2023-06-11
原文作者:奇小葩 原文地址:https://blog.csdn.net/u012489236/category_9614673.html

众所周知,Linux内存管理的核心是伙伴系统(buddy system)。其实在linux启动的那一刻,内存管理就已经开始了,只不过不是buddy在管理。在内核中,实现物理内存管理的allocator包括:

  • 连续物理内存管理buddy allocator

  • 非连续物理内存管理vmalloc allocator

  • 小块物理内存管理slab allocator

  • 高端物理内存管理kmapper

  • 初始化阶段物理内存管理memblock

    在系统初始化阶段会先启用一个bootmem分配器和memblock分配器,此分配器是专门用于启动阶段的,一个bootmem分配器管理着一个node结点的所有内存,也就是在numa架构中多个node有多个bootmem,他们被链入bdata_list链表中保存。而伙伴系统的初始化就是将bootmem管理的所有物理页框释放到伙伴系统中去,本章的主要是分析下,如何实现bootmem到buddy的过度的整个流程。

1. 由mem_init开始

    void __init mem_init(void)
    {
    #ifdef CONFIG_HAVE_TCM
    	/* These pointers are filled in on TCM detection */
    	extern u32 dtcm_end;
    	extern u32 itcm_end;
    #endif
    
    	set_max_mapnr(pfn_to_page(max_pfn) - mem_map);                                   --------------(1)
    
    	/* this will put all unused low memory onto the freelists */
    	free_unused_memmap();                                                            --------------(2)
    	free_all_bootmem();                                                              --------------(3)
    
    #ifdef CONFIG_SA1111
    	/* now that our DMA memory is actually so designated, we can free it */
    	free_reserved_area(__va(PHYS_OFFSET), swapper_pg_dir, -1, NULL);
    #endif
    
    	free_highpages();                                                               --------------(4)
    
    	mem_init_print_info(NULL);                                                      --------------(5)
    
    #define MLK(b, t) b, t, ((t) - (b)) >> 10
    #define MLM(b, t) b, t, ((t) - (b)) >> 20
    #define MLK_ROUNDUP(b, t) b, t, DIV_ROUND_UP(((t) - (b)), SZ_1K)
    
    	pr_notice("Virtual kernel memory layout:\n"
    			"    vector  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
    #ifdef CONFIG_HAVE_TCM
    			"    DTCM    : 0x%08lx - 0x%08lx   (%4ld kB)\n"
    			"    ITCM    : 0x%08lx - 0x%08lx   (%4ld kB)\n"
    #endif
    			"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n"
    			"    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)\n"
    			"    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)\n"
    #ifdef CONFIG_HIGHMEM
    			"    pkmap   : 0x%08lx - 0x%08lx   (%4ld MB)\n"
    #endif
    #ifdef CONFIG_MODULES
    			"    modules : 0x%08lx - 0x%08lx   (%4ld MB)\n"
    #endif
    			"      .text : 0x%p" " - 0x%p" "   (%4td kB)\n"
    			"      .init : 0x%p" " - 0x%p" "   (%4td kB)\n"
    			"      .data : 0x%p" " - 0x%p" "   (%4td kB)\n"
    			"       .bss : 0x%p" " - 0x%p" "   (%4td kB)\n",
    
    			MLK(UL(CONFIG_VECTORS_BASE), UL(CONFIG_VECTORS_BASE) +
    				(PAGE_SIZE)),
    #ifdef CONFIG_HAVE_TCM
    			MLK(DTCM_OFFSET, (unsigned long) dtcm_end),
    			MLK(ITCM_OFFSET, (unsigned long) itcm_end),
    #endif
    			MLK(FIXADDR_START, FIXADDR_END),
    			MLM(VMALLOC_START, VMALLOC_END),
    			MLM(PAGE_OFFSET, (unsigned long)high_memory),
    #ifdef CONFIG_HIGHMEM
    			MLM(PKMAP_BASE, (PKMAP_BASE) + (LAST_PKMAP) *
    				(PAGE_SIZE)),
    #endif
    #ifdef CONFIG_MODULES
    			MLM(MODULES_VADDR, MODULES_END),
    #endif
    
    			MLK_ROUNDUP(_text, _etext),
    			MLK_ROUNDUP(__init_begin, __init_end),
    			MLK_ROUNDUP(_sdata, _edata),
    			MLK_ROUNDUP(__bss_start, __bss_stop));
    
    #undef MLK
    #undef MLM
    #undef MLK_ROUNDUP
    
    	/*
    	 * Check boundaries twice: Some fundamental inconsistencies can
    	 * be detected at build time already.
    	 */
    #ifdef CONFIG_MMU
    	BUILD_BUG_ON(TASK_SIZE				> MODULES_VADDR);
    	BUG_ON(TASK_SIZE 				> MODULES_VADDR);
    #endif
    
    #ifdef CONFIG_HIGHMEM
    	BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE > PAGE_OFFSET);
    	BUG_ON(PKMAP_BASE + LAST_PKMAP * PAGE_SIZE	> PAGE_OFFSET);
    #endif
    
    	if (PAGE_SIZE >= 16384 && get_num_physpages() <= 128) {
    		extern int sysctl_overcommit_memory;
    		/*
    		 * On a machine this small we won't get
    		 * anywhere without overcommit, so turn
    		 * it on by default.
    		 */
    		sysctl_overcommit_memory = OVERCOMMIT_ALWAYS;
    	}
    }
  • 函数set_max_mapnr()就是用于计算max_mapnr,实际指向实际物理内存大小
  • free_unused_memmap将物理上不存在的页(hole)在页管理位图中全部记录为"不适用"。
  • 在页管理位图中记录为"不使用"后,free_all_bootmem函数进行释放,使其能够在伙伴系统中管理空白页。
  • free_highpages将高端内存区域释放到伙伴系统,使其能管理空白页
  • mem_init_print_info()是把内核映像的各个段地址打印出来,后面主要是将整个内核空间的虚拟映射空间打印出来,对于我们现在使用的开发板其打印信息如下

202306111242374521.png

2. 空闲内存释放

函数free_unused_memmap()和free_all_bootmem()都是把空闲内存释放到伙伴系统,前者释放memblock中空闲内存,后者释放bootmem中内存。

    static void __init free_unused_memmap(void)
    {
    	unsigned long start, prev_end = 0;
    	struct memblock_region *reg;
    
    	/*
    	 * This relies on each bank being in address order.
    	 * The banks are sorted previously in bootmem_init().
    	 */
    	for_each_memblock(memory, reg) {
    		start = memblock_region_memory_base_pfn(reg);
    
    #ifdef CONFIG_SPARSEMEM
    		start = min(start,
    				 ALIGN(prev_end, PAGES_PER_SECTION));
    #else
    		start = round_down(start, MAX_ORDER_NR_PAGES);
    #endif
    
    		if (prev_end && prev_end < start)
    			free_memmap(prev_end, start);
            
    		prev_end = ALIGN(memblock_region_memory_end_pfn(reg),
    				 MAX_ORDER_NR_PAGES);
    	}
    
    #ifdef CONFIG_SPARSEMEM
    	if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
    		free_memmap(prev_end,
    			    ALIGN(prev_end, PAGES_PER_SECTION));
    #endif
    }

该主要是获得memblock的memory,对于IMX开发板,其reg为0x8000000,得到对应的start为0x80000,所以不满足free_memap的条件,之后拿到的prev_end为0xa0000,而对于该开发板只有一片内存,所以对于memblock中没有相对应的空闲内存释放。系统在分配内存节点的mem_map时是按照这个内存节点起始地址到末尾地址分配的,这个地址空间中可能有空洞,这个空洞地址对应的page数据结构是可以释放掉,如下图所示

202306111242389532.png

下面我们来看看bootm的释放,首先我们来看看bootmem的struct bootmem_data结构:

    typedef struct bootmem_data {
    	unsigned long node_min_pfn;
    	unsigned long node_low_pfn;
    	void *node_bootmem_map;
    	unsigned long last_end_off;
    	unsigned long hint_idx;
    	struct list_head list;
    } bootmem_data_t;
结构体成员 含义
node_min_pfn 此块内存开始页框号
node_low_pfn 此块内存结束页框号,如果是32位系统下此保存的是ZONE_NORMAL最后一个页框号
node_bootmem_map 指向位图内存区,node中所有ZONE_HIGHMEM之前的页框都在这里面有一个位,每次需要分配内存时就会扫描找出一个空闲页框,空洞的内存也会占用位,不过空洞的内存应该设置为已分配
last_end_off 上次分配距离末尾的偏移量
hint_idx
list 链入bdata_list结构链表

bootm分配器的核心就是node_bootmem_map这个位图,每一位代表这个node的一个页,当需要分配时,就会去扫描这个位图,然后获得一段物理页框进行分配,一般都会从开始处向后分配。而伙伴系统初始化时会根据这个位图,将位图中空闲的页释放回伙伴系统,而已经分配出去的页则不会在初始化阶段释放回伙伴系统,不过有可能在运行过程中释放回伙伴系统。由于对于支持memblock的内核,内核配置了CONFIG_NO_BOOTMEM,其实现在mm/nobootmem.c,具体实现如下:

    unsigned long __init free_all_bootmem(void)
    {
    	unsigned long pages;
    
    	reset_all_zones_managed_pages();                                               ---------------(1)
    
    	pages = free_low_memory_core_early();                                          ---------------(2)
    	totalram_pages += pages;
    
    	return pages;
    }
  • 设置所有node的所有zone的managed_pages为0,该函数只会启动时候调用一次
  • 遍历所有需要释放的启动内存数据块,释放bdata启动内存块中所有页框到页框分配器中,计算所有的内存页数据,存储在totalram_pages中,并返回总共释放的页数量。

继续看free_low_memory_core_early,其主要的实现如下所示

    static unsigned long __init free_low_memory_core_early(void)
    {
    	unsigned long count = 0;
    	phys_addr_t start, end;
    	u64 i;
    
    	memblock_clear_hotplug(0, -1);
    
    	for_each_reserved_mem_region(i, &start, &end)
    		reserve_bootmem_region(start, end);
    
    	/*
    	 * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
    	 *  because in some case like Node0 doesn't have RAM installed
    	 *  low ram will be on Node1
    	 */
    	for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
    				NULL)
    		count += __free_memory_core(start, end);
    
    	return count;
    }
  • 遍历memblock.reserved类型的regions,对每个regions设置页面属性为Reserved,对于Imx,这个reserved区域为
        [root@qemu_imx6ul:~]# cat /sys/kernel/debug/memblock/reserved 
           0: 0x0000000080003000..0x0000000080007fff
           1: 0x0000000080200000..0x00000000810e8eeb
           2: 0x0000000088000000..0x0000000088014303
           3: 0x000000008bad3000..0x000000008bb40fff
           4: 0x000000008bb413c0..0x000000008bb433bf
           5: 0x000000008bb433f4..0x000000008bffefff
           6: 0x000000008bfff740..0x000000008bfff77b
           7: 0x000000008bfff780..0x000000008bfff7bb
           8: 0x000000008bfff7c0..0x000000008bfff837
           9: 0x000000008bfff840..0x000000008bfff843
          10: 0x000000008bfff880..0x000000008bfff883
          11: 0x000000008bfff8c0..0x000000008bfff8c3
          12: 0x000000008bfff900..0x000000008bfff903
          13: 0x000000008bfff940..0x000000008bfff9a1
          14: 0x000000008bfff9c0..0x000000008bfffa21
          15: 0x000000008bfffa40..0x000000008bfffaa1
          16: 0x000000008bfffaac..0x000000008bfffac6
          17: 0x000000008bfffac8..0x000000008bfffae2
          18: 0x000000008bfffae4..0x000000008bfffb5e
          19: 0x000000008bfffb60..0x000000008bfffb7a
          20: 0x000000008bfffb7c..0x000000008bfffb96
          21: 0x000000008bfffb98..0x000000008bfffbb2
          22: 0x000000008bfffbb4..0x000000008bfffbce
          23: 0x000000008bfffbd0..0x000000008bfffbea
          24: 0x000000008bfffbec..0x000000008bfffc06
          25: 0x000000008bfffc08..0x000000008bfffc22
          26: 0x000000008bfffc24..0x000000008bfffccc
          27: 0x000000008bfffcd0..0x000000008bfffce8
          28: 0x000000008bfffcec..0x000000008bfffd04
          29: 0x000000008bfffd08..0x000000008bfffd20
          30: 0x000000008bfffd24..0x000000008bfffd3c
          31: 0x000000008bfffd40..0x000000008bfffd5c
          32: 0x000000008bfffd60..0x000000008bfffd7c
          33: 0x000000008bfffd80..0x000000008bfffdc7
          34: 0x000000008bfffdd8..0x000000009fffffff
  • 遍历所有在memblock.memory中,但是不在memblock.reserve中的regions。然后清Reserved页面属性

下面重点看看页面是如何完成reserved的配置,其代码如下,主要是清空各页的page->flags的PG_reserved位,将reserved的区域的页标签位为PG_reserved,并加入到page->lru链表中。

    void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
    {
    	unsigned long start_pfn = PFN_DOWN(start);
    	unsigned long end_pfn = PFN_UP(end);
    
    	for (; start_pfn < end_pfn; start_pfn++) {
    		if (pfn_valid(start_pfn)) {
    			struct page *page = pfn_to_page(start_pfn);
    
    			init_reserved_page(start_pfn);
    
    			/* Avoid false-positive PageTail() */
    			INIT_LIST_HEAD(&page->lru);
    
    			SetPageReserved(page);
    		}
    	}
    }

我们看看重点的__free_memory_core,其主要是遍历所有在memblock.memory中,但是不在memblock.reserve中的regions,对于imx的开发板,其信息如下

    0: 0x0000000080000000 .. 0x0000000080003000
    1: 0x0000000080008000 .. 0x0000000080200000
    2: 0x00000000810e9000 .. 0x0000000080200000
    3: 0x0000000088015000 .. 0x000000008bad3000
    4:0x000000008bb41000 .. 0x000000008bb41000
    5: 0x000000008bfff000 .. 0x000000008bfff000
    static unsigned long __init __free_memory_core(phys_addr_t start,
    				 phys_addr_t end)
    {
    	unsigned long start_pfn = PFN_UP(start);
    	unsigned long end_pfn = min_t(unsigned long,
    				      PFN_DOWN(end), max_low_pfn);
    
    	if (start_pfn > end_pfn)
    		return 0;
    	__free_pages_memory(start_pfn, end_pfn);
    
    	return end_pfn - start_pfn;
    }

核心的__free_pages_memory函数,该函数以顺序为单位释放页,清空各页的PG_reserved位,设置pgae->count为0后,然后调用__free_pages,代码实现为

    void __free_pages(struct page *page, unsigned int order)
    {
    	if (put_page_testzero(page)) {
    		if (order == 0)
    			free_hot_cold_page(page, false);
    		else
    			__free_pages_ok(page, order);
    	}
    }
  • 首先检查页次数page->_refcount减1后的值是否为0

  • 为0的时,即释放1页时调用free_hot_cold_page,否则调用__free_pages_ok,将页以顺序单位释放

    该过程比较复杂,涉及到伙伴系统的一些算法,先留一个疑问,后面深入分析下。

    3. 高端内存释放

        static void __init free_highpages(void)
        {
        #ifdef CONFIG_HIGHMEM
        	unsigned long max_low = max_low_pfn;
        	struct memblock_region *mem, *res;
        
        	/* set highmem page free */
        	for_each_memblock(memory, mem) {
        		unsigned long start = memblock_region_memory_base_pfn(mem);
        		unsigned long end = memblock_region_memory_end_pfn(mem);
        		/* Ignore complete lowmem entries */
        		if (end <= max_low)
        			continue;
        
        		if (memblock_is_nomap(mem))
        			continue;
        
        		/* Truncate partial highmem entries */
        		if (start < max_low)
        			start = max_low;
        
        		/* Find and exclude any reserved regions */
        		for_each_memblock(reserved, res) {
        			unsigned long res_start, res_end;
        
        			res_start = memblock_region_reserved_base_pfn(res);
        			res_end = memblock_region_reserved_end_pfn(res);
        
        			if (res_end < start)
        				continue;
        			if (res_start < start)
        				res_start = start;
        			if (res_start > end)
        				res_start = end;
        			if (res_end > end)
        				res_end = end;
        			if (res_start != start)
        				free_area_high(start, res_start);
        			start = res_end;
        			if (start == end)
        				break;
        		}
        
        		/* And now free anything which remains */
        		if (start < end)
        			free_area_high(start, end);
        	}
        #endif
        }
存在高端内存时,该代码求出高端内存的起始页帧和尾页帧,然后调用free\_area\_high函数使伙伴系统管理空白页,free\_area\_high函数在内部调用\_\_free\_page函数,将空白页和一般内存区域共同释放到伙伴系统。

# 4. 总结 #

本章的mem\_init()函数结束启动时的内存分配器memblock和bootmem,将bootmem和memblock管理的空白页以顺序单位构建列表,构建好的伙伴系统将为Linux的内存分配器slab提供空白页。
阅读全文