From b3d9ed3fd872fc074286674ae8595ee880938bbf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 8 Sep 2015 15:00:59 -0700 Subject: sparc32: do not include swap.h from pgtable_32.h "memcg: export struct mem_cgroup" will add includes into linux/memcontrol.h which lead to further header dependency issues as reported by Guenter Roeck: In file included from include/linux/highmem.h:7:0, from include/linux/bio.h:23, from include/linux/writeback.h:192, from include/linux/memcontrol.h:30, from include/linux/swap.h:8, from ./arch/sparc/include/asm/pgtable_32.h:17, from ./arch/sparc/include/asm/pgtable.h:6, from arch/sparc/kernel/traps_32.c:23: include/linux/mm.h: In function 'is_vmalloc_addr': include/linux/mm.h:371:17: error: 'VMALLOC_START' undeclared (first use in this function) include/linux/mm.h:371:17: note: each undeclared identifier is reported only once for each function it appears in include/linux/mm.h:371:41: error: 'VMALLOC_END' undeclared (first use in this function) include/linux/mm.h: In function 'maybe_mkwrite': include/linux/mm.h:556:3: error: implicit declaration of function 'pte_mkwrite' The issue is that pgtable_32.h depends on swap.h to get swap_entry_t but that goes all the way down to linux/mm.h which wants to have VMALLOC_* which is defined later in pgtable_32.h, though. swap_entry_t is defined in include/mm_types.h so it should be sufficient to include this header without more dependencies. Signed-off-by: Michal Hocko Reported-by: Guenter Roeck Tested-by: Guenter Roeck Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_32.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/sparc/include/asm/pgtable_32.h b/arch/sparc/include/asm/pgtable_32.h index f06b36a00a3b..91b963a887b7 100644 --- a/arch/sparc/include/asm/pgtable_32.h +++ b/arch/sparc/include/asm/pgtable_32.h @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include -- cgit v1.2.3 From 95cf82ecc1fcb44df1768162343cc8eb88083b86 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 8 Sep 2015 15:02:03 -0700 Subject: mem-hotplug: handle node hole when initializing numa_meminfo. When parsing SRAT, all memory ranges are added into numa_meminfo. In numa_init(), before entering numa_cleanup_meminfo(), all possible memory ranges are in numa_meminfo. And numa_cleanup_meminfo() removes all ranges over max_pfn or empty. But, this only works if the nodes are continuous. Let's have a look at the following example: We have an SRAT like this: SRAT: Node 0 PXM 0 [mem 0x00000000-0x5fffffff] SRAT: Node 0 PXM 0 [mem 0x100000000-0x1ffffffffff] SRAT: Node 1 PXM 1 [mem 0x20000000000-0x3ffffffffff] SRAT: Node 4 PXM 2 [mem 0x40000000000-0x5ffffffffff] hotplug SRAT: Node 5 PXM 3 [mem 0x60000000000-0x7ffffffffff] hotplug SRAT: Node 2 PXM 4 [mem 0x80000000000-0x9ffffffffff] hotplug SRAT: Node 3 PXM 5 [mem 0xa0000000000-0xbffffffffff] hotplug SRAT: Node 6 PXM 6 [mem 0xc0000000000-0xdffffffffff] hotplug SRAT: Node 7 PXM 7 [mem 0xe0000000000-0xfffffffffff] hotplug On boot, only node 0,1,2,3 exist. And the numa_meminfo will look like this: numa_meminfo.nr_blks = 9 1. on node 0: [0, 60000000] 2. on node 0: [100000000, 20000000000] 3. on node 1: [20000000000, 40000000000] 4. on node 4: [40000000000, 60000000000] 5. on node 5: [60000000000, 80000000000] 6. on node 2: [80000000000, a0000000000] 7. on node 3: [a0000000000, a0800000000] 8. on node 6: [c0000000000, a0800000000] 9. on node 7: [e0000000000, a0800000000] And numa_cleanup_meminfo() will merge 1 and 2, and remove 8,9 because the end address is over max_pfn, which is a0800000000. But 4 and 5 are not removed because their end addresses are less then max_pfn. But in fact, node 4 and 5 don't exist. In a word, numa_cleanup_meminfo() is not able to handle holes between nodes. Since memory ranges in node 4 and 5 are in numa_meminfo, in numa_register_memblks(), node 4 and 5 will be mistakenly set to online. If you run lscpu, it will show: NUMA node0 CPU(s): 0-14,128-142 NUMA node1 CPU(s): 15-29,143-157 NUMA node2 CPU(s): NUMA node3 CPU(s): NUMA node4 CPU(s): 62-76,190-204 NUMA node5 CPU(s): 78-92,206-220 In this patch, we use memblock_overlaps_region() to check if ranges in numa_meminfo overlap with ranges in memory_block. Since memory_block contains all available memory at boot time, if they overlap, it means the ranges exist. If not, then remove them from numa_meminfo. After this patch, lscpu will show: NUMA node0 CPU(s): 0-14,128-142 NUMA node1 CPU(s): 15-29,143-157 NUMA node4 CPU(s): 62-76,190-204 NUMA node5 CPU(s): 78-92,206-220 Signed-off-by: Tang Chen Reviewed-by: Yasuaki Ishimatsu Cc: Thomas Gleixner Cc: Tejun Heo Cc: Luiz Capitulino Cc: Xishi Qiu Cc: Will Deacon Cc: Vladimir Murzin Cc: Fabian Frederick Cc: Alexander Kuleshov Cc: Baoquan He Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 4053bb58bf92..c3b3f653ed0c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -246,8 +246,10 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty block */ - if (bi->start >= bi->end) + /* and there's no empty or non-exist block */ + if (bi->start >= bi->end || + !memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) numa_remove_memblk_from(i--, mi); } -- cgit v1.2.3 From 1570f0d7ab425c1e0905715bf9cc98b2a82e723f Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Tue, 8 Sep 2015 15:03:04 -0700 Subject: arm64: support initrd outside kernel linear map The use of mem= could leave part or all of the initrd outside of the kernel linear map. This will lead to an error when unpacking the initrd and a probable failure to boot. This patch catches that situation and relocates the initrd to be fully within the linear map. Signed-off-by: Mark Salter Acked-by: Will Deacon Cc: Catalin Marinas Cc: Arnd Bergmann Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Russell King Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/setup.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 62 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 888478881243..6bab21f84a9f 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -339,6 +339,67 @@ static void __init request_standard_resources(void) } } +#ifdef CONFIG_BLK_DEV_INITRD +/* + * Relocate initrd if it is not completely within the linear mapping. + * This would be the case if mem= cuts out all or part of it. + */ +static void __init relocate_initrd(void) +{ + phys_addr_t orig_start = __virt_to_phys(initrd_start); + phys_addr_t orig_end = __virt_to_phys(initrd_end); + phys_addr_t ram_end = memblock_end_of_DRAM(); + phys_addr_t new_start; + unsigned long size, to_free = 0; + void *dest; + + if (orig_end <= ram_end) + return; + + /* + * Any of the original initrd which overlaps the linear map should + * be freed after relocating. + */ + if (orig_start < ram_end) + to_free = ram_end - orig_start; + + size = orig_end - orig_start; + + /* initrd needs to be relocated completely inside linear mapping */ + new_start = memblock_find_in_range(0, PFN_PHYS(max_pfn), + size, PAGE_SIZE); + if (!new_start) + panic("Cannot relocate initrd of size %ld\n", size); + memblock_reserve(new_start, size); + + initrd_start = __phys_to_virt(new_start); + initrd_end = initrd_start + size; + + pr_info("Moving initrd from [%llx-%llx] to [%llx-%llx]\n", + orig_start, orig_start + size - 1, + new_start, new_start + size - 1); + + dest = (void *)initrd_start; + + if (to_free) { + memcpy(dest, (void *)__phys_to_virt(orig_start), to_free); + dest += to_free; + } + + copy_from_early_mem(dest, orig_start + to_free, size - to_free); + + if (to_free) { + pr_info("Freeing original RAMDISK from [%llx-%llx]\n", + orig_start, orig_start + to_free - 1); + memblock_free(orig_start, to_free); + } +} +#else +static inline void __init relocate_initrd(void) +{ +} +#endif + u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID }; void __init setup_arch(char **cmdline_p) @@ -372,6 +433,7 @@ void __init setup_arch(char **cmdline_p) acpi_boot_table_init(); paging_init(); + relocate_initrd(); request_standard_resources(); early_ioremap_reset(); -- cgit v1.2.3 From 5dd2c4bded8776ee93c8f38b739fea531095067f Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Tue, 8 Sep 2015 15:03:07 -0700 Subject: x86: use generic early mem copy The early_ioremap library now has a generic copy_from_early_mem() function. Use the generic copy function for x86 relocate_initrd(). [akpm@linux-foundation.org: remove MAX_MAP_CHUNK define, per Yinghai Lu] Signed-off-by: Mark Salter Cc: Catalin Marinas Cc: Will Deacon Cc: Arnd Bergmann Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Russell King Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/setup.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b143c2d04420..baadbf90a7c5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -317,15 +317,12 @@ static u64 __init get_ramdisk_size(void) return ramdisk_size; } -#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) static void __init relocate_initrd(void) { /* Assume only end is not page aligned */ u64 ramdisk_image = get_ramdisk_image(); u64 ramdisk_size = get_ramdisk_size(); u64 area_size = PAGE_ALIGN(ramdisk_size); - unsigned long slop, clen, mapaddr; - char *p, *q; /* We need to move the initrd down into directly mapped mem */ relocated_ramdisk = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), @@ -343,25 +340,8 @@ static void __init relocate_initrd(void) printk(KERN_INFO "Allocated new RAMDISK: [mem %#010llx-%#010llx]\n", relocated_ramdisk, relocated_ramdisk + ramdisk_size - 1); - q = (char *)initrd_start; - - /* Copy the initrd */ - while (ramdisk_size) { - slop = ramdisk_image & ~PAGE_MASK; - clen = ramdisk_size; - if (clen > MAX_MAP_CHUNK-slop) - clen = MAX_MAP_CHUNK-slop; - mapaddr = ramdisk_image & PAGE_MASK; - p = early_memremap(mapaddr, clen+slop); - memcpy(q, p+slop, clen); - early_memunmap(p, clen+slop); - q += clen; - ramdisk_image += clen; - ramdisk_size -= clen; - } + copy_from_early_mem((void *)initrd_start, ramdisk_image, ramdisk_size); - ramdisk_image = get_ramdisk_image(); - ramdisk_size = get_ramdisk_size(); printk(KERN_INFO "Move RAMDISK from [mem %#010llx-%#010llx] to" " [mem %#010llx-%#010llx]\n", ramdisk_image, ramdisk_image + ramdisk_size - 1, -- cgit v1.2.3 From 96db800f5d73cd5c49461253d45766e094f0f8c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 8 Sep 2015 15:03:50 -0700 Subject: mm: rename alloc_pages_exact_node() to __alloc_pages_node() alloc_pages_exact_node() was introduced in commit 6484eb3e2a81 ("page allocator: do not check NUMA node ID when the caller knows the node is valid") as an optimized variant of alloc_pages_node(), that doesn't fallback to current node for nid == NUMA_NO_NODE. Unfortunately the name of the function can easily suggest that the allocation is restricted to the given node and fails otherwise. In truth, the node is only preferred, unless __GFP_THISNODE is passed among the gfp flags. The misleading name has lead to mistakes in the past, see for example commits 5265047ac301 ("mm, thp: really limit transparent hugepage allocation to local node") and b360edb43f8e ("mm, mempolicy: migrate_to_node should only migrate to node"). Another issue with the name is that there's a family of alloc_pages_exact*() functions where 'exact' means exact size (instead of page order), which leads to more confusion. To prevent further mistakes, this patch effectively renames alloc_pages_exact_node() to __alloc_pages_node() to better convey that it's an optimized variant of alloc_pages_node() not intended for general usage. Both functions get described in comments. It has been also considered to really provide a convenience function for allocations restricted to a node, but the major opinion seems to be that __GFP_THISNODE already provides that functionality and we shouldn't duplicate the API needlessly. The number of users would be small anyway. Existing callers of alloc_pages_exact_node() are simply converted to call __alloc_pages_node(), with the exception of sba_alloc_coherent() which open-codes the check for NUMA_NO_NODE, so it is converted to use alloc_pages_node() instead. This means it no longer performs some VM_BUG_ON checks, and since the current check for nid in alloc_pages_node() uses a 'nid < 0' comparison (which includes NUMA_NO_NODE), it may hide wrong values which would be previously exposed. Both differences will be rectified by the next patch. To sum up, this patch makes no functional changes, except temporarily hiding potentially buggy callers. Restricting the checks in alloc_pages_node() is left for the next patch which can in turn expose more existing buggy callers. Signed-off-by: Vlastimil Babka Acked-by: Johannes Weiner Acked-by: Robin Holt Acked-by: Michal Hocko Acked-by: Christoph Lameter Acked-by: Michael Ellerman Cc: Mel Gorman Cc: David Rientjes Cc: Greg Thelen Cc: Aneesh Kumar K.V Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Naoya Horiguchi Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Gleb Natapov Cc: Paolo Bonzini Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Cliff Whickman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/sba_iommu.c | 6 +----- arch/ia64/kernel/uncached.c | 2 +- arch/ia64/sn/pci/pci_dma.c | 2 +- arch/powerpc/platforms/cell/ras.c | 2 +- arch/x86/kvm/vmx.c | 2 +- 5 files changed, 5 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 344387a55406..a6d6190c9d24 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1140,13 +1140,9 @@ sba_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, #ifdef CONFIG_NUMA { - int node = ioc->node; struct page *page; - if (node == NUMA_NO_NODE) - node = numa_node_id(); - - page = alloc_pages_exact_node(node, flags, get_order(size)); + page = alloc_pages_node(ioc->node, flags, get_order(size)); if (unlikely(!page)) return NULL; diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 20e8a9b21d75..f3976da36721 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -97,7 +97,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ - page = alloc_pages_exact_node(nid, + page = __alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d0853e8e8623..8f59907007cb 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -92,7 +92,7 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, */ node = pcibus_to_node(pdev->bus); if (likely(node >=0)) { - struct page *p = alloc_pages_exact_node(node, + struct page *p = __alloc_pages_node(node, flags, get_order(size)); if (likely(p)) diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index e865d748179b..2d4f60c0119a 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -123,7 +123,7 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_exact_node(area->nid, + area->pages = __alloc_pages_node(area->nid, GFP_KERNEL|__GFP_THISNODE, area->order); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4a4eec30cc08..148ea2016022 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3150,7 +3150,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); + pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); -- cgit v1.2.3