From 7ecd19cfdfcbb625cc059dfa5b267d2436732c1c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:41 -0800 Subject: mm: percpu: generalize percpu related config Patch series "mm: percpu: Cleanup percpu first chunk function". When supporting page mapping percpu first chunk allocator on arm64, we found there are lots of duplicated codes in percpu embed/page first chunk allocator. This patchset is aimed to cleanup them and should no function change. The currently supported status about 'embed' and 'page' in Archs shows below, embed: NEED_PER_CPU_PAGE_FIRST_CHUNK page: NEED_PER_CPU_EMBED_FIRST_CHUNK embed page ------------------------ arm64 Y Y mips Y N powerpc Y Y riscv Y N sparc Y Y x86 Y Y ------------------------ There are two interfaces about percpu first chunk allocator, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); The pcpu_fc_alloc_fn_t/pcpu_fc_free_fn_t is killed, we provide generic pcpu_fc_alloc() and pcpu_fc_free() function, which are called in the pcpu_embed/page_first_chunk(). 1) For pcpu_embed_first_chunk(), pcpu_fc_cpu_to_node_fn_t is needed to be provided when archs supported NUMA. 2) For pcpu_page_first_chunk(), the pcpu_fc_populate_pte_fn_t is killed too, a generic pcpu_populate_pte() which marked '__weak' is provided, if you need a different function to populate pte on the arch(like x86), please provide its own implementation. [1] https://github.com/kevin78/linux.git percpu-cleanup This patch (of 4): The HAVE_SETUP_PER_CPU_AREA/NEED_PER_CPU_EMBED_FIRST_CHUNK/ NEED_PER_CPU_PAGE_FIRST_CHUNK/USE_PERCPU_NUMA_NODE_ID configs, which have duplicate definitions on platforms that subscribe it. Move them into mm, drop these redundant definitions and instead just select it on applicable platforms. Link: https://lkml.kernel.org/r/20211216112359.103822-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20211216112359.103822-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Dennis Zhou Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..9b5de3f54158 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -432,6 +432,18 @@ config NEED_PER_CPU_KM bool default y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + bool + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + bool + +config USE_PERCPU_NUMA_NODE_ID + bool + +config HAVE_SETUP_PER_CPU_AREA + bool + config CLEANCACHE bool "Enable cleancache driver to cache clean pages if tmem is present" help -- cgit v1.2.3 From 1ca3fb3abd2b615c4b61728de545760a6e2c2d8b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:45 -0800 Subject: mm: percpu: add pcpu_fc_cpu_to_node_fn_t typedef Add pcpu_fc_cpu_to_node_fn_t and pass it into pcpu_fc_alloc_fn_t, pcpu first chunk allocation will call it to alloc memblock on the corresponding node by it, this is prepare for the next patch. Link: https://lkml.kernel.org/r/20211216112359.103822-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 12 +++++++++--- arch/powerpc/kernel/setup_64.c | 15 +++++++++++---- arch/sparc/kernel/smp_64.c | 13 ++++++++++--- arch/x86/kernel/setup_percpu.c | 18 +++++++++++++----- drivers/base/arch_numa.c | 8 +++++--- include/linux/percpu.h | 7 +++++-- mm/percpu.c | 14 +++++++++----- 7 files changed, 62 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 325e1552cbea..1d8f2844704c 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -519,12 +519,17 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(cpu_to_node(from), cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static int __init pcpu_cpu_to_node(int cpu) +{ + return cpu_to_node(cpu); +} + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, - cpu_to_node(cpu)); + cpu_to_nd_fn(cpu)); } static void __init pcpu_fc_free(void *ptr, size_t size) @@ -545,6 +550,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) panic("Failed to initialize percpu areas."); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6052f5d5ded3..b79b10ae466f 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -784,12 +784,12 @@ void __init emergency_stack_init(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); + int node = cpu_to_nd_fun(cpu); void *ptr; if (!node_online(node) || !NODE_DATA(node)) { @@ -823,6 +823,11 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } +static __init int pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -891,6 +896,7 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_PAGE) { rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_alloc_bootmem, pcpu_free_bootmem); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " @@ -899,7 +905,8 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_alloc_bootmem, pcpu_free_bootmem, + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, + pcpu_alloc_bootmem, pcpu_free_bootmem, pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index b98a7bbe6728..14d719aa318d 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1539,12 +1539,12 @@ void smp_send_stop(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA - int node = cpu_to_node(cpu); + int node = cpu_to_nd_fn(cpu); void *ptr; if (!node_online(node) || !NODE_DATA(node)) { @@ -1578,6 +1578,11 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } +static int __init pcpu_cpu_to_node(int cpu) +{ + return cpu_to_node(cpu); +} + static void __init pcpu_populate_pte(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); @@ -1641,6 +1646,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, 4 << 20, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_alloc_bootmem, pcpu_free_bootmem); if (rc) @@ -1650,6 +1656,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + pcpu_cpu_to_node, pcpu_alloc_bootmem, pcpu_free_bootmem, pcpu_populate_pte); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7b65275544b2..1d41f4844149 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -97,12 +97,12 @@ static bool __init pcpu_need_numa(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); + int node = cpu_to_nd_fn(cpu); void *ptr; if (!node_online(node) || !NODE_DATA(node)) { @@ -128,9 +128,10 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, /* * Helpers for first chunk memory allocation */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { - return pcpu_alloc_bootmem(cpu, size, align); + return pcpu_alloc_bootmem(cpu, size, align, cpu_to_nd_fn); } static void __init pcpu_fc_free(void *ptr, size_t size) @@ -150,6 +151,11 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) #endif } +static int __init pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -205,6 +211,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", @@ -212,6 +219,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); if (rc < 0) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index bc1876915457..dae861838535 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -155,10 +155,10 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { - int nid = early_cpu_to_node(cpu); + int nid = cpu_to_nd_fn(cpu); return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); @@ -229,6 +229,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, + early_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) @@ -240,6 +241,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + early_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free, pcpu_populate_pte); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index ae4004e7957e..e4078bf45fd5 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -94,8 +94,9 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, - size_t align); +typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); @@ -111,12 +112,14 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a54..267a4d295fcf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3001,6 +3001,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * @alloc_fn: function to allocate percpu page * @free_fn: function to free percpu page * @@ -3030,6 +3031,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn) { @@ -3066,7 +3068,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ - ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; @@ -3143,6 +3145,7 @@ out_free: /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte @@ -3157,6 +3160,7 @@ out_free: * 0 on success, -errno on failure. */ int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) @@ -3201,7 +3205,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); @@ -3278,8 +3282,8 @@ out_free_ar: unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); } @@ -3300,7 +3304,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialize percpu areas."); -- cgit v1.2.3 From 23f917169ef157aa7a6bf80d8c4aad6f1282852c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:49 -0800 Subject: mm: percpu: add generic pcpu_fc_alloc/free funciton With the previous patch, we could add a generic pcpu first chunk allocate and free function to cleanup the duplicated definations on each architecture. Link: https://lkml.kernel.org/r/20211216112359.103822-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: "Rafael J. Wysocki" Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 16 +-------- arch/powerpc/kernel/setup_64.c | 51 ++------------------------- arch/sparc/kernel/smp_64.c | 50 +-------------------------- arch/x86/kernel/setup_percpu.c | 59 +------------------------------- drivers/base/arch_numa.c | 19 +--------- include/linux/percpu.h | 9 +---- mm/percpu.c | 78 +++++++++++++++++++++++++----------------- 7 files changed, 54 insertions(+), 228 deletions(-) (limited to 'mm') diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 1d8f2844704c..5a8002839550 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -524,19 +524,6 @@ static int __init pcpu_cpu_to_node(int cpu) return cpu_to_node(cpu); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, - cpu_to_nd_fn(cpu)); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -550,8 +537,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) panic("Failed to initialize percpu areas."); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index b79b10ae466f..a0c55c6e3023 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -771,50 +771,6 @@ void __init emergency_stack_init(void) } #ifdef CONFIG_SMP -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_nd_fun(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -static void __init pcpu_free_bootmem(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int pcpu_cpu_distance(unsigned int from, unsigned int to) { if (early_cpu_to_node(from) == early_cpu_to_node(to)) @@ -896,8 +852,7 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_PAGE) { rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_alloc_bootmem, pcpu_free_bootmem); + pcpu_cpu_to_node); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " "falling back to page size\n", @@ -905,9 +860,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, - pcpu_alloc_bootmem, pcpu_free_bootmem, - pcpu_populate_pte); + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 14d719aa318d..ef815b3f0592 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1526,50 +1526,6 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, NULL, 0); } -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_nd_fn(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -static void __init pcpu_free_bootmem(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { if (cpu_to_node(from) == cpu_to_node(to)) @@ -1646,9 +1602,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, 4 << 20, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_alloc_bootmem, - pcpu_free_bootmem); + pcpu_cpu_to_node); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " "falling back to page size\n", @@ -1657,8 +1611,6 @@ void __init setup_per_cpu_areas(void) if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, pcpu_cpu_to_node, - pcpu_alloc_bootmem, - pcpu_free_bootmem, pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1d41f4844149..15c5bf3cbe5f 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -84,61 +84,6 @@ static bool __init pcpu_need_numa(void) } #endif -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_nd_fn(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, - node); - - pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", - cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -/* - * Helpers for first chunk memory allocation - */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - return pcpu_alloc_bootmem(cpu, size, align, cpu_to_nd_fn); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { #ifdef CONFIG_NUMA @@ -211,8 +156,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); @@ -220,7 +164,6 @@ void __init setup_per_cpu_areas(void) if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index dae861838535..23a10cc36165 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -155,20 +155,6 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - int nid = cpu_to_nd_fn(cpu); - - return memblock_alloc_try_nid(size, align, - __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK static void __init pcpu_populate_pte(unsigned long addr) { @@ -229,8 +215,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, - early_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free); + early_cpu_to_node); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", @@ -242,8 +227,6 @@ void __init setup_per_cpu_areas(void) if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node, - pcpu_fc_alloc, - pcpu_fc_free, pcpu_populate_pte); #endif if (rc < 0) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e4078bf45fd5..d73c97ef4ff4 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -95,9 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); -typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); @@ -112,16 +109,12 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif diff --git a/mm/percpu.c b/mm/percpu.c index 267a4d295fcf..0f79b6d9a6d6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2992,6 +2992,42 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( return ai; } + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NUMA + int node = NUMA_NO_NODE; + void *ptr; + + if (cpu_to_nd_fn) + node = cpu_to_nd_fn(cpu); + + if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { + ptr = memblock_alloc_from(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", + cpu, size, (u64)__pa(ptr)); + } else { + ptr = memblock_alloc_try_nid(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, + node); + + pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", + cpu, size, node, (u64)__pa(ptr)); + } + return ptr; +#else + return memblock_alloc_from(size, align, goal); +#endif +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + memblock_free(ptr, size); +} #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) @@ -3002,14 +3038,12 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @cpu_to_nd_fn: callback to convert cpu to it's node, optional - * @alloc_fn: function to allocate percpu page - * @free_fn: function to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * by calling @alloc_fn and used as-is without being mapped into + * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * @@ -3023,7 +3057,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned using @free_fn. + * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure. @@ -3031,9 +3065,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn) + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; @@ -3068,7 +3100,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ - ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); + ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; @@ -3107,12 +3139,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ - free_fn(ptr, ai->unit_size); + pcpu_fc_free(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); - free_fn(ptr + size_sum, ai->unit_size - size_sum); + pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3131,7 +3163,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) - free_fn(areas[group], + pcpu_fc_free(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); @@ -3146,8 +3178,6 @@ out_free: * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional - * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE - * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu @@ -3161,8 +3191,6 @@ out_free: */ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; @@ -3205,7 +3233,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); + ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); @@ -3257,7 +3285,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, enomem: while (--j >= 0) - free_fn(page_address(pages[j]), PAGE_SIZE); + pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free(pages, pages_size); @@ -3282,17 +3310,6 @@ out_free_ar: unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); -} - -static void __init pcpu_dfl_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -3303,9 +3320,8 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL, - pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, + PAGE_SIZE, NULL, NULL); if (rc < 0) panic("Failed to initialize percpu areas."); -- cgit v1.2.3 From 20c035764626c56c4f6514936b9ee4be0f4cd962 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:53 -0800 Subject: mm: percpu: add generic pcpu_populate_pte() function With NEED_PER_CPU_PAGE_FIRST_CHUNK enabled, we need a function to populate pte, this patch adds a generic pcpu populate pte function, pcpu_populate_pte(), which is marked __weak and used on most architectures, but it is overridden on x86, which has its own implementation. Link: https://lkml.kernel.org/r/20211216112359.103822-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Thomas Bogendoerfer Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/setup_64.c | 47 +------------------------- arch/sparc/kernel/smp_64.c | 56 +------------------------------ arch/x86/kernel/setup_percpu.c | 5 ++- drivers/base/arch_numa.c | 51 +--------------------------- include/linux/percpu.h | 5 ++- mm/percpu.c | 76 +++++++++++++++++++++++++++++++++++++++--- 6 files changed, 78 insertions(+), 162 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a0c55c6e3023..f7cf408217c5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -787,51 +787,6 @@ static __init int pcpu_cpu_to_node(int cpu) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void __init pcpu_populate_pte(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); -} - - void __init setup_per_cpu_areas(void) { const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -860,7 +815,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, pcpu_populate_pte); + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index ef815b3f0592..a1f78e9ddaf3 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1539,59 +1539,6 @@ static int __init pcpu_cpu_to_node(int cpu) return cpu_to_node(cpu); } -static void __init pcpu_populate_pte(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - if (pgd_none(*pgd)) { - pud_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pgd_populate(&init_mm, pgd, new); - } - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -1610,8 +1557,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, - pcpu_cpu_to_node, - pcpu_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 15c5bf3cbe5f..49325caa7307 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -101,7 +101,7 @@ static int __init pcpu_cpu_to_node(int cpu) return early_cpu_to_node(cpu); } -static void __init pcpup_populate_pte(unsigned long addr) +void __init pcpu_populate_pte(unsigned long addr) { populate_extra_pte(addr); } @@ -163,8 +163,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_cpu_to_node, - pcpup_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 23a10cc36165..eaa31e567d1e 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -14,7 +14,6 @@ #include #include -#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -155,52 +154,6 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK -static void __init pcpu_populate_pte(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); -} -#endif - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -225,9 +178,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) - rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, - early_cpu_to_node, - pcpu_populate_pte); + rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node); #endif if (rc < 0) panic("Failed to initialize percpu areas (err=%d).", rc); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d73c97ef4ff4..f1ec5ad1351c 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -95,7 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); -typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, @@ -113,9 +112,9 @@ extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +void __init pcpu_populate_pte(unsigned long addr); extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); diff --git a/mm/percpu.c b/mm/percpu.c index 0f79b6d9a6d6..fc6f591cb54f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3174,11 +3174,79 @@ out_free: #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK +#include + +#ifndef P4D_TABLE_SIZE +#define P4D_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PUD_TABLE_SIZE +#define PUD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PMD_TABLE_SIZE +#define PMD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PTE_TABLE_SIZE +#define PTE_TABLE_SIZE PAGE_SIZE +#endif +void __init __weak pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (pgd_none(*pgd)) { + p4d_t *new; + + new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!new) + goto err_alloc; + pgd_populate(&init_mm, pgd, new); + } + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate memory\n", __func__); +} + /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional - * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. @@ -3189,9 +3257,7 @@ out_free: * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -3255,7 +3321,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) - populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], -- cgit v1.2.3 From a3d5dc908a5f572ce3e31fe83fd2459a1c3c5422 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:02 -0800 Subject: delayacct: support swapin delay accounting for swapping without blkio Currently delayacct accounts swapin delay only for swapping that cause blkio. If we use zram for swapping, tools/accounting/getdelays can't get any SWAP delay. It's useful to get zram swapin delay information, for example to adjust compress algorithm or /proc/sys/vm/swappiness. Reference to PSI, it accounts any kind of swapping by doing its work in swap_readpage(), no matter whether swapping causes blkio. Let delayacct do the similar work. Link: https://lkml.kernel.org/r/20211112083813.8559-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 44 ++++++++++++++++++++++---------------------- kernel/delayacct.c | 33 ++++++++++++++++++--------------- mm/memory.c | 4 ---- mm/page_io.c | 3 +++ 4 files changed, 43 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index af7e6eb50283..b96d68f310a2 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -9,14 +9,6 @@ #include -/* - * Per-task flags relevant to delay accounting - * maintained privately to avoid exhausting similar flags in sched.h:PF_* - * Used to set current->delays->flags - */ -#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ -#define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ - #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; @@ -37,13 +29,13 @@ struct task_delay_info { * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ - u64 blkio_start; /* Shared by blkio, swapin */ + u64 blkio_start; u64 blkio_delay; /* wait for sync block io completion */ - u64 swapin_delay; /* wait for swapin block io completion */ + u64 swapin_start; + u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ - u32 swapin_count; /* total count of the number of swapin block */ - /* io operations performed */ + u32 swapin_count; /* total count of swapin */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ @@ -79,14 +71,8 @@ extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); - -static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) -{ - if (p->delays) - return (p->delays->flags & DELAYACCT_PF_BLKIO); - else - return 0; -} +extern void __delayacct_swapin_start(void); +extern void __delayacct_swapin_end(void); static inline void delayacct_set_flag(struct task_struct *p, int flag) { @@ -123,7 +109,6 @@ static inline void delayacct_blkio_start(void) if (!static_branch_unlikely(&delayacct_key)) return; - delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -135,7 +120,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) @@ -169,6 +153,18 @@ static inline void delayacct_thrashing_end(void) __delayacct_thrashing_end(); } +static inline void delayacct_swapin_start(void) +{ + if (current->delays) + __delayacct_swapin_start(); +} + +static inline void delayacct_swapin_end(void) +{ + if (current->delays) + __delayacct_swapin_end(); +} + #else static inline void delayacct_set_flag(struct task_struct *p, int flag) {} @@ -199,6 +195,10 @@ static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} +static inline void delayacct_swapin_start(void) +{} +static inline void delayacct_swapin_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 51530d5b15a8..97699848c1f0 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -100,19 +100,10 @@ void __delayacct_blkio_start(void) */ void __delayacct_blkio_end(struct task_struct *p) { - struct task_delay_info *delays = p->delays; - u64 *total; - u32 *count; - - if (p->delays->flags & DELAYACCT_PF_SWAPIN) { - total = &delays->swapin_delay; - count = &delays->swapin_count; - } else { - total = &delays->blkio_delay; - count = &delays->blkio_count; - } - - delayacct_end(&delays->lock, &delays->blkio_start, total, count); + delayacct_end(&p->delays->lock, + &p->delays->blkio_start, + &p->delays->blkio_delay, + &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -179,8 +170,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); + ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } @@ -210,3 +200,16 @@ void __delayacct_thrashing_end(void) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } + +void __delayacct_swapin_start(void) +{ + current->delays->swapin_start = local_clock(); +} + +void __delayacct_swapin_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->swapin_start, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); +} diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dc..ced3274c3deb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3507,7 +3507,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3555,7 +3554,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3569,13 +3567,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; diff --git a/mm/page_io.c b/mm/page_io.c index 9725c7e1eeea..0bf8e40f4e57 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -25,6 +25,7 @@ #include #include #include +#include void end_swap_bio_write(struct bio *bio) { @@ -370,6 +371,7 @@ int swap_readpage(struct page *page, bool synchronous) * significant part of overall IO time. */ psi_memstall_enter(&pflags); + delayacct_swapin_start(); if (frontswap_load(page) == 0) { SetPageUptodate(page); @@ -432,6 +434,7 @@ int swap_readpage(struct page *page, bool synchronous) out: psi_memstall_leave(&pflags); + delayacct_swapin_end(); return ret; } -- cgit v1.2.3 From 5bf18281534451bf1ad56a45a3085cd7ad46860d Mon Sep 17 00:00:00 2001 From: wangyong Date: Wed, 19 Jan 2022 18:10:15 -0800 Subject: delayacct: track delays from memory compact Delay accounting does not track the delay of memory compact. When there is not enough free memory, tasks can spend a amount of their time waiting for compact. To get the impact of tasks in direct memory compact, measure the delay when allocating memory through memory compact. Also update tools/accounting/getdelays.c: / # ./getdelays_next -di -p 304 print delayacct stats ON printing IO accounting PID 304 CPU count real total virtual total delay total delay average 277 780000000 849039485 18877296 0.068ms IO count delay total delay average 0 0 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 5 11088812685 2217ms THRASHING count delay total delay average 0 0 0ms COMPACT count delay total delay average 3 72758 0ms watch: read=0, write=0, cancelled_write=0 Link: https://lkml.kernel.org/r/1638619795-71451-1-git-send-email-wang.yong12@zte.com.cn Signed-off-by: wangyong Reviewed-by: Jiang Xuexin Reviewed-by: Zhang Wenya Reviewed-by: Yang Yang Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 28 ++++++++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 16 ++++++++++++++++ mm/page_alloc.c | 3 +++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 59 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 435c3654a0ff..3e03d010bd2e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -42,8 +42,12 @@ struct task_delay_info { u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ + u64 compact_start; + u64 compact_delay; /* wait for memory compact */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ + u32 compact_count; /* total count of memory compact */ }; #endif @@ -72,6 +76,8 @@ extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); +extern void __delayacct_compact_start(void); +extern void __delayacct_compact_end(void); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -170,6 +176,24 @@ static inline void delayacct_swapin_end(void) __delayacct_swapin_end(); } +static inline void delayacct_compact_start(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_start(); +} + +static inline void delayacct_compact_end(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_end(); +} + #else static inline void delayacct_init(void) {} @@ -200,6 +224,10 @@ static inline void delayacct_swapin_start(void) {} static inline void delayacct_swapin_end(void) {} +static inline void delayacct_compact_start(void) +{} +static inline void delayacct_compact_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index ccbd08709321..12327d32378f 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 10 +#define TASKSTATS_VERSION 11 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -172,6 +172,10 @@ struct taskstats { /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ + + /* Delay waiting for memory compact */ + __u64 compact_count; + __u64 compact_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 97699848c1f0..c5e8cea9e05f 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -155,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + tmp = d->compact_delay_total + tsk->delays->compact_delay; + d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; + d->compact_count += tsk->delays->compact_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -213,3 +216,16 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count); } + +void __delayacct_compact_start(void) +{ + current->delays->compact_start = local_clock(); +} + +void __delayacct_compact_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->compact_start, + ¤t->delays->compact_delay, + ¤t->delays->compact_count); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40..635063f49671 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -4348,6 +4349,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; psi_memstall_enter(&pflags); + delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, @@ -4355,6 +4357,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + delayacct_compact_end(); if (*compact_result == COMPACT_SKIPPED) return NULL; diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 5ef1c15e88ad..11e86739456d 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -205,6 +205,8 @@ static void print_delayacct(struct taskstats *t) "RECLAIM %12s%15s%15s\n" " %15llu%15llu%15llums\n" "THRASHING%12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "COMPACT %12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -228,7 +230,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->thrashing_count, (unsigned long long)t->thrashing_delay_total, - average_ms(t->thrashing_delay_total, t->thrashing_count)); + average_ms(t->thrashing_delay_total, t->thrashing_count), + "count", "delay total", "delay average", + (unsigned long long)t->compact_count, + (unsigned long long)t->compact_delay_total, + average_ms(t->compact_delay_total, t->compact_count)); } static void task_context_switch_counts(struct taskstats *t) -- cgit v1.2.3