summaryrefslogtreecommitdiff
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorMel Gorman <mgorman@techsingularity.net>2021-06-29 05:42:09 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-29 20:53:54 +0300
commitbbbecb35a41cb5c63ef78e14cc8b95fa9130bc1a (patch)
tree43ff967d35abfa1583e44c7ba1e594dfce09b7a6 /mm/page_alloc.c
parent151e084af4946344fe0d021f4110b69edaac1e8d (diff)
downloadlinux-bbbecb35a41cb5c63ef78e14cc8b95fa9130bc1a.tar.xz
mm/page_alloc: delete vm.percpu_pagelist_fraction
Patch series "Calculate pcp->high based on zone sizes and active CPUs", v2. The per-cpu page allocator (PCP) is meant to reduce contention on the zone lock but the sizing of batch and high is archaic and neither takes the zone size into account or the number of CPUs local to a zone. With larger zones and more CPUs per node, the contention is getting worse. Furthermore, the fact that vm.percpu_pagelist_fraction adjusts both batch and high values means that the sysctl can reduce zone lock contention but also increase allocation latencies. This series disassociates pcp->high from pcp->batch and then scales pcp->high based on the size of the local zone with limited impact to reclaim and accounting for active CPUs but leaves pcp->batch static. It also adapts the number of pages that can be on the pcp list based on recent freeing patterns. The motivation is partially to adjust to larger memory sizes but is also driven by the fact that large batches of page freeing via release_pages() often shows zone contention as a major part of the problem. Another is a bug report based on an older kernel where a multi-terabyte process can takes several minutes to exit. A workaround was to use vm.percpu_pagelist_fraction to increase the pcp->high value but testing indicated that a production workload could not use the same values because of an increase in allocation latencies. Unfortunately, I cannot reproduce this test case myself as the multi-terabyte machines are in active use but it should alleviate the problem. The series aims to address both and partially acts as a pre-requisite. pcp only works with order-0 which is useless for SLUB (when using high orders) and THP (unconditionally). To store high-order pages on PCP, the pcp->high values need to be increased first. This patch (of 6): The vm.percpu_pagelist_fraction is used to increase the batch and high limits for the per-cpu page allocator (PCP). The intent behind the sysctl is to reduce zone lock acquisition when allocating/freeing pages but it has a problem. While it can decrease contention, it can also increase latency on the allocation side due to unreasonably large batch sizes. This leads to games where an administrator adjusts percpu_pagelist_fraction on the fly to work around contention and allocation latency problems. This series aims to alleviate the problems with zone lock contention while avoiding the allocation-side latency problems. For the purposes of review, it's easier to remove this sysctl now and reintroduce a similar sysctl later in the series that deals only with pcp->high. Link: https://lkml.kernel.org/r/20210525080119.5455-1-mgorman@techsingularity.net Link: https://lkml.kernel.org/r/20210525080119.5455-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman <mgorman@techsingularity.net> Acked-by: Dave Hansen <dave.hansen@linux.intel.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Hillf Danton <hdanton@sina.com> Cc: Michal Hocko <mhocko@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c55
1 files changed, 4 insertions, 51 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 941a75b9fb5a..5abf2c1d4c58 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -120,7 +120,6 @@ typedef int __bitwise fpi_t;
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
-#define MIN_PERCPU_PAGELIST_FRACTION (8)
struct pagesets {
local_lock_t lock;
@@ -193,7 +192,6 @@ EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
-int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
EXPORT_SYMBOL(init_on_alloc);
@@ -6735,22 +6733,15 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
/*
* Calculate and set new high and batch values for all per-cpu pagesets of a
- * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
+ * zone based on the zone's size.
*/
static void zone_set_pageset_high_and_batch(struct zone *zone)
{
unsigned long new_high, new_batch;
- if (percpu_pagelist_fraction) {
- new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
- new_batch = max(1UL, new_high / 4);
- if ((new_high / 4) > (PAGE_SHIFT * 8))
- new_batch = PAGE_SHIFT * 8;
- } else {
- new_batch = zone_batchsize(zone);
- new_high = 6 * new_batch;
- new_batch = max(1UL, 1 * new_batch);
- }
+ new_batch = zone_batchsize(zone);
+ new_high = 6 * new_batch;
+ new_batch = max(1UL, 1 * new_batch);
if (zone->pageset_high == new_high &&
zone->pageset_batch == new_batch)
@@ -8413,44 +8404,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-/*
- * percpu_pagelist_fraction - changes the pcp->high for each zone on each
- * cpu. It is the fraction of total pages in each zone that a hot per cpu
- * pagelist can have before it gets flushed back to buddy allocator.
- */
-int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
- void *buffer, size_t *length, loff_t *ppos)
-{
- struct zone *zone;
- int old_percpu_pagelist_fraction;
- int ret;
-
- mutex_lock(&pcp_batch_high_lock);
- old_percpu_pagelist_fraction = percpu_pagelist_fraction;
-
- ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (!write || ret < 0)
- goto out;
-
- /* Sanity checking to avoid pcp imbalance */
- if (percpu_pagelist_fraction &&
- percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
- percpu_pagelist_fraction = old_percpu_pagelist_fraction;
- ret = -EINVAL;
- goto out;
- }
-
- /* No change? */
- if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
- goto out;
-
- for_each_populated_zone(zone)
- zone_set_pageset_high_and_batch(zone);
-out:
- mutex_unlock(&pcp_batch_high_lock);
- return ret;
-}
-
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but