summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/sched.h5
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/fair.c29
-rw-r--r--kernel/sysctl.c7
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/memory.c12
7 files changed, 44 insertions, 15 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c5fffa239861..e850a23dd6ec 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -410,6 +410,9 @@ struct mm_struct {
*/
unsigned long numa_next_scan;
+ /* numa_next_reset is when the PTE scanner period will be reset */
+ unsigned long numa_next_reset;
+
/* Restart point for scanning and setting pte_numa */
unsigned long numa_scan_offset;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d95a232b5b9..0f4ff2bd03f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1562,9 +1562,9 @@ struct task_struct {
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
#ifdef CONFIG_NUMA_BALANCING
-extern void task_numa_fault(int node, int pages);
+extern void task_numa_fault(int node, int pages, bool migrated);
#else
-static inline void task_numa_fault(int node, int pages)
+static inline void task_numa_fault(int node, int pages, bool migrated)
{
}
#endif
@@ -2009,6 +2009,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
extern unsigned int sysctl_numa_balancing_scan_delay;
extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_period_reset;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_settle_count;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbfc4843063f..9d255bc0e278 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p)
#ifdef CONFIG_NUMA_BALANCING
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
p->mm->numa_next_scan = jiffies;
+ p->mm->numa_next_reset = jiffies;
p->mm->numa_scan_seq = 0;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd18087fd369..4b577863933f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* numa task sample period in ms
*/
unsigned int sysctl_numa_balancing_scan_period_min = 100;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*16;
+unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
/* Portion of address space to scan in MB */
unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p)
/*
* Got a PROT_NONE fault for a page on @node.
*/
-void task_numa_fault(int node, int pages)
+void task_numa_fault(int node, int pages, bool migrated)
{
struct task_struct *p = current;
/* FIXME: Allocate task-specific structure for placement policy here */
/*
- * Assume that as faults occur that pages are getting properly placed
- * and fewer NUMA hints are required. Note that this is a big
- * assumption, it assumes processes reach a steady steady with no
- * further phase changes.
+ * If pages are properly placed (did not migrate) then scan slower.
+ * This is reset periodically in case of phase changes
*/
- p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
- p->numa_scan_period + jiffies_to_msecs(2));
+ if (!migrated)
+ p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+ p->numa_scan_period + jiffies_to_msecs(10));
task_numa_placement(p);
}
@@ -858,6 +858,19 @@ void task_numa_work(struct callback_head *work)
return;
/*
+ * Reset the scan period if enough time has gone by. Objective is that
+ * scanning will be reduced if pages are properly placed. As tasks
+ * can enter different phases this needs to be re-examined. Lacking
+ * proper tracking of reference behaviour, this blunt hammer is used.
+ */
+ migrate = mm->numa_next_reset;
+ if (time_after(now, migrate)) {
+ p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+ next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
+ xchg(&mm->numa_next_reset, next_scan);
+ }
+
+ /*
* Enforce maximal scan/migration frequency..
*/
migrate = mm->numa_next_scan;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 48a68cc258c1..8906f90d6fa2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,6 +367,13 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "numa_balancing_scan_period_reset",
+ .data = &sysctl_numa_balancing_scan_period_reset,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 79b96064f8fc..199b261a257e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1068,7 +1068,7 @@ out_unlock:
spin_unlock(&mm->page_table_lock);
if (page) {
put_page(page);
- task_numa_fault(numa_node_id(), HPAGE_PMD_NR);
+ task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
}
return 0;
}
diff --git a/mm/memory.c b/mm/memory.c
index 84c6d9eab182..39edb11b63dc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3468,6 +3468,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
int current_nid = -1;
int target_nid;
+ bool migrated = false;
/*
* The "pte" at this point cannot be used safely without
@@ -3509,12 +3510,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Migrate to the requested node */
- if (migrate_misplaced_page(page, target_nid))
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
current_nid = target_nid;
out:
if (current_nid != -1)
- task_numa_fault(current_nid, 1);
+ task_numa_fault(current_nid, 1, migrated);
return 0;
}
@@ -3554,6 +3556,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
int curr_nid = local_nid;
int target_nid;
+ bool migrated;
if (!pte_present(pteval))
continue;
if (!pte_numa(pteval))
@@ -3590,9 +3593,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Migrate to the requested node */
pte_unmap_unlock(pte, ptl);
- if (migrate_misplaced_page(page, target_nid))
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
curr_nid = target_nid;
- task_numa_fault(curr_nid, 1);
+ task_numa_fault(curr_nid, 1, migrated);
pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
}