summaryrefslogtreecommitdiff
path: root/mm/mempolicy.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/mempolicy.c')
-rw-r--r--mm/mempolicy.c685
1 files changed, 502 insertions, 183 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bb37cd1a51d8..3b1dfd08338b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -109,10 +109,12 @@
#include <linux/mmu_notifier.h>
#include <linux/printk.h>
#include <linux/swapops.h>
+#include <linux/gcd.h>
#include <asm/tlbflush.h>
#include <asm/tlb.h>
#include <linux/uaccess.h>
+#include <linux/memory.h>
#include "internal.h"
@@ -139,31 +141,138 @@ static struct mempolicy default_policy = {
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/*
- * iw_table is the sysfs-set interleave weight table, a value of 0 denotes
- * system-default value should be used. A NULL iw_table also denotes that
- * system-default values should be used. Until the system-default table
- * is implemented, the system-default is always 1.
- *
- * iw_table is RCU protected
+ * weightiness balances the tradeoff between small weights (cycles through nodes
+ * faster, more fair/even distribution) and large weights (smaller errors
+ * between actual bandwidth ratios and weight ratios). 32 is a number that has
+ * been found to perform at a reasonable compromise between the two goals.
+ */
+static const int weightiness = 32;
+
+/*
+ * A null weighted_interleave_state is interpreted as having .mode="auto",
+ * and .iw_table is interpreted as an array of 1s with length nr_node_ids.
+ */
+struct weighted_interleave_state {
+ bool mode_auto;
+ u8 iw_table[];
+};
+static struct weighted_interleave_state __rcu *wi_state;
+static unsigned int *node_bw_table;
+
+/*
+ * wi_state_lock protects both wi_state and node_bw_table.
+ * node_bw_table is only used by writers to update wi_state.
*/
-static u8 __rcu *iw_table;
-static DEFINE_MUTEX(iw_table_lock);
+static DEFINE_MUTEX(wi_state_lock);
static u8 get_il_weight(int node)
{
- u8 *table;
- u8 weight;
+ struct weighted_interleave_state *state;
+ u8 weight = 1;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- /* if no iw_table, use system default */
- weight = table ? table[node] : 1;
- /* if value in iw_table is 0, use system default */
- weight = weight ? weight : 1;
+ state = rcu_dereference(wi_state);
+ if (state)
+ weight = state->iw_table[node];
rcu_read_unlock();
return weight;
}
+/*
+ * Convert bandwidth values into weighted interleave weights.
+ * Call with wi_state_lock.
+ */
+static void reduce_interleave_weights(unsigned int *bw, u8 *new_iw)
+{
+ u64 sum_bw = 0;
+ unsigned int cast_sum_bw, scaling_factor = 1, iw_gcd = 0;
+ int nid;
+
+ for_each_node_state(nid, N_MEMORY)
+ sum_bw += bw[nid];
+
+ /* Scale bandwidths to whole numbers in the range [1, weightiness] */
+ for_each_node_state(nid, N_MEMORY) {
+ /*
+ * Try not to perform 64-bit division.
+ * If sum_bw < scaling_factor, then sum_bw < U32_MAX.
+ * If sum_bw > scaling_factor, then round the weight up to 1.
+ */
+ scaling_factor = weightiness * bw[nid];
+ if (bw[nid] && sum_bw < scaling_factor) {
+ cast_sum_bw = (unsigned int)sum_bw;
+ new_iw[nid] = scaling_factor / cast_sum_bw;
+ } else {
+ new_iw[nid] = 1;
+ }
+ if (!iw_gcd)
+ iw_gcd = new_iw[nid];
+ iw_gcd = gcd(iw_gcd, new_iw[nid]);
+ }
+
+ /* 1:2 is strictly better than 16:32. Reduce by the weights' GCD. */
+ for_each_node_state(nid, N_MEMORY)
+ new_iw[nid] /= iw_gcd;
+}
+
+int mempolicy_set_node_perf(unsigned int node, struct access_coordinate *coords)
+{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *old_bw, *new_bw;
+ unsigned int bw_val;
+ int i;
+
+ bw_val = min(coords->read_bandwidth, coords->write_bandwidth);
+ new_bw = kcalloc(nr_node_ids, sizeof(unsigned int), GFP_KERNEL);
+ if (!new_bw)
+ return -ENOMEM;
+
+ new_wi_state = kmalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state) {
+ kfree(new_bw);
+ return -ENOMEM;
+ }
+ new_wi_state->mode_auto = true;
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+
+ /*
+ * Update bandwidth info, even in manual mode. That way, when switching
+ * to auto mode in the future, iw_table can be overwritten using
+ * accurate bw data.
+ */
+ mutex_lock(&wi_state_lock);
+
+ old_bw = node_bw_table;
+ if (old_bw)
+ memcpy(new_bw, old_bw, nr_node_ids * sizeof(*old_bw));
+ new_bw[node] = bw_val;
+ node_bw_table = new_bw;
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state && !old_wi_state->mode_auto) {
+ /* Manual mode; skip reducing weights and updating wi_state */
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ goto out;
+ }
+
+ /* NULL wi_state assumes auto=true; reduce weights and update wi_state*/
+ reduce_interleave_weights(new_bw, new_wi_state->iw_table);
+ rcu_assign_pointer(wi_state, new_wi_state);
+
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+out:
+ kfree(old_bw);
+ return 0;
+}
+
/**
* numa_nearest_node - Find nearest node by state
* @node: Node id to start the search
@@ -196,6 +305,37 @@ int numa_nearest_node(int node, unsigned int state)
}
EXPORT_SYMBOL_GPL(numa_nearest_node);
+/**
+ * nearest_node_nodemask - Find the node in @mask at the nearest distance
+ * from @node.
+ *
+ * @node: a valid node ID to start the search from.
+ * @mask: a pointer to a nodemask representing the allowed nodes.
+ *
+ * This function iterates over all nodes in @mask and calculates the
+ * distance from the starting @node, then it returns the node ID that is
+ * the closest to @node, or MAX_NUMNODES if no node is found.
+ *
+ * Note that @node must be a valid node ID usable with node_distance(),
+ * providing an invalid node ID (e.g., NUMA_NO_NODE) may result in crashes
+ * or unexpected behavior.
+ */
+int nearest_node_nodemask(int node, nodemask_t *mask)
+{
+ int dist, n, min_dist = INT_MAX, min_node = MAX_NUMNODES;
+
+ for_each_node_mask(n, *mask) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(nearest_node_nodemask);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -535,6 +675,7 @@ static void queue_folios_pmd(pmd_t *pmd, struct mm_walk *walk)
static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
+ const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
struct vm_area_struct *vma = walk->vma;
struct folio *folio;
struct queue_pages *qp = walk->private;
@@ -542,6 +683,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
pte_t *pte, *mapped_pte;
pte_t ptent;
spinlock_t *ptl;
+ int max_nr, nr;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
@@ -555,7 +697,9 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
walk->action = ACTION_AGAIN;
return 0;
}
- for (; addr != end; pte++, addr += PAGE_SIZE) {
+ for (; addr != end; pte += nr, addr += nr * PAGE_SIZE) {
+ max_nr = (end - addr) >> PAGE_SHIFT;
+ nr = 1;
ptent = ptep_get(pte);
if (pte_none(ptent))
continue;
@@ -567,6 +711,10 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
folio = vm_normal_folio(vma, addr, ptent);
if (!folio || folio_is_zone_device(folio))
continue;
+ if (folio_test_large(folio) && max_nr != 1)
+ nr = folio_pte_batch(folio, addr, pte, ptent,
+ max_nr, fpb_flags,
+ NULL, NULL, NULL);
/*
* vm_normal_folio() filters out zero pages, but there might
* still be reserved folios to skip, perhaps in a VDSO.
@@ -599,7 +747,7 @@ static int queue_folios_pte_range(pmd_t *pmd, unsigned long addr,
if (!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ||
!vma_migratable(vma) ||
!migrate_folio_add(folio, qp->pagelist, flags)) {
- qp->nr_failed++;
+ qp->nr_failed += nr;
if (strictly_unmovable(flags))
break;
}
@@ -642,12 +790,12 @@ static int queue_folios_hugetlb(pte_t *pte, unsigned long hmask,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
if ((flags & MPOL_MF_MOVE_ALL) ||
- (!folio_likely_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
- if (!isolate_hugetlb(folio, qp->pagelist))
+ (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(pte)))
+ if (!folio_isolate_hugetlb(folio, qp->pagelist))
qp->nr_failed++;
unlock:
spin_unlock(ptl);
@@ -1033,10 +1181,10 @@ static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist,
* Unless MPOL_MF_MOVE_ALL, we try to avoid migrating a shared folio.
* Choosing not to migrate a shared folio is not counted as a failure.
*
- * See folio_likely_mapped_shared() on possible imprecision when we
+ * See folio_maybe_mapped_shared() on possible imprecision when we
* cannot easily detect if a folio is shared.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || !folio_likely_mapped_shared(folio)) {
+ if ((flags & MPOL_MF_MOVE_ALL) || !folio_maybe_mapped_shared(folio)) {
if (folio_isolate_lru(folio)) {
list_add_tail(&folio->lru, foliolist);
node_stat_mod_folio(folio,
@@ -1080,6 +1228,10 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest,
mmap_read_lock(mm);
vma = find_vma(mm, 0);
+ if (unlikely(!vma)) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
/*
* This does not migrate the range, but isolates all pages that
@@ -1979,26 +2131,28 @@ static unsigned int read_once_policy_nodemask(struct mempolicy *pol,
static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
{
+ struct weighted_interleave_state *state;
nodemask_t nodemask;
unsigned int target, nr_nodes;
- u8 *table;
+ u8 *table = NULL;
unsigned int weight_total = 0;
u8 weight;
- int nid;
+ int nid = 0;
nr_nodes = read_once_policy_nodemask(pol, &nodemask);
if (!nr_nodes)
return numa_node_id();
rcu_read_lock();
- table = rcu_dereference(iw_table);
+
+ state = rcu_dereference(wi_state);
+ /* Uninitialized wi_state means we should assume all weights are 1 */
+ if (state)
+ table = state->iw_table;
+
/* calculate the total weight */
- for_each_node_mask(nid, nodemask) {
- /* detect system default usage */
- weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
- weight_total += weight;
- }
+ for_each_node_mask(nid, nodemask)
+ weight_total += table ? table[nid] : 1;
/* Calculate the node offset based on totals */
target = ilx % weight_total;
@@ -2006,7 +2160,6 @@ static unsigned int weighted_interleave_nid(struct mempolicy *pol, pgoff_t ilx)
while (target) {
/* detect system default usage */
weight = table ? table[nid] : 1;
- weight = weight ? weight : 1;
if (target < weight)
break;
target -= weight;
@@ -2201,9 +2354,9 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*/
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- page = __alloc_pages_noprof(preferred_gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(preferred_gfp, order, nid, nodemask);
if (!page)
- page = __alloc_pages_noprof(gfp, order, nid, NULL);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, NULL);
return page;
}
@@ -2218,7 +2371,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
*
* Return: The page on success or NULL if allocation fails.
*/
-struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
+static struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
nodemask_t *nodemask;
@@ -2249,8 +2402,9 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
* First, try to allocate THP only on local node, but
* don't reclaim unnecessarily, just compact.
*/
- page = __alloc_pages_node_noprof(nid,
- gfp | __GFP_THISNODE | __GFP_NORETRY, order);
+ page = __alloc_frozen_pages_noprof(
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order,
+ nid, NULL);
if (page || !(gfp & __GFP_DIRECT_RECLAIM))
return page;
/*
@@ -2262,9 +2416,10 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
}
}
- page = __alloc_pages_noprof(gfp, order, nid, nodemask);
+ page = __alloc_frozen_pages_noprof(gfp, order, nid, nodemask);
- if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) {
+ if (unlikely(pol->mode == MPOL_INTERLEAVE ||
+ pol->mode == MPOL_WEIGHTED_INTERLEAVE) && page) {
/* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */
if (static_branch_likely(&vm_numa_stat_key) &&
page_to_nid(page) == nid) {
@@ -2280,8 +2435,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order,
struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
struct mempolicy *pol, pgoff_t ilx, int nid)
{
- return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP,
- order, pol, ilx, nid));
+ struct page *page = alloc_pages_mpol(gfp | __GFP_COMP, order, pol,
+ ilx, nid);
+ if (!page)
+ return NULL;
+
+ set_page_refcounted(page);
+ return page_rmappable_folio(page);
}
/**
@@ -2295,7 +2455,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order,
* NUMA policy. The caller must hold the mmap_lock of the mm_struct of the
* VMA to prevent it from going away. Should be used for all allocations
* for folios that will be mapped into user space, excepting hugetlbfs, and
- * excepting where direct use of alloc_pages_mpol() is more appropriate.
+ * excepting where direct use of folio_alloc_mpol() is more appropriate.
*
* Return: The folio on success or NULL if allocation fails.
*/
@@ -2316,6 +2476,21 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct
}
EXPORT_SYMBOL(vma_alloc_folio_noprof);
+struct page *alloc_frozen_pages_noprof(gfp_t gfp, unsigned order)
+{
+ struct mempolicy *pol = &default_policy;
+
+ /*
+ * No reference counting needed for current->mempolicy
+ * nor system default_policy
+ */
+ if (!in_interrupt() && !(gfp & __GFP_THISNODE))
+ pol = get_task_policy(current);
+
+ return alloc_pages_mpol(gfp, order, pol, NO_INTERLEAVE_INDEX,
+ numa_node_id());
+}
+
/**
* alloc_pages - Allocate pages.
* @gfp: GFP flags.
@@ -2332,17 +2507,11 @@ EXPORT_SYMBOL(vma_alloc_folio_noprof);
*/
struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order)
{
- struct mempolicy *pol = &default_policy;
-
- /*
- * No reference counting needed for current->mempolicy
- * nor system default_policy
- */
- if (!in_interrupt() && !(gfp & __GFP_THISNODE))
- pol = get_task_policy(current);
+ struct page *page = alloc_frozen_pages_noprof(gfp, order);
- return alloc_pages_mpol_noprof(gfp, order, pol, NO_INTERLEAVE_INDEX,
- numa_node_id());
+ if (page)
+ set_page_refcounted(page);
+ return page;
}
EXPORT_SYMBOL(alloc_pages_noprof);
@@ -2352,7 +2521,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order)
}
EXPORT_SYMBOL(folio_alloc_noprof);
-static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2371,13 +2540,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
if (delta) {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node + 1, NULL,
+ nr_pages_per_node + 1,
page_array);
delta--;
} else {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node, NULL, page_array);
+ nr_pages_per_node, page_array);
}
page_array += nr_allocated;
@@ -2387,17 +2556,18 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
+static unsigned long alloc_pages_bulk_weighted_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
+ struct weighted_interleave_state *state;
struct task_struct *me = current;
unsigned int cpuset_mems_cookie;
unsigned long total_allocated = 0;
unsigned long nr_allocated = 0;
unsigned long rounds;
unsigned long node_pages, delta;
- u8 *table, *weights, weight;
+ u8 *weights, weight;
unsigned int weight_total = 0;
unsigned long rem_pages = nr_pages;
nodemask_t nodes;
@@ -2426,7 +2596,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (weight && node_isset(node, nodes)) {
node_pages = min(rem_pages, weight);
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
/* if that's all the pages, no need to interleave */
@@ -2447,17 +2617,19 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
rcu_read_lock();
- table = rcu_dereference(iw_table);
- if (table)
- memcpy(weights, table, nr_node_ids);
- rcu_read_unlock();
+ state = rcu_dereference(wi_state);
+ if (state) {
+ memcpy(weights, state->iw_table, nr_node_ids * sizeof(u8));
+ rcu_read_unlock();
+ } else {
+ rcu_read_unlock();
+ for (i = 0; i < nr_node_ids; i++)
+ weights[i] = 1;
+ }
/* calculate total, detect system default usage */
- for_each_node_mask(node, nodes) {
- if (!weights[node])
- weights[node] = 1;
+ for_each_node_mask(node, nodes)
weight_total += weights[node];
- }
/*
* Calculate rounds/partial rounds to minimize __alloc_pages_bulk calls.
@@ -2489,7 +2661,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (!node_pages)
break;
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
if (total_allocated == nr_pages)
@@ -2502,7 +2674,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
return total_allocated;
}
-static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
+static unsigned long alloc_pages_bulk_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
@@ -2513,11 +2685,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
if (nr_allocated < nr_pages)
nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
- nr_pages - nr_allocated, NULL,
+ nr_pages - nr_allocated,
page_array + nr_allocated);
return nr_allocated;
}
@@ -2528,7 +2700,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
-unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
+unsigned long alloc_pages_bulk_mempolicy_noprof(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
@@ -2539,21 +2711,21 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_pages_bulk_array_interleave(gfp, pol,
+ return alloc_pages_bulk_interleave(gfp, pol,
nr_pages, page_array);
if (pol->mode == MPOL_WEIGHTED_INTERLEAVE)
- return alloc_pages_bulk_array_weighted_interleave(
+ return alloc_pages_bulk_weighted_interleave(
gfp, pol, nr_pages, page_array);
if (pol->mode == MPOL_PREFERRED_MANY)
- return alloc_pages_bulk_array_preferred_many(gfp,
+ return alloc_pages_bulk_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
return alloc_pages_bulk_noprof(gfp, nid, nodemask,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
@@ -3368,6 +3540,14 @@ struct iw_node_attr {
int nid;
};
+struct sysfs_wi_group {
+ struct kobject wi_kobj;
+ struct mutex kobj_lock;
+ struct iw_node_attr *nattrs[];
+};
+
+static struct sysfs_wi_group *wi_group;
+
static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
@@ -3382,177 +3562,316 @@ static ssize_t node_show(struct kobject *kobj, struct kobj_attribute *attr,
static ssize_t node_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
struct iw_node_attr *node_attr;
- u8 *new;
- u8 *old;
u8 weight = 0;
+ int i;
node_attr = container_of(attr, struct iw_node_attr, kobj_attr);
- if (count == 0 || sysfs_streq(buf, ""))
- weight = 0;
- else if (kstrtou8(buf, 0, &weight))
+ if (count == 0 || sysfs_streq(buf, "") ||
+ kstrtou8(buf, 0, &weight) || weight == 0)
return -EINVAL;
- new = kzalloc(nr_node_ids, GFP_KERNEL);
- if (!new)
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
return -ENOMEM;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- if (old)
- memcpy(new, old, nr_node_ids);
- new[node_attr->nid] = weight;
- rcu_assign_pointer(iw_table, new);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
+ mutex_lock(&wi_state_lock);
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (old_wi_state) {
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ } else {
+ for (i = 0; i < nr_node_ids; i++)
+ new_wi_state->iw_table[i] = 1;
+ }
+ new_wi_state->iw_table[node_attr->nid] = weight;
+ new_wi_state->mode_auto = false;
+
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
return count;
}
-static struct iw_node_attr **node_attrs;
-
-static void sysfs_wi_node_release(struct iw_node_attr *node_attr,
- struct kobject *parent)
+static ssize_t weighted_interleave_auto_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
- if (!node_attr)
- return;
- sysfs_remove_file(parent, &node_attr->kobj_attr.attr);
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
+ struct weighted_interleave_state *state;
+ bool wi_auto = true;
+
+ rcu_read_lock();
+ state = rcu_dereference(wi_state);
+ if (state)
+ wi_auto = state->mode_auto;
+ rcu_read_unlock();
+
+ return sysfs_emit(buf, "%s\n", str_true_false(wi_auto));
}
-static void sysfs_wi_release(struct kobject *wi_kobj)
+static ssize_t weighted_interleave_auto_store(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf, size_t count)
{
+ struct weighted_interleave_state *new_wi_state, *old_wi_state = NULL;
+ unsigned int *bw;
+ bool input;
int i;
+ if (kstrtobool(buf, &input))
+ return -EINVAL;
+
+ new_wi_state = kzalloc(struct_size(new_wi_state, iw_table, nr_node_ids),
+ GFP_KERNEL);
+ if (!new_wi_state)
+ return -ENOMEM;
for (i = 0; i < nr_node_ids; i++)
- sysfs_wi_node_release(node_attrs[i], wi_kobj);
- kobject_put(wi_kobj);
+ new_wi_state->iw_table[i] = 1;
+
+ mutex_lock(&wi_state_lock);
+ if (!input) {
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state)
+ goto update_wi_state;
+ if (input == old_wi_state->mode_auto) {
+ mutex_unlock(&wi_state_lock);
+ return count;
+ }
+
+ memcpy(new_wi_state->iw_table, old_wi_state->iw_table,
+ nr_node_ids * sizeof(u8));
+ goto update_wi_state;
+ }
+
+ bw = node_bw_table;
+ if (!bw) {
+ mutex_unlock(&wi_state_lock);
+ kfree(new_wi_state);
+ return -ENODEV;
+ }
+
+ new_wi_state->mode_auto = true;
+ reduce_interleave_weights(bw, new_wi_state->iw_table);
+
+update_wi_state:
+ rcu_assign_pointer(wi_state, new_wi_state);
+ mutex_unlock(&wi_state_lock);
+ if (old_wi_state) {
+ synchronize_rcu();
+ kfree(old_wi_state);
+ }
+ return count;
+}
+
+static void sysfs_wi_node_delete(int nid)
+{
+ struct iw_node_attr *attr;
+
+ if (nid < 0 || nid >= nr_node_ids)
+ return;
+
+ mutex_lock(&wi_group->kobj_lock);
+ attr = wi_group->nattrs[nid];
+ if (!attr) {
+ mutex_unlock(&wi_group->kobj_lock);
+ return;
+ }
+
+ wi_group->nattrs[nid] = NULL;
+ mutex_unlock(&wi_group->kobj_lock);
+
+ sysfs_remove_file(&wi_group->wi_kobj, &attr->kobj_attr.attr);
+ kfree(attr->kobj_attr.attr.name);
+ kfree(attr);
+}
+
+static void sysfs_wi_node_delete_all(void)
+{
+ int nid;
+
+ for (nid = 0; nid < nr_node_ids; nid++)
+ sysfs_wi_node_delete(nid);
+}
+
+static void wi_state_free(void)
+{
+ struct weighted_interleave_state *old_wi_state;
+
+ mutex_lock(&wi_state_lock);
+
+ old_wi_state = rcu_dereference_protected(wi_state,
+ lockdep_is_held(&wi_state_lock));
+ if (!old_wi_state) {
+ mutex_unlock(&wi_state_lock);
+ return;
+ }
+
+ rcu_assign_pointer(wi_state, NULL);
+ mutex_unlock(&wi_state_lock);
+ synchronize_rcu();
+ kfree(old_wi_state);
+}
+
+static struct kobj_attribute wi_auto_attr =
+ __ATTR(auto, 0664, weighted_interleave_auto_show,
+ weighted_interleave_auto_store);
+
+static void wi_cleanup(void) {
+ sysfs_remove_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ sysfs_wi_node_delete_all();
+ wi_state_free();
+}
+
+static void wi_kobj_release(struct kobject *wi_kobj)
+{
+ kfree(wi_group);
}
static const struct kobj_type wi_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = sysfs_wi_release,
+ .release = wi_kobj_release,
};
-static int add_weight_node(int nid, struct kobject *wi_kobj)
+static int sysfs_wi_node_add(int nid)
{
- struct iw_node_attr *node_attr;
+ int ret;
char *name;
+ struct iw_node_attr *new_attr;
- node_attr = kzalloc(sizeof(*node_attr), GFP_KERNEL);
- if (!node_attr)
+ if (nid < 0 || nid >= nr_node_ids) {
+ pr_err("invalid node id: %d\n", nid);
+ return -EINVAL;
+ }
+
+ new_attr = kzalloc(sizeof(*new_attr), GFP_KERNEL);
+ if (!new_attr)
return -ENOMEM;
name = kasprintf(GFP_KERNEL, "node%d", nid);
if (!name) {
- kfree(node_attr);
+ kfree(new_attr);
return -ENOMEM;
}
- sysfs_attr_init(&node_attr->kobj_attr.attr);
- node_attr->kobj_attr.attr.name = name;
- node_attr->kobj_attr.attr.mode = 0644;
- node_attr->kobj_attr.show = node_show;
- node_attr->kobj_attr.store = node_store;
- node_attr->nid = nid;
+ sysfs_attr_init(&new_attr->kobj_attr.attr);
+ new_attr->kobj_attr.attr.name = name;
+ new_attr->kobj_attr.attr.mode = 0644;
+ new_attr->kobj_attr.show = node_show;
+ new_attr->kobj_attr.store = node_store;
+ new_attr->nid = nid;
- if (sysfs_create_file(wi_kobj, &node_attr->kobj_attr.attr)) {
- kfree(node_attr->kobj_attr.attr.name);
- kfree(node_attr);
- pr_err("failed to add attribute to weighted_interleave\n");
- return -ENOMEM;
+ mutex_lock(&wi_group->kobj_lock);
+ if (wi_group->nattrs[nid]) {
+ mutex_unlock(&wi_group->kobj_lock);
+ ret = -EEXIST;
+ goto out;
}
- node_attrs[nid] = node_attr;
+ ret = sysfs_create_file(&wi_group->wi_kobj, &new_attr->kobj_attr.attr);
+ if (ret) {
+ mutex_unlock(&wi_group->kobj_lock);
+ goto out;
+ }
+ wi_group->nattrs[nid] = new_attr;
+ mutex_unlock(&wi_group->kobj_lock);
return 0;
+
+out:
+ kfree(new_attr->kobj_attr.attr.name);
+ kfree(new_attr);
+ return ret;
}
-static int add_weighted_interleave_group(struct kobject *root_kobj)
+static int wi_node_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ int err;
+ struct memory_notify *arg = data;
+ int nid = arg->status_change_nid;
+
+ if (nid < 0)
+ return NOTIFY_OK;
+
+ switch (action) {
+ case MEM_ONLINE:
+ err = sysfs_wi_node_add(nid);
+ if (err)
+ pr_err("failed to add sysfs for node%d during hotplug: %d\n",
+ nid, err);
+ break;
+ case MEM_OFFLINE:
+ sysfs_wi_node_delete(nid);
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init add_weighted_interleave_group(struct kobject *mempolicy_kobj)
{
- struct kobject *wi_kobj;
int nid, err;
- wi_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (!wi_kobj)
+ wi_group = kzalloc(struct_size(wi_group, nattrs, nr_node_ids),
+ GFP_KERNEL);
+ if (!wi_group)
return -ENOMEM;
+ mutex_init(&wi_group->kobj_lock);
- err = kobject_init_and_add(wi_kobj, &wi_ktype, root_kobj,
+ err = kobject_init_and_add(&wi_group->wi_kobj, &wi_ktype, mempolicy_kobj,
"weighted_interleave");
- if (err) {
- kfree(wi_kobj);
- return err;
- }
+ if (err)
+ goto err_put_kobj;
- for_each_node_state(nid, N_POSSIBLE) {
- err = add_weight_node(nid, wi_kobj);
+ err = sysfs_create_file(&wi_group->wi_kobj, &wi_auto_attr.attr);
+ if (err)
+ goto err_put_kobj;
+
+ for_each_online_node(nid) {
+ if (!node_state(nid, N_MEMORY))
+ continue;
+
+ err = sysfs_wi_node_add(nid);
if (err) {
- pr_err("failed to add sysfs [node%d]\n", nid);
- break;
+ pr_err("failed to add sysfs for node%d during init: %d\n",
+ nid, err);
+ goto err_cleanup_kobj;
}
}
- if (err)
- kobject_put(wi_kobj);
- return 0;
-}
-static void mempolicy_kobj_release(struct kobject *kobj)
-{
- u8 *old;
+ hotplug_memory_notifier(wi_node_notifier, DEFAULT_CALLBACK_PRI);
+ return 0;
- mutex_lock(&iw_table_lock);
- old = rcu_dereference_protected(iw_table,
- lockdep_is_held(&iw_table_lock));
- rcu_assign_pointer(iw_table, NULL);
- mutex_unlock(&iw_table_lock);
- synchronize_rcu();
- kfree(old);
- kfree(node_attrs);
- kfree(kobj);
+err_cleanup_kobj:
+ wi_cleanup();
+ kobject_del(&wi_group->wi_kobj);
+err_put_kobj:
+ kobject_put(&wi_group->wi_kobj);
+ return err;
}
-static const struct kobj_type mempolicy_ktype = {
- .release = mempolicy_kobj_release
-};
-
static int __init mempolicy_sysfs_init(void)
{
int err;
static struct kobject *mempolicy_kobj;
- mempolicy_kobj = kzalloc(sizeof(*mempolicy_kobj), GFP_KERNEL);
- if (!mempolicy_kobj) {
- err = -ENOMEM;
- goto err_out;
- }
-
- node_attrs = kcalloc(nr_node_ids, sizeof(struct iw_node_attr *),
- GFP_KERNEL);
- if (!node_attrs) {
- err = -ENOMEM;
- goto mempol_out;
- }
+ mempolicy_kobj = kobject_create_and_add("mempolicy", mm_kobj);
+ if (!mempolicy_kobj)
+ return -ENOMEM;
- err = kobject_init_and_add(mempolicy_kobj, &mempolicy_ktype, mm_kobj,
- "mempolicy");
+ err = add_weighted_interleave_group(mempolicy_kobj);
if (err)
- goto node_out;
+ goto err_kobj;
- err = add_weighted_interleave_group(mempolicy_kobj);
- if (err) {
- pr_err("mempolicy sysfs structure failed to initialize\n");
- kobject_put(mempolicy_kobj);
- return err;
- }
+ return 0;
- return err;
-node_out:
- kfree(node_attrs);
-mempol_out:
- kfree(mempolicy_kobj);
-err_out:
- pr_err("failed to add mempolicy kobject to the system\n");
+err_kobj:
+ kobject_del(mempolicy_kobj);
+ kobject_put(mempolicy_kobj);
return err;
}