summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c90
-rw-r--r--mm/list_lru.c16
-rw-r--r--mm/truncate.c26
-rw-r--r--mm/vmstat.c1
-rw-r--r--mm/workingset.c161
5 files changed, 274 insertions, 20 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index a603c4d7d3c9..d6df3bacb0fb 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -110,11 +110,17 @@
static void page_cache_tree_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- if (shadow) {
- void **slot;
+ struct radix_tree_node *node;
+ unsigned long index;
+ unsigned int offset;
+ unsigned int tag;
+ void **slot;
- slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
- radix_tree_replace_slot(slot, shadow);
+ VM_BUG_ON(!PageLocked(page));
+
+ __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+
+ if (shadow) {
mapping->nrshadows++;
/*
* Make sure the nrshadows update is committed before
@@ -123,9 +129,45 @@ static void page_cache_tree_delete(struct address_space *mapping,
* same time and miss a shadow entry.
*/
smp_wmb();
- } else
- radix_tree_delete(&mapping->page_tree, page->index);
+ }
mapping->nrpages--;
+
+ if (!node) {
+ /* Clear direct pointer tags in root node */
+ mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+ radix_tree_replace_slot(slot, shadow);
+ return;
+ }
+
+ /* Clear tree tags for the removed page */
+ index = page->index;
+ offset = index & RADIX_TREE_MAP_MASK;
+ for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+ if (test_bit(offset, node->tags[tag]))
+ radix_tree_tag_clear(&mapping->page_tree, index, tag);
+ }
+
+ /* Delete page, swap shadow entry */
+ radix_tree_replace_slot(slot, shadow);
+ workingset_node_pages_dec(node);
+ if (shadow)
+ workingset_node_shadows_inc(node);
+ else
+ if (__radix_tree_delete_node(&mapping->page_tree, node))
+ return;
+
+ /*
+ * Track node that only contains shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already tracked. The
+ * list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_pages(node) &&
+ list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes, &node->private_list);
+ }
}
/*
@@ -471,27 +513,43 @@ EXPORT_SYMBOL_GPL(replace_page_cache_page);
static int page_cache_tree_insert(struct address_space *mapping,
struct page *page, void **shadowp)
{
+ struct radix_tree_node *node;
void **slot;
int error;
- slot = radix_tree_lookup_slot(&mapping->page_tree, page->index);
- if (slot) {
+ error = __radix_tree_create(&mapping->page_tree, page->index,
+ &node, &slot);
+ if (error)
+ return error;
+ if (*slot) {
void *p;
p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
if (!radix_tree_exceptional_entry(p))
return -EEXIST;
- radix_tree_replace_slot(slot, page);
- mapping->nrshadows--;
- mapping->nrpages++;
if (shadowp)
*shadowp = p;
- return 0;
+ mapping->nrshadows--;
+ if (node)
+ workingset_node_shadows_dec(node);
}
- error = radix_tree_insert(&mapping->page_tree, page->index, page);
- if (!error)
- mapping->nrpages++;
- return error;
+ radix_tree_replace_slot(slot, page);
+ mapping->nrpages++;
+ if (node) {
+ workingset_node_pages_inc(node);
+ /*
+ * Don't track node that contains actual pages.
+ *
+ * Avoid acquiring the list_lru lock if already
+ * untracked. The list_empty() test is safe as
+ * node->private_list is protected by
+ * mapping->tree_lock.
+ */
+ if (!list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
+ }
+ return 0;
}
static int __add_to_page_cache_locked(struct page *page,
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 72f9decb0104..f1a0db194173 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -87,11 +87,20 @@ restart:
ret = isolate(item, &nlru->lock, cb_arg);
switch (ret) {
+ case LRU_REMOVED_RETRY:
+ assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
if (--nlru->nr_items == 0)
node_clear(nid, lru->active_nodes);
WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
+ /*
+ * If the lru lock has been dropped, our list
+ * traversal is now invalid and so we have to
+ * restart from scratch.
+ */
+ if (ret == LRU_REMOVED_RETRY)
+ goto restart;
break;
case LRU_ROTATE:
list_move_tail(item, &nlru->list);
@@ -103,6 +112,7 @@ restart:
* The lru lock has been dropped, our list traversal is
* now invalid and so we have to restart from scratch.
*/
+ assert_spin_locked(&nlru->lock);
goto restart;
default:
BUG();
@@ -114,7 +124,7 @@ restart:
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init(struct list_lru *lru)
+int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
@@ -126,12 +136,14 @@ int list_lru_init(struct list_lru *lru)
nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
+ if (key)
+ lockdep_set_class(&lru->node[i].lock, key);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
}
return 0;
}
-EXPORT_SYMBOL_GPL(list_lru_init);
+EXPORT_SYMBOL_GPL(list_lru_init_key);
void list_lru_destroy(struct list_lru *lru)
{
diff --git a/mm/truncate.c b/mm/truncate.c
index 0db9258319f0..e5cc39ab0751 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -25,6 +25,9 @@
static void clear_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
+ struct radix_tree_node *node;
+ void **slot;
+
/* Handled by shmem itself */
if (shmem_mapping(mapping))
return;
@@ -35,8 +38,27 @@ static void clear_exceptional_entry(struct address_space *mapping,
* without the tree itself locked. These unlocked entries
* need verification under the tree lock.
*/
- if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry)
- mapping->nrshadows--;
+ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
+ goto unlock;
+ if (*slot != entry)
+ goto unlock;
+ radix_tree_replace_slot(slot, NULL);
+ mapping->nrshadows--;
+ if (!node)
+ goto unlock;
+ workingset_node_shadows_dec(node);
+ /*
+ * Don't track node without shadow entries.
+ *
+ * Avoid acquiring the list_lru lock if already untracked.
+ * The list_empty() test is safe as node->private_list is
+ * protected by mapping->tree_lock.
+ */
+ if (!workingset_node_shadows(node) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes, &node->private_list);
+ __radix_tree_delete_node(&mapping->page_tree, node);
+unlock:
spin_unlock_irq(&mapping->tree_lock);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 843125a0e255..f3155d51acfd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -772,6 +772,7 @@ const char * const vmstat_text[] = {
#endif
"workingset_refault",
"workingset_activate",
+ "workingset_nodereclaim",
"nr_anon_transparent_hugepages",
"nr_free_cma",
"nr_dirty_threshold",
diff --git a/mm/workingset.c b/mm/workingset.c
index 8a6c7cff4923..f7216fa7da27 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -251,3 +251,164 @@ void workingset_activation(struct page *page)
{
atomic_long_inc(&page_zone(page)->inactive_age);
}
+
+/*
+ * Shadow entries reflect the share of the working set that does not
+ * fit into memory, so their number depends on the access pattern of
+ * the workload. In most cases, they will refault or get reclaimed
+ * along with the inode, but a (malicious) workload that streams
+ * through files with a total size several times that of available
+ * memory, while preventing the inodes from being reclaimed, can
+ * create excessive amounts of shadow nodes. To keep a lid on this,
+ * track shadow nodes and reclaim them when they grow way past the
+ * point where they would still be useful.
+ */
+
+struct list_lru workingset_shadow_nodes;
+
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long shadow_nodes;
+ unsigned long max_nodes;
+ unsigned long pages;
+
+ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ local_irq_disable();
+ shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+ local_irq_enable();
+
+ pages = node_present_pages(sc->nid);
+ /*
+ * Active cache pages are limited to 50% of memory, and shadow
+ * entries that represent a refault distance bigger than that
+ * do not have any effect. Limit the number of shadow nodes
+ * such that shadow entries do not exceed the number of active
+ * cache pages, assuming a worst-case node population density
+ * of 1/8th on average.
+ *
+ * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+ * each, this will reclaim shadow entries when they consume
+ * ~2% of available memory:
+ *
+ * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE
+ */
+ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3);
+
+ if (shadow_nodes <= max_nodes)
+ return 0;
+
+ return shadow_nodes - max_nodes;
+}
+
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+ spinlock_t *lru_lock,
+ void *arg)
+{
+ struct address_space *mapping;
+ struct radix_tree_node *node;
+ unsigned int i;
+ int ret;
+
+ /*
+ * Page cache insertions and deletions synchroneously maintain
+ * the shadow node LRU under the mapping->tree_lock and the
+ * lru_lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the lru_lock pins any
+ * address_space that has radix tree nodes on the LRU.
+ *
+ * We can then safely transition to the mapping->tree_lock to
+ * pin only the address_space of the particular node we want
+ * to reclaim, take the node off-LRU, and drop the lru_lock.
+ */
+
+ node = container_of(item, struct radix_tree_node, private_list);
+ mapping = node->private_data;
+
+ /* Coming from the list, invert the lock order */
+ if (!spin_trylock(&mapping->tree_lock)) {
+ spin_unlock(lru_lock);
+ ret = LRU_RETRY;
+ goto out;
+ }
+
+ list_del_init(item);
+ spin_unlock(lru_lock);
+
+ /*
+ * The nodes should only contain one or more shadow entries,
+ * no pages, so we expect to be able to remove them all and
+ * delete and free the empty node afterwards.
+ */
+
+ BUG_ON(!node->count);
+ BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
+
+ for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
+ if (node->slots[i]) {
+ BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
+ node->slots[i] = NULL;
+ BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
+ node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
+ BUG_ON(!mapping->nrshadows);
+ mapping->nrshadows--;
+ }
+ }
+ BUG_ON(node->count);
+ inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
+ if (!__radix_tree_delete_node(&mapping->page_tree, node))
+ BUG();
+
+ spin_unlock(&mapping->tree_lock);
+ ret = LRU_REMOVED_RETRY;
+out:
+ local_irq_enable();
+ cond_resched();
+ local_irq_disable();
+ spin_lock(lru_lock);
+ return ret;
+}
+
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+ struct shrink_control *sc)
+{
+ unsigned long ret;
+
+ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+ local_irq_disable();
+ ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+ shadow_lru_isolate, NULL, &sc->nr_to_scan);
+ local_irq_enable();
+ return ret;
+}
+
+static struct shrinker workingset_shadow_shrinker = {
+ .count_objects = count_shadow_nodes,
+ .scan_objects = scan_shadow_nodes,
+ .seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
+};
+
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * mapping->tree_lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
+static int __init workingset_init(void)
+{
+ int ret;
+
+ ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+ if (ret)
+ goto err;
+ ret = register_shrinker(&workingset_shadow_shrinker);
+ if (ret)
+ goto err_list_lru;
+ return 0;
+err_list_lru:
+ list_lru_destroy(&workingset_shadow_nodes);
+err:
+ return ret;
+}
+module_init(workingset_init);