summaryrefslogtreecommitdiff
path: root/mm/vmstat.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c163
1 files changed, 84 insertions, 79 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5e4300482897..77e42ef388c2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -570,49 +570,18 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
#ifdef CONFIG_NUMA
/*
- * zonelist = the list of zones passed to the allocator
- * z = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
- if (z->zone_pgdat == preferred_zone->zone_pgdat) {
- __inc_zone_state(z, NUMA_HIT);
- } else {
- __inc_zone_state(z, NUMA_MISS);
- __inc_zone_state(preferred_zone, NUMA_FOREIGN);
- }
- if (z->node == ((flags & __GFP_OTHER_NODE) ?
- preferred_zone->node : numa_node_id()))
- __inc_zone_state(z, NUMA_LOCAL);
- else
- __inc_zone_state(z, NUMA_OTHER);
-}
-
-/*
* Determine the per node value of a stat item.
*/
unsigned long node_page_state(int node, enum zone_stat_item item)
{
struct zone *zones = NODE_DATA(node)->node_zones;
+ int i;
+ unsigned long count = 0;
- return
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
- zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], item) +
- zone_page_state(&zones[ZONE_MOVABLE], item);
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ count += zone_page_state(zones + i, item);
+
+ return count;
}
#endif
@@ -1010,6 +979,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!memmap_valid_within(pfn, page, zone))
continue;
+ if (page_zone(page) != zone)
+ continue;
+
mtype = get_pageblock_migratetype(page);
if (mtype < MIGRATE_TYPES)
@@ -1069,13 +1041,17 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
block_end_pfn = min(block_end_pfn, end_pfn);
page = pfn_to_page(pfn);
- pageblock_mt = get_pfnblock_migratetype(page, pfn);
+ pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
page = pfn_to_page(pfn);
+
+ if (page_zone(page) != zone)
+ continue;
+
if (PageBuddy(page)) {
pfn += (1UL << page_order(page)) - 1;
continue;
@@ -1376,7 +1352,66 @@ static const struct file_operations proc_vmstat_file_operations = {
static struct workqueue_struct *vmstat_wq;
static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
int sysctl_stat_interval __read_mostly = HZ;
-static cpumask_var_t cpu_stat_off;
+
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+ refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ long val;
+ int err;
+ int i;
+
+ /*
+ * The regular update, every sysctl_stat_interval, may come later
+ * than expected: leaving a significant amount in per_cpu buckets.
+ * This is particularly misleading when checking a quantity of HUGE
+ * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
+ * which can equally be echo'ed to or cat'ted from (by root),
+ * can be used to update the stats just before reading them.
+ *
+ * Oh, and since global_page_state() etc. are so careful to hide
+ * transiently negative values, report an error here if any of
+ * the stats is negative, so we know to go looking for imbalance.
+ */
+ err = schedule_on_each_cpu(refresh_vm_stats);
+ if (err)
+ return err;
+ for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+ val = atomic_long_read(&vm_stat[i]);
+ if (val < 0) {
+ switch (i) {
+ case NR_ALLOC_BATCH:
+ case NR_PAGES_SCANNED:
+ /*
+ * These are often seen to go negative in
+ * recent kernels, but not to go permanently
+ * negative. Whilst it would be nicer not to
+ * have exceptions, rooting them out would be
+ * another task, of rather low priority.
+ */
+ break;
+ default:
+ pr_warn("%s: %s %ld\n",
+ __func__, vmstat_text[i], val);
+ err = -EINVAL;
+ break;
+ }
+ }
+ }
+ if (err)
+ return err;
+ if (write)
+ *ppos += *lenp;
+ else
+ *lenp = 0;
+ return 0;
+}
+#endif /* CONFIG_PROC_FS */
static void vmstat_update(struct work_struct *w)
{
@@ -1385,24 +1420,10 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
- * If we were marked on cpu_stat_off clear the flag
- * so that vmstat_shepherd doesn't schedule us again.
*/
- if (!cpumask_test_and_clear_cpu(smp_processor_id(),
- cpu_stat_off)) {
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
- }
- } else {
- /*
- * We did not update any counters so the app may be in
- * a mode where it does not cause counter updates.
- * We may be uselessly running vmstat_update.
- * Defer the checking for differentials to the
- * shepherd thread on a different processor.
- */
- cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
}
}
@@ -1434,16 +1455,17 @@ static bool need_update(int cpu)
return false;
}
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
- /*
- * If we are already in hands of the shepherd then there
- * is nothing for us to do here.
- */
- if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
return;
if (!need_update(smp_processor_id()))
@@ -1458,7 +1480,6 @@ void quiet_vmstat(void)
refresh_cpu_vm_stats(false);
}
-
/*
* Shepherd worker thread that checks the
* differentials of processors that have their worker
@@ -1475,20 +1496,11 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off) {
+ for_each_online_cpu(cpu) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
- if (need_update(cpu)) {
- if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
- queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
- } else {
- /*
- * Cancel the work if quiet_vmstat has put this
- * cpu on cpu_stat_off because the work item might
- * be still scheduled
- */
- cancel_delayed_work(dw);
- }
+ if (!delayed_work_pending(dw) && need_update(cpu))
+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
}
put_online_cpus();
@@ -1504,10 +1516,6 @@ static void __init start_shepherd_timer(void)
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
- if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
- BUG();
- cpumask_copy(cpu_stat_off, cpu_online_mask);
-
vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
@@ -1542,16 +1550,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_ONLINE_FROZEN:
refresh_zone_stat_thresholds();
node_set_state(cpu_to_node(cpu), N_CPU);
- cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
- cpumask_clear_cpu(cpu, cpu_stat_off);
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- cpumask_set_cpu(cpu, cpu_stat_off);
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN: