From bfcd17a6c5529bc37234cfa720a047cf9397bcfc Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Wed, 6 Aug 2008 15:12:22 +0200 Subject: Configure out file locking features This patch adds the CONFIG_FILE_LOCKING option which allows to remove support for advisory locks. With this patch enabled, the flock() system call, the F_GETLK, F_SETLK and F_SETLKW operations of fcntl() and NFS support are disabled. These features are not necessarly needed on embedded systems. It allows to save ~11 Kb of kernel code and data: text data bss dec hex filename 1125436 118764 212992 1457192 163c28 vmlinux.old 1114299 118564 212992 1445855 160fdf vmlinux -11137 -200 0 -11337 -2C49 +/- This patch has originally been written by Matt Mackall , and is part of the Linux Tiny project. Signed-off-by: Thomas Petazzoni Signed-off-by: Matt Mackall Cc: matthew@wil.cx Cc: linux-fsdevel@vger.kernel.org Cc: mpm@selenic.com Cc: akpm@linux-foundation.org Signed-off-by: J. Bruce Fields --- fs/proc/proc_misc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs/proc/proc_misc.c') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 29e20c6b1f7f..1aabbe2592e1 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -684,6 +684,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_FILE_LOCKING static int locks_open(struct inode *inode, struct file *filp) { return seq_open(filp, &locks_seq_operations); @@ -695,6 +696,7 @@ static const struct file_operations proc_locks_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_FILE_LOCKING */ static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -888,7 +890,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif +#ifdef CONFIG_FILE_LOCKING proc_create("locks", 0, NULL, &proc_locks_operations); +#endif proc_create("devices", 0, NULL, &proc_devinfo_operations); proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK -- cgit v1.2.3 From a70973c2141f98e2046f7ce9a29774bf254cf70f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 3 Oct 2008 00:31:19 +0400 Subject: proc: remove unused get_dma_list() Signed-off-by: Alexey Dobriyan --- arch/arm/kernel/dma.c | 17 ----------------- arch/sparc/include/asm/dma_32.h | 1 - fs/proc/proc_misc.c | 1 - 3 files changed, 19 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/arch/arm/kernel/dma.c b/arch/arm/kernel/dma.c index ba99a2035523..d006085ed7e7 100644 --- a/arch/arm/kernel/dma.c +++ b/arch/arm/kernel/dma.c @@ -25,23 +25,6 @@ EXPORT_SYMBOL(dma_spin_lock); static dma_t dma_chan[MAX_DMA_CHANNELS]; -/* - * Get dma list for /proc/dma - */ -int get_dma_list(char *buf) -{ - dma_t *dma; - char *p = buf; - int i; - - for (i = 0, dma = dma_chan; i < MAX_DMA_CHANNELS; i++, dma++) - if (dma->lock) - p += sprintf(p, "%2d: %14s %s\n", i, - dma->d_ops->type, dma->device_id); - - return p - buf; -} - /* * Request DMA channel * diff --git a/arch/sparc/include/asm/dma_32.h b/arch/sparc/include/asm/dma_32.h index cf7189c0079b..7fa752959b93 100644 --- a/arch/sparc/include/asm/dma_32.h +++ b/arch/sparc/include/asm/dma_32.h @@ -231,7 +231,6 @@ static inline void sparc_dma_pause(struct sparc_dma_registers *regs, #define for_each_dvma(dma) \ for((dma) = dma_chain; (dma); (dma) = (dma)->next) -extern int get_dma_list(char *); extern int request_dma(unsigned int, __const__ char *); extern void free_dma(unsigned int); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 29e20c6b1f7f..66c1ab87656c 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -68,7 +68,6 @@ extern int get_hardware_list(char *); extern int get_stram_list(char *); extern int get_exec_domain_list(char *); -extern int get_dma_list(char *); static int proc_calc_metrics(char *page, char **start, off_t off, int count, int *eof, int len) -- cgit v1.2.3 From da27c118eb4f87f27ae8da2635b916daedf7264f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:56 -0700 Subject: fs/proc: use nr_irqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- fs/proc/proc_misc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index b675a49c1823..a2173a2a5625 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -509,7 +509,7 @@ static int show_stat(struct seq_file *p, void *v) struct timespec boottime; unsigned int *per_irq_sum; - per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL); + per_irq_sum = kzalloc(sizeof(unsigned int)*nr_irqs, GFP_KERNEL); if (!per_irq_sum) return -ENOMEM; @@ -531,7 +531,7 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { unsigned int temp = kstat_cpu(i).irqs[j]; sum += temp; per_irq_sum[j] += temp; @@ -577,7 +577,7 @@ static int show_stat(struct seq_file *p, void *v) } seq_printf(p, "intr %llu", (unsigned long long)sum); - for (i = 0; i < NR_IRQS; i++) + for (i = 0; i < nr_irqs; i++) seq_printf(p, " %u", per_irq_sum[i]); seq_printf(p, @@ -631,13 +631,13 @@ static const struct file_operations proc_stat_operations = { */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { - return (*pos <= NR_IRQS) ? pos : NULL; + return (*pos <= nr_irqs) ? pos : NULL; } static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos > NR_IRQS) + if (*pos > nr_irqs) return NULL; return pos; } -- cgit v1.2.3 From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:09 -0700 Subject: x86: move kstat_irqs from kstat to irq_desc based on Eric's patch ... together mold it with dyn_array for irq_desc, will allcate kstat_irqs for nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already. v2: make sure system without generic_hardirqs works they don't have irq_desc v3: fix merging v4: [mingo@elte.hu] fix typo [ mingo@elte.hu ] irq: build fix fix: arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow': arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs' Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/irq_32.c | 4 +- arch/x86/kernel/irq_64.c | 4 +- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/xen/spinlock.c | 2 +- fs/proc/proc_misc.c | 2 +- include/linux/irq.h | 7 +++ include/linux/kernel_stat.h | 22 +++++++--- kernel/irq/chip.c | 15 +++---- kernel/irq/handle.c | 97 ++++++++++++++++++++++++++++++------------ kernel/sched.c | 5 +-- 11 files changed, 106 insertions(+), 56 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index c2160cfdec9b..204884b1415a 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -526,7 +526,7 @@ static void do_irq_balance(void) if (package_index == i) IRQ_DELTA(package_index, j) = 0; /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; + value_now = (unsigned long) kstat_irqs_cpu(j, i); /* Determine the activity per processor per IRQ */ delta = value_now - LAST_CPU_IRQ(i, j); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index ede513be517d..576c5df6cad8 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -280,7 +280,7 @@ int show_interrupts(struct seq_file *p, void *v) any_count = kstat_irqs(i); #else for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; + any_count |= kstat_irqs_cpu(i, j); #endif action = desc->action; if (!action && !any_count) @@ -290,7 +290,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%10u ", kstat_irqs(i)); #else for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 738eb65a924e..4a0a4eb44dcb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -90,7 +90,7 @@ int show_interrupts(struct seq_file *p, void *v) any_count = kstat_irqs(i); #else for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; + any_count |= kstat_irqs_cpu(i, j); #endif action = desc->action; if (!action && !any_count) @@ -100,7 +100,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%10u ", kstat_irqs(i)); #else for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 9d85ab384435..817aa55a1209 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_cpu(smp_processor_id()).irqs[realirq]++; + kstat_irqs_this_cpu(desc)++; if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index dd71e3a021cd..bb6bc721b13d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(irq_to_desc(irq))++; out: raw_local_irq_restore(flags); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index a2173a2a5625..aa069acf61a0 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -532,7 +532,7 @@ static int show_stat(struct seq_file *p, void *v) steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); for (j = 0; j < nr_irqs; j++) { - unsigned int temp = kstat_cpu(i).irqs[j]; + unsigned int temp = kstat_irqs_cpu(j, i); sum += temp; per_irq_sum[j] += temp; } diff --git a/include/linux/irq.h b/include/linux/irq.h index 60c856aaac0f..cbf471aee1ce 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -157,6 +157,11 @@ struct irq_desc { #ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_desc *next; struct timer_rand_state *timer_rand_state; +#endif +#ifdef CONFIG_HAVE_DYN_ARRAY + unsigned int *kstat_irqs; +#else + unsigned int kstat_irqs[NR_CPUS]; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; @@ -190,6 +195,8 @@ extern struct irq_desc *irq_to_desc(unsigned int irq); /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; #endif +#define kstat_irqs_this_cpu(DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]) /* * Migration helpers for obsolete names, they will go away: diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index fe1f7fe534b4..f10616712de5 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,10 +28,8 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; -#ifdef CONFIG_HAVE_DYN_ARRAY - unsigned int *irqs; -#else - unsigned int irqs[NR_IRQS]; +#ifndef CONFIG_GENERIC_HARDIRQS + unsigned int irqs[NR_IRQS]; #endif }; @@ -43,15 +41,25 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +#ifndef CONFIG_GENERIC_HARDIRQS +static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + return kstat_cpu(cpu).irqs[irq]; +} +#else +extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); +#endif + /* * Number of interrupts per specific IRQ source, since bootup */ -static inline int kstat_irqs(int irq) +static inline unsigned int kstat_irqs(unsigned int irq) { - int cpu, sum = 0; + unsigned int sum = 0; + int cpu; for_each_possible_cpu(cpu) - sum += kstat_cpu(cpu).irqs[irq]; + sum += kstat_irqs_cpu(irq, cpu); return sum; } diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 76c225cf4b26..2aa3d4b2fce8 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -312,14 +312,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) { struct irqaction *action; irqreturn_t action_ret; - const unsigned int cpu = smp_processor_id(); spin_lock(&desc->lock); if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) @@ -351,7 +350,6 @@ out_unlock: void handle_level_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -361,7 +359,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) if (unlikely(desc->status & IRQ_INPROGRESS)) goto out_unlock; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* * If its disabled or no action available @@ -399,7 +397,6 @@ out_unlock: void handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) { - unsigned int cpu = smp_processor_id(); struct irqaction *action; irqreturn_t action_ret; @@ -409,7 +406,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) goto out; desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* * If its disabled or no action available @@ -458,8 +455,6 @@ out: void handle_edge_irq(unsigned int irq, struct irq_desc *desc) { - const unsigned int cpu = smp_processor_id(); - spin_lock(&desc->lock); desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); @@ -476,7 +471,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) goto out_unlock; } - kstat_cpu(cpu).irqs[irq]++; + kstat_irqs_this_cpu(desc)++; /* Start handling the irq */ desc->chip->ack(irq); @@ -531,7 +526,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc) { irqreturn_t action_ret; - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; if (desc->chip->ack) desc->chip->ack(irq); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 9fc33b3378e6..1f346990f3f8 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -37,7 +37,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc) { print_irq_desc(irq, desc); - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; ack_bad_irq(irq); } @@ -80,17 +80,38 @@ static void init_one_irq_desc(struct irq_desc *desc) #endif } -#ifdef CONFIG_HAVE_SPARSE_IRQ -static int nr_irq_desc = 32; +extern int after_bootmem; +extern void *__alloc_bootmem_nopanic(unsigned long size, + unsigned long align, + unsigned long goal); -static int __init parse_nr_irq_desc(char *arg) +static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) { - if (arg) - nr_irq_desc = simple_strtoul(arg, NULL, 0); - return 0; + unsigned long bytes, total_bytes; + char *ptr; + int i; + unsigned long phys; + + /* Compute how many bytes we need per irq and allocate them */ + bytes = nr * sizeof(unsigned int); + total_bytes = bytes * nr_desc; + if (after_bootmem) + ptr = kzalloc(total_bytes, GFP_ATOMIC); + else + ptr = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!ptr) + panic(" can not allocate kstat_irqs\n"); + + phys = __pa(ptr); + printk(KERN_DEBUG "kstat_irqs ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_desc; i++) { + desc[i].kstat_irqs = (unsigned int *)ptr; + ptr += bytes; + } } -early_param("nr_irq_desc", parse_nr_irq_desc); static void __init init_work(void *data) { @@ -100,25 +121,44 @@ static void __init init_work(void *data) desc = *da->name; - for (i = 0; i < *da->nr; i++) + for (i = 0; i < *da->nr; i++) { init_one_irq_desc(&desc[i]); +#ifndef CONFIG_HAVE_SPARSE_IRQ + desc[i].irq = i; +#endif + } +#ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) desc[i-1].next = &desc[i]; +#endif + + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, *da->nr, nr_cpu_ids); } +#ifdef CONFIG_HAVE_SPARSE_IRQ +static int nr_irq_desc = 32; + +static int __init parse_nr_irq_desc(char *arg) +{ + if (arg) + nr_irq_desc = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("nr_irq_desc", parse_nr_irq_desc); + static struct irq_desc *sparse_irqs; DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); -extern int after_bootmem; -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); struct irq_desc *irq_to_desc(unsigned int irq) { struct irq_desc *desc, *desc_pri; int i; int count = 0; + unsigned long phys; + unsigned long total_bytes; BUG_ON(irq == -1U); @@ -141,38 +181,34 @@ struct irq_desc *irq_to_desc(unsigned int irq) */ printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); + total_bytes = sizeof(struct irq_desc) * nr_irq_desc; if (after_bootmem) - desc = kzalloc(sizeof(struct irq_desc)*nr_irq_desc, GFP_ATOMIC); + desc = kzalloc(total_bytes, GFP_ATOMIC); else - desc = __alloc_bootmem_nopanic(sizeof(struct irq_desc)*nr_irq_desc, PAGE_SIZE, 0); + desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); if (!desc) panic("please boot with nr_irq_desc= %d\n", count * 2); + phys = __pa(desc); + printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + for (i = 0; i < nr_irq_desc; i++) init_one_irq_desc(&desc[i]); for (i = 1; i < nr_irq_desc; i++) desc[i-1].next = &desc[i]; + /* init kstat_irqs, nr_cpu_ids is ready already */ + init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); + desc->irq = irq; desc_pri->next = desc; return desc; } #else -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - int i; - struct irq_desc *desc; - - desc = *da->name; - for (i = 0; i < *da->nr; i++) - init_one_irq_desc(&desc[i]); - -} static struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); @@ -315,7 +351,7 @@ unsigned int __do_IRQ(unsigned int irq) struct irqaction *action; unsigned int status; - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(desc)++; if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; @@ -415,3 +451,10 @@ void early_init_irq_lock_class(void) } #endif +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + return desc->kstat_irqs[cpu]; +} +EXPORT_SYMBOL(kstat_irqs_cpu); + diff --git a/kernel/sched.c b/kernel/sched.c index b9d713781b5b..6f230596bd0c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4048,11 +4048,8 @@ static inline void idle_balance(int cpu, struct rq *rq) #endif DEFINE_PER_CPU(struct kernel_stat, kstat); -EXPORT_PER_CPU_SYMBOL(kstat); -#ifdef CONFIG_HAVE_DYN_ARRAY -DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); -#endif +EXPORT_PER_CPU_SYMBOL(kstat); /* * Return p->sum_exec_runtime plus any more ns on the sched_clock -- cgit v1.2.3 From c7fb03a475bd80c642c1345d85c7c550f63514b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:12 -0700 Subject: irq, fs/proc: replace loop with nr_irqs for proc/stat Replace another nr_irqs loop to avoid the allocation of all sparse irq entries - use for_each_irq_desc instead. v2: make sure arch without GENERIC_HARDIRQS works too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- fs/proc/proc_misc.c | 42 ++++++++++++++++++++++++++++-------------- include/linux/interrupt.h | 5 +++++ 2 files changed, 33 insertions(+), 14 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index aa069acf61a0..c3cbabe8b38e 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -501,17 +502,16 @@ static const struct file_operations proc_vmalloc_operations = { static int show_stat(struct seq_file *p, void *v) { - int i; + int i, j; unsigned long jif; cputime64_t user, nice, system, idle, iowait, irq, softirq, steal; cputime64_t guest; u64 sum = 0; struct timespec boottime; - unsigned int *per_irq_sum; - - per_irq_sum = kzalloc(sizeof(unsigned int)*nr_irqs, GFP_KERNEL); - if (!per_irq_sum) - return -ENOMEM; + unsigned int per_irq_sum; +#ifdef CONFIG_GENERIC_HARDIRQS + struct irq_desc *desc; +#endif user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; @@ -520,8 +520,6 @@ static int show_stat(struct seq_file *p, void *v) jif = boottime.tv_sec; for_each_possible_cpu(i) { - int j; - user = cputime64_add(user, kstat_cpu(i).cpustat.user); nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice); system = cputime64_add(system, kstat_cpu(i).cpustat.system); @@ -531,10 +529,12 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - for (j = 0; j < nr_irqs; j++) { - unsigned int temp = kstat_irqs_cpu(j, i); + for_each_irq_desc(j, desc) + { + unsigned int temp; + + temp = kstat_irqs_cpu(j, i); sum += temp; - per_irq_sum[j] += temp; } sum += arch_irq_stat_cpu(i); } @@ -577,8 +577,23 @@ static int show_stat(struct seq_file *p, void *v) } seq_printf(p, "intr %llu", (unsigned long long)sum); - for (i = 0; i < nr_irqs; i++) - seq_printf(p, " %u", per_irq_sum[i]); + /* sum again ? it could be updated? */ + for_each_irq_desc(j, desc) + { + per_irq_sum = 0; + for_each_possible_cpu(i) { + unsigned int temp; + + temp = kstat_irqs_cpu(j, i); + per_irq_sum += temp; + } + +#ifdef CONFIG_HAVE_SPARSE_IRQ + seq_printf(p, " %u:%u", j, per_irq_sum); +#else + seq_printf(p, " %u", per_irq_sum); +#endif + } seq_printf(p, "\nctxt %llu\n" @@ -592,7 +607,6 @@ static int show_stat(struct seq_file *p, void *v) nr_running(), nr_iowait()); - kfree(per_irq_sum); return 0; } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 511803853a5b..d4039a0b23f4 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -17,6 +17,11 @@ extern int nr_irqs; +#ifndef CONFIG_GENERIC_HARDIRQS +#define for_each_irq_desc(irq, desc) \ + for (irq = 0; irq < nr_irqs; irq++) +#endif + /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When -- cgit v1.2.3 From 52b17329d6d0a4824b89206803a430915031ff23 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:20 -0700 Subject: x86_64: make /proc/interrupts work with dyn irq_desc loop with irq_desc list Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 31 ++++++++++++++++++++++++------- fs/proc/proc_misc.c | 28 ++++++++++++++++++++++++---- 2 files changed, 48 insertions(+), 11 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 5d5976e0311a..7bd841a9c640 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -70,9 +70,27 @@ static inline void stack_overflow_check(struct pt_regs *regs) int show_interrupts(struct seq_file *p, void *v) { - int i = *(loff_t *) v, j; + int i, j; struct irqaction * action; unsigned long flags; + unsigned int entries; + struct irq_desc *desc; + int tail = 0; + +#ifdef CONFIG_HAVE_SPARSE_IRQ + desc = (struct irq_desc *)v; + entries = -1U; + i = desc->irq; + if (!desc->next) + tail = 1; +#else + entries = nr_irqs - 1; + i = *(loff_t *) v; + if (i == nr_irqs) + tail = 1; + else + desc = irq_to_desc(i); +#endif if (i == 0) { seq_printf(p, " "); @@ -81,12 +99,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < nr_irqs) { + if (i <= entries) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); - - if (!desc) - return 0; spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP @@ -116,7 +130,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == nr_irqs) { + } + + if (tail) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); @@ -155,6 +171,7 @@ skip: seq_printf(p, " Spurious interrupts\n"); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); } + return 0; } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index c3cbabe8b38e..72dd739a7f8a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -645,15 +645,36 @@ static const struct file_operations proc_stat_operations = { */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *desc; + int irq; + int count = *pos; + + for_each_irq_desc(irq, desc) { + if (count-- == 0) + return desc; + } + + return NULL; +#else return (*pos <= nr_irqs) ? pos : NULL; +#endif } + static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { +#ifdef CONFIG_HAVE_SPARSE_IRQ + struct irq_desc *desc; + + desc = ((struct irq_desc *)v)->next; (*pos)++; - if (*pos > nr_irqs) - return NULL; - return pos; + + return desc; +#else + (*pos)++; + return (*pos <= nr_irqs) ? pos : NULL; +#endif } static void int_seq_stop(struct seq_file *f, void *v) @@ -661,7 +682,6 @@ static void int_seq_stop(struct seq_file *f, void *v) /* Nothing to do */ } - static const struct seq_operations int_seq_ops = { .start = int_seq_start, .next = int_seq_next, -- cgit v1.2.3 From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:22 -0700 Subject: x86: use 28 bits irq NR for pci msi/msix and ht also print out irq no in /proc/interrups and /proc/stat in hex, so could tell bus/dev/func. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 64 ++++++++++++++++++++++++++++++++++---------- arch/x86/kernel/irq_64.c | 2 +- drivers/pci/htirq.c | 22 +++++++++++++-- fs/proc/proc_misc.c | 2 +- include/linux/irq.h | 1 + 5 files changed, 73 insertions(+), 18 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 8ab7ae01773f..b0d4abc55a11 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -2520,17 +2520,21 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq; - int new; + unsigned int irq; + unsigned int new; unsigned long flags; struct irq_cfg *cfg_new; - irq = -ENOSPC; +#ifndef CONFIG_HAVE_SPARSE_IRQ + irq_want = nr_irqs - 1; +#endif + + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new >= 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2545,12 +2549,24 @@ int create_irq(void) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2803,13 +2819,29 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - int irq, ret; + unsigned int irq; + int ret; + unsigned int irq_want; - irq = create_irq(); - if (irq < 0) - return irq; + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) @@ -2836,18 +2868,22 @@ error: int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret, sub_handle; + unsigned int irq; + int ret, sub_handle; struct msi_desc *desc; + unsigned int irq_want; + #ifdef CONFIG_INTR_REMAP struct intel_iommu *iommu = 0; int index = 0; #endif + irq_want = build_irq_for_pci_dev(dev) + 0x100; sub_handle = 0; list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq(); - if (irq < 0) - return irq; + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) goto no_ir; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 7bd841a9c640..348a11168c2b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%3d: ",i); + seq_printf(p, "%#x: ",i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 279c940a0039..7c5aef13fcdb 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -82,6 +82,18 @@ void unmask_ht_irq(unsigned int irq) write_ht_irq_msg(irq, &msg); } +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + /** * __ht_create_irq - create an irq and attach it to a device. * @dev: The hypertransport device to find the irq capability on. @@ -97,7 +109,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) u32 data; int max_irq; int pos; - int irq; + unsigned int irq; + unsigned int irq_want; pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); if (!pos) @@ -125,8 +138,13 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; + irq_want= build_irq_for_pci_dev(dev); +#ifdef CONFIG_HAVE_SPARSE_IRQ + irq = create_irq_nr(irq_want + idx); +#else irq = create_irq(); - if (irq < 0) { +#endif + if (irq == 0) { kfree(cfg); return -EBUSY; } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 72dd739a7f8a..d68c3592fe4a 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -589,7 +589,7 @@ static int show_stat(struct seq_file *p, void *v) } #ifdef CONFIG_HAVE_SPARSE_IRQ - seq_printf(p, " %u:%u", j, per_irq_sum); + seq_printf(p, " %#x:%u", j, per_irq_sum); #else seq_printf(p, " %u", per_irq_sum); #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index 788d5a35a580..704136138dc7 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -399,6 +399,7 @@ extern void set_irq_noprobe(unsigned int irq); extern void set_irq_probe(unsigned int irq); /* Handle dynamic irq creation and destruction */ +extern unsigned int create_irq_nr(unsigned int irq_want); extern int create_irq(void); extern void destroy_irq(unsigned int irq); -- cgit v1.2.3 From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:16:55 +0200 Subject: genirq: remove sparse irq code This code is not ready, but we need to rip it out instead of rebasing as we would lose the APIC/IO_APIC unification otherwise. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_apic.c | 130 ++----------------------------------------- arch/x86/kernel/irq_32.c | 8 --- arch/x86/kernel/irq_64.c | 8 --- drivers/char/random.c | 31 ----------- drivers/pci/htirq.c | 19 +------ drivers/pci/intr_remapping.c | 75 ------------------------- fs/proc/proc_misc.c | 43 ++------------ include/linux/irq.h | 20 ------- kernel/irq/handle.c | 114 ------------------------------------- 9 files changed, 10 insertions(+), 438 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f959acbc0db2..683610517d2a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -111,9 +111,6 @@ struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_cfg *next; -#endif struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -151,15 +148,6 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) static struct irq_cfg *irq_cfgx; -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* - * Protect the irq_cfgx_free freelist: - */ -static DEFINE_SPINLOCK(irq_cfg_lock); - -static struct irq_cfg *irq_cfgx_free; -#endif - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -174,114 +162,7 @@ static void __init init_work(void *data) legacy_count = ARRAY_SIZE(irq_cfg_legacy); for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); - -#ifdef CONFIG_HAVE_SPARSE_IRQ - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -#endif -} - -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -#define for_each_irq_cfg(irqX, cfg) \ - for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U) - - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - unsigned long flags; - int count = 0; - int i; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - spin_lock_irqsave(&irq_cfg_lock, flags); - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - - spin_unlock_irqrestore(&irq_cfg_lock, flags); - - return cfg; } -#else #define for_each_irq_cfg(irq, cfg) \ for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) @@ -290,17 +171,16 @@ DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq < nr_irqs) - return &irq_cfgx[irq]; + if (irq < nr_irqs) + return &irq_cfgx[irq]; - return NULL; + return NULL; } struct irq_cfg *irq_cfg_alloc(unsigned int irq) { - return irq_cfg(irq); + return irq_cfg(irq); } -#endif /* * This is performance-critical, we want to do it O(1) * @@ -3068,9 +2948,7 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned long flags; struct irq_cfg *cfg_new; -#ifndef CONFIG_HAVE_SPARSE_IRQ irq_want = nr_irqs - 1; -#endif irq = 0; spin_lock_irqsave(&vector_lock, flags); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 001772ffc918..ccf6c1bf7120 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -272,20 +272,12 @@ int show_interrupts(struct seq_file *p, void *v) struct irq_desc *desc = NULL; int tail = 0; -#ifdef CONFIG_HAVE_SPARSE_IRQ - desc = (struct irq_desc *)v; - entries = -1U; - i = desc->irq; - if (!desc->next) - tail = 1; -#else entries = nr_irqs - 1; i = *(loff_t *) v; if (i == nr_irqs) tail = 1; else desc = irq_to_desc(i); -#endif if (i == 0) { seq_printf(p, " "); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index ec2661091283..21f53b911113 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -77,20 +77,12 @@ int show_interrupts(struct seq_file *p, void *v) struct irq_desc *desc = NULL; int tail = 0; -#ifdef CONFIG_HAVE_SPARSE_IRQ - desc = (struct irq_desc *)v; - entries = -1U; - i = desc->irq; - if (!desc->next) - tail = 1; -#else entries = nr_irqs - 1; i = *(loff_t *) v; if (i == nr_irqs) tail = 1; else desc = irq_to_desc(i); -#endif if (i == 0) { seq_printf(p, " "); diff --git a/drivers/char/random.c b/drivers/char/random.c index 60c9c7ee6b2c..9ce80213007b 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -558,8 +558,6 @@ struct timer_rand_state { unsigned dont_count_entropy:1; }; -#ifndef CONFIG_HAVE_SPARSE_IRQ - #ifdef CONFIG_HAVE_DYN_ARRAY static struct timer_rand_state **irq_timer_state; DEFINE_DYN_ARRAY(irq_timer_state, sizeof(struct timer_rand_state *), nr_irqs, PAGE_SIZE, NULL); @@ -583,33 +581,6 @@ static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *stat irq_timer_state[irq] = state; } -#else - -static struct timer_rand_state *get_timer_rand_state(unsigned int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - if (!desc) - return NULL; - - return desc->timer_rand_state; -} - -static void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - if (!desc) - return; - - desc->timer_rand_state = state; -} -#endif - static struct timer_rand_state input_timer_state; /* @@ -967,10 +938,8 @@ void rand_initialize_irq(int irq) { struct timer_rand_state *state; -#ifndef CONFIG_HAVE_SPARSE_IRQ if (irq >= nr_irqs) return; -#endif state = get_timer_rand_state(irq); diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c index 9e4929a00832..bf7d6ce9bbb3 100644 --- a/drivers/pci/htirq.c +++ b/drivers/pci/htirq.c @@ -82,18 +82,6 @@ void unmask_ht_irq(unsigned int irq) write_ht_irq_msg(irq, &msg); } -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - /** * __ht_create_irq - create an irq and attach it to a device. * @dev: The hypertransport device to find the irq capability on. @@ -110,7 +98,6 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) int max_irq; int pos; int irq; - unsigned int irq_want; pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ); if (!pos) @@ -138,12 +125,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update) cfg->msg.address_lo = 0xffffffff; cfg->msg.address_hi = 0xffffffff; - irq_want= build_irq_for_pci_dev(dev); -#ifdef CONFIG_HAVE_SPARSE_IRQ - irq = create_irq_nr(irq_want + idx); -#else irq = create_irq(); -#endif + if (irq <= 0) { kfree(cfg); return -EBUSY; diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c index 2dcf973890c4..0f43b265eee6 100644 --- a/drivers/pci/intr_remapping.c +++ b/drivers/pci/intr_remapping.c @@ -19,78 +19,6 @@ struct irq_2_iommu { u8 irte_mask; }; -#ifdef CONFIG_HAVE_SPARSE_IRQ -static struct irq_2_iommu *irq_2_iommuX; -/* fill one page ? */ -static int nr_irq_2_iommu = 0x100; -static int irq_2_iommu_index; -DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irq_2_iommu, PAGE_SIZE, NULL); - -extern void *__alloc_bootmem_nopanic(unsigned long size, - unsigned long align, - unsigned long goal); - -static struct irq_2_iommu *get_one_free_irq_2_iommu(int not_used) -{ - struct irq_2_iommu *iommu; - unsigned long total_bytes; - - if (irq_2_iommu_index >= nr_irq_2_iommu) { - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_iommu %d\n", nr_irq_2_iommu); - - total_bytes = sizeof(struct irq_2_iommu)*nr_irq_2_iommu; - - if (after_bootmem) - iommu = kzalloc(total_bytes, GFP_ATOMIC); - else - iommu = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!iommu) - panic("can not get more irq_2_iommu\n"); - - irq_2_iommuX = iommu; - irq_2_iommu_index = 0; - } - - iommu = &irq_2_iommuX[irq_2_iommu_index]; - irq_2_iommu_index++; - return iommu; -} - -static struct irq_2_iommu *irq_2_iommu(unsigned int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - - BUG_ON(!desc); - - return desc->irq_2_iommu; -} - -static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) -{ - struct irq_desc *desc; - struct irq_2_iommu *irq_iommu; - - /* - * alloc irq desc if not allocated already. - */ - desc = irq_to_desc_alloc(irq); - - irq_iommu = desc->irq_2_iommu; - - if (!irq_iommu) - desc->irq_2_iommu = get_one_free_irq_2_iommu(irq); - - return desc->irq_2_iommu; -} - -#else /* !CONFIG_HAVE_SPARSE_IRQ */ - #ifdef CONFIG_HAVE_DYN_ARRAY static struct irq_2_iommu *irq_2_iommuX; DEFINE_DYN_ARRAY(irq_2_iommuX, sizeof(struct irq_2_iommu), nr_irqs, PAGE_SIZE, NULL); @@ -109,7 +37,6 @@ static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq) { return irq_2_iommu(irq); } -#endif static DEFINE_SPINLOCK(irq_2_ir_lock); @@ -166,11 +93,9 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) if (!count) return -1; -#ifndef CONFIG_HAVE_SPARSE_IRQ /* protect irq_2_iommu_alloc later */ if (irq >= nr_irqs) return -1; -#endif /* * start the IRTE search from index 0. diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index d68c3592fe4a..3f5c7b9d1a70 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -529,13 +529,10 @@ static int show_stat(struct seq_file *p, void *v) softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq); steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); + for_each_irq_desc(j, desc) - { - unsigned int temp; + sum += kstat_irqs_cpu(j, i); - temp = kstat_irqs_cpu(j, i); - sum += temp; - } sum += arch_irq_stat_cpu(i); } sum += arch_irq_stat(); @@ -578,21 +575,13 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_desc(j, desc) - { + for_each_irq_desc(j, desc) { per_irq_sum = 0; - for_each_possible_cpu(i) { - unsigned int temp; - temp = kstat_irqs_cpu(j, i); - per_irq_sum += temp; - } + for_each_possible_cpu(i) + per_irq_sum += kstat_irqs_cpu(j, i); -#ifdef CONFIG_HAVE_SPARSE_IRQ - seq_printf(p, " %#x:%u", j, per_irq_sum); -#else seq_printf(p, " %u", per_irq_sum); -#endif } seq_printf(p, @@ -645,36 +634,14 @@ static const struct file_operations proc_stat_operations = { */ static void *int_seq_start(struct seq_file *f, loff_t *pos) { -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *desc; - int irq; - int count = *pos; - - for_each_irq_desc(irq, desc) { - if (count-- == 0) - return desc; - } - - return NULL; -#else return (*pos <= nr_irqs) ? pos : NULL; -#endif } static void *int_seq_next(struct seq_file *f, void *v, loff_t *pos) { -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *desc; - - desc = ((struct irq_desc *)v)->next; - (*pos)++; - - return desc; -#else (*pos)++; return (*pos <= nr_irqs) ? pos : NULL; -#endif } static void int_seq_stop(struct seq_file *f, void *v) diff --git a/include/linux/irq.h b/include/linux/irq.h index 7d1adacaadb4..68e0f3f9df30 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -167,15 +167,8 @@ struct irq_2_iommu; */ struct irq_desc { unsigned int irq; -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_desc *next; - struct timer_rand_state *timer_rand_state; -#endif #ifdef CONFIG_HAVE_DYN_ARRAY unsigned int *kstat_irqs; -#endif -#if defined(CONFIG_INTR_REMAP) && defined(CONFIG_HAVE_SPARSE_IRQ) - struct irq_2_iommu *irq_2_iommu; #endif irq_flow_handler_t handle_irq; struct irq_chip *chip; @@ -205,8 +198,6 @@ struct irq_desc { } ____cacheline_internodealigned_in_smp; -#ifndef CONFIG_HAVE_SPARSE_IRQ - #ifndef CONFIG_HAVE_DYN_ARRAY /* could be removed if we get rid of all irq_desc reference */ extern struct irq_desc irq_desc[NR_IRQS]; @@ -224,17 +215,6 @@ static inline struct irq_desc *irq_to_desc_alloc(unsigned int irq) return irq_to_desc(irq); } -#else - -extern struct irq_desc *irq_to_desc(unsigned int irq); -extern struct irq_desc *irq_to_desc_alloc(unsigned int irq); - -extern struct irq_desc *sparse_irqs; -#define for_each_irq_desc(irqX, desc) \ - for (desc = sparse_irqs, irqX = desc->irq; desc; desc = desc->next, irqX = desc ? desc->irq : -1U) - -#endif - #ifdef CONFIG_HAVE_DYN_ARRAY #define kstat_irqs_this_cpu(DESC) \ ((DESC)->kstat_irqs[smp_processor_id()]) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c19896f895f9..f837133cdfbe 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,15 +111,6 @@ static void init_kstat_irqs(struct irq_desc *desc, int nr_desc, int nr) } } -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* - * Protect the sparse_irqs_free freelist: - */ -static DEFINE_SPINLOCK(sparse_irq_lock); -static struct irq_desc *sparse_irqs_free; -struct irq_desc *sparse_irqs; -#endif - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -130,121 +121,16 @@ static void __init init_work(void *data) for (i = 0; i < *da->nr; i++) { init_one_irq_desc(&desc[i]); -#ifndef CONFIG_HAVE_SPARSE_IRQ desc[i].irq = i; -#endif } /* init kstat_irqs, nr_cpu_ids is ready already */ init_kstat_irqs(desc, *da->nr, nr_cpu_ids); - -#ifdef CONFIG_HAVE_SPARSE_IRQ - for (i = 1; i < *da->nr; i++) - desc[i-1].next = &desc[i]; - - sparse_irqs_free = sparse_irqs; - sparse_irqs = NULL; -#endif -} - -#ifdef CONFIG_HAVE_SPARSE_IRQ -static int nr_irq_desc = 32; - -static int __init parse_nr_irq_desc(char *arg) -{ - if (arg) - nr_irq_desc = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("nr_irq_desc", parse_nr_irq_desc); - -DEFINE_DYN_ARRAY(sparse_irqs, sizeof(struct irq_desc), nr_irq_desc, PAGE_SIZE, init_work); - -struct irq_desc *irq_to_desc(unsigned int irq) -{ - struct irq_desc *desc; - - desc = sparse_irqs; - while (desc) { - if (desc->irq == irq) - return desc; - - desc = desc->next; - } - return NULL; } -struct irq_desc *irq_to_desc_alloc(unsigned int irq) -{ - struct irq_desc *desc, *desc_pri; - unsigned long flags; - int count = 0; - int i; - - desc_pri = desc = sparse_irqs; - while (desc) { - if (desc->irq == irq) - return desc; - - desc_pri = desc; - desc = desc->next; - count++; - } - - spin_lock_irqsave(&sparse_irq_lock, flags); - /* - * we run out of pre-allocate ones, allocate more - */ - if (!sparse_irqs_free) { - unsigned long phys; - unsigned long total_bytes; - - printk(KERN_DEBUG "try to get more irq_desc %d\n", nr_irq_desc); - - total_bytes = sizeof(struct irq_desc) * nr_irq_desc; - if (after_bootmem) - desc = kzalloc(total_bytes, GFP_ATOMIC); - else - desc = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!desc) - panic("please boot with nr_irq_desc= %d\n", count * 2); - - phys = __pa(desc); - printk(KERN_DEBUG "irq_desc ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_desc; i++) - init_one_irq_desc(&desc[i]); - - for (i = 1; i < nr_irq_desc; i++) - desc[i-1].next = &desc[i]; - - /* init kstat_irqs, nr_cpu_ids is ready already */ - init_kstat_irqs(desc, nr_irq_desc, nr_cpu_ids); - - sparse_irqs_free = desc; - } - - desc = sparse_irqs_free; - sparse_irqs_free = sparse_irqs_free->next; - desc->next = NULL; - if (desc_pri) - desc_pri->next = desc; - else - sparse_irqs = desc; - desc->irq = irq; - - spin_unlock_irqrestore(&sparse_irq_lock, flags); - - return desc; -} -#else struct irq_desc *irq_desc; DEFINE_DYN_ARRAY(irq_desc, sizeof(struct irq_desc), nr_irqs, PAGE_SIZE, init_work); -#endif - #else struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { -- cgit v1.2.3 From 2be3b52a5785a6a5c5349fbd315f57595f7074be Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 14:50:27 +0200 Subject: proc: fixup irq iterator There is no need for irq_desc here. Even for sparse_irq we can handle this clever in for_each_irq_nr(). Signed-off-by: Thomas Gleixner --- fs/proc/proc_misc.c | 7 ++----- include/linux/irq.h | 3 +++ 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 3f5c7b9d1a70..97b4579134d5 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -509,9 +509,6 @@ static int show_stat(struct seq_file *p, void *v) u64 sum = 0; struct timespec boottime; unsigned int per_irq_sum; -#ifdef CONFIG_GENERIC_HARDIRQS - struct irq_desc *desc; -#endif user = nice = system = idle = iowait = irq = softirq = steal = cputime64_zero; @@ -530,7 +527,7 @@ static int show_stat(struct seq_file *p, void *v) steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal); guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest); - for_each_irq_desc(j, desc) + for_each_irq_nr(j) sum += kstat_irqs_cpu(j, i); sum += arch_irq_stat_cpu(i); @@ -575,7 +572,7 @@ static int show_stat(struct seq_file *p, void *v) seq_printf(p, "intr %llu", (unsigned long long)sum); /* sum again ? it could be updated? */ - for_each_irq_desc(j, desc) { + for_each_irq_nr(j) { per_irq_sum = 0; for_each_possible_cpu(i) diff --git a/include/linux/irq.h b/include/linux/irq.h index 31632aa65d16..0618fb362cb4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -27,6 +27,9 @@ extern int nr_irqs; irq > 0; irq--, desc--) #endif +#define for_each_irq_nr(irq) \ + for (irq = 0; irq < nr_irqs; irq++) + #ifndef CONFIG_S390 #include -- cgit v1.2.3 From f40cbaa5b0a4719489e6e7947351c99a159aca30 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Oct 2008 22:04:23 -0700 Subject: proc: move sysrq-trigger out of fs/proc/ Move it into sysrq.c, along with the rest of the sysrq implementation. Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/sysrq.c | 30 ++++++++++++++++++++++++++++++ fs/proc/proc_misc.c | 26 -------------------------- 2 files changed, 30 insertions(+), 26 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 2aa79ab1930d..dce4cc0e6953 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -534,3 +535,32 @@ int unregister_sysrq_key(int key, struct sysrq_key_op *op_p) return __sysrq_swap_key_ops(key, NULL, op_p); } EXPORT_SYMBOL(unregister_sysrq_key); + +#ifdef CONFIG_PROC_FS +/* + * writing 'C' to /proc/sysrq-trigger is like sysrq-C + */ +static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + if (count) { + char c; + + if (get_user(c, buf)) + return -EFAULT; + __handle_sysrq(c, NULL, 0); + } + return count; +} + +static const struct file_operations proc_sysrq_trigger_operations = { + .write = write_sysrq_trigger, +}; + +static int __init sysrq_init(void) +{ + proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); + return 0; +} +module_init(sysrq_init); +#endif diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index b675a49c1823..59ea42e1ef03 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include @@ -704,28 +703,6 @@ static int execdomains_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } -#ifdef CONFIG_MAGIC_SYSRQ -/* - * writing 'C' to /proc/sysrq-trigger is like sysrq-C - */ -static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (count) { - char c; - - if (get_user(c, buf)) - return -EFAULT; - __handle_sysrq(c, NULL, 0); - } - return count; -} - -static const struct file_operations proc_sysrq_trigger_operations = { - .write = write_sysrq_trigger, -}; -#endif - #ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) @@ -934,7 +911,4 @@ void __init proc_misc_init(void) #ifdef CONFIG_PROC_VMCORE proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); #endif -#ifdef CONFIG_MAGIC_SYSRQ - proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); -#endif } -- cgit v1.2.3 From 4f98a2fee8acdb4ac84545df98cccecfd130f8db Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 18 Oct 2008 20:26:32 -0700 Subject: vmscan: split LRU lists into anon & file sets Split the LRU lists in two, one set for pages that are backed by real file systems ("file") and one for pages that are backed by memory and swap ("anon"). The latter includes tmpfs. The advantage of doing this is that the VM will not have to scan over lots of anonymous pages (which we generally do not want to swap out), just to find the page cache pages that it should evict. This patch has the infrastructure and a basic policy to balance how much we scan the anon lists and how much we scan the file lists. The big policy changes are in separate patches. [lee.schermerhorn@hp.com: collect lru meminfo statistics from correct offset] [kosaki.motohiro@jp.fujitsu.com: prevent incorrect oom under split_lru] [kosaki.motohiro@jp.fujitsu.com: fix pagevec_move_tail() doesn't treat unevictable page] [hugh@veritas.com: memcg swapbacked pages active] [hugh@veritas.com: splitlru: BDI_CAP_SWAP_BACKED] [akpm@linux-foundation.org: fix /proc/vmstat units] [nishimura@mxp.nes.nec.co.jp: memcg: fix handling of shmem migration] [kosaki.motohiro@jp.fujitsu.com: adjust Quicklists field of /proc/meminfo] [kosaki.motohiro@jp.fujitsu.com: fix style issue of get_scan_ratio()] Signed-off-by: Rik van Riel Signed-off-by: Lee Schermerhorn Signed-off-by: KOSAKI Motohiro Signed-off-by: Hugh Dickins Signed-off-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 56 +++--- fs/cifs/file.c | 4 +- fs/nfs/dir.c | 2 +- fs/ntfs/file.c | 4 +- fs/proc/proc_misc.c | 77 ++++---- fs/ramfs/file-nommu.c | 4 +- include/linux/backing-dev.h | 13 ++ include/linux/memcontrol.h | 2 +- include/linux/mm_inline.h | 50 ++++-- include/linux/mmzone.h | 47 ++++- include/linux/pagevec.h | 29 ++- include/linux/swap.h | 20 ++- include/linux/vmstat.h | 10 ++ mm/filemap.c | 22 ++- mm/hugetlb.c | 10 +- mm/memcontrol.c | 88 ++++++---- mm/memory.c | 6 +- mm/page-writeback.c | 8 +- mm/page_alloc.c | 25 ++- mm/readahead.c | 2 +- mm/shmem.c | 2 +- mm/swap.c | 14 +- mm/swap_state.c | 4 +- mm/vmscan.c | 416 +++++++++++++++++++++++--------------------- mm/vmstat.c | 14 +- 25 files changed, 562 insertions(+), 367 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 5116b78c6325..fc7e9bf0cdbc 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -61,34 +61,44 @@ static ssize_t node_read_meminfo(struct sys_device * dev, si_meminfo_node(&i, nid); n = sprintf(buf, "\n" - "Node %d MemTotal: %8lu kB\n" - "Node %d MemFree: %8lu kB\n" - "Node %d MemUsed: %8lu kB\n" - "Node %d Active: %8lu kB\n" - "Node %d Inactive: %8lu kB\n" + "Node %d MemTotal: %8lu kB\n" + "Node %d MemFree: %8lu kB\n" + "Node %d MemUsed: %8lu kB\n" + "Node %d Active: %8lu kB\n" + "Node %d Inactive: %8lu kB\n" + "Node %d Active(anon): %8lu kB\n" + "Node %d Inactive(anon): %8lu kB\n" + "Node %d Active(file): %8lu kB\n" + "Node %d Inactive(file): %8lu kB\n" #ifdef CONFIG_HIGHMEM - "Node %d HighTotal: %8lu kB\n" - "Node %d HighFree: %8lu kB\n" - "Node %d LowTotal: %8lu kB\n" - "Node %d LowFree: %8lu kB\n" + "Node %d HighTotal: %8lu kB\n" + "Node %d HighFree: %8lu kB\n" + "Node %d LowTotal: %8lu kB\n" + "Node %d LowFree: %8lu kB\n" #endif - "Node %d Dirty: %8lu kB\n" - "Node %d Writeback: %8lu kB\n" - "Node %d FilePages: %8lu kB\n" - "Node %d Mapped: %8lu kB\n" - "Node %d AnonPages: %8lu kB\n" - "Node %d PageTables: %8lu kB\n" - "Node %d NFS_Unstable: %8lu kB\n" - "Node %d Bounce: %8lu kB\n" - "Node %d WritebackTmp: %8lu kB\n" - "Node %d Slab: %8lu kB\n" - "Node %d SReclaimable: %8lu kB\n" - "Node %d SUnreclaim: %8lu kB\n", + "Node %d Dirty: %8lu kB\n" + "Node %d Writeback: %8lu kB\n" + "Node %d FilePages: %8lu kB\n" + "Node %d Mapped: %8lu kB\n" + "Node %d AnonPages: %8lu kB\n" + "Node %d PageTables: %8lu kB\n" + "Node %d NFS_Unstable: %8lu kB\n" + "Node %d Bounce: %8lu kB\n" + "Node %d WritebackTmp: %8lu kB\n" + "Node %d Slab: %8lu kB\n" + "Node %d SReclaimable: %8lu kB\n" + "Node %d SUnreclaim: %8lu kB\n", nid, K(i.totalram), nid, K(i.freeram), nid, K(i.totalram - i.freeram), - nid, K(node_page_state(nid, NR_ACTIVE)), - nid, K(node_page_state(nid, NR_INACTIVE)), + nid, K(node_page_state(nid, NR_ACTIVE_ANON) + + node_page_state(nid, NR_ACTIVE_FILE)), + nid, K(node_page_state(nid, NR_INACTIVE_ANON) + + node_page_state(nid, NR_INACTIVE_FILE)), + nid, K(node_page_state(nid, NR_ACTIVE_ANON)), + nid, K(node_page_state(nid, NR_INACTIVE_ANON)), + nid, K(node_page_state(nid, NR_ACTIVE_FILE)), + nid, K(node_page_state(nid, NR_INACTIVE_FILE)), #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c4a8a0605125..62d8bd8f14c0 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1791,7 +1791,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping, SetPageUptodate(page); unlock_page(page); if (!pagevec_add(plru_pvec, page)) - __pagevec_lru_add(plru_pvec); + __pagevec_lru_add_file(plru_pvec); data += PAGE_CACHE_SIZE; } return; @@ -1925,7 +1925,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, bytes_read = 0; } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); /* need to free smb_read_data buf before exit */ if (smb_read_data) { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2ab70d46ecbc..efdba2e802d7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1517,7 +1517,7 @@ static int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *sym if (!add_to_page_cache(page, dentry->d_inode->i_mapping, 0, GFP_KERNEL)) { pagevec_add(&lru_pvec, page); - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); SetPageUptodate(page); unlock_page(page); } else diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index d020866d4232..3140a4429af1 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -439,7 +439,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, pages[nr] = *cached_page; page_cache_get(*cached_page); if (unlikely(!pagevec_add(lru_pvec, *cached_page))) - __pagevec_lru_add(lru_pvec); + __pagevec_lru_add_file(lru_pvec); *cached_page = NULL; } index++; @@ -2084,7 +2084,7 @@ err_out: OSYNC_METADATA|OSYNC_DATA); } } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, (long)status); diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 59ea42e1ef03..b8edb2860557 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -136,6 +136,8 @@ static int meminfo_read_proc(char *page, char **start, off_t off, unsigned long allowed; struct vmalloc_info vmi; long cached; + unsigned long pages[NR_LRU_LISTS]; + int lru; /* * display in kilobytes. @@ -154,51 +156,62 @@ static int meminfo_read_proc(char *page, char **start, off_t off, get_vmalloc_info(&vmi); + for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) + pages[lru] = global_page_state(NR_LRU_BASE + lru); + /* * Tagged format, for easy grepping and expansion. */ len = sprintf(page, - "MemTotal: %8lu kB\n" - "MemFree: %8lu kB\n" - "Buffers: %8lu kB\n" - "Cached: %8lu kB\n" - "SwapCached: %8lu kB\n" - "Active: %8lu kB\n" - "Inactive: %8lu kB\n" + "MemTotal: %8lu kB\n" + "MemFree: %8lu kB\n" + "Buffers: %8lu kB\n" + "Cached: %8lu kB\n" + "SwapCached: %8lu kB\n" + "Active: %8lu kB\n" + "Inactive: %8lu kB\n" + "Active(anon): %8lu kB\n" + "Inactive(anon): %8lu kB\n" + "Active(file): %8lu kB\n" + "Inactive(file): %8lu kB\n" #ifdef CONFIG_HIGHMEM - "HighTotal: %8lu kB\n" - "HighFree: %8lu kB\n" - "LowTotal: %8lu kB\n" - "LowFree: %8lu kB\n" + "HighTotal: %8lu kB\n" + "HighFree: %8lu kB\n" + "LowTotal: %8lu kB\n" + "LowFree: %8lu kB\n" #endif - "SwapTotal: %8lu kB\n" - "SwapFree: %8lu kB\n" - "Dirty: %8lu kB\n" - "Writeback: %8lu kB\n" - "AnonPages: %8lu kB\n" - "Mapped: %8lu kB\n" - "Slab: %8lu kB\n" - "SReclaimable: %8lu kB\n" - "SUnreclaim: %8lu kB\n" - "PageTables: %8lu kB\n" + "SwapTotal: %8lu kB\n" + "SwapFree: %8lu kB\n" + "Dirty: %8lu kB\n" + "Writeback: %8lu kB\n" + "AnonPages: %8lu kB\n" + "Mapped: %8lu kB\n" + "Slab: %8lu kB\n" + "SReclaimable: %8lu kB\n" + "SUnreclaim: %8lu kB\n" + "PageTables: %8lu kB\n" #ifdef CONFIG_QUICKLIST - "Quicklists: %8lu kB\n" + "Quicklists: %8lu kB\n" #endif - "NFS_Unstable: %8lu kB\n" - "Bounce: %8lu kB\n" - "WritebackTmp: %8lu kB\n" - "CommitLimit: %8lu kB\n" - "Committed_AS: %8lu kB\n" - "VmallocTotal: %8lu kB\n" - "VmallocUsed: %8lu kB\n" - "VmallocChunk: %8lu kB\n", + "NFS_Unstable: %8lu kB\n" + "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" + "CommitLimit: %8lu kB\n" + "Committed_AS: %8lu kB\n" + "VmallocTotal: %8lu kB\n" + "VmallocUsed: %8lu kB\n" + "VmallocChunk: %8lu kB\n", K(i.totalram), K(i.freeram), K(i.bufferram), K(cached), K(total_swapcache_pages), - K(global_page_state(NR_ACTIVE)), - K(global_page_state(NR_INACTIVE)), + K(pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]), + K(pages[LRU_ACTIVE_ANON]), + K(pages[LRU_INACTIVE_ANON]), + K(pages[LRU_ACTIVE_FILE]), + K(pages[LRU_INACTIVE_FILE]), #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 5145cb9125af..76acdbc34611 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -112,12 +112,12 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) goto add_error; if (!pagevec_add(&lru_pvec, page)) - __pagevec_lru_add(&lru_pvec); + __pagevec_lru_add_file(&lru_pvec); unlock_page(page); } - pagevec_lru_add(&lru_pvec); + pagevec_lru_add_file(&lru_pvec); return 0; fsize_exceeded: diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0a24d5550eb3..bee52abb8a4d 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -175,6 +175,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_READ_MAP: Can be mapped for reading * BDI_CAP_WRITE_MAP: Can be mapped for writing * BDI_CAP_EXEC_MAP: Can be mapped for execution + * + * BDI_CAP_SWAP_BACKED: Count shmem/tmpfs objects as swap-backed. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -184,6 +186,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_WRITE_MAP 0x00000020 #define BDI_CAP_EXEC_MAP 0x00000040 #define BDI_CAP_NO_ACCT_WB 0x00000080 +#define BDI_CAP_SWAP_BACKED 0x00000100 #define BDI_CAP_VMFLAGS \ (BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP) @@ -248,6 +251,11 @@ static inline bool bdi_cap_account_writeback(struct backing_dev_info *bdi) BDI_CAP_NO_WRITEBACK)); } +static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_SWAP_BACKED; +} + static inline bool mapping_cap_writeback_dirty(struct address_space *mapping) { return bdi_cap_writeback_dirty(mapping->backing_dev_info); @@ -258,4 +266,9 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping) return bdi_cap_account_dirty(mapping->backing_dev_info); } +static inline bool mapping_cap_swap_backed(struct address_space *mapping) +{ + return bdi_cap_swap_backed(mapping->backing_dev_info); +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index a6ac0d491fe6..8d8f05c1515a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -44,7 +44,7 @@ extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 96e970485b6c..2eb599465d56 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -5,7 +5,7 @@ * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test * - * Returns !0 if @page is page cache page backed by a regular filesystem, + * Returns LRU_FILE if @page is page cache page backed by a regular filesystem, * or 0 if @page is anonymous, tmpfs or otherwise ram or swap backed. * Used by functions that manipulate the LRU lists, to sort a page * onto the right LRU list. @@ -20,7 +20,7 @@ static inline int page_is_file_cache(struct page *page) return 0; /* The page is page cache backed by a normal filesystem. */ - return 1; + return LRU_FILE; } static inline void @@ -38,39 +38,64 @@ del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) } static inline void -add_page_to_active_list(struct zone *zone, struct page *page) +add_page_to_inactive_anon_list(struct zone *zone, struct page *page) { - add_page_to_lru_list(zone, page, LRU_ACTIVE); + add_page_to_lru_list(zone, page, LRU_INACTIVE_ANON); } static inline void -add_page_to_inactive_list(struct zone *zone, struct page *page) +add_page_to_active_anon_list(struct zone *zone, struct page *page) { - add_page_to_lru_list(zone, page, LRU_INACTIVE); + add_page_to_lru_list(zone, page, LRU_ACTIVE_ANON); } static inline void -del_page_from_active_list(struct zone *zone, struct page *page) +add_page_to_inactive_file_list(struct zone *zone, struct page *page) { - del_page_from_lru_list(zone, page, LRU_ACTIVE); + add_page_to_lru_list(zone, page, LRU_INACTIVE_FILE); } static inline void -del_page_from_inactive_list(struct zone *zone, struct page *page) +add_page_to_active_file_list(struct zone *zone, struct page *page) { - del_page_from_lru_list(zone, page, LRU_INACTIVE); + add_page_to_lru_list(zone, page, LRU_ACTIVE_FILE); +} + +static inline void +del_page_from_inactive_anon_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_INACTIVE_ANON); +} + +static inline void +del_page_from_active_anon_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_ACTIVE_ANON); +} + +static inline void +del_page_from_inactive_file_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE); +} + +static inline void +del_page_from_active_file_list(struct zone *zone, struct page *page) +{ + del_page_from_lru_list(zone, page, LRU_INACTIVE_FILE); } static inline void del_page_from_lru(struct zone *zone, struct page *page) { - enum lru_list l = LRU_INACTIVE; + enum lru_list l = LRU_BASE; list_del(&page->lru); if (PageActive(page)) { __ClearPageActive(page); - l = LRU_ACTIVE; + l += LRU_ACTIVE; } + l += page_is_file_cache(page); __dec_zone_state(zone, NR_LRU_BASE + l); } @@ -87,6 +112,7 @@ static inline enum lru_list page_lru(struct page *page) if (PageActive(page)) lru += LRU_ACTIVE; + lru += page_is_file_cache(page); return lru; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 156e18f3919b..59a4c8fd6ebd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -82,21 +82,23 @@ enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_LRU_BASE, - NR_INACTIVE = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ - NR_ACTIVE, /* " " " " " */ + NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ + NR_ACTIVE_ANON, /* " " " " " */ + NR_INACTIVE_FILE, /* " " " " " */ + NR_ACTIVE_FILE, /* " " " " " */ NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, - /* Second 128 byte cacheline */ NR_SLAB_RECLAIMABLE, NR_SLAB_UNRECLAIMABLE, NR_PAGETABLE, /* used for pagetables */ NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_BOUNCE, NR_VMSCAN_WRITE, + /* Second 128 byte cacheline */ NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ @@ -108,17 +110,36 @@ enum zone_stat_item { #endif NR_VM_ZONE_STAT_ITEMS }; +/* + * We do arithmetic on the LRU lists in various places in the code, + * so it is important to keep the active lists LRU_ACTIVE higher in + * the array than the corresponding inactive lists, and to keep + * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. + * + * This has to be kept in sync with the statistics in zone_stat_item + * above and the descriptions in vmstat_text in mm/vmstat.c + */ +#define LRU_BASE 0 +#define LRU_ACTIVE 1 +#define LRU_FILE 2 + enum lru_list { - LRU_BASE, - LRU_INACTIVE=LRU_BASE, /* must match order of NR_[IN]ACTIVE */ - LRU_ACTIVE, /* " " " " " */ + LRU_INACTIVE_ANON = LRU_BASE, + LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, + LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, + LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, NR_LRU_LISTS }; #define for_each_lru(l) for (l = 0; l < NR_LRU_LISTS; l++) +static inline int is_file_lru(enum lru_list l) +{ + return (l == LRU_INACTIVE_FILE || l == LRU_ACTIVE_FILE); +} + static inline int is_active_lru(enum lru_list l) { - return (l == LRU_ACTIVE); + return (l == LRU_ACTIVE_ANON || l == LRU_ACTIVE_FILE); } struct per_cpu_pages { @@ -269,6 +290,18 @@ struct zone { struct list_head list; unsigned long nr_scan; } lru[NR_LRU_LISTS]; + + /* + * The pageout code in vmscan.c keeps track of how many of the + * mem/swap backed and file backed pages are refeferenced. + * The higher the rotated/scanned ratio, the more valuable + * that cache is. + * + * The anon LRU stats live in [0], file LRU stats in [1] + */ + unsigned long recent_rotated[2]; + unsigned long recent_scanned[2]; + unsigned long pages_scanned; /* since last reclaim */ unsigned long flags; /* zone flags, see below */ diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index fea3a982ee55..5fc96a4e760f 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -81,20 +81,37 @@ static inline void pagevec_free(struct pagevec *pvec) __pagevec_free(pvec); } -static inline void __pagevec_lru_add(struct pagevec *pvec) +static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_INACTIVE); + ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); } -static inline void __pagevec_lru_add_active(struct pagevec *pvec) +static inline void __pagevec_lru_add_active_anon(struct pagevec *pvec) { - ____pagevec_lru_add(pvec, LRU_ACTIVE); + ____pagevec_lru_add(pvec, LRU_ACTIVE_ANON); } -static inline void pagevec_lru_add(struct pagevec *pvec) +static inline void __pagevec_lru_add_file(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_INACTIVE_FILE); +} + +static inline void __pagevec_lru_add_active_file(struct pagevec *pvec) +{ + ____pagevec_lru_add(pvec, LRU_ACTIVE_FILE); +} + + +static inline void pagevec_lru_add_file(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_lru_add_file(pvec); +} + +static inline void pagevec_lru_add_anon(struct pagevec *pvec) { if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); + __pagevec_lru_add_anon(pvec); } #endif /* _LINUX_PAGEVEC_H */ diff --git a/include/linux/swap.h b/include/linux/swap.h index 833be56ad835..7d09d79997a4 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -184,14 +184,24 @@ extern void swap_setup(void); * lru_cache_add: add a page to the page lists * @page: the page to add */ -static inline void lru_cache_add(struct page *page) +static inline void lru_cache_add_anon(struct page *page) { - __lru_cache_add(page, LRU_INACTIVE); + __lru_cache_add(page, LRU_INACTIVE_ANON); } -static inline void lru_cache_add_active(struct page *page) +static inline void lru_cache_add_active_anon(struct page *page) { - __lru_cache_add(page, LRU_ACTIVE); + __lru_cache_add(page, LRU_ACTIVE_ANON); +} + +static inline void lru_cache_add_file(struct page *page) +{ + __lru_cache_add(page, LRU_INACTIVE_FILE); +} + +static inline void lru_cache_add_active_file(struct page *page) +{ + __lru_cache_add(page, LRU_ACTIVE_FILE); } /* linux/mm/vmscan.c */ @@ -199,7 +209,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem, gfp_t gfp_mask); -extern int __isolate_lru_page(struct page *page, int mode); +extern int __isolate_lru_page(struct page *page, int mode, int file); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 58334d439516..ff5179f2b153 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -159,6 +159,16 @@ static inline unsigned long zone_page_state(struct zone *zone, return x; } +extern unsigned long global_lru_pages(void); + +static inline unsigned long zone_lru_pages(struct zone *zone) +{ + return (zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_FILE)); +} + #ifdef CONFIG_NUMA /* * Determine the per node value of a stat item. This function diff --git a/mm/filemap.c b/mm/filemap.c index 903bf316912a..a1ddd2557af2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include #include /* for BUG_ON(!in_atomic()) only */ #include +#include /* for page_is_file_cache() */ #include "internal.h" /* @@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + int ret; + + /* + * Splice_read and readahead add shmem/tmpfs pages into the page cache + * before shmem_readpage has a chance to mark them as SwapBacked: they + * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * (called in add_to_page_cache) needs to know where they're going too. + */ + if (mapping_cap_swap_backed(mapping)) + SetPageSwapBacked(page); + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) { + if (page_is_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } return ret; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 38633864a93e..2fc7fddd9b1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf) { struct hstate *h = &default_hstate; return sprintf(buf, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %5lu kB\n", + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", h->nr_huge_pages, h->free_huge_pages, h->resv_huge_pages, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c0cbd7790c51..27e9e75f4eab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -162,6 +162,7 @@ struct page_cgroup { }; #define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ #define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ +#define PAGE_CGROUP_FLAG_FILE (0x4) /* page is file system backed */ static int page_cgroup_nid(struct page_cgroup *pc) { @@ -177,6 +178,7 @@ enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ }; /* @@ -288,8 +290,12 @@ static void unlock_page_cgroup(struct page *page) static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; - int lru = !!from; + int lru = LRU_BASE; + + if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) + lru += LRU_ACTIVE; + if (pc->flags & PAGE_CGROUP_FLAG_FILE) + lru += LRU_FILE; MEM_CGROUP_ZSTAT(mz, lru) -= 1; @@ -300,10 +306,12 @@ static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int lru = LRU_INACTIVE; + int lru = LRU_BASE; if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) lru += LRU_ACTIVE; + if (pc->flags & PAGE_CGROUP_FLAG_FILE) + lru += LRU_FILE; MEM_CGROUP_ZSTAT(mz, lru) += 1; list_add(&pc->lru, &mz->lists[lru]); @@ -314,10 +322,9 @@ static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) { struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); - int lru = LRU_INACTIVE; - - if (pc->flags & PAGE_CGROUP_FLAG_ACTIVE) - lru += LRU_ACTIVE; + int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; + int file = pc->flags & PAGE_CGROUP_FLAG_FILE; + int lru = LRU_FILE * !!file + !!from; MEM_CGROUP_ZSTAT(mz, lru) -= 1; @@ -326,7 +333,7 @@ static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) else pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; - lru = !!active; + lru = LRU_FILE * !!file + !!active; MEM_CGROUP_ZSTAT(mz, lru) += 1; list_move(&pc->lru, &mz->lists[lru]); } @@ -390,21 +397,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) return (int)((rss * 100L) / total); } -/* - * This function is called from vmscan.c. In page reclaiming loop. balance - * between active and inactive list is calculated. For memory controller - * page reclaiming, we should use using mem_cgroup's imbalance rather than - * zone's global lru imbalance. - */ -long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) -{ - unsigned long active, inactive; - /* active and inactive are the number of pages. 'long' is ok.*/ - active = mem_cgroup_get_all_zonestat(mem, LRU_ACTIVE); - inactive = mem_cgroup_get_all_zonestat(mem, LRU_INACTIVE); - return (long) (active / (inactive + 1)); -} - /* * prev_priority control...this will be used in memory reclaim path. */ @@ -450,7 +442,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { unsigned long nr_taken = 0; struct page *page; @@ -461,7 +453,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; - int lru = !!active; + int lru = LRU_FILE * !!file + !!active; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); @@ -477,6 +469,9 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, if (unlikely(!PageLRU(page))) continue; + /* + * TODO: play better with lumpy reclaim, grabbing anything. + */ if (PageActive(page) && !active) { __mem_cgroup_move_lists(pc, true); continue; @@ -489,7 +484,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, scan++; list_move(&pc->lru, &pc_list); - if (__isolate_lru_page(page, mode) == 0) { + if (__isolate_lru_page(page, mode, file) == 0) { list_move(&page->lru, dst); nr_taken++; } @@ -575,10 +570,16 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, * If a page is accounted as a page cache, insert to inactive list. * If anon, insert to active list. */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) + if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) { pc->flags = PAGE_CGROUP_FLAG_CACHE; - else + if (page_is_file_cache(page)) + pc->flags |= PAGE_CGROUP_FLAG_FILE; + else + pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; + } else if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) pc->flags = PAGE_CGROUP_FLAG_ACTIVE; + else /* MEM_CGROUP_CHARGE_TYPE_SHMEM */ + pc->flags = PAGE_CGROUP_FLAG_CACHE | PAGE_CGROUP_FLAG_ACTIVE; lock_page_cgroup(page); if (unlikely(page_get_page_cgroup(page))) { @@ -737,8 +738,12 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) if (pc) { mem = pc->mem_cgroup; css_get(&mem->css); - if (pc->flags & PAGE_CGROUP_FLAG_CACHE) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + if (pc->flags & PAGE_CGROUP_FLAG_CACHE) { + if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + } } unlock_page_cgroup(page); if (mem) { @@ -982,14 +987,21 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, } /* showing # of active pages */ { - unsigned long active, inactive; - - inactive = mem_cgroup_get_all_zonestat(mem_cont, - LRU_INACTIVE); - active = mem_cgroup_get_all_zonestat(mem_cont, - LRU_ACTIVE); - cb->fill(cb, "active", (active) * PAGE_SIZE); - cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); + unsigned long active_anon, inactive_anon; + unsigned long active_file, inactive_file; + + inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_ANON); + active_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_ANON); + inactive_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_FILE); + active_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_FILE); + cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); + cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); + cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); + cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); } return 0; } diff --git a/mm/memory.c b/mm/memory.c index 7512933dcc10..71cdefd1ef14 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1889,7 +1889,7 @@ gotten: set_pte_at(mm, address, page_table, entry); update_mmu_cache(vma, address, entry); SetPageSwapBacked(new_page); - lru_cache_add_active(new_page); + lru_cache_add_active_anon(new_page); page_add_new_anon_rmap(new_page, vma, address); if (old_page) { @@ -2384,7 +2384,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto release; inc_mm_counter(mm, anon_rss); SetPageSwapBacked(page); - lru_cache_add_active(page); + lru_cache_add_active_anon(page); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2526,7 +2526,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (anon) { inc_mm_counter(mm, anon_rss); SetPageSwapBacked(page); - lru_cache_add_active(page); + lru_cache_add_active_anon(page); page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b40f6d5f8fe9..2970e35fd03f 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) - + zone_page_state(z, NR_INACTIVE) - + zone_page_state(z, NR_ACTIVE); + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) - + global_page_state(NR_INACTIVE) - + global_page_state(NR_ACTIVE); + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2099904d6cc4..740a16a32c22 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1864,10 +1864,13 @@ void show_free_areas(void) } } - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" + printk("Active_anon:%lu active_file:%lu inactive_anon%lu\n" + " inactive_file:%lu dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", - global_page_state(NR_ACTIVE), - global_page_state(NR_INACTIVE), + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_INACTIVE_FILE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1890,8 +1893,10 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" - " active:%lukB" - " inactive:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -1901,8 +1906,10 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -3472,6 +3479,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, INIT_LIST_HEAD(&zone->lru[l].list); zone->lru[l].nr_scan = 0; } + zone->recent_rotated[0] = 0; + zone->recent_rotated[1] = 0; + zone->recent_scanned[0] = 0; + zone->recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) diff --git a/mm/readahead.c b/mm/readahead.c index 6cbd9a72fde2..bec83c15a78f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } diff --git a/mm/shmem.c b/mm/shmem.c index fd421ed703ed..fc2ccf79a776 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -199,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = default_unplug_io_fn, }; diff --git a/mm/swap.c b/mm/swap.c index 88a394872677..0b1974a08974 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -116,7 +116,8 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page)) { - list_move_tail(&page->lru, &zone->lru[LRU_INACTIVE].list); + int lru = page_is_file_cache(page); + list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } } @@ -157,11 +158,18 @@ void activate_page(struct page *page) spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); + int file = page_is_file_cache(page); + int lru = LRU_BASE + file; + del_page_from_lru_list(zone, page, lru); + SetPageActive(page); - add_page_to_active_list(zone, page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); mem_cgroup_move_lists(page, true); + + zone->recent_rotated[!!file]++; + zone->recent_scanned[!!file]++; } spin_unlock_irq(&zone->lru_lock); } diff --git a/mm/swap_state.c b/mm/swap_state.c index 7a3ece0b5a3b..ea62084ed402 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; @@ -310,7 +310,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + lru_cache_add_active_anon(new_page); swap_readpage(NULL, new_page); return new_page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index e656035d3406..d10d2f9a33f3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -78,7 +78,7 @@ struct scan_control { unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -680,7 +680,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode, int file) { int ret = -EINVAL; @@ -696,6 +696,9 @@ int __isolate_lru_page(struct page *page, int mode) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; + if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + return ret; + ret = -EBUSY; if (likely(get_page_unless_zero(page))) { /* @@ -726,12 +729,13 @@ int __isolate_lru_page(struct page *page, int mode) * @scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode) + unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; unsigned long scan; @@ -748,7 +752,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON(!PageLRU(page)); - switch (__isolate_lru_page(page, mode)) { + switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); nr_taken++; @@ -791,10 +795,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode)) { + switch (__isolate_lru_page(cursor_page, mode, file)) { case 0: list_move(&cursor_page->lru, dst); nr_taken++; @@ -819,30 +824,37 @@ static unsigned long isolate_pages_global(unsigned long nr, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { + int lru = LRU_BASE; if (active) - return isolate_lru_pages(nr, &z->lru[LRU_ACTIVE].list, dst, - scanned, order, mode); - else - return isolate_lru_pages(nr, &z->lru[LRU_INACTIVE].list, dst, - scanned, order, mode); + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, + mode, !!file); } /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. */ -static unsigned long clear_active_flags(struct list_head *page_list) +static unsigned long clear_active_flags(struct list_head *page_list, + unsigned int *count) { int nr_active = 0; + int lru; struct page *page; - list_for_each_entry(page, page_list, lru) + list_for_each_entry(page, page_list, lru) { + lru = page_is_file_cache(page); if (PageActive(page)) { + lru += LRU_ACTIVE; ClearPageActive(page); nr_active++; } + count[lru]++; + } return nr_active; } @@ -880,12 +892,12 @@ int isolate_lru_page(struct page *page) spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && get_page_unless_zero(page)) { + int lru = LRU_BASE; ret = 0; ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); + + lru += page_is_file_cache(page) + !!PageActive(page); + del_page_from_lru_list(zone, page, lru); } spin_unlock_irq(&zone->lru_lock); } @@ -897,7 +909,7 @@ int isolate_lru_page(struct page *page) * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) + struct zone *zone, struct scan_control *sc, int file) { LIST_HEAD(page_list); struct pagevec pvec; @@ -914,20 +926,32 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scan; unsigned long nr_freed; unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + int mode = (sc->order > PAGE_ALLOC_COSTLY_ORDER) ? + ISOLATE_BOTH : ISOLATE_INACTIVE; nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, - (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, 0); - nr_active = clear_active_flags(&page_list); + &page_list, &nr_scan, sc->order, mode, + zone, sc->mem_cgroup, 0, file); + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); - __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); - __mod_zone_page_state(zone, NR_INACTIVE, - -(nr_taken - nr_active)); - if (scan_global_lru(sc)) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + if (scan_global_lru(sc)) { zone->pages_scanned += nr_scan; + zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + } spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -947,7 +971,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * The attempt at page out may have made some * of the pages active, mark them inactive again. */ - nr_active = clear_active_flags(&page_list); + nr_active = clear_active_flags(&page_list, count); count_vm_events(PGDEACTIVATE, nr_active); nr_freed += shrink_page_list(&page_list, sc, @@ -977,6 +1001,10 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); list_del(&page->lru); add_page_to_lru_list(zone, page, page_lru(page)); + if (PageActive(page) && scan_global_lru(sc)) { + int file = !!page_is_file_cache(page); + zone->recent_rotated[file]++; + } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -1007,115 +1035,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) static inline int zone_is_near_oom(struct zone *zone) { - return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE))*3; -} - -/* - * Determine we should try to reclaim mapped pages. - * This is called only when sc->mem_cgroup is NULL. - */ -static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, - int priority) -{ - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - int reclaim_mapped = 0; - int prev_priority; - - if (scan_global_lru(sc) && zone_is_near_oom(zone)) - return 1; - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - if (scan_global_lru(sc)) - prev_priority = zone->prev_priority; - else - prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); - - distress = 100 >> min(prev_priority, priority); - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - if (scan_global_lru(sc)) - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - else - mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; - - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - if (scan_global_lru(sc)) { - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - } else - imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); - - /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; - - /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= mapped_ratio; - imbalance /= 100; - - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - - return reclaim_mapped; + return zone->pages_scanned >= (zone_lru_pages(zone) * 3); } /* @@ -1138,7 +1058,7 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority) + struct scan_control *sc, int priority, int file) { unsigned long pgmoved; int pgdeactivate = 0; @@ -1148,43 +1068,42 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_inactive); struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - - if (sc->may_swap) - reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + enum lru_list lru; lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1); + sc->mem_cgroup, 1, file); /* * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) + if (scan_global_lru(sc)) { zone->pages_scanned += pgscanned; + zone->recent_scanned[!!file] += pgmoved; + } - __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); + if (file) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + else + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->mem_cgroup)) { - list_add(&page->lru, &l_active); - continue; - } - } list_add(&page->lru, &l_inactive); } + /* + * Now put the pages back on the appropriate [file or anon] inactive + * and active lists. + */ pagevec_init(&pvec, 1); pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); @@ -1194,11 +1113,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, VM_BUG_ON(!PageActive(page)); ClearPageActive(page); - list_move(&page->lru, &zone->lru[LRU_INACTIVE].list); + list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_move_lists(page, false); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -1208,7 +1127,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); @@ -1217,6 +1136,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } pgmoved = 0; + lru = LRU_ACTIVE + file * LRU_FILE; while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); @@ -1224,11 +1144,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, SetPageLRU(page); VM_BUG_ON(!PageActive(page)); - list_move(&page->lru, &zone->lru[LRU_ACTIVE].list); + list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_move_lists(page, true); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgmoved = 0; spin_unlock_irq(&zone->lru_lock); if (vm_swap_full()) @@ -1237,7 +1157,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + zone->recent_rotated[!!file] += pgmoved; __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); @@ -1248,16 +1169,103 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, pagevec_release(&pvec); } -static unsigned long shrink_list(enum lru_list l, unsigned long nr_to_scan, +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { - if (l == LRU_ACTIVE) { - shrink_active_list(nr_to_scan, zone, sc, priority); + int file = is_file_lru(lru); + + if (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - return shrink_inactive_list(nr_to_scan, zone, sc); + return shrink_inactive_list(nr_to_scan, zone, sc, file); +} + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * percent[0] specifies how much pressure to put on ram/swap backed + * memory, while percent[1] determines pressure on the file LRUs. + */ +static void get_scan_ratio(struct zone *zone, struct scan_control *sc, + unsigned long *percent) +{ + unsigned long anon, file, free; + unsigned long anon_prio, file_prio; + unsigned long ap, fp; + + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + + /* If we have no swap space, do not bother scanning anon pages. */ + if (nr_swap_pages <= 0) { + percent[0] = 0; + percent[1] = 100; + return; + } + + /* If we have very few page cache pages, force-scan anon pages. */ + if (unlikely(file + free <= zone->pages_high)) { + percent[0] = 100; + percent[1] = 0; + return; + } + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + if (unlikely(zone->recent_scanned[0] > anon / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[0] /= 2; + zone->recent_rotated[0] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + if (unlikely(zone->recent_scanned[1] > file / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[1] /= 2; + zone->recent_rotated[1] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* + * anon recent_rotated[0] + * %anon = 100 * ----------- / ----------------- * IO cost + * anon + file rotate_sum + */ + ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); + ap /= zone->recent_rotated[0] + 1; + + fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); + fp /= zone->recent_rotated[1] + 1; + + /* Normalize to percentages */ + percent[0] = 100 * ap / (ap + fp + 1); + percent[1] = 100 - percent[0]; } + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1267,36 +1275,43 @@ static unsigned long shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; + unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; - if (scan_global_lru(sc)) { - /* - * Add one to nr_to_scan just to make sure that the kernel - * will slowly sift through the active list. - */ - for_each_lru(l) { - zone->lru[l].nr_scan += (zone_page_state(zone, - NR_LRU_BASE + l) >> priority) + 1; + get_scan_ratio(zone, sc, percent); + + for_each_lru(l) { + if (scan_global_lru(sc)) { + int file = is_file_lru(l); + int scan; + /* + * Add one to nr_to_scan just to make sure that the + * kernel will slowly sift through each list. + */ + scan = zone_page_state(zone, NR_LRU_BASE + l); + if (priority) { + scan >>= priority; + scan = (scan * percent[file]) / 100; + } + zone->lru[l].nr_scan += scan + 1; nr[l] = zone->lru[l].nr_scan; if (nr[l] >= sc->swap_cluster_max) zone->lru[l].nr_scan = 0; else nr[l] = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage + * but because memory controller hits its limit. + * Don't modify zone reclaim related data. + */ + nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, + priority, l); } - } else { - /* - * This reclaim occurs not because zone memory shortage but - * because memory controller hits its limit. - * Then, don't modify zone reclaim related data. - */ - nr[LRU_ACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup, - zone, priority, LRU_ACTIVE); - - nr[LRU_INACTIVE] = mem_cgroup_calc_reclaim(sc->mem_cgroup, - zone, priority, LRU_INACTIVE); } - while (nr[LRU_ACTIVE] || nr[LRU_INACTIVE]) { + while (nr[LRU_ACTIVE_ANON] || nr[LRU_INACTIVE_ANON] || + nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_lru(l) { if (nr[l]) { nr_to_scan = min(nr[l], @@ -1369,7 +1384,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, return nr_reclaimed; } - + /* * This is the main entry point to direct page reclaim. * @@ -1412,8 +1427,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } } @@ -1615,8 +1629,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } /* @@ -1660,8 +1673,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE)) * 6) + (zone_lru_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1715,7 +1727,7 @@ out: /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1809,6 +1821,14 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } +unsigned long global_lru_pages(void) +{ + return global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_INACTIVE_FILE); +} + #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages @@ -1834,7 +1854,8 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, for_each_lru(l) { /* For pass = 0 we don't shrink the active list */ - if (pass == 0 && l == LRU_ACTIVE) + if (pass == 0 && + (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) continue; zone->lru[l].nr_scan += @@ -1856,11 +1877,6 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, return ret; } -static unsigned long count_lru_pages(void) -{ - return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); -} - /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1886,7 +1902,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = count_lru_pages(); + lru_pages = global_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1929,7 +1945,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); + global_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1946,7 +1962,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 52c0335c1b71..27400b7da7c4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -619,8 +619,10 @@ const struct seq_operations pagetypeinfo_op = { static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", - "nr_inactive", - "nr_active", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -688,7 +690,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" + "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -696,8 +698,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->pages_low, zone->pages_high, zone->pages_scanned, - zone->lru[LRU_ACTIVE].nr_scan, - zone->lru[LRU_INACTIVE].nr_scan, + zone->lru[LRU_ACTIVE_ANON].nr_scan, + zone->lru[LRU_INACTIVE_ANON].nr_scan, + zone->lru[LRU_ACTIVE_FILE].nr_scan, + zone->lru[LRU_INACTIVE_FILE].nr_scan, zone->spanned_pages, zone->present_pages); -- cgit v1.2.3 From 7b854121eb3e5ba0241882ff939e2c485228c9c5 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Sat, 18 Oct 2008 20:26:40 -0700 Subject: Unevictable LRU Page Statistics Report unevictable pages per zone and system wide. Kosaki Motohiro added support for memory controller unevictable statistics. [riel@redhat.com: fix printk in show_free_areas()] [akpm@linux-foundation.org: fix units in /proc/vmstats] Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Debugged-by: Hiroshi Shimamoto Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 6 ++++++ fs/proc/proc_misc.c | 6 ++++++ mm/memcontrol.c | 6 ++++++ mm/page_alloc.c | 18 ++++++++++++++++-- mm/vmstat.c | 3 +++ 5 files changed, 37 insertions(+), 2 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index fc7e9bf0cdbc..11a9a05cf554 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -70,6 +70,9 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d Inactive(anon): %8lu kB\n" "Node %d Active(file): %8lu kB\n" "Node %d Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Node %d Noreclaim: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM "Node %d HighTotal: %8lu kB\n" "Node %d HighFree: %8lu kB\n" @@ -99,6 +102,9 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_INACTIVE_ANON)), nid, K(node_page_state(nid, NR_ACTIVE_FILE)), nid, K(node_page_state(nid, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + nid, K(node_page_state(nid, NR_UNEVICTABLE)), +#endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index b8edb2860557..6dd60eaea997 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -174,6 +174,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "Inactive(anon): %8lu kB\n" "Active(file): %8lu kB\n" "Inactive(file): %8lu kB\n" +#ifdef CONFIG_UNEVICTABLE_LRU + "Unevictable: %8lu kB\n" +#endif #ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" @@ -212,6 +215,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(pages[LRU_INACTIVE_ANON]), K(pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_FILE]), +#ifdef CONFIG_UNEVICTABLE_LRU + K(pages[LRU_UNEVICTABLE]), +#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 82c065e7551e..e93a4db93fbe 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1006,6 +1006,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, { unsigned long active_anon, inactive_anon; unsigned long active_file, inactive_file; + unsigned long unevictable; inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, LRU_INACTIVE_ANON); @@ -1015,10 +1016,15 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, LRU_INACTIVE_FILE); active_file = mem_cgroup_get_all_zonestat(mem_cont, LRU_ACTIVE_FILE); + unevictable = mem_cgroup_get_all_zonestat(mem_cont, + LRU_UNEVICTABLE); + cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); + cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + } return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 79c0981b1d32..4125230a1b2c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1864,13 +1864,21 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon%lu\n" - " inactive_file:%lu dirty:%lu writeback:%lu unstable:%lu\n" + printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" + " inactive_file:%lu" +//TODO: check/adjust line lengths +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lu" +#endif + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), +#ifdef CONFIG_UNEVICTABLE_LRU + global_page_state(NR_UNEVICTABLE), +#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1897,6 +1905,9 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lukB" +#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -1910,6 +1921,9 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + K(zone_page_state(zone, NR_UNEVICTABLE)), +#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") diff --git a/mm/vmstat.c b/mm/vmstat.c index 6cb08cdd4f03..6db2f6319313 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -623,6 +623,9 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", +#ifdef CONFIG_UNEVICTABLE_LRU + "nr_unevictable", +#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", -- cgit v1.2.3 From 5344b7e648980cc2ca613ec03a56a8222ff48820 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:26:51 -0700 Subject: vmstat: mlocked pages statistics Add NR_MLOCK zone page state, which provides a (conservative) count of mlocked pages (actually, the number of mlocked pages moved off the LRU). Reworked by lts to fit in with the modified mlock page support in the Reclaim Scalability series. [kosaki.motohiro@jp.fujitsu.com: fix incorrect Mlocked field of /proc/meminfo] [lee.schermerhorn@hp.com: mlocked-pages: add event counting with statistics] Signed-off-by: Nick Piggin Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 +++- fs/proc/proc_misc.c | 2 ++ include/linux/mmzone.h | 2 ++ include/linux/vmstat.h | 4 ++++ mm/internal.h | 16 +++++++++++++--- mm/mlock.c | 41 ++++++++++++++++++++++++++++++++++++----- mm/vmstat.c | 5 +++++ 7 files changed, 65 insertions(+), 9 deletions(-) (limited to 'fs/proc/proc_misc.c') diff --git a/drivers/base/node.c b/drivers/base/node.c index 11a9a05cf554..fb45d88a2446 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -71,7 +71,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d Active(file): %8lu kB\n" "Node %d Inactive(file): %8lu kB\n" #ifdef CONFIG_UNEVICTABLE_LRU - "Node %d Noreclaim: %8lu kB\n" + "Node %d Unevictable: %8lu kB\n" + "Node %d Mlocked: %8lu kB\n" #endif #ifdef CONFIG_HIGHMEM "Node %d HighTotal: %8lu kB\n" @@ -104,6 +105,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_INACTIVE_FILE)), #ifdef CONFIG_UNEVICTABLE_LRU nid, K(node_page_state(nid, NR_UNEVICTABLE)), + nid, K(node_page_state(nid, NR_MLOCK)), #endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 6dd60eaea997..61b25f4eabe6 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -176,6 +176,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "Inactive(file): %8lu kB\n" #ifdef CONFIG_UNEVICTABLE_LRU "Unevictable: %8lu kB\n" + "Mlocked: %8lu kB\n" #endif #ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" @@ -217,6 +218,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(pages[LRU_INACTIVE_FILE]), #ifdef CONFIG_UNEVICTABLE_LRU K(pages[LRU_UNEVICTABLE]), + K(global_page_state(NR_MLOCK)), #endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d1f60d5fe2ea..da2d053a95f1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -88,8 +88,10 @@ enum zone_stat_item { NR_ACTIVE_FILE, /* " " " " " */ #ifdef CONFIG_UNEVICTABLE_LRU NR_UNEVICTABLE, /* " " " " " */ + NR_MLOCK, /* mlock()ed pages found and moved off LRU */ #else NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ + NR_MLOCK = NR_ACTIVE_FILE, #endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 135840cd7feb..05b805020be2 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -45,6 +45,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ + UNEVICTABLE_PGMLOCKED, + UNEVICTABLE_PGMUNLOCKED, + UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ + UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ #endif NR_VM_EVENT_ITEMS }; diff --git a/mm/internal.h b/mm/internal.h index 48e32f790571..1cfbf2e2bc9e 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -101,7 +101,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) return 0; - SetPageMlocked(page); + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } return 1; } @@ -128,12 +131,19 @@ static inline void clear_page_mlock(struct page *page) /* * mlock_migrate_page - called only from migrate_page_copy() to - * migrate the Mlocked page flag + * migrate the Mlocked page flag; update statistics. */ static inline void mlock_migrate_page(struct page *newpage, struct page *page) { - if (TestClearPageMlocked(page)) + if (TestClearPageMlocked(page)) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); SetPageMlocked(newpage); + __inc_zone_page_state(newpage, NR_MLOCK); + local_irq_restore(flags); + } } diff --git a/mm/mlock.c b/mm/mlock.c index 8b478350a2a1..bce1b22c36c2 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -60,6 +60,8 @@ void __clear_page_mlock(struct page *page) return; } + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); if (!isolate_lru_page(page)) { putback_lru_page(page); } else { @@ -69,6 +71,9 @@ void __clear_page_mlock(struct page *page) lru_add_drain_all(); if (!isolate_lru_page(page)) putback_lru_page(page); + else if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + } } @@ -80,8 +85,12 @@ void mlock_vma_page(struct page *page) { BUG_ON(!PageLocked(page)); - if (!TestSetPageMlocked(page) && !isolate_lru_page(page)) - putback_lru_page(page); + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } } /* @@ -106,9 +115,31 @@ static void munlock_vma_page(struct page *page) { BUG_ON(!PageLocked(page)); - if (TestClearPageMlocked(page) && !isolate_lru_page(page)) { - try_to_munlock(page); - putback_lru_page(page); + if (TestClearPageMlocked(page)) { + dec_zone_page_state(page, NR_MLOCK); + if (!isolate_lru_page(page)) { + int ret = try_to_munlock(page); + /* + * did try_to_unlock() succeed or punt? + */ + if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); + } else { + /* + * We lost the race. let try_to_unmap() deal + * with it. At least we get the page state and + * mlock stats right. However, page is still on + * the noreclaim list. We'll fix that up when + * the page is eventually freed or we scan the + * noreclaim list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } } } diff --git a/mm/vmstat.c b/mm/vmstat.c index 6db2f6319313..9e28abc0a0b9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -625,6 +625,7 @@ static const char * const vmstat_text[] = { "nr_active_file", #ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", + "nr_mlock", #endif "nr_anon_pages", "nr_mapped", @@ -684,6 +685,10 @@ static const char * const vmstat_text[] = { "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", #endif #endif }; -- cgit v1.2.3