From cb2c0233755429037462e16ea0d5497a0092738c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 7 Jul 2005 17:56:03 -0700 Subject: [PATCH] export generic_drop_inode() to modules OCFS2 wants to mark an inode which has been orphaned by another node so that during final iput it takes the correct path through the VFS and can pass through the OCFS2 delete_inode callback. Since i_nlink can get out of date with other nodes, the best way I see to accomplish this is by clearing i_nlink on those inodes at drop_inode time. Other than this small amount of work, nothing different needs to happen, so I think it would be cleanest to be able to just call generic_drop_inode at the end of the OCFS2 drop_inode callback. Signed-off-by: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 047bde30836a..302ec20838ca 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1435,6 +1435,7 @@ extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern void generic_delete_inode(struct inode *inode); +extern void generic_drop_inode(struct inode *inode); extern struct inode *ilookup5(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data); -- cgit v1.2.3 From 79b9ce311e192e9a31fd9f3cf1ee4a4edf9e2650 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 7 Jul 2005 17:56:04 -0700 Subject: [PATCH] print order information when OOM killing Dump the current allocation order when OOM killing. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/sysrq.c | 2 +- include/linux/swap.h | 2 +- mm/oom_kill.c | 4 ++-- mm/page_alloc.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index af79805b5576..12d563c648f7 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -228,7 +228,7 @@ static struct sysrq_key_op sysrq_term_op = { static void moom_callback(void *ignored) { - out_of_memory(GFP_KERNEL); + out_of_memory(GFP_KERNEL, 0); } static DECLARE_WORK(moom_work, moom_callback, NULL); diff --git a/include/linux/swap.h b/include/linux/swap.h index 2343f999e6e1..c75954f2d868 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -148,7 +148,7 @@ struct swap_list_t { #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages) /* linux/mm/oom_kill.c */ -extern void out_of_memory(unsigned int __nocast gfp_mask); +extern void out_of_memory(unsigned int __nocast gfp_mask, int order); /* linux/mm/memory.c */ extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 59666d905f19..e20d559edbaf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -253,12 +253,12 @@ static struct mm_struct *oom_kill_process(struct task_struct *p) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(unsigned int __nocast gfp_mask) +void out_of_memory(unsigned int __nocast gfp_mask, int order) { struct mm_struct *mm = NULL; task_t * p; - printk("oom-killer: gfp_mask=0x%x\n", gfp_mask); + printk("oom-killer: gfp_mask=0x%x, order=%d\n", gfp_mask, order); /* print memory stats */ show_mem(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3c9f7f881125..7fbd3ea8765c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -936,7 +936,7 @@ rebalance: goto got_pg; } - out_of_memory(gfp_mask); + out_of_memory(gfp_mask, order); goto restart; } -- cgit v1.2.3 From cf36680887d6d942d2119c1ff1dfb2428b0f21f4 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Thu, 7 Jul 2005 17:56:13 -0700 Subject: [PATCH] move ioprio syscalls into syscalls.h - Make ioprio syscalls return long, like set/getpriority syscalls. - Move function prototypes into syscalls.h so we can pick them up in the 32/64bit compat code. Signed-off-by: Anton Blanchard Acked-by: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ioprio.c | 4 ++-- include/linux/ioprio.h | 3 --- include/linux/syscalls.h | 3 +++ 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/ioprio.c b/fs/ioprio.c index 663e420636d6..97e1f088ba00 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -43,7 +43,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) return 0; } -asmlinkage int sys_ioprio_set(int which, int who, int ioprio) +asmlinkage long sys_ioprio_set(int which, int who, int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); int data = IOPRIO_PRIO_DATA(ioprio); @@ -115,7 +115,7 @@ asmlinkage int sys_ioprio_set(int which, int who, int ioprio) return ret; } -asmlinkage int sys_ioprio_get(int which, int who) +asmlinkage long sys_ioprio_get(int which, int who) { struct task_struct *g, *p; struct user_struct *user; diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 8a453a0b5e4b..88d5961f7a3f 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -34,9 +34,6 @@ enum { */ #define IOPRIO_BE_NR (8) -asmlinkage int sys_ioprio_set(int, int, int); -asmlinkage int sys_ioprio_get(int, int); - enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 52830b6d94e5..425f58c8ea4a 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -506,4 +506,7 @@ asmlinkage long sys_request_key(const char __user *_type, asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); +asmlinkage long sys_ioprio_set(int which, int who, int ioprio); +asmlinkage long sys_ioprio_get(int which, int who); + #endif -- cgit v1.2.3 From e00d9967e3addea86dded46deefc5daec5d52e5a Mon Sep 17 00:00:00 2001 From: Bernard Blackham Date: Thu, 7 Jul 2005 17:56:42 -0700 Subject: [PATCH] pm: fix u32 vs. pm_message_t confusion in cpufreq Fix u32 vs pm_message_t confusion in cpufreq. Signed-off-by: Bernard Blackham Signed-off-by: Pavel Machek Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ppc/platforms/pmac_cpufreq.c | 2 +- drivers/cpufreq/cpufreq.c | 4 ++-- include/linux/cpufreq.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/ppc/platforms/pmac_cpufreq.c b/arch/ppc/platforms/pmac_cpufreq.c index 5fdd4f607a40..c0605244edda 100644 --- a/arch/ppc/platforms/pmac_cpufreq.c +++ b/arch/ppc/platforms/pmac_cpufreq.c @@ -452,7 +452,7 @@ static u32 __pmac read_gpio(struct device_node *np) return offset; } -static int __pmac pmac_cpufreq_suspend(struct cpufreq_policy *policy, u32 state) +static int __pmac pmac_cpufreq_suspend(struct cpufreq_policy *policy, pm_message_t pmsg) { /* Ok, this could be made a bit smarter, but let's be robust for now. We * always force a speed change to high speed before sleep, to make sure diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index bf62dfe4976a..7a7859dd0d98 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -869,7 +869,7 @@ EXPORT_SYMBOL(cpufreq_get); * cpufreq_suspend - let the low level driver prepare for suspend */ -static int cpufreq_suspend(struct sys_device * sysdev, u32 state) +static int cpufreq_suspend(struct sys_device * sysdev, pm_message_t pmsg) { int cpu = sysdev->id; unsigned int ret = 0; @@ -897,7 +897,7 @@ static int cpufreq_suspend(struct sys_device * sysdev, u32 state) } if (cpufreq_driver->suspend) { - ret = cpufreq_driver->suspend(cpu_policy, state); + ret = cpufreq_driver->suspend(cpu_policy, pmsg); if (ret) { printk(KERN_ERR "cpufreq: suspend failed in ->suspend " "step on CPU %u\n", cpu_policy->cpu); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 927daa86c9b3..ff7f80f48df1 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -201,7 +201,7 @@ struct cpufreq_driver { /* optional */ int (*exit) (struct cpufreq_policy *policy); - int (*suspend) (struct cpufreq_policy *policy, u32 state); + int (*suspend) (struct cpufreq_policy *policy, pm_message_t pmsg); int (*resume) (struct cpufreq_policy *policy); struct freq_attr **attr; }; -- cgit v1.2.3 From a39722034ae37f80a1803bf781fe3fe1b03e20bc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 7 Jul 2005 17:56:56 -0700 Subject: [PATCH] page_uptodate locking scalability Use a bit spin lock in the first buffer of the page to synchronise asynch IO buffer completions, instead of the global page_uptodate_lock, which is showing some scalabilty problems. Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 25 +++++++++++++++++-------- include/linux/buffer_head.h | 3 +++ 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/fs/buffer.c b/fs/buffer.c index 561e63a14966..6a25d7df89b1 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -513,8 +513,8 @@ static void free_more_memory(void) */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) { - static DEFINE_SPINLOCK(page_uptodate_lock); unsigned long flags; + struct buffer_head *first; struct buffer_head *tmp; struct page *page; int page_uptodate = 1; @@ -536,7 +536,9 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */ - spin_lock_irqsave(&page_uptodate_lock, flags); + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); clear_buffer_async_read(bh); unlock_buffer(bh); tmp = bh; @@ -549,7 +551,8 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } while (tmp != bh); - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); /* * If none of the buffers had errors and they are all @@ -561,7 +564,8 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) return; still_busy: - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); return; } @@ -572,8 +576,8 @@ still_busy: void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; - static DEFINE_SPINLOCK(page_uptodate_lock); unsigned long flags; + struct buffer_head *first; struct buffer_head *tmp; struct page *page; @@ -594,7 +598,10 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) SetPageError(page); } - spin_lock_irqsave(&page_uptodate_lock, flags); + first = page_buffers(page); + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &first->b_state); + clear_buffer_async_write(bh); unlock_buffer(bh); tmp = bh->b_this_page; @@ -605,12 +612,14 @@ void end_buffer_async_write(struct buffer_head *bh, int uptodate) } tmp = tmp->b_this_page; } - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); end_page_writeback(page); return; still_busy: - spin_unlock_irqrestore(&page_uptodate_lock, flags); + bit_spin_unlock(BH_Uptodate_Lock, &first->b_state); + local_irq_restore(flags); return; } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 802c91e9b3da..90828493791f 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -19,6 +19,9 @@ enum bh_state_bits { BH_Dirty, /* Is dirty */ BH_Lock, /* Is locked */ BH_Req, /* Has been submitted for I/O */ + BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise + * IO completion of other buffers in the page + */ BH_Mapped, /* Has a disk mapping */ BH_New, /* Disk mapping was newly created by get_block */ -- cgit v1.2.3 From 0db925af1db5f3dfe1691c35b39496e2baaff9c9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jul 2005 17:56:58 -0700 Subject: [PATCH] propagate __nocast annotations Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- include/linux/slab.h | 4 ++-- include/linux/string.h | 2 +- mm/mempool.c | 2 +- mm/slab.c | 12 +++++++----- 5 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 8d6bf608b199..7c7400137e97 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -12,8 +12,8 @@ struct vm_area_struct; * GFP bitmasks.. */ /* Zone modifiers in GFP_ZONEMASK (see linux/mmzone.h - low two bits) */ -#define __GFP_DMA 0x01 -#define __GFP_HIGHMEM 0x02 +#define __GFP_DMA 0x01u +#define __GFP_HIGHMEM 0x02u /* * Action modifiers - doesn't change the zoning diff --git a/include/linux/slab.h b/include/linux/slab.h index 76cf7e60216c..4c8e552471b0 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -65,7 +65,7 @@ extern void *kmem_cache_alloc(kmem_cache_t *, unsigned int __nocast); extern void kmem_cache_free(kmem_cache_t *, void *); extern unsigned int kmem_cache_size(kmem_cache_t *); extern const char *kmem_cache_name(kmem_cache_t *); -extern kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags); +extern kmem_cache_t *kmem_find_general_cachep(size_t size, unsigned int __nocast gfpflags); /* Size description struct for general caches. */ struct cache_sizes { @@ -105,7 +105,7 @@ extern unsigned int ksize(const void *); #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node(kmem_cache_t *, int flags, int node); -extern void *kmalloc_node(size_t size, int flags, int node); +extern void *kmalloc_node(size_t size, unsigned int __nocast flags, int node); #else static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int node) { diff --git a/include/linux/string.h b/include/linux/string.h index 93994c613095..dab2652acbd8 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -88,7 +88,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t); extern void * memchr(const void *,int,__kernel_size_t); #endif -extern char *kstrdup(const char *s, int gfp); +extern char *kstrdup(const char *s, unsigned int __nocast gfp); #ifdef __cplusplus } diff --git a/mm/mempool.c b/mm/mempool.c index 9a72f7d918fa..65f2957b8d51 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -205,7 +205,7 @@ void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask) void *element; unsigned long flags; wait_queue_t wait; - int gfp_temp; + unsigned int gfp_temp; might_sleep_if(gfp_mask & __GFP_WAIT); diff --git a/mm/slab.c b/mm/slab.c index e57abd45eede..c9e706db4634 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -584,7 +584,8 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep) return cachep->array[smp_processor_id()]; } -static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags) +static inline kmem_cache_t *__find_general_cachep(size_t size, + unsigned int __nocast gfpflags) { struct cache_sizes *csizep = malloc_sizes; @@ -608,7 +609,8 @@ static inline kmem_cache_t *__find_general_cachep(size_t size, int gfpflags) return csizep->cs_cachep; } -kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags) +kmem_cache_t *kmem_find_general_cachep(size_t size, + unsigned int __nocast gfpflags) { return __find_general_cachep(size, gfpflags); } @@ -2100,7 +2102,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags) #if DEBUG static void * cache_alloc_debugcheck_after(kmem_cache_t *cachep, - unsigned long flags, void *objp, void *caller) + unsigned int __nocast flags, void *objp, void *caller) { if (!objp) return objp; @@ -2442,7 +2444,7 @@ got_slabp: } EXPORT_SYMBOL(kmem_cache_alloc_node); -void *kmalloc_node(size_t size, int flags, int node) +void *kmalloc_node(size_t size, unsigned int __nocast flags, int node) { kmem_cache_t *cachep; @@ -3094,7 +3096,7 @@ unsigned int ksize(const void *objp) * @s: the string to duplicate * @gfp: the GFP mask used in the kmalloc() call when allocating memory */ -char *kstrdup(const char *s, int gfp) +char *kstrdup(const char *s, unsigned int __nocast gfp) { size_t len; char *buf; -- cgit v1.2.3 From 6c036527a630720063b67d9a65455e8caca2c8fa Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 7 Jul 2005 17:56:59 -0700 Subject: [PATCH] mostly_read data section Add a new section called ".data.read_mostly" for data items that are read frequently and rarely written to like cpumaps etc. If these maps are placed in the .data section then these frequenly read items may end up in cachelines with data is is frequently updated. In that case all processors in an SMP system must needlessly reload the cachelines again and again containing elements of those frequently used variables. The ability to share these cachelines will allow each cpu in an SMP system to keep local copies of those shared cachelines thereby optimizing performance. Signed-off-by: Alok N Kataria Signed-off-by: Shobhit Dayal Signed-off-by: Christoph Lameter Signed-off-by: Shai Fultheim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/i386/kernel/cpu/intel.c | 2 +- arch/i386/kernel/smpboot.c | 18 +++++++++--------- arch/i386/kernel/time.c | 2 +- arch/i386/kernel/timers/timer_hpet.c | 4 ++-- arch/i386/kernel/vmlinux.lds.S | 3 +++ arch/x86_64/kernel/vmlinux.lds.S | 4 ++++ drivers/char/random.c | 2 +- fs/bio.c | 2 +- include/linux/cache.h | 6 ++++++ kernel/profile.c | 4 ++-- lib/radix-tree.c | 2 +- 11 files changed, 31 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/arch/i386/kernel/cpu/intel.c b/arch/i386/kernel/cpu/intel.c index 96a75d045835..a2c33c1a46c5 100644 --- a/arch/i386/kernel/cpu/intel.c +++ b/arch/i386/kernel/cpu/intel.c @@ -25,7 +25,7 @@ extern int trap_init_f00f_bug(void); /* * Alignment at which movsl is preferred for bulk memory copies. */ -struct movsl_mask movsl_mask; +struct movsl_mask movsl_mask __read_mostly; #endif void __devinit early_intel_workaround(struct cpuinfo_x86 *c) diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index d66bf489a2e9..8ac8e9fd5614 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -68,21 +68,21 @@ EXPORT_SYMBOL(smp_num_siblings); #endif /* Package ID of each logical CPU */ -int phys_proc_id[NR_CPUS] = {[0 ... NR_CPUS-1] = BAD_APICID}; +int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; EXPORT_SYMBOL(phys_proc_id); /* Core ID of each logical CPU */ -int cpu_core_id[NR_CPUS] = {[0 ... NR_CPUS-1] = BAD_APICID}; +int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID}; EXPORT_SYMBOL(cpu_core_id); -cpumask_t cpu_sibling_map[NR_CPUS]; +cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_sibling_map); -cpumask_t cpu_core_map[NR_CPUS]; +cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); /* bitmap of online cpus */ -cpumask_t cpu_online_map; +cpumask_t cpu_online_map __read_mostly; EXPORT_SYMBOL(cpu_online_map); cpumask_t cpu_callin_map; @@ -100,7 +100,7 @@ static int __devinitdata tsc_sync_disabled; struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; EXPORT_SYMBOL(cpu_data); -u8 x86_cpu_to_apicid[NR_CPUS] = +u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0xff }; EXPORT_SYMBOL(x86_cpu_to_apicid); @@ -550,10 +550,10 @@ extern struct { #ifdef CONFIG_NUMA /* which logical CPUs are on which nodes */ -cpumask_t node_2_cpu_mask[MAX_NUMNODES] = +cpumask_t node_2_cpu_mask[MAX_NUMNODES] __read_mostly = { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; /* which node each logical CPU is on */ -int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; +int cpu_2_node[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; EXPORT_SYMBOL(cpu_2_node); /* set up a mapping between cpu and node. */ @@ -581,7 +581,7 @@ static inline void unmap_cpu_to_node(int cpu) #endif /* CONFIG_NUMA */ -u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; +u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; static void map_cpu_to_logical_apicid(void) { diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 2854c357377f..0ee9dee8af06 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -91,7 +91,7 @@ EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); -struct timer_opts *cur_timer = &timer_none; +struct timer_opts *cur_timer __read_mostly = &timer_none; /* * This is a special lock that is owned by the CPU and holds the index diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index d766e0963ac1..ef8dac5dd33b 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -18,7 +18,7 @@ #include "mach_timer.h" #include -static unsigned long hpet_usec_quotient; /* convert hpet clks to usec */ +static unsigned long __read_mostly hpet_usec_quotient; /* convert hpet clks to usec */ static unsigned long tsc_hpet_quotient; /* convert tsc to hpet clks */ static unsigned long hpet_last; /* hpet counter value at last tick*/ static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */ @@ -180,7 +180,7 @@ static int __init init_hpet(char* override) /************************************************************/ /* tsc timer_opts struct */ -static struct timer_opts timer_hpet = { +static struct timer_opts timer_hpet __read_mostly = { .name = "hpet", .mark_offset = mark_offset_hpet, .get_offset = get_offset_hpet, diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 7e01a528a83a..761972f8cb6c 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -57,6 +57,9 @@ SECTIONS *(.data.cacheline_aligned) } + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { *(.data.read_mostly) } _edata = .; /* End of data section */ . = ALIGN(THREAD_SIZE); /* init_task */ diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 73389f51c4e5..61c12758ca70 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -56,6 +56,10 @@ SECTIONS .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { *(.data.cacheline_aligned) } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } #define VSYSCALL_ADDR (-10*1024*1024) #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.cacheline_aligned) + SIZEOF(.data.cacheline_aligned) + 4095) & ~(4095)) diff --git a/drivers/char/random.c b/drivers/char/random.c index 460b5d475edd..6b11d6b2129f 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -271,7 +271,7 @@ static int random_write_wakeup_thresh = 128; * samples to avoid wasting CPU time and reduce lock contention. */ -static int trickle_thresh = INPUT_POOL_WORDS * 28; +static int trickle_thresh __read_mostly = INPUT_POOL_WORDS * 28; static DEFINE_PER_CPU(int, trickle_count) = 0; diff --git a/fs/bio.c b/fs/bio.c index 3a1472acc361..ca8f7a850fe3 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -52,7 +52,7 @@ struct biovec_slab { */ #define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) } -static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] = { +static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = { BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES), }; #undef BV diff --git a/include/linux/cache.h b/include/linux/cache.h index 4d767b93738a..2b66a36d85f0 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -13,6 +13,12 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES #endif +#ifdef CONFIG_X86 +#define __read_mostly __attribute__((__section__(".data.read_mostly"))) +#else +#define __read_mostly +#endif + #ifndef ____cacheline_aligned #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) #endif diff --git a/kernel/profile.c b/kernel/profile.c index ad8cbb75ffa2..f89248e6d704 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -35,11 +35,11 @@ struct profile_hit { #define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) /* Oprofile timer tick hook */ -int (*timer_hook)(struct pt_regs *); +int (*timer_hook)(struct pt_regs *) __read_mostly; static atomic_t *prof_buffer; static unsigned long prof_len, prof_shift; -static int prof_on; +static int prof_on __read_mostly; static cpumask_t prof_cpu_mask = CPU_MASK_ALL; #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 04d664377f2c..10bed1c8c3c3 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -58,7 +58,7 @@ struct radix_tree_path { #define RADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) #define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2) -static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH]; +static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly; /* * Radix tree node cache. -- cgit v1.2.3 From 1ce88cf466f7b6078b14d67d186a3d7c19dd5609 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 Jul 2005 17:57:24 -0700 Subject: [PATCH] namespace.c: fix race in mark_mounts_for_expiry() This patch fixes a race found by Ram in mark_mounts_for_expiry() in fs/namespace.c. The bug can only be triggered with simultaneous exiting of a process having a private namespace, and expiry of a mount from within that namespace. It's practically impossible to trigger, and I haven't even tried. But still, a bug is a bug. The race happens when put_namespace() is called by another task, while mark_mounts_for_expiry() is between atomic_read() and get_namespace(). In that case get_namespace() will be called on an already dead namespace with unforeseeable results. The solution was suggested by Al Viro, with his own words: Instead of screwing with atomic_read() in there, why don't we simply do the following: a) atomic_dec_and_lock() in put_namespace() b) __put_namespace() called without dropping lock c) the first thing done by __put_namespace would be struct vfsmount *root = namespace->root; namespace->root = NULL; spin_unlock(...); .... umount_tree(root); ... d) check in mark_... would be simply namespace && namespace->root. And we are all set; no screwing around with atomic_read(), no magic at all. Dying namespace gets NULL ->root. All changes of ->root happen under spinlock. If under a spinlock we see non-NULL ->mnt_namespace, it won't be freed until we drop the lock (we will set ->mnt_namespace to NULL under that lock before we get to freeing namespace). If under a spinlock we see non-NULL ->mnt_namespace and ->mnt_namespace->root, we can grab a reference to namespace and be sure that it won't go away. Signed-off-by: Miklos Szeredi Acked-by: Al Viro Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 7 +++++-- include/linux/namespace.h | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index a0d0ef1f1a48..9d17541ebafa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -869,7 +869,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) /* don't do anything if the namespace is dead - all the * vfsmounts from it are going away anyway */ namespace = mnt->mnt_namespace; - if (!namespace || atomic_read(&namespace->count) <= 0) + if (!namespace || !namespace->root) continue; get_namespace(namespace); @@ -1450,9 +1450,12 @@ void __init mnt_init(unsigned long mempages) void __put_namespace(struct namespace *namespace) { + struct vfsmount *root = namespace->root; + namespace->root = NULL; + spin_unlock(&vfsmount_lock); down_write(&namespace->sem); spin_lock(&vfsmount_lock); - umount_tree(namespace->root); + umount_tree(root); spin_unlock(&vfsmount_lock); up_write(&namespace->sem); kfree(namespace); diff --git a/include/linux/namespace.h b/include/linux/namespace.h index 697991b69f9b..0e5a86f13b2f 100644 --- a/include/linux/namespace.h +++ b/include/linux/namespace.h @@ -17,7 +17,8 @@ extern void __put_namespace(struct namespace *namespace); static inline void put_namespace(struct namespace *namespace) { - if (atomic_dec_and_test(&namespace->count)) + if (atomic_dec_and_lock(&namespace->count, &vfsmount_lock)) + /* releases vfsmount_lock */ __put_namespace(namespace); } -- cgit v1.2.3 From 55e700b924f9e0ba24e3a071d1097d050b05abe6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 Jul 2005 17:57:30 -0700 Subject: [PATCH] namespace: rename mnt_fslink to mnt_expire This patch renames vfsmount->mnt_fslink to something a little more descriptive: vfsmount->mnt_expire. Signed-off-by: Mike Waychison Signed-off-by: Miklos Szeredi Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 24 ++++++++++++------------ include/linux/mount.h | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/namespace.c b/fs/namespace.c index b168dc37eaab..587eb0d707ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -61,7 +61,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); - INIT_LIST_HEAD(&mnt->mnt_fslink); + INIT_LIST_HEAD(&mnt->mnt_expire); if (name) { int size = strlen(name)+1; char *newname = kmalloc(size, GFP_KERNEL); @@ -165,8 +165,8 @@ clone_mnt(struct vfsmount *old, struct dentry *root) /* stick the duplicate mount on the same expiry list * as the original if that was on one */ spin_lock(&vfsmount_lock); - if (!list_empty(&old->mnt_fslink)) - list_add(&mnt->mnt_fslink, &old->mnt_fslink); + if (!list_empty(&old->mnt_expire)) + list_add(&mnt->mnt_expire, &old->mnt_expire); spin_unlock(&vfsmount_lock); } return mnt; @@ -351,7 +351,7 @@ static void umount_tree(struct vfsmount *mnt) while (!list_empty(&kill)) { mnt = list_entry(kill.next, struct vfsmount, mnt_list); list_del_init(&mnt->mnt_list); - list_del_init(&mnt->mnt_fslink); + list_del_init(&mnt->mnt_expire); if (mnt->mnt_parent == mnt) { spin_unlock(&vfsmount_lock); } else { @@ -645,7 +645,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) if (mnt) { /* stop bind mounts from expiring */ spin_lock(&vfsmount_lock); - list_del_init(&mnt->mnt_fslink); + list_del_init(&mnt->mnt_expire); spin_unlock(&vfsmount_lock); err = graft_tree(mnt, nd); @@ -744,7 +744,7 @@ static int do_move_mount(struct nameidata *nd, char *old_name) /* if the mount is moved, it should no longer be expire * automatically */ - list_del_init(&old_nd.mnt->mnt_fslink); + list_del_init(&old_nd.mnt->mnt_expire); out2: spin_unlock(&vfsmount_lock); out1: @@ -814,7 +814,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd, if (err == 0 && fslist) { /* add to the specified expiration list */ spin_lock(&vfsmount_lock); - list_add_tail(&newmnt->mnt_fslink, fslist); + list_add_tail(&newmnt->mnt_expire, fslist); spin_unlock(&vfsmount_lock); } @@ -869,7 +869,7 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts) * Someone brought it back to life whilst we didn't have any * locks held so return it to the expiration list */ - list_add_tail(&mnt->mnt_fslink, mounts); + list_add_tail(&mnt->mnt_expire, mounts); spin_unlock(&vfsmount_lock); } } @@ -896,13 +896,13 @@ void mark_mounts_for_expiry(struct list_head *mounts) * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ - list_for_each_entry_safe(mnt, next, mounts, mnt_fslink) { + list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!xchg(&mnt->mnt_expiry_mark, 1) || atomic_read(&mnt->mnt_count) != 1) continue; mntget(mnt); - list_move(&mnt->mnt_fslink, &graveyard); + list_move(&mnt->mnt_expire, &graveyard); } /* @@ -912,8 +912,8 @@ void mark_mounts_for_expiry(struct list_head *mounts) * - dispose of the corpse */ while (!list_empty(&graveyard)) { - mnt = list_entry(graveyard.next, struct vfsmount, mnt_fslink); - list_del_init(&mnt->mnt_fslink); + mnt = list_entry(graveyard.next, struct vfsmount, mnt_expire); + list_del_init(&mnt->mnt_expire); /* don't do anything if the namespace is dead - all the * vfsmounts from it are going away anyway */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 8b8d3b9beefd..196d2d6de4a3 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -34,7 +34,7 @@ struct vfsmount int mnt_expiry_mark; /* true if marked for expiry */ char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; - struct list_head mnt_fslink; /* link in fs-specific expiry list */ + struct list_head mnt_expire; /* link in fs-specific expiry list */ struct namespace *mnt_namespace; /* containing namespace */ }; -- cgit v1.2.3 From 751c404b8f63e8199d5f2f8f2bcfd69b41d11caa Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 Jul 2005 17:57:30 -0700 Subject: [PATCH] namespace: rename _mntput to mntput_no_expire This patch renames _mntput() to something a little more descriptive: mntput_no_expire(). Signed-off-by: Miklos Szeredi Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- include/linux/mount.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/namei.c b/fs/namei.c index fa8df81ce8ca..1d93cb4f7c5f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -314,7 +314,7 @@ void path_release(struct nameidata *nd) void path_release_on_umount(struct nameidata *nd) { dput(nd->dentry); - _mntput(nd->mnt); + mntput_no_expire(nd->mnt); } /* diff --git a/include/linux/mount.h b/include/linux/mount.h index 196d2d6de4a3..74b4727a4e30 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -47,7 +47,7 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt) extern void __mntput(struct vfsmount *mnt); -static inline void _mntput(struct vfsmount *mnt) +static inline void mntput_no_expire(struct vfsmount *mnt) { if (mnt) { if (atomic_dec_and_test(&mnt->mnt_count)) @@ -59,7 +59,7 @@ static inline void mntput(struct vfsmount *mnt) { if (mnt) { mnt->mnt_expiry_mark = 0; - _mntput(mnt); + mntput_no_expire(mnt); } } -- cgit v1.2.3 From a6ccbbb8865101d83c2e716f08feae1da1c48584 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Jul 2005 17:59:11 -0700 Subject: [PATCH] nfsd4: fix sync'ing of recovery directory We need to fsync the recovery directory after writing to it, but we weren't doing this correctly. (For example, we weren't taking the i_sem when calling ->fsync().) Just reuse the existing nfsd fsync code instead. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4recover.c | 29 ++++++++--------------------- fs/nfsd/vfs.c | 2 +- include/linux/nfsd/nfsd.h | 1 + 3 files changed, 10 insertions(+), 22 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 095f1740f3ae..bb40083b6b7d 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -119,25 +119,12 @@ out: return status; } -static int -nfsd4_rec_fsync(struct dentry *dentry) +static void +nfsd4_sync_rec_dir(void) { - struct file *filp; - int status = nfs_ok; - - dprintk("NFSD: nfs4_fsync_rec_dir\n"); - filp = dentry_open(dget(dentry), mntget(rec_dir.mnt), O_RDWR); - if (IS_ERR(filp)) { - status = PTR_ERR(filp); - goto out; - } - if (filp->f_op && filp->f_op->fsync) - status = filp->f_op->fsync(filp, filp->f_dentry, 0); - fput(filp); -out: - if (status) - printk("nfsd4: unable to sync recovery directory\n"); - return status; + down(&rec_dir.dentry->d_inode->i_sem); + nfsd_sync_dir(rec_dir.dentry); + up(&rec_dir.dentry->d_inode->i_sem); } int @@ -176,7 +163,7 @@ out_unlock: up(&rec_dir.dentry->d_inode->i_sem); if (status == 0) { clp->cl_firststate = 1; - status = nfsd4_rec_fsync(rec_dir.dentry); + nfsd4_sync_rec_dir(); } nfs4_reset_user(uid, gid); dprintk("NFSD: nfsd4_create_clid_dir returns %d\n", status); @@ -331,7 +318,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_user(uid, gid); if (status == 0) - status = nfsd4_rec_fsync(rec_dir.dentry); + nfsd4_sync_rec_dir(); if (status) printk("NFSD: Failed to remove expired client state directory" " %.*s\n", HEXDIR_LEN, clp->cl_recdir); @@ -362,7 +349,7 @@ nfsd4_recdir_purge_old(void) { return; status = nfsd4_list_rec_dir(rec_dir.dentry, purge_old); if (status == 0) - status = nfsd4_rec_fsync(rec_dir.dentry); + nfsd4_sync_rec_dir(); if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %s\n", rec_dir.dentry->d_name.name); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index be24ead89d94..5e0bf3917607 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -733,7 +733,7 @@ nfsd_sync(struct file *filp) up(&inode->i_sem); } -static void +void nfsd_sync_dir(struct dentry *dp) { nfsd_dosync(NULL, dp, dp->d_inode->i_fop); diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 5791dfd30dd0..c2da1b62d416 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -124,6 +124,7 @@ int nfsd_statfs(struct svc_rqst *, struct svc_fh *, int nfsd_notify_change(struct inode *, struct iattr *); int nfsd_permission(struct svc_export *, struct dentry *, int); +void nfsd_sync_dir(struct dentry *dp); #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) #ifdef CONFIG_NFSD_V2_ACL -- cgit v1.2.3 From 7fb64cee34f5dc743f697041717cafda8a94b5ac Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Jul 2005 17:59:20 -0700 Subject: [PATCH] nfsd4: seqid comments Add some comments on the use of so_seqid, in an attempt to avoid some of the confusion outlined in the previous patch.... Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4xdr.c | 8 ++++---- include/linux/nfsd/state.h | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5207068cde1a..1515c5b8096f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1210,10 +1210,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) save = resp->p; /* - * Routine for encoding the result of a - * "seqid-mutating" NFSv4 operation. This is - * where seqids are incremented, and the - * replay cache is filled. + * Routine for encoding the result of a "seqid-mutating" NFSv4 operation. This + * is where sequence id's are incremented, and the replay cache is filled. + * Note that we increment sequence id's here, at the last moment, so we're sure + * we know whether the error to be returned is a sequence id mutating error. */ #define ENCODE_SEQID_OP_TAIL(stateowner) do { \ diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index a84a3fa99be1..2d19431f47ea 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -203,7 +203,9 @@ struct nfs4_stateowner { int so_is_open_owner; /* 1=openowner,0=lockowner */ u32 so_id; struct nfs4_client * so_client; - u32 so_seqid; + /* after increment in ENCODE_SEQID_OP_TAIL, represents the next + * sequence id expected from the client: */ + u32 so_seqid; struct xdr_netobj so_owner; /* open owner name */ int so_confirmed; /* successful OPEN_CONFIRM? */ struct nfs4_replay so_replay; -- cgit v1.2.3 From b700949b781480819e53bdc38a53f053226dd75e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Jul 2005 17:59:23 -0700 Subject: [PATCH] nfsd4: return better error on io incompatible with open mode from RFC 3530: "Share reservations are established by OPEN operations and by their nature are mandatory in that when the OPEN denies READ or WRITE operations, that denial results in such operations being rejected with error NFS4ERR_LOCKED." (Note that share_denied is really only a legal error for OPEN.) Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 2 +- include/linux/nfsd/nfsd.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b96714ae3dd7..3647c942915e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1295,7 +1295,7 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) fp = find_file(ino); if (!fp) return nfs_ok; - ret = nfserr_share_denied; + ret = nfserr_locked; /* Search for conflicting share reservations */ list_for_each_entry(stp, &fp->fi_stateids, st_perfile) { if (test_bit(deny_type, &stp->st_deny_bmap) || diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index c2da1b62d416..6d5a24f3fc6d 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -231,6 +231,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_reclaim_bad __constant_htonl(NFSERR_RECLAIM_BAD) #define nfserr_badname __constant_htonl(NFSERR_BADNAME) #define nfserr_cb_path_down __constant_htonl(NFSERR_CB_PATH_DOWN) +#define nfserr_locked __constant_htonl(NFSERR_LOCKED) /* error codes for internal use */ /* if a request fails due to kmalloc failure, it gets dropped. -- cgit v1.2.3 From 4c4cd222ee329025840bc2f8cebf71d36c62440c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Jul 2005 17:59:27 -0700 Subject: [PATCH] nfsd4: check lock type against openmode. We shouldn't be allowing, e.g., write locks on files not open for read. To enforce this, we add a pointer from the lock stateid back to the open stateid it came from, so that the check will continue to be correct even after the open is upgraded or downgraded. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 49 +++++++++++++++++++++++++++++++--------------- include/linux/nfsd/state.h | 5 +++++ 2 files changed, 38 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 59b214f01b6d..b83f8fb441e1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1160,6 +1160,7 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open * stp->st_deny_bmap = 0; __set_bit(open->op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); + stp->st_openstp = NULL; } static void @@ -2158,12 +2159,18 @@ out: return status; } +static inline int +setlkflg (int type) +{ + return (type == NFS4_READW_LT || type == NFS4_READ_LT) ? + RD_STATE : WR_STATE; +} /* * Checks for sequence id mutating operations. */ static int -nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, clientid_t *lockclid) +nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) { struct nfs4_stateid *stp; struct nfs4_stateowner *sop; @@ -2201,21 +2208,31 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei goto check_replay; } - /* for new lock stateowners: - * check that the lock->v.new.open_stateid - * refers to an open stateowner - * - * check that the lockclid (nfs4_lock->v.new.clientid) is the same - * as the open_stateid->st_stateowner->so_client->clientid - */ - if (lockclid) { + if (lock) { struct nfs4_stateowner *sop = stp->st_stateowner; + clientid_t *lockclid = &lock->v.new.clientid; struct nfs4_client *clp = sop->so_client; + int lkflg = 0; + int status; + + lkflg = setlkflg(lock->lk_type); + + if (lock->lk_is_new) { + if (!sop->so_is_open_owner) + return nfserr_bad_stateid; + if (!cmp_clid(&clp->cl_clientid, lockclid)) + return nfserr_bad_stateid; + /* stp is the open stateid */ + status = nfs4_check_openmode(stp, lkflg); + if (status) + return status; + } else { + /* stp is the lock stateid */ + status = nfs4_check_openmode(stp->st_openstp, lkflg); + if (status) + return status; + } - if (!sop->so_is_open_owner) - return nfserr_bad_stateid; - if (!cmp_clid(&clp->cl_clientid, lockclid)) - return nfserr_bad_stateid; } if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) { @@ -2642,6 +2659,7 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */ stp->st_access_bmap = open_stp->st_access_bmap; stp->st_deny_bmap = open_stp->st_deny_bmap; + stp->st_openstp = open_stp; out: return stp; @@ -2697,8 +2715,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock lock->lk_new_open_seqid, &lock->lk_new_open_stateid, CHECK_FH | OPEN_STATE, - &open_sop, &open_stp, - &lock->v.new.clientid); + &open_sop, &open_stp, lock); if (status) goto out; /* create lockowner and lock stateid */ @@ -2726,7 +2743,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, CHECK_FH | LOCK_STATE, - &lock->lk_stateowner, &lock_stp, NULL); + &lock->lk_stateowner, &lock_stp, lock); if (status) goto out; } diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index 2d19431f47ea..8bf23cf8b603 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -237,6 +237,10 @@ struct nfs4_file { * st_perlockowner: (open stateid) list of lock nfs4_stateowners * st_access_bmap: used only for open stateid * st_deny_bmap: used only for open stateid +* st_openstp: open stateid lock stateid was derived from +* +* XXX: open stateids and lock stateids have diverged sufficiently that +* we should consider defining separate structs for the two cases. */ struct nfs4_stateid { @@ -250,6 +254,7 @@ struct nfs4_stateid { struct file * st_vfs_file; unsigned long st_access_bmap; unsigned long st_deny_bmap; + struct nfs4_stateid * st_openstp; }; /* flags for preprocess_seqid_op() */ -- cgit v1.2.3