From a19cbd4bf258840ade3b6ee9e9256006d0644e09 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 8 Mar 2006 14:03:09 -0800 Subject: Mark the pipe file operations static They aren't used (nor even really usable) outside of pipe.c anyway Signed-off-by: Linus Torvalds --- include/linux/fs.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/include/linux/fs.h b/include/linux/fs.h index e059da947007..0cc34b1c42c9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1418,9 +1418,6 @@ extern int is_bad_inode(struct inode *); extern struct file_operations read_fifo_fops; extern struct file_operations write_fifo_fops; extern struct file_operations rdwr_fifo_fops; -extern struct file_operations read_pipe_fops; -extern struct file_operations write_pipe_fops; -extern struct file_operations rdwr_pipe_fops; extern int fs_may_remount_ro(struct super_block *); -- cgit v1.2.3 From 529bf6be5c04f2e869d07bfdb122e9fd98ade714 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 7 Mar 2006 21:55:35 -0800 Subject: [PATCH] fix file counting I have benchmarked this on an x86_64 NUMA system and see no significant performance difference on kernbench. Tested on both x86_64 and powerpc. The way we do file struct accounting is not very suitable for batched freeing. For scalability reasons, file accounting was constructor/destructor based. This meant that nr_files was decremented only when the object was removed from the slab cache. This is susceptible to slab fragmentation. With RCU based file structure, consequent batched freeing and a test program like Serge's, we just speed this up and end up with a very fragmented slab - llm22:~ # cat /proc/sys/fs/file-nr 587730 0 758844 At the same time, I see only a 2000+ objects in filp cache. The following patch I fixes this problem. This patch changes the file counting by removing the filp_count_lock. Instead we use a separate percpu counter, nr_files, for now and all accesses to it are through get_nr_files() api. In the sysctl handler for nr_files, we populate files_stat.nr_files before returning to user. Counting files as an when they are created and destroyed (as opposed to inside slab) allows us to correctly count open files with RCU. Signed-off-by: Dipankar Sarma Cc: "Paul E. McKenney" Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- fs/file_table.c | 87 +++++++++++++++++++++++++++++++++------------------- include/linux/file.h | 2 -- include/linux/fs.h | 1 + kernel/sysctl.c | 5 ++- net/unix/af_unix.c | 2 +- 6 files changed, 62 insertions(+), 37 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/dcache.c b/fs/dcache.c index a173bba32666..11dc83092d4a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1736,7 +1736,7 @@ void __init vfs_caches_init(unsigned long mempages) SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, filp_ctor, filp_dtor); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); dcache_init(mempages); inode_init(mempages); diff --git a/fs/file_table.c b/fs/file_table.c index 768b58167543..44fabeaa9415 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -5,6 +5,7 @@ * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) */ +#include #include #include #include @@ -19,52 +20,67 @@ #include #include #include +#include +#include + +#include /* sysctl tunables... */ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -EXPORT_SYMBOL(files_stat); /* Needed by unix.o */ - /* public. Not pretty! */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); -static DEFINE_SPINLOCK(filp_count_lock); +static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* slab constructors and destructors are called from arbitrary - * context and must be fully threaded - use a local spinlock - * to protect files_stat.nr_files - */ -void filp_ctor(void *objp, struct kmem_cache *cachep, unsigned long cflags) +static inline void file_free_rcu(struct rcu_head *head) { - if ((cflags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == - SLAB_CTOR_CONSTRUCTOR) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files++; - spin_unlock_irqrestore(&filp_count_lock, flags); - } + struct file *f = container_of(head, struct file, f_u.fu_rcuhead); + kmem_cache_free(filp_cachep, f); } -void filp_dtor(void *objp, struct kmem_cache *cachep, unsigned long dflags) +static inline void file_free(struct file *f) { - unsigned long flags; - spin_lock_irqsave(&filp_count_lock, flags); - files_stat.nr_files--; - spin_unlock_irqrestore(&filp_count_lock, flags); + percpu_counter_dec(&nr_files); + call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } -static inline void file_free_rcu(struct rcu_head *head) +/* + * Return the total number of open files in the system + */ +static int get_nr_files(void) { - struct file *f = container_of(head, struct file, f_u.fu_rcuhead); - kmem_cache_free(filp_cachep, f); + return percpu_counter_read_positive(&nr_files); } -static inline void file_free(struct file *f) +/* + * Return the maximum number of open files in the system + */ +int get_max_files(void) { - call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); + return files_stat.max_files; } +EXPORT_SYMBOL_GPL(get_max_files); + +/* + * Handle nr_files sysctl + */ +#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + files_stat.nr_files = get_nr_files(); + return proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +#else +int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} +#endif /* Find an unused file structure and return a pointer to it. * Returns NULL, if there are no more free file structures or @@ -78,14 +94,20 @@ struct file *get_empty_filp(void) /* * Privileged users can go above max_files */ - if (files_stat.nr_files >= files_stat.max_files && - !capable(CAP_SYS_ADMIN)) - goto over; + if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + /* + * percpu_counters are inaccurate. Do an expensive check before + * we go and fail. + */ + if (percpu_counter_sum(&nr_files) >= files_stat.max_files) + goto over; + } f = kmem_cache_alloc(filp_cachep, GFP_KERNEL); if (f == NULL) goto fail; + percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); if (security_file_alloc(f)) goto fail_sec; @@ -101,10 +123,10 @@ struct file *get_empty_filp(void) over: /* Ran out of filps - report that */ - if (files_stat.nr_files > old_max) { + if (get_nr_files() > old_max) { printk(KERN_INFO "VFS: file-max limit %d reached\n", - files_stat.max_files); - old_max = files_stat.nr_files; + get_max_files()); + old_max = get_nr_files(); } goto fail; @@ -276,4 +298,5 @@ void __init files_init(unsigned long mempages) if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + percpu_counter_init(&nr_files); } diff --git a/include/linux/file.h b/include/linux/file.h index 418b6101b59a..9901b850f2e4 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -60,8 +60,6 @@ extern void put_filp(struct file *); extern int get_unused_fd(void); extern void FASTCALL(put_unused_fd(unsigned int fd)); struct kmem_cache; -extern void filp_ctor(void * objp, struct kmem_cache *cachep, unsigned long cflags); -extern void filp_dtor(void * objp, struct kmem_cache *cachep, unsigned long dflags); extern struct file ** alloc_fd_array(int); extern void free_fd_array(struct file **, int); diff --git a/include/linux/fs.h b/include/linux/fs.h index 0cc34b1c42c9..51c0c93bdf93 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -35,6 +35,7 @@ struct files_stat_struct { int max_files; /* tunable */ }; extern struct files_stat_struct files_stat; +extern int get_max_files(void); struct inodes_stat_t { int nr_inodes; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index de2d9109194e..32b48e8ee36e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,9 @@ #include #include +extern int proc_nr_files(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ @@ -943,7 +946,7 @@ static ctl_table fs_table[] = { .data = &files_stat, .maxlen = 3*sizeof(int), .mode = 0444, - .proc_handler = &proc_dointvec, + .proc_handler = &proc_nr_files, }, { .ctl_name = FS_MAXFILE, diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1b5989b1b670..c323cc6a28b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -547,7 +547,7 @@ static struct sock * unix_create1(struct socket *sock) struct sock *sk = NULL; struct unix_sock *u; - if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + if (atomic_read(&unix_nr_socks) >= 2*get_max_files()) goto out; sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); -- cgit v1.2.3 From 0adb25d2e71ab047423d6fc63d5d184590d0a66f Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 11 Mar 2006 03:27:13 -0800 Subject: [PATCH] ext3: ext3_symlink should use GFP_NOFS allocations inside This patch fixes illegal __GFP_FS allocation inside ext3 transaction in ext3_symlink(). Such allocation may re-enter ext3 code from try_to_free_pages. But JBD/ext3 code keeps a pointer to current journal handle in task_struct and, hence, is not reentrable. This bug led to "Assertion failure in journal_dirty_metadata()" messages. http://bugzilla.openvz.org/show_bug.cgi?id=115 Signed-off-by: Andrey Savochkin Signed-off-by: Kirill Korotaev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/namei.c | 3 ++- fs/namei.c | 13 +++++++++++-- include/linux/fs.h | 2 ++ 3 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include/linux/fs.h') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 8bd8ac077704..b8f5cd1e540d 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2141,7 +2141,8 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = page_symlink(inode, symname, l); + err = __page_symlink(inode, symname, l, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); if (err) { ext3_dec_count(handle, inode); ext3_mark_inode_dirty(handle, inode); diff --git a/fs/namei.c b/fs/namei.c index 557dcf395ca1..8dc2b038d5d9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2613,13 +2613,15 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int page_symlink(struct inode *inode, const char *symname, int len) +int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask) { struct address_space *mapping = inode->i_mapping; - struct page *page = grab_cache_page(mapping, 0); + struct page *page; int err = -ENOMEM; char *kaddr; + page = find_or_create_page(mapping, 0, gfp_mask); if (!page) goto fail; err = mapping->a_ops->prepare_write(NULL, page, 0, len-1); @@ -2654,6 +2656,12 @@ fail: return err; } +int page_symlink(struct inode *inode, const char *symname, int len) +{ + return __page_symlink(inode, symname, len, + mapping_gfp_mask(inode->i_mapping)); +} + struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -2672,6 +2680,7 @@ EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); EXPORT_SYMBOL(page_put_link); EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(__page_symlink); EXPORT_SYMBOL(page_symlink); EXPORT_SYMBOL(page_symlink_inode_operations); EXPORT_SYMBOL(path_lookup); diff --git a/include/linux/fs.h b/include/linux/fs.h index 51c0c93bdf93..128d0082522c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1664,6 +1664,8 @@ extern int vfs_follow_link(struct nameidata *, const char *); extern int page_readlink(struct dentry *, char __user *, int); extern void *page_follow_link_light(struct dentry *, struct nameidata *); extern void page_put_link(struct dentry *, struct nameidata *, void *); +extern int __page_symlink(struct inode *inode, const char *symname, int len, + gfp_t gfp_mask); extern int page_symlink(struct inode *inode, const char *symname, int len); extern struct inode_operations page_symlink_inode_operations; extern int generic_readlink(struct dentry *, char __user *, int); -- cgit v1.2.3