From 6bfde05bf5c9682e255c6a2c669dc80f91af6296 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Mon, 21 Sep 2009 17:03:43 -0700 Subject: hugetlbfs: allow the creation of files suitable for MAP_PRIVATE on the vfs internal mount This patchset adds a flag to mmap that allows the user to request that an anonymous mapping be backed with huge pages. This mapping will borrow functionality from the huge page shm code to create a file on the kernel internal mount and use it to approximate an anonymous mapping. The MAP_HUGETLB flag is a modifier to MAP_ANONYMOUS and will not work without both flags being preset. A new flag is necessary because there is no other way to hook into huge pages without creating a file on a hugetlbfs mount which wouldn't be MAP_ANONYMOUS. To userspace, this mapping will behave just like an anonymous mapping because the file is not accessible outside of the kernel. This patchset is meant to simplify the programming model. Presently there is a large chunk of boiler platecode, contained in libhugetlbfs, required to create private, hugepage backed mappings. This patch set would allow use of hugepages without linking to libhugetlbfs or having hugetblfs mounted. Unification of the VM code would provide these same benefits, but it has been resisted each time that it has been suggested for several reasons: it would break PAGE_SIZE assumptions across the kernel, it makes page-table abstractions really expensive, and it does not provide any benefit on architectures that do not support huge pages, incurring fast path penalties without providing any benefit on these architectures. This patch: There are two means of creating mappings backed by huge pages: 1. mmap() a file created on hugetlbfs 2. Use shm which creates a file on an internal mount which essentially maps it MAP_SHARED The internal mount is only used for shared mappings but there is very little that stops it being used for private mappings. This patch extends hugetlbfs_file_setup() to deal with the creation of files that will be mapped MAP_PRIVATE on the internal hugetlbfs mount. This extended API is used in a subsequent patch to implement the MAP_HUGETLB mmap() flag. Signed-off-by: Eric Munson Acked-by: David Rientjes Cc: Mel Gorman Cc: Adam Litke Cc: David Gibson Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a93b885311d8..06b7c2623f99 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -507,6 +507,13 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; INIT_LIST_HEAD(&inode->i_mapping->private_list); info = HUGETLBFS_I(inode); + /* + * The policy is initialized here even if we are creating a + * private inode because initialization simply creates an + * an empty rb tree and calls spin_lock_init(), later when we + * call mpol_free_shared_policy() it will just return because + * the rb tree will still be empty. + */ mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: @@ -931,13 +938,19 @@ static struct file_system_type hugetlbfs_fs_type = { static struct vfsmount *hugetlbfs_vfsmount; -static int can_do_hugetlb_shm(void) +static int can_do_hugetlb_shm(int creat_flags) { - return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); + if (creat_flags != HUGETLB_SHMFS_INODE) + return 0; + if (capable(CAP_IPC_LOCK)) + return 1; + if (in_group_p(sysctl_hugetlb_shm_group)) + return 1; + return 0; } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, - struct user_struct **user) + struct user_struct **user, int creat_flags) { int error = -ENOMEM; struct file *file; @@ -949,7 +962,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm()) { + if (!can_do_hugetlb_shm(creat_flags)) { *user = current_user(); if (user_shm_lock(size, *user)) { WARN_ONCE(1, -- cgit v1.2.3 From 1fd7317d02ec03c6fdf072317841287933d06d24 Mon Sep 17 00:00:00 2001 From: Nick Black Date: Tue, 22 Sep 2009 16:43:33 -0700 Subject: Move magic numbers into magic.h Move various magic-number definitions into magic.h. Signed-off-by: Nick Black Acked-by: Pekka Enberg Cc: Al Viro Cc: "David S. Miller" Cc: Casey Schaufler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/devpts/inode.c | 3 +-- fs/hugetlbfs/inode.c | 4 +--- include/linux/magic.h | 5 +++++ net/socket.c | 3 +-- security/smack/smack_lsm.c | 8 +------- 5 files changed, 9 insertions(+), 14 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 75efb028974b..d5f8c96964be 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -18,14 +18,13 @@ #include #include #include +#include #include #include #include #include #include -#define DEVPTS_SUPER_MAGIC 0x1cd1 - #define DEVPTS_DEFAULT_MODE 0600 /* * ptmx is a new node in /dev/pts and will be unused in legacy (single- diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 06b7c2623f99..eba6d552d9c9 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -31,12 +31,10 @@ #include #include #include +#include #include -/* some random number */ -#define HUGETLBFS_MAGIC 0x958458f6 - static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; const struct file_operations hugetlbfs_file_operations; diff --git a/include/linux/magic.h b/include/linux/magic.h index 1923327b9869..bce37786a0a5 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -13,6 +13,7 @@ #define SECURITYFS_MAGIC 0x73636673 #define SELINUX_MAGIC 0xf97cff8c #define TMPFS_MAGIC 0x01021994 +#define HUGETLBFS_MAGIC 0x958458f6 /* some random number */ #define SQUASHFS_MAGIC 0x73717368 #define EFS_SUPER_MAGIC 0x414A53 #define EXT2_SUPER_MAGIC 0xEF53 @@ -53,4 +54,8 @@ #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA #define STACK_END_MAGIC 0x57AC6E9D + +#define DEVPTS_SUPER_MAGIC 0x1cd1 +#define SOCKFS_MAGIC 0x534F434B + #endif /* __LINUX_MAGIC_H__ */ diff --git a/net/socket.c b/net/socket.c index 0ad02ae61a91..49917a1cac7d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include @@ -235,8 +236,6 @@ int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, return __put_user(klen, ulen); } -#define SOCKFS_MAGIC 0x534F434B - static struct kmem_cache *sock_inode_cachep __read_mostly; static struct inode *sock_alloc_inode(struct super_block *sb) diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index acae7ef4092d..c33b6bb9b6dd 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -30,17 +30,11 @@ #include #include #include +#include #include "smack.h" #define task_security(task) (task_cred_xxx((task), security)) -/* - * I hope these are the hokeyist lines of code in the module. Casey. - */ -#define DEVPTS_SUPER_MAGIC 0x1cd1 -#define SOCKFS_MAGIC 0x534F434B -#define TMPFS_MAGIC 0x01021994 - /** * smk_fetch - Fetch the smack label from a file. * @ip: a pointer to the inode -- cgit v1.2.3 From 22fe404218156328a27e66349b1175cd0baa4990 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 18 Sep 2009 13:05:44 -0700 Subject: vfs: split generic_forget_inode() so that hugetlbfs does not have to copy it Hugetlbfs needs to do special things instead of truncate_inode_pages(). Currently, it copied generic_forget_inode() except for truncate_inode_pages() call which is asking for trouble (the code there isn't trivial). So create a separate function generic_detach_inode() which does all the list magic done in generic_forget_inode() and call it from hugetlbfs_forget_inode(). Signed-off-by: Jan Kara Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/hugetlbfs/inode.c | 33 ++++----------------------------- fs/inode.c | 21 +++++++++++++++++++-- include/linux/fs.h | 1 + 3 files changed, 24 insertions(+), 31 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index eba6d552d9c9..478a169a262d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -380,36 +380,11 @@ static void hugetlbfs_delete_inode(struct inode *inode) static void hugetlbfs_forget_inode(struct inode *inode) __releases(inode_lock) { - struct super_block *sb = inode->i_sb; - - if (!hlist_unhashed(&inode->i_hash)) { - if (!(inode->i_state & (I_DIRTY|I_SYNC))) - list_move(&inode->i_list, &inode_unused); - inodes_stat.nr_unused++; - if (!sb || (sb->s_flags & MS_ACTIVE)) { - spin_unlock(&inode_lock); - return; - } - inode->i_state |= I_WILL_FREE; - spin_unlock(&inode_lock); - /* - * write_inode_now is a noop as we set BDI_CAP_NO_WRITEBACK - * in our backing_dev_info. - */ - write_inode_now(inode, 1); - spin_lock(&inode_lock); - inode->i_state &= ~I_WILL_FREE; - inodes_stat.nr_unused--; - hlist_del_init(&inode->i_hash); + if (generic_detach_inode(inode)) { + truncate_hugepages(inode, 0); + clear_inode(inode); + destroy_inode(inode); } - list_del_init(&inode->i_list); - list_del_init(&inode->i_sb_list); - inode->i_state |= I_FREEING; - inodes_stat.nr_inodes--; - spin_unlock(&inode_lock); - truncate_hugepages(inode, 0); - clear_inode(inode); - destroy_inode(inode); } static void hugetlbfs_drop_inode(struct inode *inode) diff --git a/fs/inode.c b/fs/inode.c index 07d775ea6161..fa506d539653 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1241,7 +1241,16 @@ void generic_delete_inode(struct inode *inode) } EXPORT_SYMBOL(generic_delete_inode); -static void generic_forget_inode(struct inode *inode) +/** + * generic_detach_inode - remove inode from inode lists + * @inode: inode to remove + * + * Remove inode from inode lists, write it if it's dirty. This is just an + * internal VFS helper exported for hugetlbfs. Do not use! + * + * Returns 1 if inode should be completely destroyed. + */ +int generic_detach_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; @@ -1251,7 +1260,7 @@ static void generic_forget_inode(struct inode *inode) inodes_stat.nr_unused++; if (sb->s_flags & MS_ACTIVE) { spin_unlock(&inode_lock); - return; + return 0; } WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; @@ -1269,6 +1278,14 @@ static void generic_forget_inode(struct inode *inode) inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); + return 1; +} +EXPORT_SYMBOL_GPL(generic_detach_inode); + +static void generic_forget_inode(struct inode *inode) +{ + if (!generic_detach_inode(inode)) + return; if (inode->i_data.nrpages) truncate_inode_pages(&inode->i_data, 0); clear_inode(inode); diff --git a/include/linux/fs.h b/include/linux/fs.h index 51803528b095..955e34615cb7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2156,6 +2156,7 @@ extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); extern void generic_delete_inode(struct inode *inode); extern void generic_drop_inode(struct inode *inode); +extern int generic_detach_inode(struct inode *inode); extern struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), -- cgit v1.2.3 From ef1ff6b8c08954bc203b59e887d1e580dd91755a Mon Sep 17 00:00:00 2001 From: "From: Mel Gorman" Date: Wed, 23 Sep 2009 15:56:05 -0700 Subject: hugetlbfs: do not call user_shm_lock() for MAP_HUGETLB fix Commit 6bfde05bf5c ("hugetlbfs: allow the creation of files suitable for MAP_PRIVATE on the vfs internal mount") altered can_do_hugetlb_shm() to check if a file is being created for shared memory or mmap(). If this returns false, we then unconditionally call user_shm_lock() triggering a warning. This block should never be entered for MAP_HUGETLB. This patch partially reverts the problem and fixes the check. Signed-off-by: Eric B Munson Cc: David Rientjes Cc: Mel Gorman Cc: Adam Litke Cc: David Gibson Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs/hugetlbfs/inode.c') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index eba6d552d9c9..133335479c24 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -936,15 +936,9 @@ static struct file_system_type hugetlbfs_fs_type = { static struct vfsmount *hugetlbfs_vfsmount; -static int can_do_hugetlb_shm(int creat_flags) +static int can_do_hugetlb_shm(void) { - if (creat_flags != HUGETLB_SHMFS_INODE) - return 0; - if (capable(CAP_IPC_LOCK)) - return 1; - if (in_group_p(sysctl_hugetlb_shm_group)) - return 1; - return 0; + return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, @@ -960,7 +954,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); - if (!can_do_hugetlb_shm(creat_flags)) { + if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { *user = current_user(); if (user_shm_lock(size, *user)) { WARN_ONCE(1, -- cgit v1.2.3