From 1a7bd2265fc57f29400d57f66275cc5918e30aa6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Aug 2012 17:18:05 -0400
Subject: make get_unused_fd_flags() a function

... and get_unused_fd() a macro around it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index ba3f6053025c..5b46e9970a7a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -477,8 +477,8 @@ out:
 	return error;
 }
 
-int get_unused_fd(void)
+int get_unused_fd_flags(unsigned flags)
 {
-	return alloc_fd(0, 0);
+	return alloc_fd(0, flags);
 }
-EXPORT_SYMBOL(get_unused_fd);
+EXPORT_SYMBOL(get_unused_fd_flags);
-- 
cgit v1.2.3


From c921b40d6201f7ec7b1edf7ea9a844f93e1a27f4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Aug 2012 18:04:37 -0400
Subject: autofs4: don't open-code fd_install()

The only difference between autofs_dev_ioctl_fd_install() and
fd_install() is __set_close_on_exec() done by the latter.  Just
use get_unused_fd_flags(O_CLOEXEC) to allocate the descriptor
and be done with that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index abf645c1703b..a16214109d31 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -221,20 +221,6 @@ static int test_by_type(struct path *path, void *p)
 	return ino && ino->sbi->type & *(unsigned *)p;
 }
 
-static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	__set_close_on_exec(fd, fdt);
-	spin_unlock(&files->file_lock);
-}
-
-
 /*
  * Open a file descriptor on the autofs mount point corresponding
  * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
@@ -243,7 +229,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
 	int err, fd;
 
-	fd = get_unused_fd();
+	fd = get_unused_fd_flags(O_CLOEXEC);
 	if (likely(fd >= 0)) {
 		struct file *filp;
 		struct path path;
@@ -264,7 +250,7 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 			goto out;
 		}
 
-		autofs_dev_ioctl_fd_install(fd, filp);
+		fd_install(fd, filp);
 	}
 
 	return fd;
-- 
cgit v1.2.3


From 5b249b1b07c42c61d0c53b8ab4fad026e39a9be9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Aug 2012 12:17:29 -0400
Subject: pipe(2) - race-free error recovery

don't mess with sys_close() if copy_to_user() fails; just postpone
fd_install() until we know it hasn't.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 8d85d7068c1e..bd3479db4b62 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1064,9 +1064,8 @@ err_inode:
 	return err;
 }
 
-int do_pipe_flags(int *fd, int flags)
+static int __do_pipe_flags(int *fd, struct file **files, int flags)
 {
-	struct file *files[2];
 	int error;
 	int fdw, fdr;
 
@@ -1088,11 +1087,8 @@ int do_pipe_flags(int *fd, int flags)
 	fdw = error;
 
 	audit_fd_pair(fdr, fdw);
-	fd_install(fdr, files[0]);
-	fd_install(fdw, files[1]);
 	fd[0] = fdr;
 	fd[1] = fdw;
-
 	return 0;
 
  err_fdr:
@@ -1103,21 +1099,38 @@ int do_pipe_flags(int *fd, int flags)
 	return error;
 }
 
+int do_pipe_flags(int *fd, int flags)
+{
+	struct file *files[2];
+	int error = __do_pipe_flags(fd, files, flags);
+	if (!error) {
+		fd_install(fd[0], files[0]);
+		fd_install(fd[1], files[1]);
+	}
+	return error;
+}
+
 /*
  * sys_pipe() is the normal C calling standard for creating
  * a pipe. It's not the way Unix traditionally does this, though.
  */
 SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
 {
+	struct file *files[2];
 	int fd[2];
 	int error;
 
-	error = do_pipe_flags(fd, flags);
+	error = __do_pipe_flags(fd, files, flags);
 	if (!error) {
-		if (copy_to_user(fildes, fd, sizeof(fd))) {
-			sys_close(fd[0]);
-			sys_close(fd[1]);
+		if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
+			fput(files[0]);
+			fput(files[1]);
+			put_unused_fd(fd[0]);
+			put_unused_fd(fd[1]);
 			error = -EFAULT;
+		} else {
+			fd_install(fd[0], files[0]);
+			fd_install(fd[1], files[1]);
 		}
 	}
 	return error;
-- 
cgit v1.2.3


From 352e3b249284235e00745f3e71fc348b913e5deb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Aug 2012 12:30:45 -0400
Subject: fanotify: sanitize failure exits in copy_event_to_user()

* do copy_to_user() before prepare_for_access_response(); that kills
the need in remove_access_response().
* don't do fd_install() until we are past the last possible failure
exit.  Don't use sys_close() on cleanup side - just put_unused_fd()
and fput().  Less racy that way...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/notify/fanotify/fanotify_user.c | 59 +++++++++++++-------------------------
 1 file changed, 20 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index d43803669739..ea48693940f1 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -58,7 +58,9 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 	return fsnotify_remove_notify_event(group);
 }
 
-static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
+static int create_fd(struct fsnotify_group *group,
+			struct fsnotify_event *event,
+			struct file **file)
 {
 	int client_fd;
 	struct file *new_file;
@@ -98,7 +100,7 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 		put_unused_fd(client_fd);
 		client_fd = PTR_ERR(new_file);
 	} else {
-		fd_install(client_fd, new_file);
+		*file = new_file;
 	}
 
 	return client_fd;
@@ -106,13 +108,15 @@ static int create_fd(struct fsnotify_group *group, struct fsnotify_event *event)
 
 static int fill_event_metadata(struct fsnotify_group *group,
 				   struct fanotify_event_metadata *metadata,
-				   struct fsnotify_event *event)
+				   struct fsnotify_event *event,
+				   struct file **file)
 {
 	int ret = 0;
 
 	pr_debug("%s: group=%p metadata=%p event=%p\n", __func__,
 		 group, metadata, event);
 
+	*file = NULL;
 	metadata->event_len = FAN_EVENT_METADATA_LEN;
 	metadata->metadata_len = FAN_EVENT_METADATA_LEN;
 	metadata->vers = FANOTIFY_METADATA_VERSION;
@@ -121,7 +125,7 @@ static int fill_event_metadata(struct fsnotify_group *group,
 	if (unlikely(event->mask & FAN_Q_OVERFLOW))
 		metadata->fd = FAN_NOFD;
 	else {
-		metadata->fd = create_fd(group, event);
+		metadata->fd = create_fd(group, event, file);
 		if (metadata->fd < 0)
 			ret = metadata->fd;
 	}
@@ -220,25 +224,6 @@ static int prepare_for_access_response(struct fsnotify_group *group,
 	return 0;
 }
 
-static void remove_access_response(struct fsnotify_group *group,
-				   struct fsnotify_event *event,
-				   __s32 fd)
-{
-	struct fanotify_response_event *re;
-
-	if (!(event->mask & FAN_ALL_PERM_EVENTS))
-		return;
-
-	re = dequeue_re(group, fd);
-	if (!re)
-		return;
-
-	BUG_ON(re->event != event);
-
-	kmem_cache_free(fanotify_response_event_cache, re);
-
-	return;
-}
 #else
 static int prepare_for_access_response(struct fsnotify_group *group,
 				       struct fsnotify_event *event,
@@ -247,12 +232,6 @@ static int prepare_for_access_response(struct fsnotify_group *group,
 	return 0;
 }
 
-static void remove_access_response(struct fsnotify_group *group,
-				   struct fsnotify_event *event,
-				   __s32 fd)
-{
-	return;
-}
 #endif
 
 static ssize_t copy_event_to_user(struct fsnotify_group *group,
@@ -260,31 +239,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 				  char __user *buf)
 {
 	struct fanotify_event_metadata fanotify_event_metadata;
+	struct file *f;
 	int fd, ret;
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	ret = fill_event_metadata(group, &fanotify_event_metadata, event);
+	ret = fill_event_metadata(group, &fanotify_event_metadata, event, &f);
 	if (ret < 0)
 		goto out;
 
 	fd = fanotify_event_metadata.fd;
-	ret = prepare_for_access_response(group, event, fd);
-	if (ret)
-		goto out_close_fd;
-
 	ret = -EFAULT;
 	if (copy_to_user(buf, &fanotify_event_metadata,
 			 fanotify_event_metadata.event_len))
-		goto out_kill_access_response;
+		goto out_close_fd;
+
+	ret = prepare_for_access_response(group, event, fd);
+	if (ret)
+		goto out_close_fd;
 
+	fd_install(fd, f);
 	return fanotify_event_metadata.event_len;
 
-out_kill_access_response:
-	remove_access_response(group, event, fd);
 out_close_fd:
-	if (fd != FAN_NOFD)
-		sys_close(fd);
+	if (fd != FAN_NOFD) {
+		put_unused_fd(fd);
+		fput(f);
+	}
 out:
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	if (event->mask & FAN_ALL_PERM_EVENTS) {
-- 
cgit v1.2.3


From f33ff9927f42045116d738ee47ff7bc59f739bd7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Aug 2012 16:17:59 -0400
Subject: take rlimit check to callers of expand_files()

... except for one in android, where the check is different
and already done in caller.  No need to recalculate rlimit
many times in alloc_fd() either.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c |  3 +++
 fs/file.c  | 16 +++++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 887b5ba8c9b5..08e6af5c1b1f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -64,6 +64,9 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 	if (unlikely(oldfd == newfd))
 		return -EINVAL;
 
+	if (newfd >= rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
+
 	spin_lock(&files->file_lock);
 	err = expand_files(files, newfd);
 	file = fcheck(oldfd);
diff --git a/fs/file.c b/fs/file.c
index 5b46e9970a7a..08922af4a62c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -251,13 +251,6 @@ int expand_files(struct files_struct *files, int nr)
 
 	fdt = files_fdtable(files);
 
-	/*
-	 * N.B. For clone tasks sharing a files structure, this test
-	 * will limit the total number of files that can be opened.
-	 */
-	if (nr >= rlimit(RLIMIT_NOFILE))
-		return -EMFILE;
-
 	/* Do we need to expand? */
 	if (nr < fdt->max_fds)
 		return 0;
@@ -431,6 +424,7 @@ int alloc_fd(unsigned start, unsigned flags)
 {
 	struct files_struct *files = current->files;
 	unsigned int fd;
+	unsigned end = rlimit(RLIMIT_NOFILE);
 	int error;
 	struct fdtable *fdt;
 
@@ -444,6 +438,14 @@ repeat:
 	if (fd < fdt->max_fds)
 		fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
 
+	/*
+	 * N.B. For clone tasks sharing a files structure, this test
+	 * will limit the total number of files that can be opened.
+	 */
+	error = -EMFILE;
+	if (fd >= end)
+		goto out;
+
 	error = expand_files(files, fd);
 	if (error < 0)
 		goto out;
-- 
cgit v1.2.3


From dcfadfa4ec5a12404a99ad6426871a6b03a62b37 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 12 Aug 2012 17:27:30 -0400
Subject: new helper: __alloc_fd()

Essentially, alloc_fd() in a files_struct we own a reference to.
Most of the time wanting to use it is a sign of lousy API
design (such as android/binder).  It's *not* a general-purpose
interface; better that than open-coding its guts, but again,
playing with other process' descriptor table is a sign of bad
design.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/android/binder.c | 59 ++++------------------------------------
 fs/file.c                        | 12 +++++---
 include/linux/fdtable.h          |  3 ++
 3 files changed, 16 insertions(+), 58 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index b9a534c46aac..4946d282a35c 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -362,71 +362,22 @@ struct binder_transaction {
 static void
 binder_defer_work(struct binder_proc *proc, enum binder_deferred_state defer);
 
-/*
- * copied from get_unused_fd_flags
- */
 int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 {
 	struct files_struct *files = proc->files;
-	int fd, error;
-	struct fdtable *fdt;
 	unsigned long rlim_cur;
 	unsigned long irqs;
 
 	if (files == NULL)
 		return -ESRCH;
 
-	error = -EMFILE;
-	spin_lock(&files->file_lock);
+	if (!lock_task_sighand(proc->tsk, &irqs))
+		return -EMFILE;
 
-repeat:
-	fdt = files_fdtable(files);
-	fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, files->next_fd);
-
-	/*
-	 * N.B. For clone tasks sharing a files structure, this test
-	 * will limit the total number of files that can be opened.
-	 */
-	rlim_cur = 0;
-	if (lock_task_sighand(proc->tsk, &irqs)) {
-		rlim_cur = proc->tsk->signal->rlim[RLIMIT_NOFILE].rlim_cur;
-		unlock_task_sighand(proc->tsk, &irqs);
-	}
-	if (fd >= rlim_cur)
-		goto out;
-
-	/* Do we need to expand the fd array or fd set?  */
-	error = expand_files(files, fd);
-	if (error < 0)
-		goto out;
-
-	if (error) {
-		/*
-		 * If we needed to expand the fs array we
-		 * might have blocked - try again.
-		 */
-		error = -EMFILE;
-		goto repeat;
-	}
-
-	__set_open_fd(fd, fdt);
-	if (flags & O_CLOEXEC)
-		__set_close_on_exec(fd, fdt);
-	else
-		__clear_close_on_exec(fd, fdt);
-	files->next_fd = fd + 1;
-#if 1
-	/* Sanity check */
-	if (fdt->fd[fd] != NULL) {
-		pr_warn("get_unused_fd: slot %d not NULL!\n", fd);
-		fdt->fd[fd] = NULL;
-	}
-#endif
-	error = fd;
+	rlim_cur = task_rlimit(proc->tsk, RLIMIT_NOFILE);
+	unlock_task_sighand(proc->tsk, &irqs);
 
-out:
-	spin_unlock(&files->file_lock);
-	return error;
+	return __alloc_fd(files, 0, rlim_cur, flags);
 }
 
 /*
diff --git a/fs/file.c b/fs/file.c
index 08922af4a62c..a3a0705f8f51 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -420,11 +420,10 @@ struct files_struct init_files = {
 /*
  * allocate a file descriptor, mark it busy.
  */
-int alloc_fd(unsigned start, unsigned flags)
+int __alloc_fd(struct files_struct *files,
+	       unsigned start, unsigned end, unsigned flags)
 {
-	struct files_struct *files = current->files;
 	unsigned int fd;
-	unsigned end = rlimit(RLIMIT_NOFILE);
 	int error;
 	struct fdtable *fdt;
 
@@ -479,8 +478,13 @@ out:
 	return error;
 }
 
+int alloc_fd(unsigned start, unsigned flags)
+{
+	return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
+}
+
 int get_unused_fd_flags(unsigned flags)
 {
-	return alloc_fd(0, flags);
+	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
 }
 EXPORT_SYMBOL(get_unused_fd_flags);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 158a41eed314..b84ca064f727 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -125,6 +125,9 @@ void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
 struct files_struct *dup_fd(struct files_struct *, int *);
 
+extern int __alloc_fd(struct files_struct *files,
+		      unsigned start, unsigned end, unsigned flags);
+
 extern struct kmem_cache *files_cachep;
 
 #endif /* __LINUX_FDTABLE_H */
-- 
cgit v1.2.3


From 7cf4dc3c8dbfdfde163d4636f621cf99a1f63bfb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Aug 2012 19:56:12 -0400
Subject: move files_struct-related bits from kernel/exit.c to fs/file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 100 +++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fdtable.h |   6 ---
 kernel/exit.c           |  93 --------------------------------------------
 3 files changed, 99 insertions(+), 100 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index a3a0705f8f51..0be423cadb26 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -84,7 +84,7 @@ static void free_fdtable_work(struct work_struct *work)
 	}
 }
 
-void free_fdtable_rcu(struct rcu_head *rcu)
+static void free_fdtable_rcu(struct rcu_head *rcu)
 {
 	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
 	struct fdtable_defer *fddef;
@@ -116,6 +116,11 @@ void free_fdtable_rcu(struct rcu_head *rcu)
 	}
 }
 
+static inline void free_fdtable(struct fdtable *fdt)
+{
+	call_rcu(&fdt->rcu, free_fdtable_rcu);
+}
+
 /*
  * Expand the fdset in the files_struct.  Called with the files spinlock
  * held for write.
@@ -388,6 +393,99 @@ out:
 	return NULL;
 }
 
+static void close_files(struct files_struct * files)
+{
+	int i, j;
+	struct fdtable *fdt;
+
+	j = 0;
+
+	/*
+	 * It is safe to dereference the fd table without RCU or
+	 * ->file_lock because this is the last reference to the
+	 * files structure.  But use RCU to shut RCU-lockdep up.
+	 */
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	rcu_read_unlock();
+	for (;;) {
+		unsigned long set;
+		i = j * BITS_PER_LONG;
+		if (i >= fdt->max_fds)
+			break;
+		set = fdt->open_fds[j++];
+		while (set) {
+			if (set & 1) {
+				struct file * file = xchg(&fdt->fd[i], NULL);
+				if (file) {
+					filp_close(file, files);
+					cond_resched();
+				}
+			}
+			i++;
+			set >>= 1;
+		}
+	}
+}
+
+struct files_struct *get_files_struct(struct task_struct *task)
+{
+	struct files_struct *files;
+
+	task_lock(task);
+	files = task->files;
+	if (files)
+		atomic_inc(&files->count);
+	task_unlock(task);
+
+	return files;
+}
+
+void put_files_struct(struct files_struct *files)
+{
+	struct fdtable *fdt;
+
+	if (atomic_dec_and_test(&files->count)) {
+		close_files(files);
+		/*
+		 * Free the fd and fdset arrays if we expanded them.
+		 * If the fdtable was embedded, pass files for freeing
+		 * at the end of the RCU grace period. Otherwise,
+		 * you can free files immediately.
+		 */
+		rcu_read_lock();
+		fdt = files_fdtable(files);
+		if (fdt != &files->fdtab)
+			kmem_cache_free(files_cachep, files);
+		free_fdtable(fdt);
+		rcu_read_unlock();
+	}
+}
+
+void reset_files_struct(struct files_struct *files)
+{
+	struct task_struct *tsk = current;
+	struct files_struct *old;
+
+	old = tsk->files;
+	task_lock(tsk);
+	tsk->files = files;
+	task_unlock(tsk);
+	put_files_struct(old);
+}
+
+void exit_files(struct task_struct *tsk)
+{
+	struct files_struct * files = tsk->files;
+
+	if (files) {
+		task_lock(tsk);
+		tsk->files = NULL;
+		task_unlock(tsk);
+		put_files_struct(files);
+	}
+}
+
 static void __devinit fdtable_defer_list_init(int cpu)
 {
 	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index b84ca064f727..3855f4febe70 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -94,14 +94,8 @@ struct vfsmount;
 struct dentry;
 
 extern int expand_files(struct files_struct *, int nr);
-extern void free_fdtable_rcu(struct rcu_head *rcu);
 extern void __init files_defer_init(void);
 
-static inline void free_fdtable(struct fdtable *fdt)
-{
-	call_rcu(&fdt->rcu, free_fdtable_rcu);
-}
-
 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
 {
 	struct file * file = NULL;
diff --git a/kernel/exit.c b/kernel/exit.c
index f65345f9e5bb..20dfc7617c2e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -466,99 +466,6 @@ void daemonize(const char *name, ...)
 
 EXPORT_SYMBOL(daemonize);
 
-static void close_files(struct files_struct * files)
-{
-	int i, j;
-	struct fdtable *fdt;
-
-	j = 0;
-
-	/*
-	 * It is safe to dereference the fd table without RCU or
-	 * ->file_lock because this is the last reference to the
-	 * files structure.  But use RCU to shut RCU-lockdep up.
-	 */
-	rcu_read_lock();
-	fdt = files_fdtable(files);
-	rcu_read_unlock();
-	for (;;) {
-		unsigned long set;
-		i = j * BITS_PER_LONG;
-		if (i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds[j++];
-		while (set) {
-			if (set & 1) {
-				struct file * file = xchg(&fdt->fd[i], NULL);
-				if (file) {
-					filp_close(file, files);
-					cond_resched();
-				}
-			}
-			i++;
-			set >>= 1;
-		}
-	}
-}
-
-struct files_struct *get_files_struct(struct task_struct *task)
-{
-	struct files_struct *files;
-
-	task_lock(task);
-	files = task->files;
-	if (files)
-		atomic_inc(&files->count);
-	task_unlock(task);
-
-	return files;
-}
-
-void put_files_struct(struct files_struct *files)
-{
-	struct fdtable *fdt;
-
-	if (atomic_dec_and_test(&files->count)) {
-		close_files(files);
-		/*
-		 * Free the fd and fdset arrays if we expanded them.
-		 * If the fdtable was embedded, pass files for freeing
-		 * at the end of the RCU grace period. Otherwise,
-		 * you can free files immediately.
-		 */
-		rcu_read_lock();
-		fdt = files_fdtable(files);
-		if (fdt != &files->fdtab)
-			kmem_cache_free(files_cachep, files);
-		free_fdtable(fdt);
-		rcu_read_unlock();
-	}
-}
-
-void reset_files_struct(struct files_struct *files)
-{
-	struct task_struct *tsk = current;
-	struct files_struct *old;
-
-	old = tsk->files;
-	task_lock(tsk);
-	tsk->files = files;
-	task_unlock(tsk);
-	put_files_struct(old);
-}
-
-void exit_files(struct task_struct *tsk)
-{
-	struct files_struct * files = tsk->files;
-
-	if (files) {
-		task_lock(tsk);
-		tsk->files = NULL;
-		task_unlock(tsk);
-		put_files_struct(files);
-	}
-}
-
 #ifdef CONFIG_MM_OWNER
 /*
  * A task is exiting.   If it owned this mm, find a new owner for the mm.
-- 
cgit v1.2.3


From b9e02af0ae0783894abb576fbab45ec29aa8e7fc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Aug 2012 20:00:58 -0400
Subject: don't bother with call_rcu() in put_files_struct()

At that point nobody can see us anyway; everything that
looks at files_fdtable(files) is separated from the
guts of put_files_struct(files) - either since files is
current->files or because we fetched it under task_lock()
and hadn't dropped that yet, or because we'd bumped
files->count while holding task_lock()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 0be423cadb26..533fa5d56a5f 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -447,18 +447,14 @@ void put_files_struct(struct files_struct *files)
 
 	if (atomic_dec_and_test(&files->count)) {
 		close_files(files);
-		/*
-		 * Free the fd and fdset arrays if we expanded them.
-		 * If the fdtable was embedded, pass files for freeing
-		 * at the end of the RCU grace period. Otherwise,
-		 * you can free files immediately.
-		 */
+		/* not really needed, since nobody can see us */
 		rcu_read_lock();
 		fdt = files_fdtable(files);
-		if (fdt != &files->fdtab)
-			kmem_cache_free(files_cachep, files);
-		free_fdtable(fdt);
 		rcu_read_unlock();
+		/* free the arrays if they are not embedded */
+		if (fdt != &files->fdtab)
+			__free_fdtable(fdt);
+		kmem_cache_free(files_cachep, files);
 	}
 }
 
-- 
cgit v1.2.3


From 1983e781da2f7f77906f4ccc2c3dc279cd61d1ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Aug 2012 20:06:36 -0400
Subject: trim free_fdtable_rcu()

embedded case isn't hit anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 533fa5d56a5f..4ce4a0fcf320 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -90,16 +90,8 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 	struct fdtable_defer *fddef;
 
 	BUG_ON(!fdt);
+	BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
 
-	if (fdt->max_fds <= NR_OPEN_DEFAULT) {
-		/*
-		 * This fdtable is embedded in the files structure and that
-		 * structure itself is getting destroyed.
-		 */
-		kmem_cache_free(files_cachep,
-				container_of(fdt, struct files_struct, fdtab));
-		return;
-	}
 	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
 		kfree(fdt->fd);
 		kfree(fdt->open_fds);
@@ -116,11 +108,6 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
 	}
 }
 
-static inline void free_fdtable(struct fdtable *fdt)
-{
-	call_rcu(&fdt->rcu, free_fdtable_rcu);
-}
-
 /*
  * Expand the fdset in the files_struct.  Called with the files spinlock
  * held for write.
@@ -234,7 +221,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
 		copy_fdtable(new_fdt, cur_fdt);
 		rcu_assign_pointer(files->fdt, new_fdt);
 		if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
-			free_fdtable(cur_fdt);
+			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 	} else {
 		/* Somebody else expanded, so undo our attempt */
 		__free_fdtable(new_fdt);
-- 
cgit v1.2.3


From 56007cae94f349387c088e738c7dcb6bc513063b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Aug 2012 21:03:26 -0400
Subject: move put_unused_fd() and fd_install() to fs/file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 fs/open.c | 44 --------------------------------------------
 2 files changed, 44 insertions(+), 44 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 4ce4a0fcf320..78cf88f2a0e8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -569,3 +569,47 @@ int get_unused_fd_flags(unsigned flags)
 	return __alloc_fd(current->files, 0, rlimit(RLIMIT_NOFILE), flags);
 }
 EXPORT_SYMBOL(get_unused_fd_flags);
+
+static void __put_unused_fd(struct files_struct *files, unsigned int fd)
+{
+	struct fdtable *fdt = files_fdtable(files);
+	__clear_open_fd(fd, fdt);
+	if (fd < files->next_fd)
+		files->next_fd = fd;
+}
+
+void put_unused_fd(unsigned int fd)
+{
+	struct files_struct *files = current->files;
+	spin_lock(&files->file_lock);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+}
+
+EXPORT_SYMBOL(put_unused_fd);
+
+/*
+ * Install a file pointer in the fd array.
+ *
+ * The VFS is full of places where we drop the files lock between
+ * setting the open_fds bitmap and installing the file in the file
+ * array.  At any such point, we are vulnerable to a dup2() race
+ * installing a file in the array before us.  We need to detect this and
+ * fput() the struct file we are about to overwrite in this case.
+ *
+ * It should never happen - if we allow dup2() do it, _really_ bad things
+ * will follow.
+ */
+
+void fd_install(unsigned int fd, struct file *file)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	BUG_ON(fdt->fd[fd] != NULL);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	spin_unlock(&files->file_lock);
+}
+
+EXPORT_SYMBOL(fd_install);
diff --git a/fs/open.c b/fs/open.c
index e1f2cdb91a4d..c525bd0e65b6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -803,50 +803,6 @@ struct file *dentry_open(const struct path *path, int flags,
 }
 EXPORT_SYMBOL(dentry_open);
 
-static void __put_unused_fd(struct files_struct *files, unsigned int fd)
-{
-	struct fdtable *fdt = files_fdtable(files);
-	__clear_open_fd(fd, fdt);
-	if (fd < files->next_fd)
-		files->next_fd = fd;
-}
-
-void put_unused_fd(unsigned int fd)
-{
-	struct files_struct *files = current->files;
-	spin_lock(&files->file_lock);
-	__put_unused_fd(files, fd);
-	spin_unlock(&files->file_lock);
-}
-
-EXPORT_SYMBOL(put_unused_fd);
-
-/*
- * Install a file pointer in the fd array.
- *
- * The VFS is full of places where we drop the files lock between
- * setting the open_fds bitmap and installing the file in the file
- * array.  At any such point, we are vulnerable to a dup2() race
- * installing a file in the array before us.  We need to detect this and
- * fput() the struct file we are about to overwrite in this case.
- *
- * It should never happen - if we allow dup2() do it, _really_ bad things
- * will follow.
- */
-
-void fd_install(unsigned int fd, struct file *file)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
-}
-
-EXPORT_SYMBOL(fd_install);
-
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
-- 
cgit v1.2.3


From f869e8a7f753e3fd43d6483e796774776f645edb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Aug 2012 21:06:33 -0400
Subject: expose a low-level variant of fd_install() for binder

Similar situation to that of __alloc_fd(); do not use unless you
really have to.  You should not touch any descriptor table other
than your own; it's a sure sign of a really bad API design.

As with __alloc_fd(), you *must* use a first-class reference to
struct files_struct; something obtained by get_files_struct(some task)
(let alone direct task->files) will not do.  It must be either
current->files, or obtained by get_files_struct(current) by the
owner of that sucker and given to you.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/android/binder.c | 13 ++-----------
 fs/file.c                        | 16 ++++++++++++++--
 include/linux/fdtable.h          |  2 ++
 3 files changed, 18 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index 4946d282a35c..9e1a98a360d4 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -386,17 +386,8 @@ int task_get_unused_fd_flags(struct binder_proc *proc, int flags)
 static void task_fd_install(
 	struct binder_proc *proc, unsigned int fd, struct file *file)
 {
-	struct files_struct *files = proc->files;
-	struct fdtable *fdt;
-
-	if (files == NULL)
-		return;
-
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	BUG_ON(fdt->fd[fd] != NULL);
-	rcu_assign_pointer(fdt->fd[fd], file);
-	spin_unlock(&files->file_lock);
+	if (proc->files)
+		__fd_install(proc->files, fd, file);
 }
 
 /*
diff --git a/fs/file.c b/fs/file.c
index 78cf88f2a0e8..0d1bf0515111 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -599,11 +599,18 @@ EXPORT_SYMBOL(put_unused_fd);
  *
  * It should never happen - if we allow dup2() do it, _really_ bad things
  * will follow.
+ *
+ * NOTE: __fd_install() variant is really, really low-level; don't
+ * use it unless you are forced to by truly lousy API shoved down
+ * your throat.  'files' *MUST* be either current->files or obtained
+ * by get_files_struct(current) done by whoever had given it to you,
+ * or really bad things will happen.  Normally you want to use
+ * fd_install() instead.
  */
 
-void fd_install(unsigned int fd, struct file *file)
+void __fd_install(struct files_struct *files, unsigned int fd,
+		struct file *file)
 {
-	struct files_struct *files = current->files;
 	struct fdtable *fdt;
 	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
@@ -612,4 +619,9 @@ void fd_install(unsigned int fd, struct file *file)
 	spin_unlock(&files->file_lock);
 }
 
+void fd_install(unsigned int fd, struct file *file)
+{
+	__fd_install(current->files, fd, file);
+}
+
 EXPORT_SYMBOL(fd_install);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 3855f4febe70..59d4fc7f10c8 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -121,6 +121,8 @@ struct files_struct *dup_fd(struct files_struct *, int *);
 
 extern int __alloc_fd(struct files_struct *files,
 		      unsigned start, unsigned end, unsigned flags);
+extern void __fd_install(struct files_struct *files,
+		      unsigned int fd, struct file *file);
 
 extern struct kmem_cache *files_cachep;
 
-- 
cgit v1.2.3


From 0ee8cdfe6af052deb56dccd54838a1eb32fb4ca2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 15 Aug 2012 21:12:10 -0400
Subject: take fget() and friends to fs/file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c       | 106 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/file_table.c | 106 --------------------------------------------------------
 2 files changed, 106 insertions(+), 106 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 0d1bf0515111..6eef55ce30c9 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -625,3 +625,109 @@ void fd_install(unsigned int fd, struct file *file)
 }
 
 EXPORT_SYMBOL(fd_install);
+
+struct file *fget(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (file->f_mode & FMODE_PATH ||
+		    !atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
+EXPORT_SYMBOL(fget);
+
+struct file *fget_raw(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (!atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
+EXPORT_SYMBOL(fget_raw);
+
+/*
+ * Lightweight file lookup - no refcnt increment if fd table isn't shared.
+ *
+ * You can use this instead of fget if you satisfy all of the following
+ * conditions:
+ * 1) You must call fput_light before exiting the syscall and returning control
+ *    to userspace (i.e. you cannot remember the returned struct file * after
+ *    returning to userspace).
+ * 2) You must not call filp_close on the returned struct file * in between
+ *    calls to fget_light and fput_light.
+ * 3) You must not clone the current task in between the calls to fget_light
+ *    and fput_light.
+ *
+ * The fput_needed flag returned by fget_light should be passed to the
+ * corresponding fput_light.
+ */
+struct file *fget_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
+		if (file && (file->f_mode & FMODE_PATH))
+			file = NULL;
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (!(file->f_mode & FMODE_PATH) &&
+			    atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
+
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
diff --git a/fs/file_table.c b/fs/file_table.c
index 701985e4ccda..c6780163bf3e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -339,112 +339,6 @@ void __fput_sync(struct file *file)
 
 EXPORT_SYMBOL(fput);
 
-struct file *fget(unsigned int fd)
-{
-	struct file *file;
-	struct files_struct *files = current->files;
-
-	rcu_read_lock();
-	file = fcheck_files(files, fd);
-	if (file) {
-		/* File object ref couldn't be taken */
-		if (file->f_mode & FMODE_PATH ||
-		    !atomic_long_inc_not_zero(&file->f_count))
-			file = NULL;
-	}
-	rcu_read_unlock();
-
-	return file;
-}
-
-EXPORT_SYMBOL(fget);
-
-struct file *fget_raw(unsigned int fd)
-{
-	struct file *file;
-	struct files_struct *files = current->files;
-
-	rcu_read_lock();
-	file = fcheck_files(files, fd);
-	if (file) {
-		/* File object ref couldn't be taken */
-		if (!atomic_long_inc_not_zero(&file->f_count))
-			file = NULL;
-	}
-	rcu_read_unlock();
-
-	return file;
-}
-
-EXPORT_SYMBOL(fget_raw);
-
-/*
- * Lightweight file lookup - no refcnt increment if fd table isn't shared.
- *
- * You can use this instead of fget if you satisfy all of the following
- * conditions:
- * 1) You must call fput_light before exiting the syscall and returning control
- *    to userspace (i.e. you cannot remember the returned struct file * after
- *    returning to userspace).
- * 2) You must not call filp_close on the returned struct file * in between
- *    calls to fget_light and fput_light.
- * 3) You must not clone the current task in between the calls to fget_light
- *    and fput_light.
- *
- * The fput_needed flag returned by fget_light should be passed to the
- * corresponding fput_light.
- */
-struct file *fget_light(unsigned int fd, int *fput_needed)
-{
-	struct file *file;
-	struct files_struct *files = current->files;
-
-	*fput_needed = 0;
-	if (atomic_read(&files->count) == 1) {
-		file = fcheck_files(files, fd);
-		if (file && (file->f_mode & FMODE_PATH))
-			file = NULL;
-	} else {
-		rcu_read_lock();
-		file = fcheck_files(files, fd);
-		if (file) {
-			if (!(file->f_mode & FMODE_PATH) &&
-			    atomic_long_inc_not_zero(&file->f_count))
-				*fput_needed = 1;
-			else
-				/* Didn't get the reference, someone's freed */
-				file = NULL;
-		}
-		rcu_read_unlock();
-	}
-
-	return file;
-}
-
-struct file *fget_raw_light(unsigned int fd, int *fput_needed)
-{
-	struct file *file;
-	struct files_struct *files = current->files;
-
-	*fput_needed = 0;
-	if (atomic_read(&files->count) == 1) {
-		file = fcheck_files(files, fd);
-	} else {
-		rcu_read_lock();
-		file = fcheck_files(files, fd);
-		if (file) {
-			if (atomic_long_inc_not_zero(&file->f_count))
-				*fput_needed = 1;
-			else
-				/* Didn't get the reference, someone's freed */
-				file = NULL;
-		}
-		rcu_read_unlock();
-	}
-
-	return file;
-}
-
 void put_filp(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
-- 
cgit v1.2.3


From 483ce1d4b8c3b82bc9c9a1dd9dbc44f50b3aaf5a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 19 Aug 2012 12:04:24 -0400
Subject: take descriptor-related part of close() to file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/android/binder.c | 34 ++--------------------------------
 fs/file.c                        | 26 ++++++++++++++++++++++++++
 fs/open.c                        | 22 +---------------------
 include/linux/fdtable.h          |  2 ++
 4 files changed, 31 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c
index 9e1a98a360d4..f71d624995ea 100644
--- a/drivers/staging/android/binder.c
+++ b/drivers/staging/android/binder.c
@@ -390,43 +390,17 @@ static void task_fd_install(
 		__fd_install(proc->files, fd, file);
 }
 
-/*
- * copied from __put_unused_fd in open.c
- */
-static void __put_unused_fd(struct files_struct *files, unsigned int fd)
-{
-	struct fdtable *fdt = files_fdtable(files);
-	__clear_open_fd(fd, fdt);
-	if (fd < files->next_fd)
-		files->next_fd = fd;
-}
-
 /*
  * copied from sys_close
  */
 static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 {
-	struct file *filp;
-	struct files_struct *files = proc->files;
-	struct fdtable *fdt;
 	int retval;
 
-	if (files == NULL)
+	if (proc->files == NULL)
 		return -ESRCH;
 
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (fd >= fdt->max_fds)
-		goto out_unlock;
-	filp = fdt->fd[fd];
-	if (!filp)
-		goto out_unlock;
-	rcu_assign_pointer(fdt->fd[fd], NULL);
-	__clear_close_on_exec(fd, fdt);
-	__put_unused_fd(files, fd);
-	spin_unlock(&files->file_lock);
-	retval = filp_close(filp, files);
-
+	retval = __close_fd(proc->files, fd);
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
 		     retval == -ERESTARTNOINTR ||
@@ -435,10 +409,6 @@ static long task_close_fd(struct binder_proc *proc, unsigned int fd)
 		retval = -EINTR;
 
 	return retval;
-
-out_unlock:
-	spin_unlock(&files->file_lock);
-	return -EBADF;
 }
 
 static void binder_set_nice(long nice)
diff --git a/fs/file.c b/fs/file.c
index 6eef55ce30c9..fd4694e688ab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -626,6 +626,32 @@ void fd_install(unsigned int fd, struct file *file)
 
 EXPORT_SYMBOL(fd_install);
 
+/*
+ * The same warnings as for __alloc_fd()/__fd_install() apply here...
+ */
+int __close_fd(struct files_struct *files, unsigned fd)
+{
+	struct file *file;
+	struct fdtable *fdt;
+
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (fd >= fdt->max_fds)
+		goto out_unlock;
+	file = fdt->fd[fd];
+	if (!file)
+		goto out_unlock;
+	rcu_assign_pointer(fdt->fd[fd], NULL);
+	__clear_close_on_exec(fd, fdt);
+	__put_unused_fd(files, fd);
+	spin_unlock(&files->file_lock);
+	return filp_close(file, files);
+
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return -EBADF;
+}
+
 struct file *fget(unsigned int fd)
 {
 	struct file *file;
diff --git a/fs/open.c b/fs/open.c
index c525bd0e65b6..30760017deed 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -994,23 +994,7 @@ EXPORT_SYMBOL(filp_close);
  */
 SYSCALL_DEFINE1(close, unsigned int, fd)
 {
-	struct file * filp;
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-	int retval;
-
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (fd >= fdt->max_fds)
-		goto out_unlock;
-	filp = fdt->fd[fd];
-	if (!filp)
-		goto out_unlock;
-	rcu_assign_pointer(fdt->fd[fd], NULL);
-	__clear_close_on_exec(fd, fdt);
-	__put_unused_fd(files, fd);
-	spin_unlock(&files->file_lock);
-	retval = filp_close(filp, files);
+	int retval = __close_fd(current->files, fd);
 
 	/* can't restart close syscall because file table entry was cleared */
 	if (unlikely(retval == -ERESTARTSYS ||
@@ -1020,10 +1004,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
 		retval = -EINTR;
 
 	return retval;
-
-out_unlock:
-	spin_unlock(&files->file_lock);
-	return -EBADF;
 }
 EXPORT_SYMBOL(sys_close);
 
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 59d4fc7f10c8..59488f2392bc 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -123,6 +123,8 @@ extern int __alloc_fd(struct files_struct *files,
 		      unsigned start, unsigned end, unsigned flags);
 extern void __fd_install(struct files_struct *files,
 		      unsigned int fd, struct file *file);
+extern int __close_fd(struct files_struct *files,
+		      unsigned int fd);
 
 extern struct kmem_cache *files_cachep;
 
-- 
cgit v1.2.3


From 6a6d27de340c89c5323565b49f7851362619925d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 09:56:33 -0400
Subject: take close-on-exec logics to fs/file.c, clean it up a bit

... and add cond_resched() there, while we are at it.  We can
get large latencies as is...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c               | 41 ++++++-----------------------------------
 fs/file.c               | 37 +++++++++++++++++++++++++++++++++++++
 include/linux/fdtable.h |  1 +
 3 files changed, 44 insertions(+), 35 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 574cf4de4ec3..f2b6af585d4a 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1006,40 +1006,6 @@ no_thread_group:
 	return 0;
 }
 
-/*
- * These functions flushes out all traces of the currently running executable
- * so that a new one can be started
- */
-static void flush_old_files(struct files_struct * files)
-{
-	long j = -1;
-	struct fdtable *fdt;
-
-	spin_lock(&files->file_lock);
-	for (;;) {
-		unsigned long set, i;
-
-		j++;
-		i = j * BITS_PER_LONG;
-		fdt = files_fdtable(files);
-		if (i >= fdt->max_fds)
-			break;
-		set = fdt->close_on_exec[j];
-		if (!set)
-			continue;
-		fdt->close_on_exec[j] = 0;
-		spin_unlock(&files->file_lock);
-		for ( ; set ; i++,set >>= 1) {
-			if (set & 1) {
-				sys_close(i);
-			}
-		}
-		spin_lock(&files->file_lock);
-
-	}
-	spin_unlock(&files->file_lock);
-}
-
 char *get_task_comm(char *buf, struct task_struct *tsk)
 {
 	/* buf must be at least sizeof(tsk->comm) in size */
@@ -1050,6 +1016,11 @@ char *get_task_comm(char *buf, struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(get_task_comm);
 
+/*
+ * These functions flushes out all traces of the currently running executable
+ * so that a new one can be started
+ */
+
 void set_task_comm(struct task_struct *tsk, char *buf)
 {
 	task_lock(tsk);
@@ -1171,7 +1142,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	current->self_exec_id++;
 			
 	flush_signal_handlers(current, 0);
-	flush_old_files(current->files);
+	do_close_on_exec(current->files);
 }
 EXPORT_SYMBOL(setup_new_exec);
 
diff --git a/fs/file.c b/fs/file.c
index fd4694e688ab..92197dd9fdc8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -652,6 +652,43 @@ out_unlock:
 	return -EBADF;
 }
 
+void do_close_on_exec(struct files_struct *files)
+{
+	unsigned i;
+	struct fdtable *fdt;
+
+	/* exec unshares first */
+	BUG_ON(atomic_read(&files->count) != 1);
+	spin_lock(&files->file_lock);
+	for (i = 0; ; i++) {
+		unsigned long set;
+		unsigned fd = i * BITS_PER_LONG;
+		fdt = files_fdtable(files);
+		if (fd >= fdt->max_fds)
+			break;
+		set = fdt->close_on_exec[i];
+		if (!set)
+			continue;
+		fdt->close_on_exec[i] = 0;
+		for ( ; set ; fd++, set >>= 1) {
+			struct file *file;
+			if (!(set & 1))
+				continue;
+			file = fdt->fd[fd];
+			if (!file)
+				continue;
+			rcu_assign_pointer(fdt->fd[fd], NULL);
+			__put_unused_fd(files, fd);
+			spin_unlock(&files->file_lock);
+			filp_close(file, files);
+			cond_resched();
+			spin_lock(&files->file_lock);
+		}
+
+	}
+	spin_unlock(&files->file_lock);
+}
+
 struct file *fget(unsigned int fd)
 {
 	struct file *file;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 59488f2392bc..ef4b2137e6bc 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -118,6 +118,7 @@ void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
 struct files_struct *dup_fd(struct files_struct *, int *);
+void do_close_on_exec(struct files_struct *);
 
 extern int __alloc_fd(struct files_struct *files,
 		      unsigned start, unsigned end, unsigned flags);
-- 
cgit v1.2.3


From fe17f22d7fd0e344ef6447238f799bb49f670c6f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 11:48:11 -0400
Subject: take purely descriptor-related stuff from fcntl.c to file.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c           | 131 ++------------------------------------------------
 fs/file.c            | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/file.h |   2 +
 3 files changed, 137 insertions(+), 128 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 08e6af5c1b1f..40a5bfb72cca 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -26,127 +26,6 @@
 #include <asm/siginfo.h>
 #include <asm/uaccess.h>
 
-void set_close_on_exec(unsigned int fd, int flag)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-	spin_lock(&files->file_lock);
-	fdt = files_fdtable(files);
-	if (flag)
-		__set_close_on_exec(fd, fdt);
-	else
-		__clear_close_on_exec(fd, fdt);
-	spin_unlock(&files->file_lock);
-}
-
-static bool get_close_on_exec(unsigned int fd)
-{
-	struct files_struct *files = current->files;
-	struct fdtable *fdt;
-	bool res;
-	rcu_read_lock();
-	fdt = files_fdtable(files);
-	res = close_on_exec(fd, fdt);
-	rcu_read_unlock();
-	return res;
-}
-
-SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
-{
-	int err = -EBADF;
-	struct file * file, *tofree;
-	struct files_struct * files = current->files;
-	struct fdtable *fdt;
-
-	if ((flags & ~O_CLOEXEC) != 0)
-		return -EINVAL;
-
-	if (unlikely(oldfd == newfd))
-		return -EINVAL;
-
-	if (newfd >= rlimit(RLIMIT_NOFILE))
-		return -EMFILE;
-
-	spin_lock(&files->file_lock);
-	err = expand_files(files, newfd);
-	file = fcheck(oldfd);
-	if (unlikely(!file))
-		goto Ebadf;
-	if (unlikely(err < 0)) {
-		if (err == -EMFILE)
-			goto Ebadf;
-		goto out_unlock;
-	}
-	/*
-	 * We need to detect attempts to do dup2() over allocated but still
-	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
-	 * extra work in their equivalent of fget() - they insert struct
-	 * file immediately after grabbing descriptor, mark it larval if
-	 * more work (e.g. actual opening) is needed and make sure that
-	 * fget() treats larval files as absent.  Potentially interesting,
-	 * but while extra work in fget() is trivial, locking implications
-	 * and amount of surgery on open()-related paths in VFS are not.
-	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
-	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
-	 * scope of POSIX or SUS, since neither considers shared descriptor
-	 * tables and this condition does not arise without those.
-	 */
-	err = -EBUSY;
-	fdt = files_fdtable(files);
-	tofree = fdt->fd[newfd];
-	if (!tofree && fd_is_open(newfd, fdt))
-		goto out_unlock;
-	get_file(file);
-	rcu_assign_pointer(fdt->fd[newfd], file);
-	__set_open_fd(newfd, fdt);
-	if (flags & O_CLOEXEC)
-		__set_close_on_exec(newfd, fdt);
-	else
-		__clear_close_on_exec(newfd, fdt);
-	spin_unlock(&files->file_lock);
-
-	if (tofree)
-		filp_close(tofree, files);
-
-	return newfd;
-
-Ebadf:
-	err = -EBADF;
-out_unlock:
-	spin_unlock(&files->file_lock);
-	return err;
-}
-
-SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
-{
-	if (unlikely(newfd == oldfd)) { /* corner case */
-		struct files_struct *files = current->files;
-		int retval = oldfd;
-
-		rcu_read_lock();
-		if (!fcheck_files(files, oldfd))
-			retval = -EBADF;
-		rcu_read_unlock();
-		return retval;
-	}
-	return sys_dup3(oldfd, newfd, 0);
-}
-
-SYSCALL_DEFINE1(dup, unsigned int, fildes)
-{
-	int ret = -EBADF;
-	struct file *file = fget_raw(fildes);
-
-	if (file) {
-		ret = get_unused_fd();
-		if (ret >= 0)
-			fd_install(ret, file);
-		else
-			fput(file);
-	}
-	return ret;
-}
-
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
 static int setfl(int fd, struct file * filp, unsigned long arg)
@@ -376,14 +255,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 
 	switch (cmd) {
 	case F_DUPFD:
+		err = f_dupfd(arg, filp, 0);
+		break;
 	case F_DUPFD_CLOEXEC:
-		if (arg >= rlimit(RLIMIT_NOFILE))
-			break;
-		err = alloc_fd(arg, cmd == F_DUPFD_CLOEXEC ? O_CLOEXEC : 0);
-		if (err >= 0) {
-			get_file(filp);
-			fd_install(err, filp);
-		}
+		err = f_dupfd(arg, filp, FD_CLOEXEC);
 		break;
 	case F_GETFD:
 		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
diff --git a/fs/file.c b/fs/file.c
index 92197dd9fdc8..7f29544755d0 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -6,6 +6,7 @@
  *  Manage the dynamic fd arrays in the process files_struct.
  */
 
+#include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
@@ -794,3 +795,134 @@ struct file *fget_raw_light(unsigned int fd, int *fput_needed)
 
 	return file;
 }
+
+void set_close_on_exec(unsigned int fd, int flag)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	if (flag)
+		__set_close_on_exec(fd, fdt);
+	else
+		__clear_close_on_exec(fd, fdt);
+	spin_unlock(&files->file_lock);
+}
+
+bool get_close_on_exec(unsigned int fd)
+{
+	struct files_struct *files = current->files;
+	struct fdtable *fdt;
+	bool res;
+	rcu_read_lock();
+	fdt = files_fdtable(files);
+	res = close_on_exec(fd, fdt);
+	rcu_read_unlock();
+	return res;
+}
+
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+{
+	int err = -EBADF;
+	struct file * file, *tofree;
+	struct files_struct * files = current->files;
+	struct fdtable *fdt;
+
+	if ((flags & ~O_CLOEXEC) != 0)
+		return -EINVAL;
+
+	if (newfd >= rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
+
+	spin_lock(&files->file_lock);
+	err = expand_files(files, newfd);
+	file = fcheck(oldfd);
+	if (unlikely(!file))
+		goto Ebadf;
+	if (unlikely(err < 0)) {
+		if (err == -EMFILE)
+			goto Ebadf;
+		goto out_unlock;
+	}
+	/*
+	 * We need to detect attempts to do dup2() over allocated but still
+	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
+	 * extra work in their equivalent of fget() - they insert struct
+	 * file immediately after grabbing descriptor, mark it larval if
+	 * more work (e.g. actual opening) is needed and make sure that
+	 * fget() treats larval files as absent.  Potentially interesting,
+	 * but while extra work in fget() is trivial, locking implications
+	 * and amount of surgery on open()-related paths in VFS are not.
+	 * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
+	 * deadlocks in rather amusing ways, AFAICS.  All of that is out of
+	 * scope of POSIX or SUS, since neither considers shared descriptor
+	 * tables and this condition does not arise without those.
+	 */
+	err = -EBUSY;
+	fdt = files_fdtable(files);
+	tofree = fdt->fd[newfd];
+	if (!tofree && fd_is_open(newfd, fdt))
+		goto out_unlock;
+	get_file(file);
+	rcu_assign_pointer(fdt->fd[newfd], file);
+	__set_open_fd(newfd, fdt);
+	if (flags & O_CLOEXEC)
+		__set_close_on_exec(newfd, fdt);
+	else
+		__clear_close_on_exec(newfd, fdt);
+	spin_unlock(&files->file_lock);
+
+	if (tofree)
+		filp_close(tofree, files);
+
+	return newfd;
+
+Ebadf:
+	err = -EBADF;
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return err;
+}
+
+SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
+{
+	if (unlikely(newfd == oldfd)) { /* corner case */
+		struct files_struct *files = current->files;
+		int retval = oldfd;
+
+		rcu_read_lock();
+		if (!fcheck_files(files, oldfd))
+			retval = -EBADF;
+		rcu_read_unlock();
+		return retval;
+	}
+	return sys_dup3(oldfd, newfd, 0);
+}
+
+SYSCALL_DEFINE1(dup, unsigned int, fildes)
+{
+	int ret = -EBADF;
+	struct file *file = fget_raw(fildes);
+
+	if (file) {
+		ret = get_unused_fd();
+		if (ret >= 0)
+			fd_install(ret, file);
+		else
+			fput(file);
+	}
+	return ret;
+}
+
+int f_dupfd(unsigned int from, struct file *file, unsigned flags)
+{
+	int err;
+	if (from >= rlimit(RLIMIT_NOFILE))
+		return -EINVAL;
+	err = alloc_fd(from, flags);
+	if (err >= 0) {
+		get_file(file);
+		fd_install(err, file);
+	}
+	return err;
+}
diff --git a/include/linux/file.h b/include/linux/file.h
index 86795ec6027d..da84fa0f4579 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -30,7 +30,9 @@ extern struct file *fget(unsigned int fd);
 extern struct file *fget_light(unsigned int fd, int *fput_needed);
 extern struct file *fget_raw(unsigned int fd);
 extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
+extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern void set_close_on_exec(unsigned int fd, int flag);
+extern bool get_close_on_exec(unsigned int fd);
 extern void put_filp(struct file *);
 extern int alloc_fd(unsigned start, unsigned flags);
 extern int get_unused_fd_flags(unsigned flags);
-- 
cgit v1.2.3


From 8280d16172243702ed43432f826ca6130edb4086 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 12:11:46 -0400
Subject: new helper: replace_fd()

analog of dup2(), except that it takes struct file * as source.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c            | 11 +------
 fs/file.c            | 91 +++++++++++++++++++++++++++++++++++-----------------
 include/linux/file.h |  1 +
 3 files changed, 64 insertions(+), 39 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index f2b6af585d4a..3fc74681cc6c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2041,23 +2041,14 @@ static void wait_for_dump_helpers(struct file *file)
 static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
 {
 	struct file *files[2];
-	struct fdtable *fdt;
 	struct coredump_params *cp = (struct coredump_params *)info->data;
-	struct files_struct *cf = current->files;
 	int err = create_pipe_files(files, 0);
 	if (err)
 		return err;
 
 	cp->file = files[1];
 
-	sys_close(0);
-	fd_install(0, files[0]);
-	spin_lock(&cf->file_lock);
-	fdt = files_fdtable(cf);
-	__set_open_fd(0, fdt);
-	__clear_close_on_exec(0, fdt);
-	spin_unlock(&cf->file_lock);
-
+	replace_fd(0, files[0], 0);
 	/* and disallow core files too */
 	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
 
diff --git a/fs/file.c b/fs/file.c
index 7f29544755d0..a7bbe0324478 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -821,29 +821,12 @@ bool get_close_on_exec(unsigned int fd)
 	return res;
 }
 
-SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+static int do_dup2(struct files_struct *files,
+	struct file *file, unsigned fd, unsigned flags)
 {
-	int err = -EBADF;
-	struct file * file, *tofree;
-	struct files_struct * files = current->files;
+	struct file *tofree;
 	struct fdtable *fdt;
 
-	if ((flags & ~O_CLOEXEC) != 0)
-		return -EINVAL;
-
-	if (newfd >= rlimit(RLIMIT_NOFILE))
-		return -EMFILE;
-
-	spin_lock(&files->file_lock);
-	err = expand_files(files, newfd);
-	file = fcheck(oldfd);
-	if (unlikely(!file))
-		goto Ebadf;
-	if (unlikely(err < 0)) {
-		if (err == -EMFILE)
-			goto Ebadf;
-		goto out_unlock;
-	}
 	/*
 	 * We need to detect attempts to do dup2() over allocated but still
 	 * not finished descriptor.  NB: OpenBSD avoids that at the price of
@@ -858,24 +841,74 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
 	 * scope of POSIX or SUS, since neither considers shared descriptor
 	 * tables and this condition does not arise without those.
 	 */
-	err = -EBUSY;
 	fdt = files_fdtable(files);
-	tofree = fdt->fd[newfd];
-	if (!tofree && fd_is_open(newfd, fdt))
-		goto out_unlock;
+	tofree = fdt->fd[fd];
+	if (!tofree && fd_is_open(fd, fdt))
+		goto Ebusy;
 	get_file(file);
-	rcu_assign_pointer(fdt->fd[newfd], file);
-	__set_open_fd(newfd, fdt);
+	rcu_assign_pointer(fdt->fd[fd], file);
+	__set_open_fd(fd, fdt);
 	if (flags & O_CLOEXEC)
-		__set_close_on_exec(newfd, fdt);
+		__set_close_on_exec(fd, fdt);
 	else
-		__clear_close_on_exec(newfd, fdt);
+		__clear_close_on_exec(fd, fdt);
 	spin_unlock(&files->file_lock);
 
 	if (tofree)
 		filp_close(tofree, files);
 
-	return newfd;
+	return fd;
+
+Ebusy:
+	spin_unlock(&files->file_lock);
+	return -EBUSY;
+}
+
+int replace_fd(unsigned fd, struct file *file, unsigned flags)
+{
+	int err;
+	struct files_struct *files = current->files;
+
+	if (!file)
+		return __close_fd(files, fd);
+
+	if (fd >= rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
+
+	spin_lock(&files->file_lock);
+	err = expand_files(files, fd);
+	if (unlikely(err < 0))
+		goto out_unlock;
+	return do_dup2(files, file, fd, flags);
+
+out_unlock:
+	spin_unlock(&files->file_lock);
+	return err;
+}
+
+SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
+{
+	int err = -EBADF;
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	if ((flags & ~O_CLOEXEC) != 0)
+		return -EINVAL;
+
+	if (newfd >= rlimit(RLIMIT_NOFILE))
+		return -EMFILE;
+
+	spin_lock(&files->file_lock);
+	err = expand_files(files, newfd);
+	file = fcheck(oldfd);
+	if (unlikely(!file))
+		goto Ebadf;
+	if (unlikely(err < 0)) {
+		if (err == -EMFILE)
+			goto Ebadf;
+		goto out_unlock;
+	}
+	return do_dup2(files, file, newfd, flags);
 
 Ebadf:
 	err = -EBADF;
diff --git a/include/linux/file.h b/include/linux/file.h
index da84fa0f4579..6239591a6122 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -31,6 +31,7 @@ extern struct file *fget_light(unsigned int fd, int *fput_needed);
 extern struct file *fget_raw(unsigned int fd);
 extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
+extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
 extern void set_close_on_exec(unsigned int fd, int flag);
 extern bool get_close_on_exec(unsigned int fd);
 extern void put_filp(struct file *);
-- 
cgit v1.2.3


From b8318b01a8f7f760ae3ecae052ccc7fc123d9508 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 20:09:42 -0400
Subject: take __{set,clear}_{open_fd,close_on_exec}() into fs/file.c

nobody uses those outside anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 20 ++++++++++++++++++++
 include/linux/fdtable.h | 20 --------------------
 2 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index a7bbe0324478..40ddef9fe041 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -256,6 +256,26 @@ int expand_files(struct files_struct *files, int nr)
 	return expand_fdtable(files, nr);
 }
 
+static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
+{
+	__set_bit(fd, fdt->close_on_exec);
+}
+
+static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
+{
+	__clear_bit(fd, fdt->close_on_exec);
+}
+
+static inline void __set_open_fd(int fd, struct fdtable *fdt)
+{
+	__set_bit(fd, fdt->open_fds);
+}
+
+static inline void __clear_open_fd(int fd, struct fdtable *fdt)
+{
+	__clear_bit(fd, fdt->open_fds);
+}
+
 static int count_open_files(struct fdtable *fdt)
 {
 	int size = fdt->max_fds;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index ef4b2137e6bc..9ff26319d44f 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -30,31 +30,11 @@ struct fdtable {
 	struct fdtable *next;
 };
 
-static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
-{
-	__set_bit(fd, fdt->close_on_exec);
-}
-
-static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
-{
-	__clear_bit(fd, fdt->close_on_exec);
-}
-
 static inline bool close_on_exec(int fd, const struct fdtable *fdt)
 {
 	return test_bit(fd, fdt->close_on_exec);
 }
 
-static inline void __set_open_fd(int fd, struct fdtable *fdt)
-{
-	__set_bit(fd, fdt->open_fds);
-}
-
-static inline void __clear_open_fd(int fd, struct fdtable *fdt)
-{
-	__clear_bit(fd, fdt->open_fds);
-}
-
 static inline bool fd_is_open(int fd, const struct fdtable *fdt)
 {
 	return test_bit(fd, fdt->open_fds);
-- 
cgit v1.2.3


From ad47bd7252bf402fe7dba92f5240b5ed16832ae7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 20:11:34 -0400
Subject: make expand_files() and alloc_fd() static

no callers outside of fs/file.c left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 4 ++--
 include/linux/fdtable.h | 1 -
 include/linux/file.h    | 1 -
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 40ddef9fe041..967bd0dadbe5 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -238,7 +238,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
  * expanded and execution may have blocked.
  * The files->file_lock should be held on entry, and will be held on exit.
  */
-int expand_files(struct files_struct *files, int nr)
+static int expand_files(struct files_struct *files, int nr)
 {
 	struct fdtable *fdt;
 
@@ -580,7 +580,7 @@ out:
 	return error;
 }
 
-int alloc_fd(unsigned start, unsigned flags)
+static int alloc_fd(unsigned start, unsigned flags)
 {
 	return __alloc_fd(current->files, start, rlimit(RLIMIT_NOFILE), flags);
 }
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index 9ff26319d44f..de2b71caa0f0 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -73,7 +73,6 @@ struct file_operations;
 struct vfsmount;
 struct dentry;
 
-extern int expand_files(struct files_struct *, int nr);
 extern void __init files_defer_init(void);
 
 static inline struct file * fcheck_files(struct files_struct *files, unsigned int fd)
diff --git a/include/linux/file.h b/include/linux/file.h
index 6239591a6122..6eee54aea279 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -35,7 +35,6 @@ extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
 extern void set_close_on_exec(unsigned int fd, int flag);
 extern bool get_close_on_exec(unsigned int fd);
 extern void put_filp(struct file *);
-extern int alloc_fd(unsigned start, unsigned flags);
 extern int get_unused_fd_flags(unsigned flags);
 #define get_unused_fd() get_unused_fd_flags(0)
 extern void put_unused_fd(unsigned int fd);
-- 
cgit v1.2.3


From c3c073f808b22dfae15ef8412b6f7b998644139a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 22:32:06 -0400
Subject: new helper: iterate_fd()

iterates through the opened files in given descriptor table,
calling a supplied function; we stop once non-zero is returned.
Callback gets struct file *, descriptor number and const void *
argument passed to iterator.  It is called with files->file_lock
held, so it is not allowed to block.

tty_io, netprio_cgroup and selinux flush_unauthorized_files()
converted to its use.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/tty/tty_io.c      | 36 +++++++++++-------------------
 fs/file.c                 | 21 +++++++++++++++++
 include/linux/fdtable.h   |  3 +++
 net/core/netprio_cgroup.c | 38 ++++++++++---------------------
 security/selinux/hooks.c  | 57 ++++++++++++++++++-----------------------------
 5 files changed, 71 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index b425c79675ad..71d95cfbabec 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -2791,6 +2791,13 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 }
 #endif
 
+static int this_tty(const void *t, struct file *file, unsigned fd)
+{
+	if (likely(file->f_op->read != tty_read))
+		return 0;
+	return file_tty(file) != t ? 0 : fd + 1;
+}
+	
 /*
  * This implements the "Secure Attention Key" ---  the idea is to
  * prevent trojan horses by killing all processes associated with this
@@ -2818,8 +2825,6 @@ void __do_SAK(struct tty_struct *tty)
 	struct task_struct *g, *p;
 	struct pid *session;
 	int		i;
-	struct file	*filp;
-	struct fdtable *fdt;
 
 	if (!tty)
 		return;
@@ -2849,27 +2854,12 @@ void __do_SAK(struct tty_struct *tty)
 			continue;
 		}
 		task_lock(p);
-		if (p->files) {
-			/*
-			 * We don't take a ref to the file, so we must
-			 * hold ->file_lock instead.
-			 */
-			spin_lock(&p->files->file_lock);
-			fdt = files_fdtable(p->files);
-			for (i = 0; i < fdt->max_fds; i++) {
-				filp = fcheck_files(p->files, i);
-				if (!filp)
-					continue;
-				if (filp->f_op->read == tty_read &&
-				    file_tty(filp) == tty) {
-					printk(KERN_NOTICE "SAK: killed process %d"
-					    " (%s): fd#%d opened to the tty\n",
-					    task_pid_nr(p), p->comm, i);
-					force_sig(SIGKILL, p);
-					break;
-				}
-			}
-			spin_unlock(&p->files->file_lock);
+		i = iterate_fd(p->files, 0, this_tty, tty);
+		if (i != 0) {
+			printk(KERN_NOTICE "SAK: killed process %d"
+			    " (%s): fd#%d opened to the tty\n",
+				    task_pid_nr(p), p->comm, i - 1);
+			force_sig(SIGKILL, p);
 		}
 		task_unlock(p);
 	} while_each_thread(g, p);
diff --git a/fs/file.c b/fs/file.c
index 967bd0dadbe5..e6e418122587 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -979,3 +979,24 @@ int f_dupfd(unsigned int from, struct file *file, unsigned flags)
 	}
 	return err;
 }
+
+int iterate_fd(struct files_struct *files, unsigned n,
+		int (*f)(const void *, struct file *, unsigned),
+		const void *p)
+{
+	struct fdtable *fdt;
+	struct file *file;
+	int res = 0;
+	if (!files)
+		return 0;
+	spin_lock(&files->file_lock);
+	fdt = files_fdtable(files);
+	while (!res && n < fdt->max_fds) {
+		file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
+		if (file)
+			res = f(p, file, n);
+	}
+	spin_unlock(&files->file_lock);
+	return res;
+}
+EXPORT_SYMBOL(iterate_fd);
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index de2b71caa0f0..fb7dacae0522 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -98,6 +98,9 @@ void reset_files_struct(struct files_struct *);
 int unshare_files(struct files_struct **);
 struct files_struct *dup_fd(struct files_struct *, int *);
 void do_close_on_exec(struct files_struct *);
+int iterate_fd(struct files_struct *, unsigned,
+		int (*)(const void *, struct file *, unsigned),
+		const void *);
 
 extern int __alloc_fd(struct files_struct *files,
 		      unsigned start, unsigned end, unsigned flags);
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index c75e3f9d060f..5ffd084c6a83 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -272,38 +272,24 @@ out_free_devname:
 	return ret;
 }
 
+static int update_netprio(const void *v, struct file *file, unsigned n)
+{
+	int err;
+	struct socket *sock = sock_from_file(file, &err);
+	if (sock)
+		sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+	return 0;
+}
+
 void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 {
 	struct task_struct *p;
+	void *v;
 
 	cgroup_taskset_for_each(p, cgrp, tset) {
-		unsigned int fd;
-		struct fdtable *fdt;
-		struct files_struct *files;
-
 		task_lock(p);
-		files = p->files;
-		if (!files) {
-			task_unlock(p);
-			continue;
-		}
-
-		spin_lock(&files->file_lock);
-		fdt = files_fdtable(files);
-		for (fd = 0; fd < fdt->max_fds; fd++) {
-			struct file *file;
-			struct socket *sock;
-			int err;
-
-			file = fcheck_files(files, fd);
-			if (!file)
-				continue;
-
-			sock = sock_from_file(file, &err);
-			if (sock)
-				sock_update_netprioidx(sock->sk, p);
-		}
-		spin_unlock(&files->file_lock);
+		v = (void *)(unsigned long)task_netprioidx(p);
+		iterate_fd(p->files, 0, update_netprio, v);
 		task_unlock(p);
 	}
 }
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 00b50113642d..4dfbcea10eb7 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2088,15 +2088,19 @@ static int selinux_bprm_secureexec(struct linux_binprm *bprm)
 	return (atsecure || cap_bprm_secureexec(bprm));
 }
 
+static int match_file(const void *p, struct file *file, unsigned fd)
+{
+	return file_has_perm(p, file, file_to_av(file)) ? fd + 1 : 0;
+}
+
 /* Derived from fs/exec.c:flush_old_files. */
 static inline void flush_unauthorized_files(const struct cred *cred,
 					    struct files_struct *files)
 {
 	struct file *file, *devnull = NULL;
 	struct tty_struct *tty;
-	struct fdtable *fdt;
-	long j = -1;
 	int drop_tty = 0;
+	unsigned n;
 
 	tty = get_current_tty();
 	if (tty) {
@@ -2123,41 +2127,24 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 		no_tty();
 
 	/* Revalidate access to inherited open files. */
-	spin_lock(&files->file_lock);
-	for (;;) {
-		unsigned long set, i;
-		j++;
-		i = j * BITS_PER_LONG;
-		fdt = files_fdtable(files);
-		if (i >= fdt->max_fds)
-			break;
-		set = fdt->open_fds[j];
-		if (!set)
-			continue;
-		spin_unlock(&files->file_lock);
-		for ( ; set ; i++, set >>= 1) {
-			if (!(set & 1))
-				continue;
-			file = fget(i);
-			if (!file)
-				continue;
-			if (file_has_perm(cred, file, file_to_av(file))) {
-				if (devnull) {
-					get_file(devnull);
-				} else {
-					devnull = dentry_open(&selinux_null,
-								O_RDWR, cred);
-					if (IS_ERR(devnull))
-						devnull = NULL;
-				}
-				replace_fd(i, devnull, 0);
-			}
-			fput(file);
-		}
-		spin_lock(&files->file_lock);
+	n = iterate_fd(files, 0, match_file, cred);
+	if (!n) /* none found? */
+		return;
 
+	devnull = dentry_open(&selinux_null, O_RDWR, cred);
+	if (!IS_ERR(devnull)) {
+		/* replace all the matching ones with this */
+		do {
+			get_file(devnull);
+			replace_fd(n - 1, devnull, 0);
+		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);
+		fput(devnull);
+	} else {
+		/* just close all the matching ones */
+		do {
+			replace_fd(n - 1, NULL, 0);
+		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);
 	}
-	spin_unlock(&files->file_lock);
 }
 
 /*
-- 
cgit v1.2.3


From 179e037fc1370288188cb1f90b81156d75a3cb2d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 21 Aug 2012 22:43:47 -0400
Subject: do_coredump(): make sure that descriptor table isn't shared

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 3fc74681cc6c..beb05a95e4a3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -2066,6 +2066,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 	int retval = 0;
 	int flag = 0;
 	int ispipe;
+	struct files_struct *displaced;
 	bool need_nonrelative = false;
 	static atomic_t core_dump_count = ATOMIC_INIT(0);
 	struct coredump_params cprm = {
@@ -2219,6 +2220,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 			goto close_fail;
 	}
 
+	/* get us an unshared descriptor table; almost always a no-op */
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto close_fail;
+	if (displaced)
+		put_files_struct(displaced);
 	retval = binfmt->core_dump(&cprm);
 	if (retval)
 		current->signal->group_exit_code |= 0x80;
-- 
cgit v1.2.3


From 864bdb3b6cbd9911222543fef1cfe36f88183f44 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 22 Aug 2012 18:42:10 -0400
Subject: new helper: daemonize_descriptors()

descriptor-related parts of daemonize, done right.  As the
result we simplify the locking rules for ->files - we
hold task_lock in *all* cases when we modify ->files.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 6 ++++++
 include/linux/fdtable.h | 1 +
 kernel/exit.c           | 4 +---
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index e6e418122587..15750b80e3ce 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -519,6 +519,12 @@ struct files_struct init_files = {
 	.file_lock	= __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
 
+void daemonize_descriptors(void)
+{
+	atomic_inc(&init_files.count);
+	reset_files_struct(&init_files);
+}
+
 /*
  * allocate a file descriptor, mark it busy.
  */
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fb7dacae0522..45052aa814c8 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -95,6 +95,7 @@ struct task_struct;
 struct files_struct *get_files_struct(struct task_struct *);
 void put_files_struct(struct files_struct *fs);
 void reset_files_struct(struct files_struct *);
+void daemonize_descriptors(void);
 int unshare_files(struct files_struct **);
 struct files_struct *dup_fd(struct files_struct *, int *);
 void do_close_on_exec(struct files_struct *);
diff --git a/kernel/exit.c b/kernel/exit.c
index 20dfc7617c2e..095113321318 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -457,9 +457,7 @@ void daemonize(const char *name, ...)
 	/* Become as one with the init task */
 
 	daemonize_fs_struct();
-	exit_files(current);
-	current->files = init_task.files;
-	atomic_inc(&current->files->count);
+	daemonize_descriptors();
 
 	reparent_to_kthreadd();
 }
-- 
cgit v1.2.3


From faf60af17f8da87e1c87a6be527344791025ce9e Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Thu, 23 Aug 2012 14:43:24 +0400
Subject: procfs: Move /proc/pid/fd[info] handling code to fd.[ch]

This patch prepares the ground for further extension of
/proc/pid/fd[info] handling code by moving fdinfo handling
code into fs/proc/fd.c.

I think such move makes both fs/proc/base.c and fs/proc/fd.c
easier to read.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Pavel Emelyanov <xemul@parallels.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: James Bottomley <jbottomley@parallels.com>
CC: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Matthew Helsley <matt.helsley@gmail.com>
CC: "J. Bruce Fields" <bfields@fieldses.org>
CC: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/Makefile   |   2 +-
 fs/proc/base.c     | 388 +----------------------------------------------------
 fs/proc/fd.c       | 351 ++++++++++++++++++++++++++++++++++++++++++++++++
 fs/proc/fd.h       |  14 ++
 fs/proc/internal.h |  48 +++++++
 5 files changed, 416 insertions(+), 387 deletions(-)
 create mode 100644 fs/proc/fd.c
 create mode 100644 fs/proc/fd.h

(limited to 'fs')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index c1c729335924..99349efbbc2b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,7 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
-		proc_tty.o
+		proc_tty.o fd.o
 proc-y	+= cmdline.o
 proc-y	+= consoles.o
 proc-y	+= cpuinfo.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1b6c84cbdb73..b55c3bb298e3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -90,6 +90,7 @@
 #endif
 #include <trace/events/oom.h>
 #include "internal.h"
+#include "fd.h"
 
 /* NOTE:
  *	Implementing inode permission operations in /proc is almost
@@ -136,8 +137,6 @@ struct pid_entry {
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
 
-static int proc_fd_permission(struct inode *inode, int mask);
-
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -1492,7 +1491,7 @@ out:
 	return error;
 }
 
-static const struct inode_operations proc_pid_link_inode_operations = {
+const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
 	.follow_link	= proc_pid_follow_link,
 	.setattr	= proc_setattr,
@@ -1501,21 +1500,6 @@ static const struct inode_operations proc_pid_link_inode_operations = {
 
 /* building an inode */
 
-static int task_dumpable(struct task_struct *task)
-{
-	int dumpable = 0;
-	struct mm_struct *mm;
-
-	task_lock(task);
-	mm = task->mm;
-	if (mm)
-		dumpable = get_dumpable(mm);
-	task_unlock(task);
-	if(dumpable == 1)
-		return 1;
-	return 0;
-}
-
 struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
 {
 	struct inode * inode;
@@ -1641,15 +1625,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags)
 	return 0;
 }
 
-static int pid_delete_dentry(const struct dentry * dentry)
-{
-	/* Is the task we represent dead?
-	 * If so, then don't put the dentry on the lru list,
-	 * kill it immediately.
-	 */
-	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
-}
-
 const struct dentry_operations pid_dentry_operations =
 {
 	.d_revalidate	= pid_revalidate,
@@ -1712,289 +1687,6 @@ end_instantiate:
 	return filldir(dirent, name, len, filp->f_pos, ino, type);
 }
 
-static unsigned name_to_int(struct dentry *dentry)
-{
-	const char *name = dentry->d_name.name;
-	int len = dentry->d_name.len;
-	unsigned n = 0;
-
-	if (len > 1 && *name == '0')
-		goto out;
-	while (len-- > 0) {
-		unsigned c = *name++ - '0';
-		if (c > 9)
-			goto out;
-		if (n >= (~0U-9)/10)
-			goto out;
-		n *= 10;
-		n += c;
-	}
-	return n;
-out:
-	return ~0U;
-}
-
-#define PROC_FDINFO_MAX 64
-
-static int proc_fd_info(struct inode *inode, struct path *path, char *info)
-{
-	struct task_struct *task = get_proc_task(inode);
-	struct files_struct *files = NULL;
-	struct file *file;
-	int fd = proc_fd(inode);
-
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
-	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
-		spin_lock(&files->file_lock);
-		file = fcheck_files(files, fd);
-		if (file) {
-			unsigned int f_flags;
-			struct fdtable *fdt;
-
-			fdt = files_fdtable(files);
-			f_flags = file->f_flags & ~O_CLOEXEC;
-			if (close_on_exec(fd, fdt))
-				f_flags |= O_CLOEXEC;
-
-			if (path) {
-				*path = file->f_path;
-				path_get(&file->f_path);
-			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 f_flags);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
-		}
-		spin_unlock(&files->file_lock);
-		put_files_struct(files);
-	}
-	return -ENOENT;
-}
-
-static int proc_fd_link(struct dentry *dentry, struct path *path)
-{
-	return proc_fd_info(dentry->d_inode, path, NULL);
-}
-
-static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	struct inode *inode;
-	struct task_struct *task;
-	int fd;
-	struct files_struct *files;
-	const struct cred *cred;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = dentry->d_inode;
-	task = get_proc_task(inode);
-	fd = proc_fd(inode);
-
-	if (task) {
-		files = get_files_struct(task);
-		if (files) {
-			struct file *file;
-			rcu_read_lock();
-			file = fcheck_files(files, fd);
-			if (file) {
-				unsigned f_mode = file->f_mode;
-
-				rcu_read_unlock();
-				put_files_struct(files);
-
-				if (task_dumpable(task)) {
-					rcu_read_lock();
-					cred = __task_cred(task);
-					inode->i_uid = cred->euid;
-					inode->i_gid = cred->egid;
-					rcu_read_unlock();
-				} else {
-					inode->i_uid = GLOBAL_ROOT_UID;
-					inode->i_gid = GLOBAL_ROOT_GID;
-				}
-
-				if (S_ISLNK(inode->i_mode)) {
-					unsigned i_mode = S_IFLNK;
-					if (f_mode & FMODE_READ)
-						i_mode |= S_IRUSR | S_IXUSR;
-					if (f_mode & FMODE_WRITE)
-						i_mode |= S_IWUSR | S_IXUSR;
-					inode->i_mode = i_mode;
-				}
-
-				security_task_to_inode(task, inode);
-				put_task_struct(task);
-				return 1;
-			}
-			rcu_read_unlock();
-			put_files_struct(files);
-		}
-		put_task_struct(task);
-	}
-	d_drop(dentry);
-	return 0;
-}
-
-static const struct dentry_operations tid_fd_dentry_operations =
-{
-	.d_revalidate	= tid_fd_revalidate,
-	.d_delete	= pid_delete_dentry,
-};
-
-static struct dentry *proc_fd_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	unsigned fd = (unsigned long)ptr;
- 	struct inode *inode;
- 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
-
-	inode = proc_pid_make_inode(dir->i_sb, task);
-	if (!inode)
-		goto out;
-	ei = PROC_I(inode);
-	ei->fd = fd;
-
-	inode->i_mode = S_IFLNK;
-	inode->i_op = &proc_pid_link_inode_operations;
-	inode->i_size = 64;
-	ei->op.proc_get_link = proc_fd_link;
-	d_set_d_op(dentry, &tid_fd_dentry_operations);
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, 0))
-		error = NULL;
-
- out:
-	return error;
-}
-
-static struct dentry *proc_lookupfd_common(struct inode *dir,
-					   struct dentry *dentry,
-					   instantiate_t instantiate)
-{
-	struct task_struct *task = get_proc_task(dir);
-	unsigned fd = name_to_int(dentry);
-	struct dentry *result = ERR_PTR(-ENOENT);
-
-	if (!task)
-		goto out_no_task;
-	if (fd == ~0U)
-		goto out;
-
-	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
-out:
-	put_task_struct(task);
-out_no_task:
-	return result;
-}
-
-static int proc_readfd_common(struct file * filp, void * dirent,
-			      filldir_t filldir, instantiate_t instantiate)
-{
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = dentry->d_inode;
-	struct task_struct *p = get_proc_task(inode);
-	unsigned int fd, ino;
-	int retval;
-	struct files_struct * files;
-
-	retval = -ENOENT;
-	if (!p)
-		goto out_no_task;
-	retval = 0;
-
-	fd = filp->f_pos;
-	switch (fd) {
-		case 0:
-			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		case 1:
-			ino = parent_ino(dentry);
-			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
-				goto out;
-			filp->f_pos++;
-		default:
-			files = get_files_struct(p);
-			if (!files)
-				goto out;
-			rcu_read_lock();
-			for (fd = filp->f_pos-2;
-			     fd < files_fdtable(files)->max_fds;
-			     fd++, filp->f_pos++) {
-				char name[PROC_NUMBUF];
-				int len;
-				int rv;
-
-				if (!fcheck_files(files, fd))
-					continue;
-				rcu_read_unlock();
-
-				len = snprintf(name, sizeof(name), "%d", fd);
-				rv = proc_fill_cache(filp, dirent, filldir,
-						     name, len, instantiate, p,
-						     (void *)(unsigned long)fd);
-				if (rv < 0)
-					goto out_fd_loop;
-				rcu_read_lock();
-			}
-			rcu_read_unlock();
-out_fd_loop:
-			put_files_struct(files);
-	}
-out:
-	put_task_struct(p);
-out_no_task:
-	return retval;
-}
-
-static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
-				    unsigned int flags)
-{
-	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
-}
-
-static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
-{
-	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
-}
-
-static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
-				      size_t len, loff_t *ppos)
-{
-	char tmp[PROC_FDINFO_MAX];
-	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
-	if (!err)
-		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
-	return err;
-}
-
-static const struct file_operations proc_fdinfo_file_operations = {
-	.open           = nonseekable_open,
-	.read		= proc_fdinfo_read,
-	.llseek		= no_llseek,
-};
-
-static const struct file_operations proc_fd_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_readfd,
-	.llseek		= default_llseek,
-};
-
 #ifdef CONFIG_CHECKPOINT_RESTORE
 
 /*
@@ -2337,82 +2029,6 @@ static const struct file_operations proc_map_files_operations = {
 
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
-/*
- * /proc/pid/fd needs a special permission handler so that a process can still
- * access /proc/self/fd after it has executed a setuid().
- */
-static int proc_fd_permission(struct inode *inode, int mask)
-{
-	int rv = generic_permission(inode, mask);
-	if (rv == 0)
-		return 0;
-	if (task_pid(current) == proc_pid(inode))
-		rv = 0;
-	return rv;
-}
-
-/*
- * proc directories can do almost nothing..
- */
-static const struct inode_operations proc_fd_inode_operations = {
-	.lookup		= proc_lookupfd,
-	.permission	= proc_fd_permission,
-	.setattr	= proc_setattr,
-};
-
-static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
-	struct dentry *dentry, struct task_struct *task, const void *ptr)
-{
-	unsigned fd = (unsigned long)ptr;
- 	struct inode *inode;
- 	struct proc_inode *ei;
-	struct dentry *error = ERR_PTR(-ENOENT);
-
-	inode = proc_pid_make_inode(dir->i_sb, task);
-	if (!inode)
-		goto out;
-	ei = PROC_I(inode);
-	ei->fd = fd;
-	inode->i_mode = S_IFREG | S_IRUSR;
-	inode->i_fop = &proc_fdinfo_file_operations;
-	d_set_d_op(dentry, &tid_fd_dentry_operations);
-	d_add(dentry, inode);
-	/* Close the race of the process dying before we return the dentry */
-	if (tid_fd_revalidate(dentry, 0))
-		error = NULL;
-
- out:
-	return error;
-}
-
-static struct dentry *proc_lookupfdinfo(struct inode *dir,
-					struct dentry *dentry,
-					unsigned int flags)
-{
-	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
-}
-
-static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
-{
-	return proc_readfd_common(filp, dirent, filldir,
-				  proc_fdinfo_instantiate);
-}
-
-static const struct file_operations proc_fdinfo_operations = {
-	.read		= generic_read_dir,
-	.readdir	= proc_readfdinfo,
-	.llseek		= default_llseek,
-};
-
-/*
- * proc directories can do almost nothing..
- */
-static const struct inode_operations proc_fdinfo_inode_operations = {
-	.lookup		= proc_lookupfdinfo,
-	.setattr	= proc_setattr,
-};
-
-
 static struct dentry *proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
new file mode 100644
index 000000000000..1b0f932b4bd9
--- /dev/null
+++ b/fs/proc/fd.c
@@ -0,0 +1,351 @@
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/dcache.h>
+#include <linux/path.h>
+#include <linux/fdtable.h>
+#include <linux/namei.h>
+#include <linux/pid.h>
+#include <linux/security.h>
+
+#include <linux/proc_fs.h>
+
+#include "internal.h"
+#include "fd.h"
+
+#define PROC_FDINFO_MAX 64
+
+static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+{
+	struct task_struct *task = get_proc_task(inode);
+	struct files_struct *files = NULL;
+	int fd = proc_fd(inode);
+	struct file *file;
+
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+	if (files) {
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
+		file = fcheck_files(files, fd);
+		if (file) {
+			unsigned int f_flags;
+			struct fdtable *fdt;
+
+			fdt = files_fdtable(files);
+			f_flags = file->f_flags & ~O_CLOEXEC;
+			if (close_on_exec(fd, fdt))
+				f_flags |= O_CLOEXEC;
+
+			if (path) {
+				*path = file->f_path;
+				path_get(&file->f_path);
+			}
+			if (info)
+				snprintf(info, PROC_FDINFO_MAX,
+					 "pos:\t%lli\n"
+					 "flags:\t0%o\n",
+					 (long long) file->f_pos,
+					 f_flags);
+			spin_unlock(&files->file_lock);
+			put_files_struct(files);
+			return 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+	return -ENOENT;
+}
+
+static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct files_struct *files;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct inode *inode;
+	int fd;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	fd = proc_fd(inode);
+
+	if (task) {
+		files = get_files_struct(task);
+		if (files) {
+			struct file *file;
+
+			rcu_read_lock();
+			file = fcheck_files(files, fd);
+			if (file) {
+				unsigned f_mode = file->f_mode;
+
+				rcu_read_unlock();
+				put_files_struct(files);
+
+				if (task_dumpable(task)) {
+					rcu_read_lock();
+					cred = __task_cred(task);
+					inode->i_uid = cred->euid;
+					inode->i_gid = cred->egid;
+					rcu_read_unlock();
+				} else {
+					inode->i_uid = GLOBAL_ROOT_UID;
+					inode->i_gid = GLOBAL_ROOT_GID;
+				}
+
+				if (S_ISLNK(inode->i_mode)) {
+					unsigned i_mode = S_IFLNK;
+					if (f_mode & FMODE_READ)
+						i_mode |= S_IRUSR | S_IXUSR;
+					if (f_mode & FMODE_WRITE)
+						i_mode |= S_IWUSR | S_IXUSR;
+					inode->i_mode = i_mode;
+				}
+
+				security_task_to_inode(task, inode);
+				put_task_struct(task);
+				return 1;
+			}
+			rcu_read_unlock();
+			put_files_struct(files);
+		}
+		put_task_struct(task);
+	}
+
+	d_drop(dentry);
+	return 0;
+}
+
+static const struct dentry_operations tid_fd_dentry_operations = {
+	.d_revalidate	= tid_fd_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_fd_link(struct dentry *dentry, struct path *path)
+{
+	return proc_fd_info(dentry->d_inode, path, NULL);
+}
+
+static struct dentry *
+proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
+		    struct task_struct *task, const void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	unsigned fd = (unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	ei->fd = fd;
+
+	inode->i_mode = S_IFLNK;
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+
+	ei->op.proc_get_link = proc_fd_link;
+
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, 0))
+		error = NULL;
+ out:
+	return error;
+}
+
+static struct dentry *proc_lookupfd_common(struct inode *dir,
+					   struct dentry *dentry,
+					   instantiate_t instantiate)
+{
+	struct task_struct *task = get_proc_task(dir);
+	struct dentry *result = ERR_PTR(-ENOENT);
+	unsigned fd = name_to_int(dentry);
+
+	if (!task)
+		goto out_no_task;
+	if (fd == ~0U)
+		goto out;
+
+	result = instantiate(dir, dentry, task, (void *)(unsigned long)fd);
+out:
+	put_task_struct(task);
+out_no_task:
+	return result;
+}
+
+static int proc_readfd_common(struct file * filp, void * dirent,
+			      filldir_t filldir, instantiate_t instantiate)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct task_struct *p = get_proc_task(inode);
+	struct files_struct *files;
+	unsigned int fd, ino;
+	int retval;
+
+	retval = -ENOENT;
+	if (!p)
+		goto out_no_task;
+	retval = 0;
+
+	fd = filp->f_pos;
+	switch (fd) {
+		case 0:
+			if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		case 1:
+			ino = parent_ino(dentry);
+			if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+				goto out;
+			filp->f_pos++;
+		default:
+			files = get_files_struct(p);
+			if (!files)
+				goto out;
+			rcu_read_lock();
+			for (fd = filp->f_pos - 2;
+			     fd < files_fdtable(files)->max_fds;
+			     fd++, filp->f_pos++) {
+				char name[PROC_NUMBUF];
+				int len;
+				int rv;
+
+				if (!fcheck_files(files, fd))
+					continue;
+				rcu_read_unlock();
+
+				len = snprintf(name, sizeof(name), "%d", fd);
+				rv = proc_fill_cache(filp, dirent, filldir,
+						     name, len, instantiate, p,
+						     (void *)(unsigned long)fd);
+				if (rv < 0)
+					goto out_fd_loop;
+				rcu_read_lock();
+			}
+			rcu_read_unlock();
+out_fd_loop:
+			put_files_struct(files);
+	}
+out:
+	put_task_struct(p);
+out_no_task:
+	return retval;
+}
+
+static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
+				size_t len, loff_t *ppos)
+{
+	char tmp[PROC_FDINFO_MAX];
+	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
+	if (!err)
+		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
+	return err;
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open           = nonseekable_open,
+	.read		= proc_fdinfo_read,
+	.llseek		= no_llseek,
+};
+
+static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
+}
+
+const struct file_operations proc_fd_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfd,
+	.llseek		= default_llseek,
+};
+
+static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
+				    unsigned int flags)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
+}
+
+/*
+ * /proc/pid/fd needs a special permission handler so that a process can still
+ * access /proc/self/fd after it has executed a setuid().
+ */
+int proc_fd_permission(struct inode *inode, int mask)
+{
+	int rv = generic_permission(inode, mask);
+	if (rv == 0)
+		return 0;
+	if (task_pid(current) == proc_pid(inode))
+		rv = 0;
+	return rv;
+}
+
+const struct inode_operations proc_fd_inode_operations = {
+	.lookup		= proc_lookupfd,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static struct dentry *
+proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
+			struct task_struct *task, const void *ptr)
+{
+	struct dentry *error = ERR_PTR(-ENOENT);
+	unsigned fd = (unsigned long)ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		goto out;
+
+	ei = PROC_I(inode);
+	ei->fd = fd;
+
+	inode->i_mode = S_IFREG | S_IRUSR;
+	inode->i_fop = &proc_fdinfo_file_operations;
+
+	d_set_d_op(dentry, &tid_fd_dentry_operations);
+	d_add(dentry, inode);
+
+	/* Close the race of the process dying before we return the dentry */
+	if (tid_fd_revalidate(dentry, 0))
+		error = NULL;
+ out:
+	return error;
+}
+
+static struct dentry *
+proc_lookupfdinfo(struct inode *dir, struct dentry *dentry, unsigned int flags)
+{
+	return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
+}
+
+static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
+{
+	return proc_readfd_common(filp, dirent, filldir,
+				  proc_fdinfo_instantiate);
+}
+
+const struct inode_operations proc_fdinfo_inode_operations = {
+	.lookup		= proc_lookupfdinfo,
+	.setattr	= proc_setattr,
+};
+
+const struct file_operations proc_fdinfo_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_readfdinfo,
+	.llseek		= default_llseek,
+};
diff --git a/fs/proc/fd.h b/fs/proc/fd.h
new file mode 100644
index 000000000000..cbb1d47deda8
--- /dev/null
+++ b/fs/proc/fd.h
@@ -0,0 +1,14 @@
+#ifndef __PROCFS_FD_H__
+#define __PROCFS_FD_H__
+
+#include <linux/fs.h>
+
+extern const struct file_operations proc_fd_operations;
+extern const struct inode_operations proc_fd_inode_operations;
+
+extern const struct file_operations proc_fdinfo_operations;
+extern const struct inode_operations proc_fdinfo_inode_operations;
+
+extern int proc_fd_permission(struct inode *inode, int mask);
+
+#endif /* __PROCFS_FD_H__ */
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e1167a1c9126..67925a7bd8cb 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -9,6 +9,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/sched.h>
 #include <linux/proc_fs.h>
 struct  ctl_table_header;
 
@@ -65,6 +66,7 @@ extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
 extern const struct file_operations proc_net_operations;
 extern const struct inode_operations proc_net_inode_operations;
+extern const struct inode_operations proc_pid_link_inode_operations;
 
 struct proc_maps_private {
 	struct pid *pid;
@@ -91,6 +93,52 @@ static inline int proc_fd(struct inode *inode)
 	return PROC_I(inode)->fd;
 }
 
+static inline int task_dumpable(struct task_struct *task)
+{
+	int dumpable = 0;
+	struct mm_struct *mm;
+
+	task_lock(task);
+	mm = task->mm;
+	if (mm)
+		dumpable = get_dumpable(mm);
+	task_unlock(task);
+	if(dumpable == 1)
+		return 1;
+	return 0;
+}
+
+static inline int pid_delete_dentry(const struct dentry * dentry)
+{
+	/* Is the task we represent dead?
+	 * If so, then don't put the dentry on the lru list,
+	 * kill it immediately.
+	 */
+	return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
+}
+
+static inline unsigned name_to_int(struct dentry *dentry)
+{
+	const char *name = dentry->d_name.name;
+	int len = dentry->d_name.len;
+	unsigned n = 0;
+
+	if (len > 1 && *name == '0')
+		goto out;
+	while (len-- > 0) {
+		unsigned c = *name++ - '0';
+		if (c > 9)
+			goto out;
+		if (n >= (~0U-9)/10)
+			goto out;
+		n *= 10;
+		n += c;
+	}
+	return n;
+out:
+	return ~0U;
+}
+
 struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
 		struct dentry *dentry);
 int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-- 
cgit v1.2.3


From ddd3e0771bc7b869c550687c204e21f0155d5496 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 26 Aug 2012 18:28:20 +0400
Subject: procfs: Convert /proc/pid/fdinfo/ handling routines to seq-file v2

This patch converts /proc/pid/fdinfo/ handling routines to seq-file which
is needed to extend seq operations and plug in auxiliary fdinfo provides
from subsystems like eventfd/eventpoll/fsnotify.

Note the proc_fd_link no longer call for proc_fd_info, simply because
the guts of proc_fd_info() got merged into ->show() of that seq_file

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/fd.c | 112 ++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 64 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 1b0f932b4bd9..9cef449c0f76 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -6,61 +6,68 @@
 #include <linux/namei.h>
 #include <linux/pid.h>
 #include <linux/security.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
 
 #include <linux/proc_fs.h>
 
 #include "internal.h"
 #include "fd.h"
 
-#define PROC_FDINFO_MAX 64
-
-static int proc_fd_info(struct inode *inode, struct path *path, char *info)
+static int seq_show(struct seq_file *m, void *v)
 {
-	struct task_struct *task = get_proc_task(inode);
 	struct files_struct *files = NULL;
-	int fd = proc_fd(inode);
-	struct file *file;
+	int f_flags = 0, ret = -ENOENT;
+	struct file *file = NULL;
+	struct task_struct *task;
+
+	task = get_proc_task(m->private);
+	if (!task)
+		return -ENOENT;
+
+	files = get_files_struct(task);
+	put_task_struct(task);
 
-	if (task) {
-		files = get_files_struct(task);
-		put_task_struct(task);
-	}
 	if (files) {
-		/*
-		 * We are not taking a ref to the file structure, so we must
-		 * hold ->file_lock.
-		 */
+		int fd = proc_fd(m->private);
+
 		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
 		if (file) {
-			unsigned int f_flags;
-			struct fdtable *fdt;
+			struct fdtable *fdt = files_fdtable(files);
 
-			fdt = files_fdtable(files);
 			f_flags = file->f_flags & ~O_CLOEXEC;
 			if (close_on_exec(fd, fdt))
 				f_flags |= O_CLOEXEC;
 
-			if (path) {
-				*path = file->f_path;
-				path_get(&file->f_path);
-			}
-			if (info)
-				snprintf(info, PROC_FDINFO_MAX,
-					 "pos:\t%lli\n"
-					 "flags:\t0%o\n",
-					 (long long) file->f_pos,
-					 f_flags);
-			spin_unlock(&files->file_lock);
-			put_files_struct(files);
-			return 0;
+			get_file(file);
+			ret = 0;
 		}
 		spin_unlock(&files->file_lock);
 		put_files_struct(files);
 	}
-	return -ENOENT;
+
+	if (!ret) {
+                seq_printf(m, "pos:\t%lli\nflags:\t0%o\n",
+			   (long long)file->f_pos, f_flags);
+		fput(file);
+	}
+
+	return ret;
 }
 
+static int seq_fdinfo_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, seq_show, inode);
+}
+
+static const struct file_operations proc_fdinfo_file_operations = {
+	.open		= seq_fdinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
 static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags)
 {
 	struct files_struct *files;
@@ -130,7 +137,32 @@ static const struct dentry_operations tid_fd_dentry_operations = {
 
 static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(dentry->d_inode, path, NULL);
+	struct files_struct *files = NULL;
+	struct task_struct *task;
+	int ret = -ENOENT;
+
+	task = get_proc_task(dentry->d_inode);
+	if (task) {
+		files = get_files_struct(task);
+		put_task_struct(task);
+	}
+
+	if (files) {
+		int fd = proc_fd(dentry->d_inode);
+		struct file *fd_file;
+
+		spin_lock(&files->file_lock);
+		fd_file = fcheck_files(files, fd);
+		if (fd_file) {
+			*path = fd_file->f_path;
+			path_get(&fd_file->f_path);
+			ret = 0;
+		}
+		spin_unlock(&files->file_lock);
+		put_files_struct(files);
+	}
+
+	return ret;
 }
 
 static struct dentry *
@@ -245,22 +277,6 @@ out_no_task:
 	return retval;
 }
 
-static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
-				size_t len, loff_t *ppos)
-{
-	char tmp[PROC_FDINFO_MAX];
-	int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
-	if (!err)
-		err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
-	return err;
-}
-
-static const struct file_operations proc_fdinfo_file_operations = {
-	.open           = nonseekable_open,
-	.read		= proc_fdinfo_read,
-	.llseek		= no_llseek,
-};
-
 static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
 {
 	return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
-- 
cgit v1.2.3


From c6f3d81115989e274c42a852222b80d2e14ced6f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 11:01:04 -0400
Subject: don't leak O_CLOEXEC into ->f_flags

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c    | 2 +-
 fs/proc/fd.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 30760017deed..03028d0e7487 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -814,7 +814,7 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 		op->mode = 0;
 
 	/* Must never be set by userspace */
-	flags &= ~FMODE_NONOTIFY;
+	flags &= ~FMODE_NONOTIFY & ~O_CLOEXEC;
 
 	/*
 	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 9cef449c0f76..f28a875f8779 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -36,7 +36,7 @@ static int seq_show(struct seq_file *m, void *v)
 		if (file) {
 			struct fdtable *fdt = files_fdtable(files);
 
-			f_flags = file->f_flags & ~O_CLOEXEC;
+			f_flags = file->f_flags;
 			if (close_on_exec(fd, fdt))
 				f_flags |= O_CLOEXEC;
 
-- 
cgit v1.2.3


From f6d2ac5ca79d1fd468dbc77e488aec06e86f3035 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 12:55:54 -0400
Subject: namei.c: fix BS comment

get_write_access() is needed for nfsd, not binfmt_aout (the latter
has no business doing anything of that kind, of course)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index dd1ed1b8e98e..c5b85b3523c0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3971,7 +3971,7 @@ EXPORT_SYMBOL(user_path_at);
 EXPORT_SYMBOL(follow_down_one);
 EXPORT_SYMBOL(follow_down);
 EXPORT_SYMBOL(follow_up);
-EXPORT_SYMBOL(get_write_access); /* binfmt_aout */
+EXPORT_SYMBOL(get_write_access); /* nfsd */
 EXPORT_SYMBOL(getname);
 EXPORT_SYMBOL(lock_rename);
 EXPORT_SYMBOL(lookup_one_len);
-- 
cgit v1.2.3


From bf2965d5b5950d09e934ea5d961d79d0ed1fae7e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 20:13:36 -0400
Subject: switch ftruncate(2) to fget_light

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 03028d0e7487..9f61d7269d39 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -132,16 +132,16 @@ SYSCALL_DEFINE2(truncate, const char __user *, path, long, length)
 
 static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
-	struct inode * inode;
+	struct inode *inode;
 	struct dentry *dentry;
-	struct file * file;
-	int error;
+	struct file *file;
+	int error, fput_needed;
 
 	error = -EINVAL;
 	if (length < 0)
 		goto out;
 	error = -EBADF;
-	file = fget(fd);
+	file = fget_light(fd, &fput_needed);
 	if (!file)
 		goto out;
 
@@ -172,7 +172,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
 	sb_end_write(inode->i_sb);
 out_putf:
-	fput(file);
+	fput_light(file, fput_needed);
 out:
 	return error;
 }
-- 
cgit v1.2.3


From 6b48c5b2079af1f81d8f249ae07a988d8c45b32f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 20:15:40 -0400
Subject: switch fallocate(2) to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 9f61d7269d39..da6d3f1ac243 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -277,12 +277,12 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 {
 	struct file *file;
-	int error = -EBADF;
+	int error = -EBADF, fput_needed;
 
-	file = fget(fd);
+	file = fget_light(fd, &fput_needed);
 	if (file) {
 		error = do_fallocate(file, mode, offset, len);
-		fput(file);
+		fput_light(file, fput_needed);
 	}
 
 	return error;
-- 
cgit v1.2.3


From d6483b7a78438bc333560d11b69e6a6a6cf55940 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 20:22:10 -0400
Subject: switch fchmod(2) to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index da6d3f1ac243..3c741eae6b99 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -582,23 +582,21 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
 
 SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 {
-	struct file * file;
-	int error = -EBADF;
-	struct dentry * dentry;
+	struct file *file;
+	int error = -EBADF, fput_needed;
 
-	file = fget(fd);
+	file = fget_light(fd, &fput_needed);
 	if (!file)
 		goto out;
 
 	error = mnt_want_write_file(file);
 	if (error)
 		goto out_fput;
-	dentry = file->f_path.dentry;
-	audit_inode(NULL, dentry);
+	audit_inode(NULL, file->f_path.dentry);
 	error = chown_common(&file->f_path, user, group);
 	mnt_drop_write_file(file);
 out_fput:
-	fput(file);
+	fput_light(file, fput_needed);
 out:
 	return error;
 }
-- 
cgit v1.2.3


From 399c9b862f853f5c33e5a3b1f9a3c2507bdd526b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 21:00:03 -0400
Subject: ext4: close struct file leak on EXT4_IOC_MOVE_EXT

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/ioctl.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7f7dad787603..a0ee682dc394 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,8 @@ group_extend_out:
 			       EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
 			ext4_msg(sb, KERN_ERR,
 				 "Online defrag not supported with bigalloc");
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto mext_out;
 		}
 
 		err = mnt_want_write_file(filp);
-- 
cgit v1.2.3


From 4557c669ef9801d96cf663331cdd1dcb8fa9c2f1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Aug 2012 10:19:41 -0400
Subject: export fget_light

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/file.c b/fs/file.c
index 15750b80e3ce..0f1bda4bebfa 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -797,6 +797,7 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 
 	return file;
 }
+EXPORT_SYMBOL(fget_light);
 
 struct file *fget_raw_light(unsigned int fd, int *fput_needed)
 {
-- 
cgit v1.2.3


From 6bdf2954016ef7c1f4d4fa07a338ee197d9c3506 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 21:01:46 -0400
Subject: switch EXT4_IOC_MOVE_EXT to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index a0ee682dc394..39646f224514 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -234,7 +234,7 @@ group_extend_out:
 	case EXT4_IOC_MOVE_EXT: {
 		struct move_extent me;
 		struct file *donor_filp;
-		int err;
+		int err, fput_needed;
 
 		if (!(filp->f_mode & FMODE_READ) ||
 		    !(filp->f_mode & FMODE_WRITE))
@@ -245,7 +245,7 @@ group_extend_out:
 			return -EFAULT;
 		me.moved_len = 0;
 
-		donor_filp = fget(me.donor_fd);
+		donor_filp = fget_light(me.donor_fd, &fput_needed);
 		if (!donor_filp)
 			return -EBADF;
 
@@ -274,7 +274,7 @@ group_extend_out:
 				 &me, sizeof(me)))
 			err = -EFAULT;
 mext_out:
-		fput(donor_filp);
+		fput_light(donor_filp, fput_needed);
 		return err;
 	}
 
-- 
cgit v1.2.3


From ecd188159efa112770d5f8a4a62f8d3586784f48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 21:20:24 -0400
Subject: switch btrfs_ioctl_snap_create_transid() to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ioctl.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9df50fa8a078..3c88abb0e265 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1422,7 +1422,8 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 				     NULL, transid, readonly, inherit);
 	} else {
 		struct inode *src_inode;
-		src_file = fget(fd);
+		int fput_needed;
+		src_file = fget_light(fd, &fput_needed);
 		if (!src_file) {
 			ret = -EINVAL;
 			goto out_drop_write;
@@ -1433,13 +1434,12 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 			printk(KERN_INFO "btrfs: Snapshot src from "
 			       "another FS\n");
 			ret = -EINVAL;
-			fput(src_file);
-			goto out_drop_write;
+		} else {
+			ret = btrfs_mksubvol(&file->f_path, name, namelen,
+					     BTRFS_I(src_inode)->root,
+					     transid, readonly, inherit);
 		}
-		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     BTRFS_I(src_inode)->root,
-				     transid, readonly, inherit);
-		fput(src_file);
+		fput_light(src_file, fput_needed);
 	}
 out_drop_write:
 	mnt_drop_write_file(file);
-- 
cgit v1.2.3


From 5e196a9cf557dbc2bf808824c6c4998150e18d06 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 21:27:40 -0400
Subject: switch epoll_wait(2) to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/eventpoll.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index eedec84c1809..567ae72e155d 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1809,7 +1809,7 @@ error_return:
 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 		int, maxevents, int, timeout)
 {
-	int error;
+	int error, fput_needed;
 	struct file *file;
 	struct eventpoll *ep;
 
@@ -1825,7 +1825,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 
 	/* Get the "struct file *" for the eventpoll file */
 	error = -EBADF;
-	file = fget(epfd);
+	file = fget_light(epfd, &fput_needed);
 	if (!file)
 		goto error_return;
 
@@ -1847,7 +1847,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 	error = ep_poll(ep, events, maxevents, timeout);
 
 error_fput:
-	fput(file);
+	fput_light(file, fput_needed);
 error_return:
 
 	return error;
-- 
cgit v1.2.3


From 4109633f4c4dcdaedf0d85ae74dba334760c577b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 26 Aug 2012 21:32:02 -0400
Subject: switch timerfd_[sg]ettime(2) to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/timerfd.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index dffeb3795af1..dd91e9420c9e 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -234,15 +234,15 @@ static const struct file_operations timerfd_fops = {
 	.llseek		= noop_llseek,
 };
 
-static struct file *timerfd_fget(int fd)
+static struct file *timerfd_fget(int fd, int *fput_needed)
 {
 	struct file *file;
 
-	file = fget(fd);
+	file = fget_light(fd, fput_needed);
 	if (!file)
 		return ERR_PTR(-EBADF);
 	if (file->f_op != &timerfd_fops) {
-		fput(file);
+		fput_light(file, *fput_needed);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -287,7 +287,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	struct file *file;
 	struct timerfd_ctx *ctx;
 	struct itimerspec ktmr, kotmr;
-	int ret;
+	int ret, fput_needed;
 
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
@@ -297,7 +297,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
-	file = timerfd_fget(ufd);
+	file = timerfd_fget(ufd, &fput_needed);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 	ctx = file->private_data;
@@ -334,7 +334,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	ret = timerfd_setup(ctx, flags, &ktmr);
 
 	spin_unlock_irq(&ctx->wqh.lock);
-	fput(file);
+	fput_light(file, fput_needed);
 	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
 		return -EFAULT;
 
@@ -346,8 +346,9 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 	struct file *file;
 	struct timerfd_ctx *ctx;
 	struct itimerspec kotmr;
+	int fput_needed;
 
-	file = timerfd_fget(ufd);
+	file = timerfd_fget(ufd, &fput_needed);
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 	ctx = file->private_data;
@@ -362,7 +363,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
 	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
 	spin_unlock_irq(&ctx->wqh.lock);
-	fput(file);
+	fput_light(file, fput_needed);
 
 	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
 }
-- 
cgit v1.2.3


From 8319aa9127a1282b24c3ece473a058d246f35b0d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 03:18:55 -0400
Subject: switch btrfs_ioctl_clone() to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3c88abb0e265..3494f2f44167 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2350,7 +2350,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	struct btrfs_key key;
 	u32 nritems;
 	int slot;
-	int ret;
+	int ret, fput_needed;
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
@@ -2376,7 +2376,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	if (ret)
 		return ret;
 
-	src_file = fget(srcfd);
+	src_file = fget_light(srcfd, &fput_needed);
 	if (!src_file) {
 		ret = -EBADF;
 		goto out_drop_write;
@@ -2724,7 +2724,7 @@ out_unlock:
 	vfree(buf);
 	btrfs_free_path(path);
 out_fput:
-	fput(src_file);
+	fput_light(src_file, fput_needed);
 out_drop_write:
 	mnt_drop_write_file(file);
 	return ret;
-- 
cgit v1.2.3


From 78f7d75e5dd9aa1027e90d0b71d394603933c2ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 12:54:13 -0400
Subject: switch coda get_device_index() to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/coda/inode.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index f1813120d753..bd2313d106e5 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -109,41 +109,39 @@ static int get_device_index(struct coda_mount_data *data)
 {
 	struct file *file;
 	struct inode *inode;
-	int idx;
+	int idx, fput_needed;
 
-	if(data == NULL) {
+	if (data == NULL) {
 		printk("coda_read_super: Bad mount data\n");
 		return -1;
 	}
 
-	if(data->version != CODA_MOUNT_VERSION) {
+	if (data->version != CODA_MOUNT_VERSION) {
 		printk("coda_read_super: Bad mount version\n");
 		return -1;
 	}
 
-	file = fget(data->fd);
-	inode = NULL;
-	if(file)
-		inode = file->f_path.dentry->d_inode;
-	
-	if(!inode || !S_ISCHR(inode->i_mode) ||
-	   imajor(inode) != CODA_PSDEV_MAJOR) {
-		if(file)
-			fput(file);
-
-		printk("coda_read_super: Bad file\n");
-		return -1;
+	file = fget_light(data->fd, &fput_needed);
+	if (!file)
+		goto Ebadf;
+	inode = file->f_path.dentry->d_inode;
+	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
+		fput_light(file, fput_needed);
+		goto Ebadf;
 	}
 
 	idx = iminor(inode);
-	fput(file);
+	fput_light(file, fput_needed);
 
-	if(idx < 0 || idx >= MAX_CODADEVS) {
+	if (idx < 0 || idx >= MAX_CODADEVS) {
 		printk("coda_read_super: Bad minor number\n");
 		return -1;
 	}
 
 	return idx;
+Ebadf:
+	printk("coda_read_super: Bad file\n");
+	return -1;
 }
 
 static int coda_fill_super(struct super_block *sb, void *data, int silent)
-- 
cgit v1.2.3


From 1ea65c96077f9bb5c0e5e224a4da751d269c5f94 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 12:57:12 -0400
Subject: switch xfs_swapext() to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/xfs_dfrag.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e00de08dc8ac..e6cdf224d7ea 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -49,10 +49,10 @@ xfs_swapext(
 {
 	xfs_inode_t     *ip, *tip;
 	struct file	*file, *tmp_file;
-	int		error = 0;
+	int		error = 0, fput_needed, fput_needed_tmp;
 
 	/* Pull information for the target fd */
-	file = fget((int)sxp->sx_fdtarget);
+	file = fget_light((int)sxp->sx_fdtarget, &fput_needed);
 	if (!file) {
 		error = XFS_ERROR(EINVAL);
 		goto out;
@@ -65,7 +65,7 @@ xfs_swapext(
 		goto out_put_file;
 	}
 
-	tmp_file = fget((int)sxp->sx_fdtmp);
+	tmp_file = fget_light((int)sxp->sx_fdtmp, &fput_needed_tmp);
 	if (!tmp_file) {
 		error = XFS_ERROR(EINVAL);
 		goto out_put_file;
@@ -105,9 +105,9 @@ xfs_swapext(
 	error = xfs_swap_extents(ip, tip, sxp);
 
  out_put_tmp_file:
-	fput(tmp_file);
+	fput_light(tmp_file, fput_needed_tmp);
  out_put_file:
-	fput(file);
+	fput_light(file, fput_needed);
  out:
 	return error;
 }
-- 
cgit v1.2.3


From 64e09fa2e1fef1696a8685c7aad7e0d3dd24ce71 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 12:59:52 -0400
Subject: switch xfs_find_handle() to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/xfs_ioctl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0e0232c3b6d9..21483eac402d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -72,11 +72,11 @@ xfs_find_handle(
 	struct inode		*inode;
 	struct file		*file = NULL;
 	struct path		path;
-	int			error;
+	int			error, fput_needed;
 	struct xfs_inode	*ip;
 
 	if (cmd == XFS_IOC_FD_TO_HANDLE) {
-		file = fget(hreq->fd);
+		file = fget_light(hreq->fd, &fput_needed);
 		if (!file)
 			return -EBADF;
 		inode = file->f_path.dentry->d_inode;
@@ -134,7 +134,7 @@ xfs_find_handle(
 
  out_put:
 	if (cmd == XFS_IOC_FD_TO_HANDLE)
-		fput(file);
+		fput_light(file, fput_needed);
 	else
 		path_put(&path);
 	return error;
-- 
cgit v1.2.3


From cb0942b81249798e15c3f04eee2946ef543e8115 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 14:48:26 -0400
Subject: make get_file() return its argument

simplifies a bunch of callers...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c         | 4 +---
 drivers/base/dma-buf.c             | 3 +--
 drivers/staging/omapdrm/omap_gem.c | 3 +--
 drivers/tty/tty_io.c               | 9 +++------
 fs/autofs4/waitq.c                 | 3 +--
 fs/fuse/dev.c                      | 3 +--
 fs/nfsd/nfs4state.c                | 3 +--
 fs/proc/base.c                     | 3 +--
 fs/select.c                        | 3 +--
 include/linux/fs.h                 | 6 +++++-
 mm/fremap.c                        | 3 +--
 mm/mmap.c                          | 3 +--
 mm/nommu.c                         | 6 ++----
 net/compat.c                       | 3 +--
 net/core/scm.c                     | 3 +--
 security/selinux/hooks.c           | 3 +--
 16 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 79826c13b8b6..ff5d4e4c3733 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2306,7 +2306,7 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 	 * partially initialize the vma for the sampling buffer
 	 */
 	vma->vm_mm	     = mm;
-	vma->vm_file	     = filp;
+	vma->vm_file	     = get_file(filp);
 	vma->vm_flags	     = VM_READ| VM_MAYREAD |VM_RESERVED;
 	vma->vm_page_prot    = PAGE_READONLY; /* XXX may need to change */
 
@@ -2345,8 +2345,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
 		goto error;
 	}
 
-	get_file(filp);
-
 	/*
 	 * now insert the vma in the vm list for the process, must be
 	 * done with mmap lock held
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index c30f3e1d0efc..460e22dee36d 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -460,8 +460,7 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
 	if (vma->vm_file)
 		fput(vma->vm_file);
 
-	vma->vm_file = dmabuf->file;
-	get_file(vma->vm_file);
+	vma->vm_file = get_file(dmabuf->file);
 
 	vma->vm_pgoff = pgoff;
 
diff --git a/drivers/staging/omapdrm/omap_gem.c b/drivers/staging/omapdrm/omap_gem.c
index 3a0d035a9e03..2a6bb7f9ee68 100644
--- a/drivers/staging/omapdrm/omap_gem.c
+++ b/drivers/staging/omapdrm/omap_gem.c
@@ -566,9 +566,8 @@ int omap_gem_mmap_obj(struct drm_gem_object *obj,
 		 * in particular in the case of mmap'd dmabufs)
 		 */
 		fput(vma->vm_file);
-		get_file(obj->filp);
 		vma->vm_pgoff = 0;
-		vma->vm_file  = obj->filp;
+		vma->vm_file  = get_file(obj->filp);
 
 		vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 	}
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index 71d95cfbabec..c7561f29d894 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -1163,10 +1163,8 @@ ssize_t redirected_tty_write(struct file *file, const char __user *buf,
 	struct file *p = NULL;
 
 	spin_lock(&redirect_lock);
-	if (redirect) {
-		get_file(redirect);
-		p = redirect;
-	}
+	if (redirect)
+		p = get_file(redirect);
 	spin_unlock(&redirect_lock);
 
 	if (p) {
@@ -2246,8 +2244,7 @@ static int tioccons(struct file *file)
 		spin_unlock(&redirect_lock);
 		return -EBUSY;
 	}
-	get_file(file);
-	redirect = file;
+	redirect = get_file(file);
 	spin_unlock(&redirect_lock);
 	return 0;
 }
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index da8876d38a7b..dce436e595c1 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -175,8 +175,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 		return;
 	}
 
-	pipe = sbi->pipe;
-	get_file(pipe);
+	pipe = get_file(sbi->pipe);
 
 	mutex_unlock(&sbi->wq_mutex);
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index f4246cfc8d87..8c23fa7a91e6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -148,8 +148,7 @@ static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
 		if (ff->reserved_req) {
 			req = ff->reserved_req;
 			ff->reserved_req = NULL;
-			get_file(file);
-			req->stolen_file = file;
+			req->stolen_file = get_file(file);
 		}
 		spin_unlock(&fc->lock);
 	} while (!req);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index cc894eda385a..48a1bad37334 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2837,8 +2837,7 @@ static int nfs4_setlease(struct nfs4_delegation *dp, int flag)
 		return -ENOMEM;
 	}
 	fp->fi_lease = fl;
-	fp->fi_deleg_file = fl->fl_file;
-	get_file(fp->fi_deleg_file);
+	fp->fi_deleg_file = get_file(fl->fl_file);
 	atomic_set(&fp->fi_delegees, 1);
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	return 0;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b55c3bb298e3..f1e8438d21b5 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1979,8 +1979,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if (++pos <= filp->f_pos)
 					continue;
 
-				get_file(vma->vm_file);
-				info.file = vma->vm_file;
+				info.file = get_file(vma->vm_file);
 				info.len = snprintf(info.name,
 						sizeof(info.name), "%lx-%lx",
 						vma->vm_start, vma->vm_end);
diff --git a/fs/select.c b/fs/select.c
index db14c781335e..ffdd16d6e691 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -220,8 +220,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
 	struct poll_table_entry *entry = poll_get_entry(pwq);
 	if (!entry)
 		return;
-	get_file(filp);
-	entry->filp = filp;
+	entry->filp = get_file(filp);
 	entry->wait_address = wait_address;
 	entry->key = p->_key;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index aa110476a95b..de1db1c12080 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1074,7 +1074,11 @@ struct file_handle {
 	unsigned char f_handle[0];
 };
 
-#define get_file(x)	atomic_long_inc(&(x)->f_count)
+static inline struct file *get_file(struct file *f)
+{
+	atomic_long_inc(&f->f_count);
+	return f;
+}
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
 
diff --git a/mm/fremap.c b/mm/fremap.c
index 9ed4fd432467..048659c0c03d 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -195,10 +195,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		 */
 		if (mapping_cap_account_dirty(mapping)) {
 			unsigned long addr;
-			struct file *file = vma->vm_file;
+			struct file *file = get_file(vma->vm_file);
 
 			flags &= MAP_NONBLOCK;
-			get_file(file);
 			addr = mmap_region(file, start, size,
 					flags, vma->vm_flags, pgoff);
 			fput(file);
diff --git a/mm/mmap.c b/mm/mmap.c
index ae18a48e7e4e..872441e81914 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1301,8 +1301,7 @@ munmap_back:
 				goto free_vma;
 			correct_wcount = 1;
 		}
-		vma->vm_file = file;
-		get_file(file);
+		vma->vm_file = get_file(file);
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
diff --git a/mm/nommu.c b/mm/nommu.c
index d4b0c10872de..dee2ff89fd58 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1282,10 +1282,8 @@ unsigned long do_mmap_pgoff(struct file *file,
 	vma->vm_pgoff = pgoff;
 
 	if (file) {
-		region->vm_file = file;
-		get_file(file);
-		vma->vm_file = file;
-		get_file(file);
+		region->vm_file = get_file(file);
+		vma->vm_file = get_file(file);
 		if (vm_flags & VM_EXECUTABLE) {
 			added_exe_file_vma(current->mm);
 			vma->vm_mm = current->mm;
diff --git a/net/compat.c b/net/compat.c
index 74ed1d7a84a2..79ae88485001 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -301,8 +301,7 @@ void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm)
 			break;
 		}
 		/* Bump the usage count and install the file. */
-		get_file(fp[i]);
-		fd_install(new_fd, fp[i]);
+		fd_install(new_fd, get_file(fp[i]));
 	}
 
 	if (i > 0) {
diff --git a/net/core/scm.c b/net/core/scm.c
index 040cebeed45b..b0098d259233 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -281,11 +281,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
 			break;
 		}
 		/* Bump the usage count and install the file. */
-		get_file(fp[i]);
 		sock = sock_from_file(fp[i], &err);
 		if (sock)
 			sock_update_netprioidx(sock->sk, current);
-		fd_install(new_fd, fp[i]);
+		fd_install(new_fd, get_file(fp[i]));
 	}
 
 	if (i > 0)
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4dfbcea10eb7..651d8456611a 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2135,8 +2135,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	if (!IS_ERR(devnull)) {
 		/* replace all the matching ones with this */
 		do {
-			get_file(devnull);
-			replace_fd(n - 1, devnull, 0);
+			replace_fd(n - 1, get_file(devnull), 0);
 		} while ((n = iterate_fd(files, n, match_file, cred)) != 0);
 		fput(devnull);
 	} else {
-- 
cgit v1.2.3


From 7b540d0646ce122f0ba4520412be91e530719742 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 14:55:26 -0400
Subject: proc_map_files_readdir(): don't bother with grabbing files

all we need is their ->f_mode, so just collect _that_

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 28 +++++++++-------------------
 1 file changed, 9 insertions(+), 19 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f1e8438d21b5..df18db61d6d8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1805,7 +1805,7 @@ out:
 }
 
 struct map_files_info {
-	struct file	*file;
+	fmode_t		mode;
 	unsigned long	len;
 	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
@@ -1814,13 +1814,10 @@ static struct dentry *
 proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 			   struct task_struct *task, const void *ptr)
 {
-	const struct file *file = ptr;
+	fmode_t mode = (fmode_t)(unsigned long)ptr;
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	if (!file)
-		return ERR_PTR(-ENOENT);
-
 	inode = proc_pid_make_inode(dir->i_sb, task);
 	if (!inode)
 		return ERR_PTR(-ENOENT);
@@ -1832,9 +1829,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 	inode->i_size = 64;
 	inode->i_mode = S_IFLNK;
 
-	if (file->f_mode & FMODE_READ)
+	if (mode & FMODE_READ)
 		inode->i_mode |= S_IRUSR;
-	if (file->f_mode & FMODE_WRITE)
+	if (mode & FMODE_WRITE)
 		inode->i_mode |= S_IWUSR;
 
 	d_set_d_op(dentry, &tid_map_files_dentry_operations);
@@ -1878,7 +1875,8 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
 	if (!vma)
 		goto out_no_vma;
 
-	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+	result = proc_map_files_instantiate(dir, dentry, task,
+			(void *)(unsigned long)vma->vm_file->f_mode);
 
 out_no_vma:
 	up_read(&mm->mmap_sem);
@@ -1979,7 +1977,7 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				if (++pos <= filp->f_pos)
 					continue;
 
-				info.file = get_file(vma->vm_file);
+				info.mode = vma->vm_file->f_mode;
 				info.len = snprintf(info.name,
 						sizeof(info.name), "%lx-%lx",
 						vma->vm_start, vma->vm_end);
@@ -1994,19 +1992,11 @@ proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			ret = proc_fill_cache(filp, dirent, filldir,
 					      p->name, p->len,
 					      proc_map_files_instantiate,
-					      task, p->file);
+					      task,
+					      (void *)(unsigned long)p->mode);
 			if (ret)
 				break;
 			filp->f_pos++;
-			fput(p->file);
-		}
-		for (; i < nr_files; i++) {
-			/*
-			 * In case of error don't forget
-			 * to put rest of file refs.
-			 */
-			p = flex_array_get(fa, i);
-			fput(p->file);
 		}
 		if (fa)
 			flex_array_free(fa);
-- 
cgit v1.2.3


From 2a117354b7bdfe13750a64307755e75436fb157a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 27 Aug 2012 17:55:17 -0400
Subject: switch o2hb_region_dev_write() to fget_light()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/cluster/heartbeat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index a4e855e3690e..61c28ae266f5 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1750,6 +1750,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	struct inode *inode = NULL;
 	ssize_t ret = -EINVAL;
 	int live_threshold;
+	int fput_needed;
 
 	if (reg->hr_bdev)
 		goto out;
@@ -1766,7 +1767,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	if (fd < 0 || fd >= INT_MAX)
 		goto out;
 
-	filp = fget(fd);
+	filp = fget_light(fd, &fput_needed);
 	if (filp == NULL)
 		goto out;
 
@@ -1884,7 +1885,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 out:
 	if (filp)
-		fput(filp);
+		fput_light(filp, fput_needed);
 	if (inode)
 		iput(inode);
 	if (ret < 0) {
-- 
cgit v1.2.3


From 2903ff019b346ab8d36ebbf54853c3aaf6590608 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 28 Aug 2012 12:52:22 -0400
Subject: switch simple cases of fget_light to fdget

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c                |  15 +--
 arch/ia64/kernel/perfmon.c                 |  15 ++-
 arch/parisc/hpux/fs.c                      |  17 ++-
 arch/powerpc/platforms/cell/spu_syscalls.c |  21 ++--
 drivers/infiniband/core/ucma.c             |  12 +-
 drivers/infiniband/core/uverbs_cmd.c       |  18 +--
 drivers/infiniband/core/uverbs_main.c      |  12 +-
 drivers/vfio/vfio.c                        |  17 ++-
 drivers/video/msm/mdp.c                    |  12 +-
 fs/btrfs/ioctl.c                           |  26 ++---
 fs/coda/inode.c                            |  14 +--
 fs/compat.c                                |  90 +++++++--------
 fs/compat_ioctl.c                          |  27 ++---
 fs/eventpoll.c                             |  25 ++--
 fs/ext4/ioctl.c                            |  14 +--
 fs/fcntl.c                                 |  32 +++---
 fs/fhandle.c                               |  17 ++-
 fs/ioctl.c                                 |  25 ++--
 fs/locks.c                                 |  20 ++--
 fs/namei.c                                 |  39 +++----
 fs/notify/fanotify/fanotify_user.c         |  28 +++--
 fs/notify/inotify/inotify_user.c           |  28 ++---
 fs/ocfs2/cluster/heartbeat.c               |  39 ++++---
 fs/open.c                                  |  64 +++++------
 fs/read_write.c                            | 176 +++++++++++++----------------
 fs/readdir.c                               |  36 +++---
 fs/select.c                                |  28 ++---
 fs/signalfd.c                              |  13 +--
 fs/splice.c                                |  69 ++++++-----
 fs/stat.c                                  |  10 +-
 fs/statfs.c                                |   9 +-
 fs/sync.c                                  |  33 +++---
 fs/timerfd.c                               |  48 ++++----
 fs/utimes.c                                |  11 +-
 fs/xattr.c                                 |  52 ++++-----
 fs/xfs/xfs_dfrag.c                         |  36 +++---
 fs/xfs/xfs_ioctl.c                         |  12 +-
 include/linux/file.h                       |   5 +-
 ipc/mqueue.c                               |  84 +++++++-------
 kernel/events/core.c                       |  70 +++++-------
 kernel/sys.c                               |  16 +--
 kernel/taskstats.c                         |  11 +-
 mm/fadvise.c                               |  35 +++---
 mm/readahead.c                             |  15 ++-
 44 files changed, 633 insertions(+), 763 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index d6c49e67d3fc..f1daf7ae42e9 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -144,28 +144,25 @@ SYSCALL_DEFINE4(osf_getdirentries, unsigned int, fd,
 		struct osf_dirent __user *, dirent, unsigned int, count,
 		long __user *, basep)
 {
-	int error, fput_needed;
-	struct file *file;
+	int error;
+	struct fd arg = fdget(fd);
 	struct osf_dirent_callback buf;
 
-	error = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		goto out;
+	if (!arg.file)
+		return -EBADF;
 
 	buf.dirent = dirent;
 	buf.basep = basep;
 	buf.count = count;
 	buf.error = 0;
 
-	error = vfs_readdir(file, osf_filldir, &buf);
+	error = vfs_readdir(arg.file, osf_filldir, &buf);
 	if (error >= 0)
 		error = buf.error;
 	if (count != buf.count)
 		error = count - buf.count;
 
-	fput_light(file, fput_needed);
- out:
+	fdput(arg);
 	return error;
 }
 
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index ff5d4e4c3733..e3bd7b8aceab 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -4780,7 +4780,7 @@ recheck:
 asmlinkage long
 sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
 {
-	struct file *file = NULL;
+	struct fd f = {NULL, 0};
 	pfm_context_t *ctx = NULL;
 	unsigned long flags = 0UL;
 	void *args_k = NULL;
@@ -4789,7 +4789,6 @@ sys_perfmonctl (int fd, int cmd, void __user *arg, int count)
 	int narg, completed_args = 0, call_made = 0, cmd_flags;
 	int (*func)(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs);
 	int (*getsize)(void *arg, size_t *sz);
-	int fput_needed;
 #define PFM_MAX_ARGSIZE	4096
 
 	/*
@@ -4878,17 +4877,17 @@ restart_args:
 
 	ret = -EBADF;
 
-	file = fget_light(fd, &fput_needed);
-	if (unlikely(file == NULL)) {
+	f = fdget(fd);
+	if (unlikely(f.file == NULL)) {
 		DPRINT(("invalid fd %d\n", fd));
 		goto error_args;
 	}
-	if (unlikely(PFM_IS_FILE(file) == 0)) {
+	if (unlikely(PFM_IS_FILE(f.file) == 0)) {
 		DPRINT(("fd %d not related to perfmon\n", fd));
 		goto error_args;
 	}
 
-	ctx = file->private_data;
+	ctx = f.file->private_data;
 	if (unlikely(ctx == NULL)) {
 		DPRINT(("no context for fd %d\n", fd));
 		goto error_args;
@@ -4918,8 +4917,8 @@ abort_locked:
 	if (call_made && PFM_CMD_RW_ARG(cmd) && copy_to_user(arg, args_k, base_sz*count)) ret = -EFAULT;
 
 error_args:
-	if (file)
-		fput_light(file, fput_needed);
+	if (f.file)
+		fdput(f);
 
 	kfree(args_k);
 
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 41e01832cb21..6785de7bd2a0 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -109,33 +109,32 @@ Efault:
 
 int hpux_getdents(unsigned int fd, struct hpux_dirent __user *dirent, unsigned int count)
 {
-	struct file * file;
+	struct fd arg;
 	struct hpux_dirent __user * lastdirent;
 	struct getdents_callback buf;
-	int error = -EBADF, fput_needed;
+	int error;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		goto out;
+	arg = fdget(fd);
+	if (!arg.file)
+		return -EBADF;
 
 	buf.current_dir = dirent;
 	buf.previous = NULL;
 	buf.count = count;
 	buf.error = 0;
 
-	error = vfs_readdir(file, filldir, &buf);
+	error = vfs_readdir(arg.file, filldir, &buf);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(file->f_pos, &lastdirent->d_off))
+		if (put_user(arg.file->f_pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
 
-	fput_light(file, fput_needed);
-out:
+	fdput(arg);
 	return error;
 }
 
diff --git a/arch/powerpc/platforms/cell/spu_syscalls.c b/arch/powerpc/platforms/cell/spu_syscalls.c
index 714bbfc3162c..db4e638cf408 100644
--- a/arch/powerpc/platforms/cell/spu_syscalls.c
+++ b/arch/powerpc/platforms/cell/spu_syscalls.c
@@ -69,8 +69,6 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,
 	umode_t, mode, int, neighbor_fd)
 {
 	long ret;
-	struct file *neighbor;
-	int fput_needed;
 	struct spufs_calls *calls;
 
 	calls = spufs_calls_get();
@@ -78,11 +76,11 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,
 		return -ENOSYS;
 
 	if (flags & SPU_CREATE_AFFINITY_SPU) {
+		struct fd neighbor = fdget(neighbor_fd);
 		ret = -EBADF;
-		neighbor = fget_light(neighbor_fd, &fput_needed);
-		if (neighbor) {
-			ret = calls->create_thread(name, flags, mode, neighbor);
-			fput_light(neighbor, fput_needed);
+		if (neighbor.file) {
+			ret = calls->create_thread(name, flags, mode, neighbor.file);
+			fdput(neighbor);
 		}
 	} else
 		ret = calls->create_thread(name, flags, mode, NULL);
@@ -94,8 +92,7 @@ SYSCALL_DEFINE4(spu_create, const char __user *, name, unsigned int, flags,
 asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
 {
 	long ret;
-	struct file *filp;
-	int fput_needed;
+	struct fd arg;
 	struct spufs_calls *calls;
 
 	calls = spufs_calls_get();
@@ -103,10 +100,10 @@ asmlinkage long sys_spu_run(int fd, __u32 __user *unpc, __u32 __user *ustatus)
 		return -ENOSYS;
 
 	ret = -EBADF;
-	filp = fget_light(fd, &fput_needed);
-	if (filp) {
-		ret = calls->spu_run(filp, unpc, ustatus);
-		fput_light(filp, fput_needed);
+	arg = fdget(fd);
+	if (arg.file) {
+		ret = calls->spu_run(arg.file, unpc, ustatus);
+		fdput(arg);
 	}
 
 	spufs_calls_put(calls);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 6b2ae729de92..6f28da9f4cad 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -1184,20 +1184,20 @@ static ssize_t ucma_migrate_id(struct ucma_file *new_file,
 	struct rdma_ucm_migrate_id cmd;
 	struct rdma_ucm_migrate_resp resp;
 	struct ucma_context *ctx;
-	struct file *filp;
+	struct fd f;
 	struct ucma_file *cur_file;
-	int ret = 0, fput_needed;
+	int ret = 0;
 
 	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
 		return -EFAULT;
 
 	/* Get current fd to protect against it being closed */
-	filp = fget_light(cmd.fd, &fput_needed);
-	if (!filp)
+	f = fdget(cmd.fd);
+	if (!f.file)
 		return -ENOENT;
 
 	/* Validate current fd and prevent destruction of id. */
-	ctx = ucma_get_ctx(filp->private_data, cmd.id);
+	ctx = ucma_get_ctx(f.file->private_data, cmd.id);
 	if (IS_ERR(ctx)) {
 		ret = PTR_ERR(ctx);
 		goto file_put;
@@ -1231,7 +1231,7 @@ response:
 
 	ucma_put_ctx(ctx);
 file_put:
-	fput_light(filp, fput_needed);
+	fdput(f);
 	return ret;
 }
 
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 402679bd30a3..0cb0007724a2 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -705,9 +705,9 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
 	struct ib_udata			udata;
 	struct ib_uxrcd_object         *obj;
 	struct ib_xrcd                 *xrcd = NULL;
-	struct file                    *f = NULL;
+	struct fd			f = {NULL, 0};
 	struct inode                   *inode = NULL;
-	int				ret = 0, fput_needed;
+	int				ret = 0;
 	int				new_xrcd = 0;
 
 	if (out_len < sizeof resp)
@@ -724,13 +724,13 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
 
 	if (cmd.fd != -1) {
 		/* search for file descriptor */
-		f = fget_light(cmd.fd, &fput_needed);
-		if (!f) {
+		f = fdget(cmd.fd);
+		if (!f.file) {
 			ret = -EBADF;
 			goto err_tree_mutex_unlock;
 		}
 
-		inode = f->f_dentry->d_inode;
+		inode = f.file->f_path.dentry->d_inode;
 		xrcd = find_xrcd(file->device, inode);
 		if (!xrcd && !(cmd.oflags & O_CREAT)) {
 			/* no file descriptor. Need CREATE flag */
@@ -795,8 +795,8 @@ ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
 		goto err_copy;
 	}
 
-	if (f)
-		fput_light(f, fput_needed);
+	if (f.file)
+		fdput(f);
 
 	mutex_lock(&file->mutex);
 	list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list);
@@ -825,8 +825,8 @@ err:
 	put_uobj_write(&obj->uobject);
 
 err_tree_mutex_unlock:
-	if (f)
-		fput_light(f, fput_needed);
+	if (f.file)
+		fdput(f);
 
 	mutex_unlock(&file->device->xrcd_tree_mutex);
 
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
index acf75c2cf7ef..6f2ce6fa98f8 100644
--- a/drivers/infiniband/core/uverbs_main.c
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -541,17 +541,15 @@ struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
 struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
 {
 	struct ib_uverbs_event_file *ev_file = NULL;
-	struct file *filp;
-	int fput_needed;
+	struct fd f = fdget(fd);
 
-	filp = fget_light(fd, &fput_needed);
-	if (!filp)
+	if (!f.file)
 		return NULL;
 
-	if (filp->f_op != &uverbs_event_fops)
+	if (f.file->f_op != &uverbs_event_fops)
 		goto out;
 
-	ev_file = filp->private_data;
+	ev_file = f.file->private_data;
 	if (ev_file->is_async) {
 		ev_file = NULL;
 		goto out;
@@ -560,7 +558,7 @@ struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
 	kref_get(&ev_file->ref);
 
 out:
-	fput_light(filp, fput_needed);
+	fdput(f);
 	return ev_file;
 }
 
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 91bcd97d3061..56097c6d072d 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1014,25 +1014,25 @@ static void vfio_group_try_dissolve_container(struct vfio_group *group)
 
 static int vfio_group_set_container(struct vfio_group *group, int container_fd)
 {
-	struct file *filep;
+	struct fd f;
 	struct vfio_container *container;
 	struct vfio_iommu_driver *driver;
-	int ret = 0, fput_needed;
+	int ret = 0;
 
 	if (atomic_read(&group->container_users))
 		return -EINVAL;
 
-	filep = fget_light(container_fd, &fput_needed);
-	if (!filep)
+	f = fdget(container_fd);
+	if (!f.file)
 		return -EBADF;
 
 	/* Sanity check, is this really our fd? */
-	if (filep->f_op != &vfio_fops) {
-		fput_light(filep, fput_needed);
+	if (f.file->f_op != &vfio_fops) {
+		fdput(f);
 		return -EINVAL;
 	}
 
-	container = filep->private_data;
+	container = f.file->private_data;
 	WARN_ON(!container); /* fget ensures we don't race vfio_release */
 
 	mutex_lock(&container->group_lock);
@@ -1054,8 +1054,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
 
 unlock_out:
 	mutex_unlock(&container->group_lock);
-	fput_light(filep, fput_needed);
-
+	fdput(f);
 	return ret;
 }
 
diff --git a/drivers/video/msm/mdp.c b/drivers/video/msm/mdp.c
index cb2ddf164c98..07c9d8ab2c3b 100644
--- a/drivers/video/msm/mdp.c
+++ b/drivers/video/msm/mdp.c
@@ -257,19 +257,17 @@ int get_img(struct mdp_img *img, struct fb_info *info,
 	    unsigned long *start, unsigned long *len,
 	    struct file **filep)
 {
-	int put_needed, ret = 0;
-	struct file *file;
-
-	file = fget_light(img->memory_id, &put_needed);
-	if (file == NULL)
+	int ret = 0;
+	struct fd f = fdget(img->memory_id);
+	if (f.file == NULL)
 		return -1;
 
-	if (MAJOR(file->f_dentry->d_inode->i_rdev) == FB_MAJOR) {
+	if (MAJOR(f.file->f_dentry->d_inode->i_rdev) == FB_MAJOR) {
 		*start = info->fix.smem_start;
 		*len = info->fix.smem_len;
 	} else
 		ret = -1;
-	fput_light(file, put_needed);
+	fdput(f);
 
 	return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 3494f2f44167..0a4f0c8bc58f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1397,7 +1397,6 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 				u64 *transid, bool readonly,
 				struct btrfs_qgroup_inherit **inherit)
 {
-	struct file *src_file;
 	int namelen;
 	int ret = 0;
 
@@ -1421,15 +1420,14 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
 				     NULL, transid, readonly, inherit);
 	} else {
+		struct fd src = fdget(fd);
 		struct inode *src_inode;
-		int fput_needed;
-		src_file = fget_light(fd, &fput_needed);
-		if (!src_file) {
+		if (!src.file) {
 			ret = -EINVAL;
 			goto out_drop_write;
 		}
 
-		src_inode = src_file->f_path.dentry->d_inode;
+		src_inode = src.file->f_path.dentry->d_inode;
 		if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) {
 			printk(KERN_INFO "btrfs: Snapshot src from "
 			       "another FS\n");
@@ -1439,7 +1437,7 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
 					     BTRFS_I(src_inode)->root,
 					     transid, readonly, inherit);
 		}
-		fput_light(src_file, fput_needed);
+		fdput(src);
 	}
 out_drop_write:
 	mnt_drop_write_file(file);
@@ -2341,7 +2339,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 {
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct file *src_file;
+	struct fd src_file;
 	struct inode *src;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
@@ -2350,7 +2348,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	struct btrfs_key key;
 	u32 nritems;
 	int slot;
-	int ret, fput_needed;
+	int ret;
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
 	u64 hint_byte;
@@ -2376,24 +2374,24 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	if (ret)
 		return ret;
 
-	src_file = fget_light(srcfd, &fput_needed);
-	if (!src_file) {
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
 		ret = -EBADF;
 		goto out_drop_write;
 	}
 
 	ret = -EXDEV;
-	if (src_file->f_path.mnt != file->f_path.mnt)
+	if (src_file.file->f_path.mnt != file->f_path.mnt)
 		goto out_fput;
 
-	src = src_file->f_dentry->d_inode;
+	src = src_file.file->f_dentry->d_inode;
 
 	ret = -EINVAL;
 	if (src == inode)
 		goto out_fput;
 
 	/* the src must be open for reading */
-	if (!(src_file->f_mode & FMODE_READ))
+	if (!(src_file.file->f_mode & FMODE_READ))
 		goto out_fput;
 
 	/* don't make the dst file partly checksummed */
@@ -2724,7 +2722,7 @@ out_unlock:
 	vfree(buf);
 	btrfs_free_path(path);
 out_fput:
-	fput_light(src_file, fput_needed);
+	fdput(src_file);
 out_drop_write:
 	mnt_drop_write_file(file);
 	return ret;
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index bd2313d106e5..d315c6c5891a 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -107,9 +107,9 @@ static const struct super_operations coda_super_operations =
 
 static int get_device_index(struct coda_mount_data *data)
 {
-	struct file *file;
+	struct fd f;
 	struct inode *inode;
-	int idx, fput_needed;
+	int idx;
 
 	if (data == NULL) {
 		printk("coda_read_super: Bad mount data\n");
@@ -121,17 +121,17 @@ static int get_device_index(struct coda_mount_data *data)
 		return -1;
 	}
 
-	file = fget_light(data->fd, &fput_needed);
-	if (!file)
+	f = fdget(data->fd);
+	if (!f.file)
 		goto Ebadf;
-	inode = file->f_path.dentry->d_inode;
+	inode = f.file->f_path.dentry->d_inode;
 	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
-		fput_light(file, fput_needed);
+		fdput(f);
 		goto Ebadf;
 	}
 
 	idx = iminor(inode);
-	fput_light(file, fput_needed);
+	fdput(f);
 
 	if (idx < 0 || idx >= MAX_CODADEVS) {
 		printk("coda_read_super: Bad minor number\n");
diff --git a/fs/compat.c b/fs/compat.c
index 1bdb350ea5d3..d72d51e19025 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -870,22 +870,20 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 	struct compat_old_linux_dirent __user *dirent, unsigned int count)
 {
 	int error;
-	struct file *file;
-	int fput_needed;
+	struct fd f = fdget(fd);
 	struct compat_readdir_callback buf;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		return -EBADF;
 
 	buf.result = 0;
 	buf.dirent = dirent;
 
-	error = vfs_readdir(file, compat_fillonedir, &buf);
+	error = vfs_readdir(f.file, compat_fillonedir, &buf);
 	if (buf.result)
 		error = buf.result;
 
-	fput_light(file, fput_needed);
+	fdput(f);
 	return error;
 }
 
@@ -949,17 +947,16 @@ efault:
 asmlinkage long compat_sys_getdents(unsigned int fd,
 		struct compat_linux_dirent __user *dirent, unsigned int count)
 {
-	struct file * file;
+	struct fd f;
 	struct compat_linux_dirent __user * lastdirent;
 	struct compat_getdents_callback buf;
-	int fput_needed;
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return -EBADF;
 
 	buf.current_dir = dirent;
@@ -967,17 +964,17 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
 	buf.count = count;
 	buf.error = 0;
 
-	error = vfs_readdir(file, compat_filldir, &buf);
+	error = vfs_readdir(f.file, compat_filldir, &buf);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(file->f_pos, &lastdirent->d_off))
+		if (put_user(f.file->f_pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
-	fput_light(file, fput_needed);
+	fdput(f);
 	return error;
 }
 
@@ -1035,17 +1032,16 @@ efault:
 asmlinkage long compat_sys_getdents64(unsigned int fd,
 		struct linux_dirent64 __user * dirent, unsigned int count)
 {
-	struct file * file;
+	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
 	struct compat_getdents_callback64 buf;
-	int fput_needed;
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return -EBADF;
 
 	buf.current_dir = dirent;
@@ -1053,18 +1049,18 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
 	buf.count = count;
 	buf.error = 0;
 
-	error = vfs_readdir(file, compat_filldir64, &buf);
+	error = vfs_readdir(f.file, compat_filldir64, &buf);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = file->f_pos;
+		typeof(lastdirent->d_off) d_off = f.file->f_pos;
 		if (__put_user_unaligned(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
-	fput_light(file, fput_needed);
+	fdput(f);
 	return error;
 }
 #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
@@ -1152,18 +1148,16 @@ asmlinkage ssize_t
 compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec,
 		 unsigned long vlen)
 {
-	struct file *file;
-	int fput_needed;
+	struct fd f = fdget(fd);
 	ssize_t ret;
 	loff_t pos;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		return -EBADF;
-	pos = file->f_pos;
-	ret = compat_readv(file, vec, vlen, &pos);
-	file->f_pos = pos;
-	fput_light(file, fput_needed);
+	pos = f.file->f_pos;
+	ret = compat_readv(f.file, vec, vlen, &pos);
+	f.file->f_pos = pos;
+	fdput(f);
 	return ret;
 }
 
@@ -1171,19 +1165,18 @@ asmlinkage ssize_t
 compat_sys_preadv64(unsigned long fd, const struct compat_iovec __user *vec,
 		    unsigned long vlen, loff_t pos)
 {
-	struct file *file;
-	int fput_needed;
+	struct fd f;
 	ssize_t ret;
 
 	if (pos < 0)
 		return -EINVAL;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return -EBADF;
 	ret = -ESPIPE;
-	if (file->f_mode & FMODE_PREAD)
-		ret = compat_readv(file, vec, vlen, &pos);
-	fput_light(file, fput_needed);
+	if (f.file->f_mode & FMODE_PREAD)
+		ret = compat_readv(f.file, vec, vlen, &pos);
+	fdput(f);
 	return ret;
 }
 
@@ -1221,18 +1214,16 @@ asmlinkage ssize_t
 compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec,
 		  unsigned long vlen)
 {
-	struct file *file;
-	int fput_needed;
+	struct fd f = fdget(fd);
 	ssize_t ret;
 	loff_t pos;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		return -EBADF;
-	pos = file->f_pos;
-	ret = compat_writev(file, vec, vlen, &pos);
-	file->f_pos = pos;
-	fput_light(file, fput_needed);
+	pos = f.file->f_pos;
+	ret = compat_writev(f.file, vec, vlen, &pos);
+	f.file->f_pos = pos;
+	fdput(f);
 	return ret;
 }
 
@@ -1240,19 +1231,18 @@ asmlinkage ssize_t
 compat_sys_pwritev64(unsigned long fd, const struct compat_iovec __user *vec,
 		     unsigned long vlen, loff_t pos)
 {
-	struct file *file;
-	int fput_needed;
+	struct fd f;
 	ssize_t ret;
 
 	if (pos < 0)
 		return -EINVAL;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return -EBADF;
 	ret = -ESPIPE;
-	if (file->f_mode & FMODE_PWRITE)
-		ret = compat_writev(file, vec, vlen, &pos);
-	fput_light(file, fput_needed);
+	if (f.file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(f.file, vec, vlen, &pos);
+	fdput(f);
 	return ret;
 }
 
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index debdfe0fc809..48f1987bec34 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1531,16 +1531,13 @@ static int compat_ioctl_check_table(unsigned int xcmd)
 asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 				unsigned long arg)
 {
-	struct file *filp;
+	struct fd f = fdget(fd);
 	int error = -EBADF;
-	int fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (!filp)
+	if (!f.file)
 		goto out;
 
 	/* RED-PEN how should LSM module know it's handling 32bit? */
-	error = security_file_ioctl(filp, cmd, arg);
+	error = security_file_ioctl(f.file, cmd, arg);
 	if (error)
 		goto out_fput;
 
@@ -1560,30 +1557,30 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 #if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
 	case FS_IOC_RESVSP_32:
 	case FS_IOC_RESVSP64_32:
-		error = compat_ioctl_preallocate(filp, compat_ptr(arg));
+		error = compat_ioctl_preallocate(f.file, compat_ptr(arg));
 		goto out_fput;
 #else
 	case FS_IOC_RESVSP:
 	case FS_IOC_RESVSP64:
-		error = ioctl_preallocate(filp, compat_ptr(arg));
+		error = ioctl_preallocate(f.file, compat_ptr(arg));
 		goto out_fput;
 #endif
 
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
-		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
+		if (S_ISREG(f.file->f_path.dentry->d_inode->i_mode))
 			break;
 		/*FALL THROUGH*/
 
 	default:
-		if (filp->f_op && filp->f_op->compat_ioctl) {
-			error = filp->f_op->compat_ioctl(filp, cmd, arg);
+		if (f.file->f_op && f.file->f_op->compat_ioctl) {
+			error = f.file->f_op->compat_ioctl(f.file, cmd, arg);
 			if (error != -ENOIOCTLCMD)
 				goto out_fput;
 		}
 
-		if (!filp->f_op || !filp->f_op->unlocked_ioctl)
+		if (!f.file->f_op || !f.file->f_op->unlocked_ioctl)
 			goto do_ioctl;
 		break;
 	}
@@ -1591,7 +1588,7 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
 	if (compat_ioctl_check_table(XFORM(cmd)))
 		goto found_handler;
 
-	error = do_ioctl_trans(fd, cmd, arg, filp);
+	error = do_ioctl_trans(fd, cmd, arg, f.file);
 	if (error == -ENOIOCTLCMD)
 		error = -ENOTTY;
 
@@ -1600,9 +1597,9 @@ asmlinkage long compat_sys_ioctl(unsigned int fd, unsigned int cmd,
  found_handler:
 	arg = (unsigned long)compat_ptr(arg);
  do_ioctl:
-	error = do_vfs_ioctl(filp, fd, cmd, arg);
+	error = do_vfs_ioctl(f.file, fd, cmd, arg);
  out_fput:
-	fput_light(filp, fput_needed);
+	fdput(f);
  out:
 	return error;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 567ae72e155d..cd96649bfe62 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1809,8 +1809,8 @@ error_return:
 SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 		int, maxevents, int, timeout)
 {
-	int error, fput_needed;
-	struct file *file;
+	int error;
+	struct fd f;
 	struct eventpoll *ep;
 
 	/* The maximum number of event must be greater than zero */
@@ -1818,38 +1818,33 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 		return -EINVAL;
 
 	/* Verify that the area passed by the user is writeable */
-	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
-		error = -EFAULT;
-		goto error_return;
-	}
+	if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event)))
+		return -EFAULT;
 
 	/* Get the "struct file *" for the eventpoll file */
-	error = -EBADF;
-	file = fget_light(epfd, &fput_needed);
-	if (!file)
-		goto error_return;
+	f = fdget(epfd);
+	if (!f.file)
+		return -EBADF;
 
 	/*
 	 * We have to check that the file structure underneath the fd
 	 * the user passed to us _is_ an eventpoll file.
 	 */
 	error = -EINVAL;
-	if (!is_file_epoll(file))
+	if (!is_file_epoll(f.file))
 		goto error_fput;
 
 	/*
 	 * At this point it is safe to assume that the "private_data" contains
 	 * our own data structure.
 	 */
-	ep = file->private_data;
+	ep = f.file->private_data;
 
 	/* Time to fish for events ... */
 	error = ep_poll(ep, events, maxevents, timeout);
 
 error_fput:
-	fput_light(file, fput_needed);
-error_return:
-
+	fdput(f);
 	return error;
 }
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 39646f224514..5439d6a56e99 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -233,8 +233,8 @@ group_extend_out:
 
 	case EXT4_IOC_MOVE_EXT: {
 		struct move_extent me;
-		struct file *donor_filp;
-		int err, fput_needed;
+		struct fd donor;
+		int err;
 
 		if (!(filp->f_mode & FMODE_READ) ||
 		    !(filp->f_mode & FMODE_WRITE))
@@ -245,11 +245,11 @@ group_extend_out:
 			return -EFAULT;
 		me.moved_len = 0;
 
-		donor_filp = fget_light(me.donor_fd, &fput_needed);
-		if (!donor_filp)
+		donor = fdget(me.donor_fd);
+		if (!donor.file)
 			return -EBADF;
 
-		if (!(donor_filp->f_mode & FMODE_WRITE)) {
+		if (!(donor.file->f_mode & FMODE_WRITE)) {
 			err = -EBADF;
 			goto mext_out;
 		}
@@ -266,7 +266,7 @@ group_extend_out:
 		if (err)
 			goto mext_out;
 
-		err = ext4_move_extents(filp, donor_filp, me.orig_start,
+		err = ext4_move_extents(filp, donor.file, me.orig_start,
 					me.donor_start, me.len, &me.moved_len);
 		mnt_drop_write_file(filp);
 
@@ -274,7 +274,7 @@ group_extend_out:
 				 &me, sizeof(me)))
 			err = -EFAULT;
 mext_out:
-		fput_light(donor_filp, fput_needed);
+		fdput(donor);
 		return err;
 	}
 
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 40a5bfb72cca..91af39a33af7 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -348,25 +348,23 @@ static int check_fcntl_cmd(unsigned cmd)
 
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
-	struct file *filp;
-	int fput_needed;
+	struct fd f = fdget_raw(fd);
 	long err = -EBADF;
 
-	filp = fget_raw_light(fd, &fput_needed);
-	if (!filp)
+	if (!f.file)
 		goto out;
 
-	if (unlikely(filp->f_mode & FMODE_PATH)) {
+	if (unlikely(f.file->f_mode & FMODE_PATH)) {
 		if (!check_fcntl_cmd(cmd))
 			goto out1;
 	}
 
-	err = security_file_fcntl(filp, cmd, arg);
+	err = security_file_fcntl(f.file, cmd, arg);
 	if (!err)
-		err = do_fcntl(fd, cmd, arg, filp);
+		err = do_fcntl(fd, cmd, arg, f.file);
 
 out1:
- 	fput_light(filp, fput_needed);
+ 	fdput(f);
 out:
 	return err;
 }
@@ -375,38 +373,36 @@ out:
 SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 		unsigned long, arg)
 {	
-	struct file * filp;
+	struct fd f = fdget_raw(fd);
 	long err = -EBADF;
-	int fput_needed;
 
-	filp = fget_raw_light(fd, &fput_needed);
-	if (!filp)
+	if (!f.file)
 		goto out;
 
-	if (unlikely(filp->f_mode & FMODE_PATH)) {
+	if (unlikely(f.file->f_mode & FMODE_PATH)) {
 		if (!check_fcntl_cmd(cmd))
 			goto out1;
 	}
 
-	err = security_file_fcntl(filp, cmd, arg);
+	err = security_file_fcntl(f.file, cmd, arg);
 	if (err)
 		goto out1;
 	
 	switch (cmd) {
 		case F_GETLK64:
-			err = fcntl_getlk64(filp, (struct flock64 __user *) arg);
+			err = fcntl_getlk64(f.file, (struct flock64 __user *) arg);
 			break;
 		case F_SETLK64:
 		case F_SETLKW64:
-			err = fcntl_setlk64(fd, filp, cmd,
+			err = fcntl_setlk64(fd, f.file, cmd,
 					(struct flock64 __user *) arg);
 			break;
 		default:
-			err = do_fcntl(fd, cmd, arg, filp);
+			err = do_fcntl(fd, cmd, arg, f.file);
 			break;
 	}
 out1:
-	fput_light(filp, fput_needed);
+	fdput(f);
 out:
 	return err;
 }
diff --git a/fs/fhandle.c b/fs/fhandle.c
index a48e4a139be1..f775bfdd6e4a 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -113,24 +113,21 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 
 static struct vfsmount *get_vfsmount_from_fd(int fd)
 {
-	struct path path;
+	struct vfsmount *mnt;
 
 	if (fd == AT_FDCWD) {
 		struct fs_struct *fs = current->fs;
 		spin_lock(&fs->lock);
-		path = fs->pwd;
-		mntget(path.mnt);
+		mnt = mntget(fs->pwd.mnt);
 		spin_unlock(&fs->lock);
 	} else {
-		int fput_needed;
-		struct file *file = fget_light(fd, &fput_needed);
-		if (!file)
+		struct fd f = fdget(fd);
+		if (!f.file)
 			return ERR_PTR(-EBADF);
-		path = file->f_path;
-		mntget(path.mnt);
-		fput_light(file, fput_needed);
+		mnt = mntget(f.file->f_path.mnt);
+		fdput(f);
 	}
-	return path.mnt;
+	return mnt;
 }
 
 static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 29167bebe874..3bdad6d1f268 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -603,21 +603,14 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 
 SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {
-	struct file *filp;
-	int error = -EBADF;
-	int fput_needed;
-
-	filp = fget_light(fd, &fput_needed);
-	if (!filp)
-		goto out;
-
-	error = security_file_ioctl(filp, cmd, arg);
-	if (error)
-		goto out_fput;
-
-	error = do_vfs_ioctl(filp, fd, cmd, arg);
- out_fput:
-	fput_light(filp, fput_needed);
- out:
+	int error;
+	struct fd f = fdget(fd);
+
+	if (!f.file)
+		return -EBADF;
+	error = security_file_ioctl(f.file, cmd, arg);
+	if (!error)
+		error = do_vfs_ioctl(f.file, fd, cmd, arg);
+	fdput(f);
 	return error;
 }
diff --git a/fs/locks.c b/fs/locks.c
index 7e81bfc75164..abc7dc6c490b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1625,15 +1625,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);
  */
 SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 {
-	struct file *filp;
-	int fput_needed;
+	struct fd f = fdget(fd);
 	struct file_lock *lock;
 	int can_sleep, unlock;
 	int error;
 
 	error = -EBADF;
-	filp = fget_light(fd, &fput_needed);
-	if (!filp)
+	if (!f.file)
 		goto out;
 
 	can_sleep = !(cmd & LOCK_NB);
@@ -1641,31 +1639,31 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 	unlock = (cmd == LOCK_UN);
 
 	if (!unlock && !(cmd & LOCK_MAND) &&
-	    !(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
+	    !(f.file->f_mode & (FMODE_READ|FMODE_WRITE)))
 		goto out_putf;
 
-	error = flock_make_lock(filp, &lock, cmd);
+	error = flock_make_lock(f.file, &lock, cmd);
 	if (error)
 		goto out_putf;
 	if (can_sleep)
 		lock->fl_flags |= FL_SLEEP;
 
-	error = security_file_lock(filp, lock->fl_type);
+	error = security_file_lock(f.file, lock->fl_type);
 	if (error)
 		goto out_free;
 
-	if (filp->f_op && filp->f_op->flock)
-		error = filp->f_op->flock(filp,
+	if (f.file->f_op && f.file->f_op->flock)
+		error = f.file->f_op->flock(f.file,
 					  (can_sleep) ? F_SETLKW : F_SETLK,
 					  lock);
 	else
-		error = flock_lock_file_wait(filp, lock);
+		error = flock_lock_file_wait(f.file, lock);
 
  out_free:
 	locks_free_lock(lock);
 
  out_putf:
-	fput_light(filp, fput_needed);
+	fdput(f);
  out:
 	return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index c5b85b3523c0..e1c7072c7afa 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1797,8 +1797,6 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 		     struct nameidata *nd, struct file **fp)
 {
 	int retval = 0;
-	int fput_needed;
-	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED;
@@ -1850,44 +1848,41 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 			get_fs_pwd(current->fs, &nd->path);
 		}
 	} else {
+		struct fd f = fdget_raw(dfd);
 		struct dentry *dentry;
 
-		file = fget_raw_light(dfd, &fput_needed);
-		retval = -EBADF;
-		if (!file)
-			goto out_fail;
+		if (!f.file)
+			return -EBADF;
 
-		dentry = file->f_path.dentry;
+		dentry = f.file->f_path.dentry;
 
 		if (*name) {
-			retval = -ENOTDIR;
-			if (!S_ISDIR(dentry->d_inode->i_mode))
-				goto fput_fail;
+			if (!S_ISDIR(dentry->d_inode->i_mode)) {
+				fdput(f);
+				return -ENOTDIR;
+			}
 
 			retval = inode_permission(dentry->d_inode, MAY_EXEC);
-			if (retval)
-				goto fput_fail;
+			if (retval) {
+				fdput(f);
+				return retval;
+			}
 		}
 
-		nd->path = file->f_path;
+		nd->path = f.file->f_path;
 		if (flags & LOOKUP_RCU) {
-			if (fput_needed)
-				*fp = file;
+			if (f.need_put)
+				*fp = f.file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			lock_rcu_walk();
 		} else {
-			path_get(&file->f_path);
-			fput_light(file, fput_needed);
+			path_get(&nd->path);
+			fdput(f);
 		}
 	}
 
 	nd->inode = nd->path.dentry->d_inode;
 	return 0;
-
-fput_fail:
-	fput_light(file, fput_needed);
-out_fail:
-	return retval;
 }
 
 static inline int lookup_last(struct nameidata *nd, struct path *path)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index ea48693940f1..721d692fa8d4 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -451,24 +451,22 @@ static int fanotify_find_path(int dfd, const char __user *filename,
 		 dfd, filename, flags);
 
 	if (filename == NULL) {
-		struct file *file;
-		int fput_needed;
+		struct fd f = fdget(dfd);
 
 		ret = -EBADF;
-		file = fget_light(dfd, &fput_needed);
-		if (!file)
+		if (!f.file)
 			goto out;
 
 		ret = -ENOTDIR;
 		if ((flags & FAN_MARK_ONLYDIR) &&
-		    !(S_ISDIR(file->f_path.dentry->d_inode->i_mode))) {
-			fput_light(file, fput_needed);
+		    !(S_ISDIR(f.file->f_path.dentry->d_inode->i_mode))) {
+			fdput(f);
 			goto out;
 		}
 
-		*path = file->f_path;
+		*path = f.file->f_path;
 		path_get(path);
-		fput_light(file, fput_needed);
+		fdput(f);
 	} else {
 		unsigned int lookup_flags = 0;
 
@@ -748,9 +746,9 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 	struct inode *inode = NULL;
 	struct vfsmount *mnt = NULL;
 	struct fsnotify_group *group;
-	struct file *filp;
+	struct fd f;
 	struct path path;
-	int ret, fput_needed;
+	int ret;
 
 	pr_debug("%s: fanotify_fd=%d flags=%x dfd=%d pathname=%p mask=%llx\n",
 		 __func__, fanotify_fd, flags, dfd, pathname, mask);
@@ -784,15 +782,15 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 #endif
 		return -EINVAL;
 
-	filp = fget_light(fanotify_fd, &fput_needed);
-	if (unlikely(!filp))
+	f = fdget(fanotify_fd);
+	if (unlikely(!f.file))
 		return -EBADF;
 
 	/* verify that this is indeed an fanotify instance */
 	ret = -EINVAL;
-	if (unlikely(filp->f_op != &fanotify_fops))
+	if (unlikely(f.file->f_op != &fanotify_fops))
 		goto fput_and_out;
-	group = filp->private_data;
+	group = f.file->private_data;
 
 	/*
 	 * group->priority == FS_PRIO_0 == FAN_CLASS_NOTIF.  These are not
@@ -839,7 +837,7 @@ SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
 
 	path_put(&path);
 fput_and_out:
-	fput_light(filp, fput_needed);
+	fdput(f);
 	return ret;
 }
 
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 8445fbc8985c..c311dda054a3 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -757,16 +757,16 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	struct fsnotify_group *group;
 	struct inode *inode;
 	struct path path;
-	struct file *filp;
-	int ret, fput_needed;
+	struct fd f;
+	int ret;
 	unsigned flags = 0;
 
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
+	f = fdget(fd);
+	if (unlikely(!f.file))
 		return -EBADF;
 
 	/* verify that this is indeed an inotify instance */
-	if (unlikely(filp->f_op != &inotify_fops)) {
+	if (unlikely(f.file->f_op != &inotify_fops)) {
 		ret = -EINVAL;
 		goto fput_and_out;
 	}
@@ -782,13 +782,13 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 
 	/* inode held in place by reference to path; group by fget on fd */
 	inode = path.dentry->d_inode;
-	group = filp->private_data;
+	group = f.file->private_data;
 
 	/* create/update an inode mark */
 	ret = inotify_update_watch(group, inode, mask);
 	path_put(&path);
 fput_and_out:
-	fput_light(filp, fput_needed);
+	fdput(f);
 	return ret;
 }
 
@@ -796,19 +796,19 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
 	struct fsnotify_group *group;
 	struct inotify_inode_mark *i_mark;
-	struct file *filp;
-	int ret = 0, fput_needed;
+	struct fd f;
+	int ret = 0;
 
-	filp = fget_light(fd, &fput_needed);
-	if (unlikely(!filp))
+	f = fdget(fd);
+	if (unlikely(!f.file))
 		return -EBADF;
 
 	/* verify that this is indeed an inotify instance */
 	ret = -EINVAL;
-	if (unlikely(filp->f_op != &inotify_fops))
+	if (unlikely(f.file->f_op != &inotify_fops))
 		goto out;
 
-	group = filp->private_data;
+	group = f.file->private_data;
 
 	ret = -EINVAL;
 	i_mark = inotify_idr_find(group, wd);
@@ -823,7 +823,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 	fsnotify_put_mark(&i_mark->fsn_mark);
 
 out:
-	fput_light(filp, fput_needed);
+	fdput(f);
 	return ret;
 }
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 61c28ae266f5..f7c648d7d6bf 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1746,11 +1746,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	long fd;
 	int sectsize;
 	char *p = (char *)page;
-	struct file *filp = NULL;
-	struct inode *inode = NULL;
+	struct fd f;
+	struct inode *inode;
 	ssize_t ret = -EINVAL;
 	int live_threshold;
-	int fput_needed;
 
 	if (reg->hr_bdev)
 		goto out;
@@ -1767,26 +1766,26 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	if (fd < 0 || fd >= INT_MAX)
 		goto out;
 
-	filp = fget_light(fd, &fput_needed);
-	if (filp == NULL)
+	f = fdget(fd);
+	if (f.file == NULL)
 		goto out;
 
 	if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
 	    reg->hr_block_bytes == 0)
-		goto out;
+		goto out2;
 
-	inode = igrab(filp->f_mapping->host);
+	inode = igrab(f.file->f_mapping->host);
 	if (inode == NULL)
-		goto out;
+		goto out2;
 
 	if (!S_ISBLK(inode->i_mode))
-		goto out;
+		goto out3;
 
-	reg->hr_bdev = I_BDEV(filp->f_mapping->host);
+	reg->hr_bdev = I_BDEV(f.file->f_mapping->host);
 	ret = blkdev_get(reg->hr_bdev, FMODE_WRITE | FMODE_READ, NULL);
 	if (ret) {
 		reg->hr_bdev = NULL;
-		goto out;
+		goto out3;
 	}
 	inode = NULL;
 
@@ -1798,7 +1797,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		     "blocksize %u incorrect for device, expected %d",
 		     reg->hr_block_bytes, sectsize);
 		ret = -EINVAL;
-		goto out;
+		goto out3;
 	}
 
 	o2hb_init_region_params(reg);
@@ -1812,13 +1811,13 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	ret = o2hb_map_slot_data(reg);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out3;
 	}
 
 	ret = o2hb_populate_slot_data(reg);
 	if (ret) {
 		mlog_errno(ret);
-		goto out;
+		goto out3;
 	}
 
 	INIT_DELAYED_WORK(&reg->hr_write_timeout_work, o2hb_write_timeout);
@@ -1848,7 +1847,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 	if (IS_ERR(hb_task)) {
 		ret = PTR_ERR(hb_task);
 		mlog_errno(ret);
-		goto out;
+		goto out3;
 	}
 
 	spin_lock(&o2hb_live_lock);
@@ -1864,7 +1863,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 	if (reg->hr_aborted_start) {
 		ret = -EIO;
-		goto out;
+		goto out3;
 	}
 
 	/* Ok, we were woken.  Make sure it wasn't by drop_item() */
@@ -1883,11 +1882,11 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 		printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
 		       config_item_name(&reg->hr_item), reg->hr_dev_name);
 
+out3:
+	iput(inode);
+out2:
+	fdput(f);
 out:
-	if (filp)
-		fput_light(filp, fput_needed);
-	if (inode)
-		iput(inode);
 	if (ret < 0) {
 		if (reg->hr_bdev) {
 			blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
diff --git a/fs/open.c b/fs/open.c
index 3c741eae6b99..85603262d8db 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -134,25 +134,25 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 {
 	struct inode *inode;
 	struct dentry *dentry;
-	struct file *file;
-	int error, fput_needed;
+	struct fd f;
+	int error;
 
 	error = -EINVAL;
 	if (length < 0)
 		goto out;
 	error = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		goto out;
 
 	/* explicitly opened as large or we are on 64-bit box */
-	if (file->f_flags & O_LARGEFILE)
+	if (f.file->f_flags & O_LARGEFILE)
 		small = 0;
 
-	dentry = file->f_path.dentry;
+	dentry = f.file->f_path.dentry;
 	inode = dentry->d_inode;
 	error = -EINVAL;
-	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
+	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
 		goto out_putf;
 
 	error = -EINVAL;
@@ -165,14 +165,14 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
 		goto out_putf;
 
 	sb_start_write(inode->i_sb);
-	error = locks_verify_truncate(inode, file, length);
+	error = locks_verify_truncate(inode, f.file, length);
 	if (!error)
-		error = security_path_truncate(&file->f_path);
+		error = security_path_truncate(&f.file->f_path);
 	if (!error)
-		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
+		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
 	sb_end_write(inode->i_sb);
 out_putf:
-	fput_light(file, fput_needed);
+	fdput(f);
 out:
 	return error;
 }
@@ -276,15 +276,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 {
-	struct file *file;
-	int error = -EBADF, fput_needed;
+	struct fd f = fdget(fd);
+	int error = -EBADF;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		error = do_fallocate(file, mode, offset, len);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		error = do_fallocate(f.file, mode, offset, len);
+		fdput(f);
 	}
-
 	return error;
 }
 
@@ -400,16 +398,15 @@ out:
 
 SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 {
-	struct file *file;
+	struct fd f = fdget_raw(fd);
 	struct inode *inode;
-	int error, fput_needed;
+	int error = -EBADF;
 
 	error = -EBADF;
-	file = fget_raw_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		goto out;
 
-	inode = file->f_path.dentry->d_inode;
+	inode = f.file->f_path.dentry->d_inode;
 
 	error = -ENOTDIR;
 	if (!S_ISDIR(inode->i_mode))
@@ -417,9 +414,9 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
 
 	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 	if (!error)
-		set_fs_pwd(current->fs, &file->f_path);
+		set_fs_pwd(current->fs, &f.file->f_path);
 out_putf:
-	fput_light(file, fput_needed);
+	fdput(f);
 out:
 	return error;
 }
@@ -582,21 +579,20 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
 
 SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 {
-	struct file *file;
-	int error = -EBADF, fput_needed;
+	struct fd f = fdget(fd);
+	int error = -EBADF;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		goto out;
 
-	error = mnt_want_write_file(file);
+	error = mnt_want_write_file(f.file);
 	if (error)
 		goto out_fput;
-	audit_inode(NULL, file->f_path.dentry);
-	error = chown_common(&file->f_path, user, group);
-	mnt_drop_write_file(file);
+	audit_inode(NULL, f.file->f_path.dentry);
+	error = chown_common(&f.file->f_path, user, group);
+	mnt_drop_write_file(f.file);
 out_fput:
-	fput_light(file, fput_needed);
+	fdput(f);
 out:
 	return error;
 }
diff --git a/fs/read_write.c b/fs/read_write.c
index 1adfb691e4f1..28b38279a219 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -232,23 +232,18 @@ EXPORT_SYMBOL(vfs_llseek);
 SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, origin)
 {
 	off_t retval;
-	struct file * file;
-	int fput_needed;
-
-	retval = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		goto bad;
+	struct fd f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
 
 	retval = -EINVAL;
 	if (origin <= SEEK_MAX) {
-		loff_t res = vfs_llseek(file, offset, origin);
+		loff_t res = vfs_llseek(f.file, offset, origin);
 		retval = res;
 		if (res != (loff_t)retval)
 			retval = -EOVERFLOW;	/* LFS: should only happen on 32 bit platforms */
 	}
-	fput_light(file, fput_needed);
-bad:
+	fdput(f);
 	return retval;
 }
 
@@ -258,20 +253,17 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 		unsigned int, origin)
 {
 	int retval;
-	struct file * file;
+	struct fd f = fdget(fd);
 	loff_t offset;
-	int fput_needed;
 
-	retval = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
-		goto bad;
+	if (!f.file)
+		return -EBADF;
 
 	retval = -EINVAL;
 	if (origin > SEEK_MAX)
 		goto out_putf;
 
-	offset = vfs_llseek(file, ((loff_t) offset_high << 32) | offset_low,
+	offset = vfs_llseek(f.file, ((loff_t) offset_high << 32) | offset_low,
 			origin);
 
 	retval = (int)offset;
@@ -281,8 +273,7 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
 			retval = 0;
 	}
 out_putf:
-	fput_light(file, fput_needed);
-bad:
+	fdput(f);
 	return retval;
 }
 #endif
@@ -461,34 +452,29 @@ static inline void file_pos_write(struct file *file, loff_t pos)
 
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
-	struct file *file;
+	struct fd f = fdget(fd);
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		loff_t pos = file_pos_read(file);
-		ret = vfs_read(file, buf, count, &pos);
-		file_pos_write(file, pos);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_read(f.file, buf, count, &pos);
+		file_pos_write(f.file, pos);
+		fdput(f);
 	}
-
 	return ret;
 }
 
 SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 		size_t, count)
 {
-	struct file *file;
+	struct fd f = fdget(fd);
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		loff_t pos = file_pos_read(file);
-		ret = vfs_write(file, buf, count, &pos);
-		file_pos_write(file, pos);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_write(f.file, buf, count, &pos);
+		file_pos_write(f.file, pos);
+		fdput(f);
 	}
 
 	return ret;
@@ -497,19 +483,18 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 			size_t count, loff_t pos)
 {
-	struct file *file;
+	struct fd f;
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
 	if (pos < 0)
 		return -EINVAL;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
+	f = fdget(fd);
+	if (f.file) {
 		ret = -ESPIPE;
-		if (file->f_mode & FMODE_PREAD)
-			ret = vfs_read(file, buf, count, &pos);
-		fput_light(file, fput_needed);
+		if (f.file->f_mode & FMODE_PREAD)
+			ret = vfs_read(f.file, buf, count, &pos);
+		fdput(f);
 	}
 
 	return ret;
@@ -526,19 +511,18 @@ SYSCALL_ALIAS(sys_pread64, SyS_pread64);
 SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 			 size_t count, loff_t pos)
 {
-	struct file *file;
+	struct fd f;
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
 	if (pos < 0)
 		return -EINVAL;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
+	f = fdget(fd);
+	if (f.file) {
 		ret = -ESPIPE;
-		if (file->f_mode & FMODE_PWRITE)  
-			ret = vfs_write(file, buf, count, &pos);
-		fput_light(file, fput_needed);
+		if (f.file->f_mode & FMODE_PWRITE)  
+			ret = vfs_write(f.file, buf, count, &pos);
+		fdput(f);
 	}
 
 	return ret;
@@ -789,16 +773,14 @@ EXPORT_SYMBOL(vfs_writev);
 SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen)
 {
-	struct file *file;
+	struct fd f = fdget(fd);
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		loff_t pos = file_pos_read(file);
-		ret = vfs_readv(file, vec, vlen, &pos);
-		file_pos_write(file, pos);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_readv(f.file, vec, vlen, &pos);
+		file_pos_write(f.file, pos);
+		fdput(f);
 	}
 
 	if (ret > 0)
@@ -810,16 +792,14 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec,
 SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen)
 {
-	struct file *file;
+	struct fd f = fdget(fd);
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		loff_t pos = file_pos_read(file);
-		ret = vfs_writev(file, vec, vlen, &pos);
-		file_pos_write(file, pos);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		loff_t pos = file_pos_read(f.file);
+		ret = vfs_writev(f.file, vec, vlen, &pos);
+		file_pos_write(f.file, pos);
+		fdput(f);
 	}
 
 	if (ret > 0)
@@ -838,19 +818,18 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
 	loff_t pos = pos_from_hilo(pos_h, pos_l);
-	struct file *file;
+	struct fd f;
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
 	if (pos < 0)
 		return -EINVAL;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
+	f = fdget(fd);
+	if (f.file) {
 		ret = -ESPIPE;
-		if (file->f_mode & FMODE_PREAD)
-			ret = vfs_readv(file, vec, vlen, &pos);
-		fput_light(file, fput_needed);
+		if (f.file->f_mode & FMODE_PREAD)
+			ret = vfs_readv(f.file, vec, vlen, &pos);
+		fdput(f);
 	}
 
 	if (ret > 0)
@@ -863,19 +842,18 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 		unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h)
 {
 	loff_t pos = pos_from_hilo(pos_h, pos_l);
-	struct file *file;
+	struct fd f;
 	ssize_t ret = -EBADF;
-	int fput_needed;
 
 	if (pos < 0)
 		return -EINVAL;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
+	f = fdget(fd);
+	if (f.file) {
 		ret = -ESPIPE;
-		if (file->f_mode & FMODE_PWRITE)
-			ret = vfs_writev(file, vec, vlen, &pos);
-		fput_light(file, fput_needed);
+		if (f.file->f_mode & FMODE_PWRITE)
+			ret = vfs_writev(f.file, vec, vlen, &pos);
+		fdput(f);
 	}
 
 	if (ret > 0)
@@ -887,28 +865,28 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 			   size_t count, loff_t max)
 {
-	struct file * in_file, * out_file;
-	struct inode * in_inode, * out_inode;
+	struct fd in, out;
+	struct inode *in_inode, *out_inode;
 	loff_t pos;
 	ssize_t retval;
-	int fput_needed_in, fput_needed_out, fl;
+	int fl;
 
 	/*
 	 * Get input file, and verify that it is ok..
 	 */
 	retval = -EBADF;
-	in_file = fget_light(in_fd, &fput_needed_in);
-	if (!in_file)
+	in = fdget(in_fd);
+	if (!in.file)
 		goto out;
-	if (!(in_file->f_mode & FMODE_READ))
+	if (!(in.file->f_mode & FMODE_READ))
 		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos)
-		ppos = &in_file->f_pos;
+		ppos = &in.file->f_pos;
 	else
-		if (!(in_file->f_mode & FMODE_PREAD))
+		if (!(in.file->f_mode & FMODE_PREAD))
 			goto fput_in;
-	retval = rw_verify_area(READ, in_file, ppos, count);
+	retval = rw_verify_area(READ, in.file, ppos, count);
 	if (retval < 0)
 		goto fput_in;
 	count = retval;
@@ -917,15 +895,15 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	 * Get output file, and verify that it is ok..
 	 */
 	retval = -EBADF;
-	out_file = fget_light(out_fd, &fput_needed_out);
-	if (!out_file)
+	out = fdget(out_fd);
+	if (!out.file)
 		goto fput_in;
-	if (!(out_file->f_mode & FMODE_WRITE))
+	if (!(out.file->f_mode & FMODE_WRITE))
 		goto fput_out;
 	retval = -EINVAL;
-	in_inode = in_file->f_path.dentry->d_inode;
-	out_inode = out_file->f_path.dentry->d_inode;
-	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
+	in_inode = in.file->f_path.dentry->d_inode;
+	out_inode = out.file->f_path.dentry->d_inode;
+	retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count);
 	if (retval < 0)
 		goto fput_out;
 	count = retval;
@@ -949,10 +927,10 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	 * and the application is arguably buggy if it doesn't expect
 	 * EAGAIN on a non-blocking file descriptor.
 	 */
-	if (in_file->f_flags & O_NONBLOCK)
+	if (in.file->f_flags & O_NONBLOCK)
 		fl = SPLICE_F_NONBLOCK;
 #endif
-	retval = do_splice_direct(in_file, ppos, out_file, count, fl);
+	retval = do_splice_direct(in.file, ppos, out.file, count, fl);
 
 	if (retval > 0) {
 		add_rchar(current, retval);
@@ -965,9 +943,9 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		retval = -EOVERFLOW;
 
 fput_out:
-	fput_light(out_file, fput_needed_out);
+	fdput(out);
 fput_in:
-	fput_light(in_file, fput_needed_in);
+	fdput(in);
 out:
 	return retval;
 }
diff --git a/fs/readdir.c b/fs/readdir.c
index 39e3370d79cf..5e69ef533b77 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -106,22 +106,20 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
 		struct old_linux_dirent __user *, dirent, unsigned int, count)
 {
 	int error;
-	struct file * file;
+	struct fd f = fdget(fd);
 	struct readdir_callback buf;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		return -EBADF;
 
 	buf.result = 0;
 	buf.dirent = dirent;
 
-	error = vfs_readdir(file, fillonedir, &buf);
+	error = vfs_readdir(f.file, fillonedir, &buf);
 	if (buf.result)
 		error = buf.result;
 
-	fput_light(file, fput_needed);
+	fdput(f);
 	return error;
 }
 
@@ -191,17 +189,16 @@ efault:
 SYSCALL_DEFINE3(getdents, unsigned int, fd,
 		struct linux_dirent __user *, dirent, unsigned int, count)
 {
-	struct file * file;
+	struct fd f;
 	struct linux_dirent __user * lastdirent;
 	struct getdents_callback buf;
-	int fput_needed;
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return -EBADF;
 
 	buf.current_dir = dirent;
@@ -209,17 +206,17 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
 	buf.count = count;
 	buf.error = 0;
 
-	error = vfs_readdir(file, filldir, &buf);
+	error = vfs_readdir(f.file, filldir, &buf);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		if (put_user(file->f_pos, &lastdirent->d_off))
+		if (put_user(f.file->f_pos, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
-	fput_light(file, fput_needed);
+	fdput(f);
 	return error;
 }
 
@@ -272,17 +269,16 @@ efault:
 SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 		struct linux_dirent64 __user *, dirent, unsigned int, count)
 {
-	struct file * file;
+	struct fd f;
 	struct linux_dirent64 __user * lastdirent;
 	struct getdents_callback64 buf;
-	int fput_needed;
 	int error;
 
 	if (!access_ok(VERIFY_WRITE, dirent, count))
 		return -EFAULT;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return -EBADF;
 
 	buf.current_dir = dirent;
@@ -290,17 +286,17 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
 	buf.count = count;
 	buf.error = 0;
 
-	error = vfs_readdir(file, filldir64, &buf);
+	error = vfs_readdir(f.file, filldir64, &buf);
 	if (error >= 0)
 		error = buf.error;
 	lastdirent = buf.previous;
 	if (lastdirent) {
-		typeof(lastdirent->d_off) d_off = file->f_pos;
+		typeof(lastdirent->d_off) d_off = f.file->f_pos;
 		if (__put_user(d_off, &lastdirent->d_off))
 			error = -EFAULT;
 		else
 			error = count - buf.count;
 	}
-	fput_light(file, fput_needed);
+	fdput(f);
 	return error;
 }
diff --git a/fs/select.c b/fs/select.c
index ffdd16d6e691..2ef72d965036 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -428,8 +428,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
 			unsigned long in, out, ex, all_bits, bit = 1, mask, j;
 			unsigned long res_in = 0, res_out = 0, res_ex = 0;
-			const struct file_operations *f_op = NULL;
-			struct file *file = NULL;
 
 			in = *inp++; out = *outp++; ex = *exp++;
 			all_bits = in | out | ex;
@@ -439,20 +437,21 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
 			}
 
 			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
-				int fput_needed;
+				struct fd f;
 				if (i >= n)
 					break;
 				if (!(bit & all_bits))
 					continue;
-				file = fget_light(i, &fput_needed);
-				if (file) {
-					f_op = file->f_op;
+				f = fdget(i);
+				if (f.file) {
+					const struct file_operations *f_op;
+					f_op = f.file->f_op;
 					mask = DEFAULT_POLLMASK;
 					if (f_op && f_op->poll) {
 						wait_key_set(wait, in, out, bit);
-						mask = (*f_op->poll)(file, wait);
+						mask = (*f_op->poll)(f.file, wait);
 					}
-					fput_light(file, fput_needed);
+					fdput(f);
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
 						retval++;
@@ -725,20 +724,17 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 	mask = 0;
 	fd = pollfd->fd;
 	if (fd >= 0) {
-		int fput_needed;
-		struct file * file;
-
-		file = fget_light(fd, &fput_needed);
+		struct fd f = fdget(fd);
 		mask = POLLNVAL;
-		if (file != NULL) {
+		if (f.file) {
 			mask = DEFAULT_POLLMASK;
-			if (file->f_op && file->f_op->poll) {
+			if (f.file->f_op && f.file->f_op->poll) {
 				pwait->_key = pollfd->events|POLLERR|POLLHUP;
-				mask = file->f_op->poll(file, pwait);
+				mask = f.file->f_op->poll(f.file, pwait);
 			}
 			/* Mask out unneeded events. */
 			mask &= pollfd->events | POLLERR | POLLHUP;
-			fput_light(file, fput_needed);
+			fdput(f);
 		}
 	}
 	pollfd->revents = mask;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 9f35a37173de..8bee4e570911 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -269,13 +269,12 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
 		if (ufd < 0)
 			kfree(ctx);
 	} else {
-		int fput_needed;
-		struct file *file = fget_light(ufd, &fput_needed);
-		if (!file)
+		struct fd f = fdget(ufd);
+		if (!f.file)
 			return -EBADF;
-		ctx = file->private_data;
-		if (file->f_op != &signalfd_fops) {
-			fput_light(file, fput_needed);
+		ctx = f.file->private_data;
+		if (f.file->f_op != &signalfd_fops) {
+			fdput(f);
 			return -EINVAL;
 		}
 		spin_lock_irq(&current->sighand->siglock);
@@ -283,7 +282,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
 		spin_unlock_irq(&current->sighand->siglock);
 
 		wake_up(&current->sighand->signalfd_wqh);
-		fput_light(file, fput_needed);
+		fdput(f);
 	}
 
 	return ufd;
diff --git a/fs/splice.c b/fs/splice.c
index 41514dd89462..13e5b4776e7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1666,9 +1666,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov,
 SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
 		unsigned long, nr_segs, unsigned int, flags)
 {
-	struct file *file;
+	struct fd f;
 	long error;
-	int fput;
 
 	if (unlikely(nr_segs > UIO_MAXIOV))
 		return -EINVAL;
@@ -1676,14 +1675,14 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov,
 		return 0;
 
 	error = -EBADF;
-	file = fget_light(fd, &fput);
-	if (file) {
-		if (file->f_mode & FMODE_WRITE)
-			error = vmsplice_to_pipe(file, iov, nr_segs, flags);
-		else if (file->f_mode & FMODE_READ)
-			error = vmsplice_to_user(file, iov, nr_segs, flags);
-
-		fput_light(file, fput);
+	f = fdget(fd);
+	if (f.file) {
+		if (f.file->f_mode & FMODE_WRITE)
+			error = vmsplice_to_pipe(f.file, iov, nr_segs, flags);
+		else if (f.file->f_mode & FMODE_READ)
+			error = vmsplice_to_user(f.file, iov, nr_segs, flags);
+
+		fdput(f);
 	}
 
 	return error;
@@ -1693,30 +1692,27 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
 		int, fd_out, loff_t __user *, off_out,
 		size_t, len, unsigned int, flags)
 {
+	struct fd in, out;
 	long error;
-	struct file *in, *out;
-	int fput_in, fput_out;
 
 	if (unlikely(!len))
 		return 0;
 
 	error = -EBADF;
-	in = fget_light(fd_in, &fput_in);
-	if (in) {
-		if (in->f_mode & FMODE_READ) {
-			out = fget_light(fd_out, &fput_out);
-			if (out) {
-				if (out->f_mode & FMODE_WRITE)
-					error = do_splice(in, off_in,
-							  out, off_out,
+	in = fdget(fd_in);
+	if (in.file) {
+		if (in.file->f_mode & FMODE_READ) {
+			out = fdget(fd_out);
+			if (out.file) {
+				if (out.file->f_mode & FMODE_WRITE)
+					error = do_splice(in.file, off_in,
+							  out.file, off_out,
 							  len, flags);
-				fput_light(out, fput_out);
+				fdput(out);
 			}
 		}
-
-		fput_light(in, fput_in);
+		fdput(in);
 	}
-
 	return error;
 }
 
@@ -2027,26 +2023,25 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 
 SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
 {
-	struct file *in;
-	int error, fput_in;
+	struct fd in;
+	int error;
 
 	if (unlikely(!len))
 		return 0;
 
 	error = -EBADF;
-	in = fget_light(fdin, &fput_in);
-	if (in) {
-		if (in->f_mode & FMODE_READ) {
-			int fput_out;
-			struct file *out = fget_light(fdout, &fput_out);
-
-			if (out) {
-				if (out->f_mode & FMODE_WRITE)
-					error = do_tee(in, out, len, flags);
-				fput_light(out, fput_out);
+	in = fdget(fdin);
+	if (in.file) {
+		if (in.file->f_mode & FMODE_READ) {
+			struct fd out = fdget(fdout);
+			if (out.file) {
+				if (out.file->f_mode & FMODE_WRITE)
+					error = do_tee(in.file, out.file,
+							len, flags);
+				fdput(out);
 			}
 		}
- 		fput_light(in, fput_in);
+ 		fdput(in);
  	}
 
 	return error;
diff --git a/fs/stat.c b/fs/stat.c
index 40780229a032..ee18fa122ae0 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -57,13 +57,13 @@ EXPORT_SYMBOL(vfs_getattr);
 
 int vfs_fstat(unsigned int fd, struct kstat *stat)
 {
-	int fput_needed;
-	struct file *f = fget_raw_light(fd, &fput_needed);
+	struct fd f = fdget_raw(fd);
 	int error = -EBADF;
 
-	if (f) {
-		error = vfs_getattr(f->f_path.mnt, f->f_path.dentry, stat);
-		fput_light(f, fput_needed);
+	if (f.file) {
+		error = vfs_getattr(f.file->f_path.mnt, f.file->f_path.dentry,
+				    stat);
+		fdput(f);
 	}
 	return error;
 }
diff --git a/fs/statfs.c b/fs/statfs.c
index 95ad5c0e586c..f8e832e6f0a2 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -87,12 +87,11 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
 
 int fd_statfs(int fd, struct kstatfs *st)
 {
-	int fput_needed;
-	struct file *file = fget_light(fd, &fput_needed);
+	struct fd f = fdget(fd);
 	int error = -EBADF;
-	if (file) {
-		error = vfs_statfs(&file->f_path, st);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		error = vfs_statfs(&f.file->f_path, st);
+		fdput(f);
 	}
 	return error;
 }
diff --git a/fs/sync.c b/fs/sync.c
index eb8722dc556f..14eefeb44636 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -148,21 +148,19 @@ void emergency_sync(void)
  */
 SYSCALL_DEFINE1(syncfs, int, fd)
 {
-	struct file *file;
+	struct fd f = fdget(fd);
 	struct super_block *sb;
 	int ret;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		return -EBADF;
-	sb = file->f_dentry->d_sb;
+	sb = f.file->f_dentry->d_sb;
 
 	down_read(&sb->s_umount);
 	ret = sync_filesystem(sb);
 	up_read(&sb->s_umount);
 
-	fput_light(file, fput_needed);
+	fdput(f);
 	return ret;
 }
 
@@ -201,14 +199,12 @@ EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
 {
-	struct file *file;
+	struct fd f = fdget(fd);
 	int ret = -EBADF;
-	int fput_needed;
 
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		ret = vfs_fsync(file, datasync);
-		fput_light(file, fput_needed);
+	if (f.file) {
+		ret = vfs_fsync(f.file, datasync);
+		fdput(f);
 	}
 	return ret;
 }
@@ -291,10 +287,9 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
 				unsigned int flags)
 {
 	int ret;
-	struct file *file;
+	struct fd f;
 	struct address_space *mapping;
 	loff_t endbyte;			/* inclusive */
-	int fput_needed;
 	umode_t i_mode;
 
 	ret = -EINVAL;
@@ -333,17 +328,17 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
 		endbyte--;		/* inclusive */
 
 	ret = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		goto out;
 
-	i_mode = file->f_path.dentry->d_inode->i_mode;
+	i_mode = f.file->f_path.dentry->d_inode->i_mode;
 	ret = -ESPIPE;
 	if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) &&
 			!S_ISLNK(i_mode))
 		goto out_put;
 
-	mapping = file->f_mapping;
+	mapping = f.file->f_mapping;
 	if (!mapping) {
 		ret = -EINVAL;
 		goto out_put;
@@ -366,7 +361,7 @@ SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
 		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 
 out_put:
-	fput_light(file, fput_needed);
+	fdput(f);
 out:
 	return ret;
 }
diff --git a/fs/timerfd.c b/fs/timerfd.c
index dd91e9420c9e..d03822bbf190 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -234,19 +234,17 @@ static const struct file_operations timerfd_fops = {
 	.llseek		= noop_llseek,
 };
 
-static struct file *timerfd_fget(int fd, int *fput_needed)
+static int timerfd_fget(int fd, struct fd *p)
 {
-	struct file *file;
-
-	file = fget_light(fd, fput_needed);
-	if (!file)
-		return ERR_PTR(-EBADF);
-	if (file->f_op != &timerfd_fops) {
-		fput_light(file, *fput_needed);
-		return ERR_PTR(-EINVAL);
+	struct fd f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+	if (f.file->f_op != &timerfd_fops) {
+		fdput(f);
+		return -EINVAL;
 	}
-
-	return file;
+	*p = f;
+	return 0;
 }
 
 SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
@@ -284,10 +282,10 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 		const struct itimerspec __user *, utmr,
 		struct itimerspec __user *, otmr)
 {
-	struct file *file;
+	struct fd f;
 	struct timerfd_ctx *ctx;
 	struct itimerspec ktmr, kotmr;
-	int ret, fput_needed;
+	int ret;
 
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
@@ -297,10 +295,10 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
-	file = timerfd_fget(ufd, &fput_needed);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-	ctx = file->private_data;
+	ret = timerfd_fget(ufd, &f);
+	if (ret)
+		return ret;
+	ctx = f.file->private_data;
 
 	timerfd_setup_cancel(ctx, flags);
 
@@ -334,7 +332,7 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	ret = timerfd_setup(ctx, flags, &ktmr);
 
 	spin_unlock_irq(&ctx->wqh.lock);
-	fput_light(file, fput_needed);
+	fdput(f);
 	if (otmr && copy_to_user(otmr, &kotmr, sizeof(kotmr)))
 		return -EFAULT;
 
@@ -343,15 +341,13 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 
 SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 {
-	struct file *file;
+	struct fd f;
 	struct timerfd_ctx *ctx;
 	struct itimerspec kotmr;
-	int fput_needed;
-
-	file = timerfd_fget(ufd, &fput_needed);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
-	ctx = file->private_data;
+	int ret = timerfd_fget(ufd, &f);
+	if (ret)
+		return ret;
+	ctx = f.file->private_data;
 
 	spin_lock_irq(&ctx->wqh.lock);
 	if (ctx->expired && ctx->tintv.tv64) {
@@ -363,7 +359,7 @@ SYSCALL_DEFINE2(timerfd_gettime, int, ufd, struct itimerspec __user *, otmr)
 	kotmr.it_value = ktime_to_timespec(timerfd_get_remaining(ctx));
 	kotmr.it_interval = ktime_to_timespec(ctx->tintv);
 	spin_unlock_irq(&ctx->wqh.lock);
-	fput_light(file, fput_needed);
+	fdput(f);
 
 	return copy_to_user(otmr, &kotmr, sizeof(kotmr)) ? -EFAULT: 0;
 }
diff --git a/fs/utimes.c b/fs/utimes.c
index fa4dbe451e27..bb0696a41735 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -140,19 +140,18 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
 		goto out;
 
 	if (filename == NULL && dfd != AT_FDCWD) {
-		int fput_needed;
-		struct file *file;
+		struct fd f;
 
 		if (flags & AT_SYMLINK_NOFOLLOW)
 			goto out;
 
-		file = fget_light(dfd, &fput_needed);
+		f = fdget(dfd);
 		error = -EBADF;
-		if (!file)
+		if (!f.file)
 			goto out;
 
-		error = utimes_common(&file->f_path, times);
-		fput_light(file, fput_needed);
+		error = utimes_common(&f.file->f_path, times);
+		fdput(f);
 	} else {
 		struct path path;
 		int lookup_flags = 0;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4d45b7189e7e..14a7e2544fe3 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -399,22 +399,20 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		const void __user *,value, size_t, size, int, flags)
 {
-	int fput_needed;
-	struct file *f;
+	struct fd f = fdget(fd);
 	struct dentry *dentry;
 	int error = -EBADF;
 
-	f = fget_light(fd, &fput_needed);
-	if (!f)
+	if (!f.file)
 		return error;
-	dentry = f->f_path.dentry;
+	dentry = f.file->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write_file(f);
+	error = mnt_want_write_file(f.file);
 	if (!error) {
 		error = setxattr(dentry, name, value, size, flags);
-		mnt_drop_write_file(f);
+		mnt_drop_write_file(f.file);
 	}
-	fput_light(f, fput_needed);
+	fdput(f);
 	return error;
 }
 
@@ -495,16 +493,14 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
 SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
 		void __user *, value, size_t, size)
 {
-	int fput_needed;
-	struct file *f;
+	struct fd f = fdget(fd);
 	ssize_t error = -EBADF;
 
-	f = fget_light(fd, &fput_needed);
-	if (!f)
+	if (!f.file)
 		return error;
-	audit_inode(NULL, f->f_path.dentry);
-	error = getxattr(f->f_path.dentry, name, value, size);
-	fput_light(f, fput_needed);
+	audit_inode(NULL, f.file->f_path.dentry);
+	error = getxattr(f.file->f_path.dentry, name, value, size);
+	fdput(f);
 	return error;
 }
 
@@ -576,16 +572,14 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
 
 SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
-	int fput_needed;
-	struct file *f;
+	struct fd f = fdget(fd);
 	ssize_t error = -EBADF;
 
-	f = fget_light(fd, &fput_needed);
-	if (!f)
+	if (!f.file)
 		return error;
-	audit_inode(NULL, f->f_path.dentry);
-	error = listxattr(f->f_path.dentry, list, size);
-	fput_light(f, fput_needed);
+	audit_inode(NULL, f.file->f_path.dentry);
+	error = listxattr(f.file->f_path.dentry, list, size);
+	fdput(f);
 	return error;
 }
 
@@ -645,22 +639,20 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 
 SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
-	int fput_needed;
-	struct file *f;
+	struct fd f = fdget(fd);
 	struct dentry *dentry;
 	int error = -EBADF;
 
-	f = fget_light(fd, &fput_needed);
-	if (!f)
+	if (!f.file)
 		return error;
-	dentry = f->f_path.dentry;
+	dentry = f.file->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write_file(f);
+	error = mnt_want_write_file(f.file);
 	if (!error) {
 		error = removexattr(dentry, name);
-		mnt_drop_write_file(f);
+		mnt_drop_write_file(f.file);
 	}
-	fput_light(f, fput_needed);
+	fdput(f);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index e6cdf224d7ea..b9b8646e62db 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -48,44 +48,44 @@ xfs_swapext(
 	xfs_swapext_t	*sxp)
 {
 	xfs_inode_t     *ip, *tip;
-	struct file	*file, *tmp_file;
-	int		error = 0, fput_needed, fput_needed_tmp;
+	struct fd	f, tmp;
+	int		error = 0;
 
 	/* Pull information for the target fd */
-	file = fget_light((int)sxp->sx_fdtarget, &fput_needed);
-	if (!file) {
+	f = fdget((int)sxp->sx_fdtarget);
+	if (!f.file) {
 		error = XFS_ERROR(EINVAL);
 		goto out;
 	}
 
-	if (!(file->f_mode & FMODE_WRITE) ||
-	    !(file->f_mode & FMODE_READ) ||
-	    (file->f_flags & O_APPEND)) {
+	if (!(f.file->f_mode & FMODE_WRITE) ||
+	    !(f.file->f_mode & FMODE_READ) ||
+	    (f.file->f_flags & O_APPEND)) {
 		error = XFS_ERROR(EBADF);
 		goto out_put_file;
 	}
 
-	tmp_file = fget_light((int)sxp->sx_fdtmp, &fput_needed_tmp);
-	if (!tmp_file) {
+	tmp = fdget((int)sxp->sx_fdtmp);
+	if (!tmp.file) {
 		error = XFS_ERROR(EINVAL);
 		goto out_put_file;
 	}
 
-	if (!(tmp_file->f_mode & FMODE_WRITE) ||
-	    !(tmp_file->f_mode & FMODE_READ) ||
-	    (tmp_file->f_flags & O_APPEND)) {
+	if (!(tmp.file->f_mode & FMODE_WRITE) ||
+	    !(tmp.file->f_mode & FMODE_READ) ||
+	    (tmp.file->f_flags & O_APPEND)) {
 		error = XFS_ERROR(EBADF);
 		goto out_put_tmp_file;
 	}
 
-	if (IS_SWAPFILE(file->f_path.dentry->d_inode) ||
-	    IS_SWAPFILE(tmp_file->f_path.dentry->d_inode)) {
+	if (IS_SWAPFILE(f.file->f_path.dentry->d_inode) ||
+	    IS_SWAPFILE(tmp.file->f_path.dentry->d_inode)) {
 		error = XFS_ERROR(EINVAL);
 		goto out_put_tmp_file;
 	}
 
-	ip = XFS_I(file->f_path.dentry->d_inode);
-	tip = XFS_I(tmp_file->f_path.dentry->d_inode);
+	ip = XFS_I(f.file->f_path.dentry->d_inode);
+	tip = XFS_I(tmp.file->f_path.dentry->d_inode);
 
 	if (ip->i_mount != tip->i_mount) {
 		error = XFS_ERROR(EINVAL);
@@ -105,9 +105,9 @@ xfs_swapext(
 	error = xfs_swap_extents(ip, tip, sxp);
 
  out_put_tmp_file:
-	fput_light(tmp_file, fput_needed_tmp);
+	fdput(tmp);
  out_put_file:
-	fput_light(file, fput_needed);
+	fdput(f);
  out:
 	return error;
 }
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 21483eac402d..8305f2ac6773 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -70,16 +70,16 @@ xfs_find_handle(
 	int			hsize;
 	xfs_handle_t		handle;
 	struct inode		*inode;
-	struct file		*file = NULL;
+	struct fd		f;
 	struct path		path;
-	int			error, fput_needed;
+	int			error;
 	struct xfs_inode	*ip;
 
 	if (cmd == XFS_IOC_FD_TO_HANDLE) {
-		file = fget_light(hreq->fd, &fput_needed);
-		if (!file)
+		f = fdget(hreq->fd);
+		if (!f.file)
 			return -EBADF;
-		inode = file->f_path.dentry->d_inode;
+		inode = f.file->f_path.dentry->d_inode;
 	} else {
 		error = user_lpath((const char __user *)hreq->path, &path);
 		if (error)
@@ -134,7 +134,7 @@ xfs_find_handle(
 
  out_put:
 	if (cmd == XFS_IOC_FD_TO_HANDLE)
-		fput_light(file, fput_needed);
+		fdput(f);
 	else
 		path_put(&path);
 	return error;
diff --git a/include/linux/file.h b/include/linux/file.h
index c38bfbff4647..cbacf4faf447 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -47,6 +47,9 @@ static inline struct fd fdget(unsigned int fd)
 	return (struct fd){f,b};
 }
 
+extern struct file *fget_raw(unsigned int fd);
+extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
+
 static inline struct fd fdget_raw(unsigned int fd)
 {
 	int b;
@@ -54,8 +57,6 @@ static inline struct fd fdget_raw(unsigned int fd)
 	return (struct fd){f,b};
 }
 
-extern struct file *fget_raw(unsigned int fd);
-extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
 extern int f_dupfd(unsigned int from, struct file *file, unsigned flags);
 extern int replace_fd(unsigned fd, struct file *file, unsigned flags);
 extern void set_close_on_exec(unsigned int fd, int flag);
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 5db1b69500fd..6d255e535d03 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -944,7 +944,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 		size_t, msg_len, unsigned int, msg_prio,
 		const struct timespec __user *, u_abs_timeout)
 {
-	struct file *filp;
+	struct fd f;
 	struct inode *inode;
 	struct ext_wait_queue wait;
 	struct ext_wait_queue *receiver;
@@ -953,7 +953,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 	ktime_t expires, *timeout = NULL;
 	struct timespec ts;
 	struct posix_msg_tree_node *new_leaf = NULL;
-	int ret = 0, fput_needed;
+	int ret = 0;
 
 	if (u_abs_timeout) {
 		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -967,21 +967,21 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 
 	audit_mq_sendrecv(mqdes, msg_len, msg_prio, timeout ? &ts : NULL);
 
-	filp = fget_light(mqdes, &fput_needed);
-	if (unlikely(!filp)) {
+	f = fdget(mqdes);
+	if (unlikely(!f.file)) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = filp->f_path.dentry->d_inode;
-	if (unlikely(filp->f_op != &mqueue_file_operations)) {
+	inode = f.file->f_path.dentry->d_inode;
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
 	info = MQUEUE_I(inode);
-	audit_inode(NULL, filp->f_path.dentry);
+	audit_inode(NULL, f.file->f_path.dentry);
 
-	if (unlikely(!(filp->f_mode & FMODE_WRITE))) {
+	if (unlikely(!(f.file->f_mode & FMODE_WRITE))) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1023,7 +1023,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr,
 	}
 
 	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
-		if (filp->f_flags & O_NONBLOCK) {
+		if (f.file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
 		} else {
 			wait.task = current;
@@ -1056,7 +1056,7 @@ out_free:
 	if (ret)
 		free_msg(msg_ptr);
 out_fput:
-	fput_light(filp, fput_needed);
+	fdput(f);
 out:
 	return ret;
 }
@@ -1067,14 +1067,13 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 {
 	ssize_t ret;
 	struct msg_msg *msg_ptr;
-	struct file *filp;
+	struct fd f;
 	struct inode *inode;
 	struct mqueue_inode_info *info;
 	struct ext_wait_queue wait;
 	ktime_t expires, *timeout = NULL;
 	struct timespec ts;
 	struct posix_msg_tree_node *new_leaf = NULL;
-	int fput_needed;
 
 	if (u_abs_timeout) {
 		int res = prepare_timeout(u_abs_timeout, &expires, &ts);
@@ -1085,21 +1084,21 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 
 	audit_mq_sendrecv(mqdes, msg_len, 0, timeout ? &ts : NULL);
 
-	filp = fget_light(mqdes, &fput_needed);
-	if (unlikely(!filp)) {
+	f = fdget(mqdes);
+	if (unlikely(!f.file)) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = filp->f_path.dentry->d_inode;
-	if (unlikely(filp->f_op != &mqueue_file_operations)) {
+	inode = f.file->f_path.dentry->d_inode;
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
 	info = MQUEUE_I(inode);
-	audit_inode(NULL, filp->f_path.dentry);
+	audit_inode(NULL, f.file->f_path.dentry);
 
-	if (unlikely(!(filp->f_mode & FMODE_READ))) {
+	if (unlikely(!(f.file->f_mode & FMODE_READ))) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1131,7 +1130,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 	}
 
 	if (info->attr.mq_curmsgs == 0) {
-		if (filp->f_flags & O_NONBLOCK) {
+		if (f.file->f_flags & O_NONBLOCK) {
 			spin_unlock(&info->lock);
 			ret = -EAGAIN;
 		} else {
@@ -1161,7 +1160,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr,
 		free_msg(msg_ptr);
 	}
 out_fput:
-	fput_light(filp, fput_needed);
+	fdput(f);
 out:
 	return ret;
 }
@@ -1174,8 +1173,8 @@ out:
 SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
 		const struct sigevent __user *, u_notification)
 {
-	int ret, fput_needed;
-	struct file *filp;
+	int ret;
+	struct fd f;
 	struct sock *sock;
 	struct inode *inode;
 	struct sigevent notification;
@@ -1221,13 +1220,13 @@ SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes,
 			skb_put(nc, NOTIFY_COOKIE_LEN);
 			/* and attach it to the socket */
 retry:
-			filp = fget_light(notification.sigev_signo, &fput_needed);
-			if (!filp) {
+			f = fdget(notification.sigev_signo);
+			if (!f.file) {
 				ret = -EBADF;
 				goto out;
 			}
-			sock = netlink_getsockbyfilp(filp);
-			fput_light(filp, fput_needed);
+			sock = netlink_getsockbyfilp(f.file);
+			fdput(f);
 			if (IS_ERR(sock)) {
 				ret = PTR_ERR(sock);
 				sock = NULL;
@@ -1246,14 +1245,14 @@ retry:
 		}
 	}
 
-	filp = fget_light(mqdes, &fput_needed);
-	if (!filp) {
+	f = fdget(mqdes);
+	if (!f.file) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = filp->f_path.dentry->d_inode;
-	if (unlikely(filp->f_op != &mqueue_file_operations)) {
+	inode = f.file->f_path.dentry->d_inode;
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1293,7 +1292,7 @@ retry:
 	}
 	spin_unlock(&info->lock);
 out_fput:
-	fput_light(filp, fput_needed);
+	fdput(f);
 out:
 	if (sock) {
 		netlink_detachskb(sock, nc);
@@ -1309,8 +1308,7 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
 {
 	int ret;
 	struct mq_attr mqstat, omqstat;
-	int fput_needed;
-	struct file *filp;
+	struct fd f;
 	struct inode *inode;
 	struct mqueue_inode_info *info;
 
@@ -1321,14 +1319,14 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
 			return -EINVAL;
 	}
 
-	filp = fget_light(mqdes, &fput_needed);
-	if (!filp) {
+	f = fdget(mqdes);
+	if (!f.file) {
 		ret = -EBADF;
 		goto out;
 	}
 
-	inode = filp->f_path.dentry->d_inode;
-	if (unlikely(filp->f_op != &mqueue_file_operations)) {
+	inode = f.file->f_path.dentry->d_inode;
+	if (unlikely(f.file->f_op != &mqueue_file_operations)) {
 		ret = -EBADF;
 		goto out_fput;
 	}
@@ -1337,15 +1335,15 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
 	spin_lock(&info->lock);
 
 	omqstat = info->attr;
-	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
+	omqstat.mq_flags = f.file->f_flags & O_NONBLOCK;
 	if (u_mqstat) {
 		audit_mq_getsetattr(mqdes, &mqstat);
-		spin_lock(&filp->f_lock);
+		spin_lock(&f.file->f_lock);
 		if (mqstat.mq_flags & O_NONBLOCK)
-			filp->f_flags |= O_NONBLOCK;
+			f.file->f_flags |= O_NONBLOCK;
 		else
-			filp->f_flags &= ~O_NONBLOCK;
-		spin_unlock(&filp->f_lock);
+			f.file->f_flags &= ~O_NONBLOCK;
+		spin_unlock(&f.file->f_lock);
 
 		inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	}
@@ -1358,7 +1356,7 @@ SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes,
 		ret = -EFAULT;
 
 out_fput:
-	fput_light(filp, fput_needed);
+	fdput(f);
 out:
 	return ret;
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2bd199bfaef8..bd9c5bca42ae 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -467,14 +467,13 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 {
 	struct perf_cgroup *cgrp;
 	struct cgroup_subsys_state *css;
-	struct file *file;
-	int ret = 0, fput_needed;
+	struct fd f = fdget(fd);
+	int ret = 0;
 
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	if (!f.file)
 		return -EBADF;
 
-	css = cgroup_css_from_dir(file, perf_subsys_id);
+	css = cgroup_css_from_dir(f.file, perf_subsys_id);
 	if (IS_ERR(css)) {
 		ret = PTR_ERR(css);
 		goto out;
@@ -500,7 +499,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
 		ret = -EINVAL;
 	}
 out:
-	fput_light(file, fput_needed);
+	fdput(f);
 	return ret;
 }
 
@@ -3233,21 +3232,18 @@ unlock:
 
 static const struct file_operations perf_fops;
 
-static struct file *perf_fget_light(int fd, int *fput_needed)
+static inline int perf_fget_light(int fd, struct fd *p)
 {
-	struct file *file;
-
-	file = fget_light(fd, fput_needed);
-	if (!file)
-		return ERR_PTR(-EBADF);
+	struct fd f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
 
-	if (file->f_op != &perf_fops) {
-		fput_light(file, *fput_needed);
-		*fput_needed = 0;
-		return ERR_PTR(-EBADF);
+	if (f.file->f_op != &perf_fops) {
+		fdput(f);
+		return -EBADF;
 	}
-
-	return file;
+	*p = f;
+	return 0;
 }
 
 static int perf_event_set_output(struct perf_event *event,
@@ -3279,22 +3275,19 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case PERF_EVENT_IOC_SET_OUTPUT:
 	{
-		struct file *output_file = NULL;
-		struct perf_event *output_event = NULL;
-		int fput_needed = 0;
 		int ret;
-
 		if (arg != -1) {
-			output_file = perf_fget_light(arg, &fput_needed);
-			if (IS_ERR(output_file))
-				return PTR_ERR(output_file);
-			output_event = output_file->private_data;
+			struct perf_event *output_event;
+			struct fd output;
+			ret = perf_fget_light(arg, &output);
+			if (ret)
+				return ret;
+			output_event = output.file->private_data;
+			ret = perf_event_set_output(event, output_event);
+			fdput(output);
+		} else {
+			ret = perf_event_set_output(event, NULL);
 		}
-
-		ret = perf_event_set_output(event, output_event);
-		if (output_event)
-			fput_light(output_file, fput_needed);
-
 		return ret;
 	}
 
@@ -6229,12 +6222,11 @@ SYSCALL_DEFINE5(perf_event_open,
 	struct perf_event_attr attr;
 	struct perf_event_context *ctx;
 	struct file *event_file = NULL;
-	struct file *group_file = NULL;
+	struct fd group = {NULL, 0};
 	struct task_struct *task = NULL;
 	struct pmu *pmu;
 	int event_fd;
 	int move_group = 0;
-	int fput_needed = 0;
 	int err;
 
 	/* for future expandability... */
@@ -6269,12 +6261,10 @@ SYSCALL_DEFINE5(perf_event_open,
 		return event_fd;
 
 	if (group_fd != -1) {
-		group_file = perf_fget_light(group_fd, &fput_needed);
-		if (IS_ERR(group_file)) {
-			err = PTR_ERR(group_file);
+		err = perf_fget_light(group_fd, &group);
+		if (err)
 			goto err_fd;
-		}
-		group_leader = group_file->private_data;
+		group_leader = group.file->private_data;
 		if (flags & PERF_FLAG_FD_OUTPUT)
 			output_event = group_leader;
 		if (flags & PERF_FLAG_FD_NO_GROUP)
@@ -6450,7 +6440,7 @@ SYSCALL_DEFINE5(perf_event_open,
 	 * of the group leader will find the pointer to itself in
 	 * perf_group_detach().
 	 */
-	fput_light(group_file, fput_needed);
+	fdput(group);
 	fd_install(event_fd, event_file);
 	return event_fd;
 
@@ -6464,7 +6454,7 @@ err_task:
 	if (task)
 		put_task_struct(task);
 err_group_fd:
-	fput_light(group_file, fput_needed);
+	fdput(group);
 err_fd:
 	put_unused_fd(event_fd);
 	return err;
diff --git a/kernel/sys.c b/kernel/sys.c
index 0cb4283df884..f9492284e5d2 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1788,15 +1788,15 @@ SYSCALL_DEFINE1(umask, int, mask)
 #ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
-	struct file *exe_file;
+	struct fd exe;
 	struct dentry *dentry;
-	int err, fput_needed;
+	int err;
 
-	exe_file = fget_light(fd, &fput_needed);
-	if (!exe_file)
+	exe = fdget(fd);
+	if (!exe.file)
 		return -EBADF;
 
-	dentry = exe_file->f_path.dentry;
+	dentry = exe.file->f_path.dentry;
 
 	/*
 	 * Because the original mm->exe_file points to executable file, make
@@ -1805,7 +1805,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 	 */
 	err = -EACCES;
 	if (!S_ISREG(dentry->d_inode->i_mode)	||
-	    exe_file->f_path.mnt->mnt_flags & MNT_NOEXEC)
+	    exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
 		goto exit;
 
 	err = inode_permission(dentry->d_inode, MAY_EXEC);
@@ -1839,12 +1839,12 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 		goto exit_unlock;
 
 	err = 0;
-	set_mm_exe_file(mm, exe_file);	/* this grabs a reference to exe_file */
+	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
 exit_unlock:
 	up_write(&mm->mmap_sem);
 
 exit:
-	fput_light(exe_file, fput_needed);
+	fdput(exe);
 	return err;
 }
 
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index d0a32796550f..5116b7e5962e 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -415,16 +415,15 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	struct nlattr *na;
 	size_t size;
 	u32 fd;
-	struct file *file;
-	int fput_needed;
+	struct fd f;
 
 	na = info->attrs[CGROUPSTATS_CMD_ATTR_FD];
 	if (!na)
 		return -EINVAL;
 
 	fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
-	file = fget_light(fd, &fput_needed);
-	if (!file)
+	f = fdget(fd);
+	if (!f.file)
 		return 0;
 
 	size = nla_total_size(sizeof(struct cgroupstats));
@@ -444,7 +443,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	stats = nla_data(na);
 	memset(stats, 0, sizeof(*stats));
 
-	rc = cgroupstats_build(stats, file->f_dentry);
+	rc = cgroupstats_build(stats, f.file->f_dentry);
 	if (rc < 0) {
 		nlmsg_free(rep_skb);
 		goto err;
@@ -453,7 +452,7 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
 	rc = send_reply(rep_skb, info);
 
 err:
-	fput_light(file, fput_needed);
+	fdput(f);
 	return rc;
 }
 
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a83245763cf8..a47f0f50c89f 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -26,8 +26,7 @@
  */
 SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 {
-	int fput_needed;
-	struct file *file = fget_light(fd, &fput_needed);
+	struct fd f = fdget(fd);
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
@@ -36,15 +35,15 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 	unsigned long nrpages;
 	int ret = 0;
 
-	if (!file)
+	if (!f.file)
 		return -EBADF;
 
-	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+	if (S_ISFIFO(f.file->f_path.dentry->d_inode->i_mode)) {
 		ret = -ESPIPE;
 		goto out;
 	}
 
-	mapping = file->f_mapping;
+	mapping = f.file->f_mapping;
 	if (!mapping || len < 0) {
 		ret = -EINVAL;
 		goto out;
@@ -77,21 +76,21 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
-		file->f_ra.ra_pages = bdi->ra_pages;
-		spin_lock(&file->f_lock);
-		file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&file->f_lock);
+		f.file->f_ra.ra_pages = bdi->ra_pages;
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
-		spin_lock(&file->f_lock);
-		file->f_mode |= FMODE_RANDOM;
-		spin_unlock(&file->f_lock);
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode |= FMODE_RANDOM;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_SEQUENTIAL:
-		file->f_ra.ra_pages = bdi->ra_pages * 2;
-		spin_lock(&file->f_lock);
-		file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&file->f_lock);
+		f.file->f_ra.ra_pages = bdi->ra_pages * 2;
+		spin_lock(&f.file->f_lock);
+		f.file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&f.file->f_lock);
 		break;
 	case POSIX_FADV_WILLNEED:
 		/* First and last PARTIAL page! */
@@ -107,7 +106,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		 * Ignore return value because fadvise() shall return
 		 * success even if filesystem can't retrieve a hint,
 		 */
-		force_page_cache_readahead(mapping, file, start_index,
+		force_page_cache_readahead(mapping, f.file, start_index,
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
@@ -129,7 +128,7 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
 		ret = -EINVAL;
 	}
 out:
-	fput_light(file, fput_needed);
+	fdput(f);
 	return ret;
 }
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
diff --git a/mm/readahead.c b/mm/readahead.c
index 1011111e2bf4..7963f2391236 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -579,20 +579,19 @@ do_readahead(struct address_space *mapping, struct file *filp,
 SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 {
 	ssize_t ret;
-	struct file *file;
-	int fput_needed;
+	struct fd f;
 
 	ret = -EBADF;
-	file = fget_light(fd, &fput_needed);
-	if (file) {
-		if (file->f_mode & FMODE_READ) {
-			struct address_space *mapping = file->f_mapping;
+	f = fdget(fd);
+	if (f.file) {
+		if (f.file->f_mode & FMODE_READ) {
+			struct address_space *mapping = f.file->f_mapping;
 			pgoff_t start = offset >> PAGE_CACHE_SHIFT;
 			pgoff_t end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
 			unsigned long len = end - start + 1;
-			ret = do_readahead(mapping, file, start, len);
+			ret = do_readahead(mapping, f.file, start, len);
 		}
-		fput_light(file, fput_needed);
+		fdput(f);
 	}
 	return ret;
 }
-- 
cgit v1.2.3


From 1fe0c0230a7c2d5f4061e681a3f3be9512446d23 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 19 Sep 2012 15:49:51 +0100
Subject: vfs: delete surplus inode NULL check

Each iteration of d_delete we reload inode from dentry->d_inode and
then call S_ISDIR(inode-i_mode), so inode cannot possibly be NULL
shortly afterwards unless something went horribly wrong.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 16521a9f2038..fbee67b92651 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2109,7 +2109,7 @@ again:
 	inode = dentry->d_inode;
 	isdir = S_ISDIR(inode->i_mode);
 	if (dentry->d_count == 1) {
-		if (inode && !spin_trylock(&inode->i_lock)) {
+		if (!spin_trylock(&inode->i_lock)) {
 			spin_unlock(&dentry->d_lock);
 			cpu_relax();
 			goto again;
-- 
cgit v1.2.3


From 2744c171dbaa0b1ec7639e7d0ff817fba9461a38 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Sep 2012 21:41:05 -0400
Subject: ceph: don't abuse d_delete() on failure exits

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 4b5762ef7c2b..ba95eea201bf 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1104,7 +1104,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 				pr_err("fill_trace bad get_inode "
 				       "%llx.%llx\n", vino.ino, vino.snap);
 				err = PTR_ERR(in);
-				d_delete(dn);
+				d_drop(dn);
 				goto done;
 			}
 			dn = splice_dentry(dn, in, &have_lease, true);
@@ -1277,7 +1277,7 @@ retry_lookup:
 			in = ceph_get_inode(parent->d_sb, vino);
 			if (IS_ERR(in)) {
 				dout("new_inode badness\n");
-				d_delete(dn);
+				d_drop(dn);
 				dput(dn);
 				err = PTR_ERR(in);
 				goto out;
-- 
cgit v1.2.3


From 63784dd02b2ac85e867be9d6a6f5536c4ff738be Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 26 Sep 2012 21:43:05 -0400
Subject: fcntl: fix misannotations

__user * != * __user...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 91af39a33af7..8f704291d4ed 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -149,7 +149,7 @@ pid_t f_getown(struct file *filp)
 
 static int f_setown_ex(struct file *filp, unsigned long arg)
 {
-	struct f_owner_ex * __user owner_p = (void * __user)arg;
+	struct f_owner_ex __user *owner_p = (void __user *)arg;
 	struct f_owner_ex owner;
 	struct pid *pid;
 	int type;
@@ -189,7 +189,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
 
 static int f_getown_ex(struct file *filp, unsigned long arg)
 {
-	struct f_owner_ex * __user owner_p = (void * __user)arg;
+	struct f_owner_ex __user *owner_p = (void __user *)arg;
 	struct f_owner_ex owner;
 	int ret = 0;
 
@@ -227,7 +227,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
 static int f_getowner_uids(struct file *filp, unsigned long arg)
 {
 	struct user_namespace *user_ns = current_user_ns();
-	uid_t * __user dst = (void * __user)arg;
+	uid_t __user *dst = (void __user *)arg;
 	uid_t src[2];
 	int err;
 
-- 
cgit v1.2.3


From f34f9d186df35e5c39163444c43b4fc6255e39c5 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <vda.linux@googlemail.com>
Date: Wed, 26 Sep 2012 11:34:50 +1000
Subject: coredump: prevent double-free on an error path in core dumper

In !CORE_DUMP_USE_REGSET case, if elf_note_info_init fails to allocate
memory for info->fields, it frees already allocated stuff and returns
error to its caller, fill_note_info.  Which in turn returns error to its
caller, elf_core_dump.  Which jumps to cleanup label and calls
free_note_info, which will happily try to free all info->fields again.
BOOM.

This is the fix.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Venu Byravarasu <vbyravarasu@nvidia.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/binfmt_elf.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1b52956afe33..0225fddf49b7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1696,30 +1696,19 @@ static int elf_note_info_init(struct elf_note_info *info)
 		return 0;
 	info->psinfo = kmalloc(sizeof(*info->psinfo), GFP_KERNEL);
 	if (!info->psinfo)
-		goto notes_free;
+		return 0;
 	info->prstatus = kmalloc(sizeof(*info->prstatus), GFP_KERNEL);
 	if (!info->prstatus)
-		goto psinfo_free;
+		return 0;
 	info->fpu = kmalloc(sizeof(*info->fpu), GFP_KERNEL);
 	if (!info->fpu)
-		goto prstatus_free;
+		return 0;
 #ifdef ELF_CORE_COPY_XFPREGS
 	info->xfpu = kmalloc(sizeof(*info->xfpu), GFP_KERNEL);
 	if (!info->xfpu)
-		goto fpu_free;
+		return 0;
 #endif
 	return 1;
-#ifdef ELF_CORE_COPY_XFPREGS
- fpu_free:
-	kfree(info->fpu);
-#endif
- prstatus_free:
-	kfree(info->prstatus);
- psinfo_free:
-	kfree(info->psinfo);
- notes_free:
-	kfree(info->notes);
-	return 0;
 }
 
 static int fill_note_info(struct elfhdr *elf, int phdrs,
-- 
cgit v1.2.3


From 10c28d937e2cca577c2d804106b50dd0562fb062 Mon Sep 17 00:00:00 2001
From: Alex Kelly <alex.page.kelly@gmail.com>
Date: Wed, 26 Sep 2012 21:52:08 -0400
Subject: coredump: move core dump functionality into its own file

This prepares for making core dump functionality optional.

The variable "suid_dumpable" and associated functions are left in fs/exec.c
because they're used elsewhere, such as in ptrace.

Signed-off-by: Alex Kelly <alex.page.kelly@gmail.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile           |   2 +-
 fs/coredump.c         | 686 ++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/exec.c             | 645 +----------------------------------------------
 include/linux/sched.h |   1 +
 4 files changed, 689 insertions(+), 645 deletions(-)
 create mode 100644 fs/coredump.c

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 2fb977934673..8938f8250320 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o statfs.o
+		stack.o fs_struct.o statfs.o coredump.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/coredump.c b/fs/coredump.c
new file mode 100644
index 000000000000..f045bbad6822
--- /dev/null
+++ b/fs/coredump.c
@@ -0,0 +1,686 @@
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/stat.h>
+#include <linux/fcntl.h>
+#include <linux/swap.h>
+#include <linux/string.h>
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/perf_event.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/key.h>
+#include <linux/personality.h>
+#include <linux/binfmts.h>
+#include <linux/utsname.h>
+#include <linux/pid_namespace.h>
+#include <linux/module.h>
+#include <linux/namei.h>
+#include <linux/mount.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+#include <linux/tsacct_kern.h>
+#include <linux/cn_proc.h>
+#include <linux/audit.h>
+#include <linux/tracehook.h>
+#include <linux/kmod.h>
+#include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/oom.h>
+#include <linux/compat.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/tlb.h>
+#include <asm/exec.h>
+
+#include <trace/events/task.h>
+#include "internal.h"
+
+#include <trace/events/sched.h>
+
+int core_uses_pid;
+char core_pattern[CORENAME_MAX_SIZE] = "core";
+unsigned int core_pipe_limit;
+
+struct core_name {
+	char *corename;
+	int used, size;
+};
+static atomic_t call_count = ATOMIC_INIT(1);
+
+/* The maximal length of core_pattern is also specified in sysctl.c */
+
+static int expand_corename(struct core_name *cn)
+{
+	char *old_corename = cn->corename;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
+	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
+
+	if (!cn->corename) {
+		kfree(old_corename);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int cn_printf(struct core_name *cn, const char *fmt, ...)
+{
+	char *cur;
+	int need;
+	int ret;
+	va_list arg;
+
+	va_start(arg, fmt);
+	need = vsnprintf(NULL, 0, fmt, arg);
+	va_end(arg);
+
+	if (likely(need < cn->size - cn->used - 1))
+		goto out_printf;
+
+	ret = expand_corename(cn);
+	if (ret)
+		goto expand_fail;
+
+out_printf:
+	cur = cn->corename + cn->used;
+	va_start(arg, fmt);
+	vsnprintf(cur, need + 1, fmt, arg);
+	va_end(arg);
+	cn->used += need;
+	return 0;
+
+expand_fail:
+	return ret;
+}
+
+static void cn_escape(char *str)
+{
+	for (; *str; str++)
+		if (*str == '/')
+			*str = '!';
+}
+
+static int cn_print_exe_file(struct core_name *cn)
+{
+	struct file *exe_file;
+	char *pathbuf, *path;
+	int ret;
+
+	exe_file = get_mm_exe_file(current->mm);
+	if (!exe_file) {
+		char *commstart = cn->corename + cn->used;
+		ret = cn_printf(cn, "%s (path unknown)", current->comm);
+		cn_escape(commstart);
+		return ret;
+	}
+
+	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
+	if (!pathbuf) {
+		ret = -ENOMEM;
+		goto put_exe_file;
+	}
+
+	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
+	if (IS_ERR(path)) {
+		ret = PTR_ERR(path);
+		goto free_buf;
+	}
+
+	cn_escape(path);
+
+	ret = cn_printf(cn, "%s", path);
+
+free_buf:
+	kfree(pathbuf);
+put_exe_file:
+	fput(exe_file);
+	return ret;
+}
+
+/* format_corename will inspect the pattern parameter, and output a
+ * name into corename, which must have space for at least
+ * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
+ */
+static int format_corename(struct core_name *cn, long signr)
+{
+	const struct cred *cred = current_cred();
+	const char *pat_ptr = core_pattern;
+	int ispipe = (*pat_ptr == '|');
+	int pid_in_pattern = 0;
+	int err = 0;
+
+	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
+	cn->corename = kmalloc(cn->size, GFP_KERNEL);
+	cn->used = 0;
+
+	if (!cn->corename)
+		return -ENOMEM;
+
+	/* Repeat as long as we have more pattern to process and more output
+	   space */
+	while (*pat_ptr) {
+		if (*pat_ptr != '%') {
+			if (*pat_ptr == 0)
+				goto out;
+			err = cn_printf(cn, "%c", *pat_ptr++);
+		} else {
+			switch (*++pat_ptr) {
+			/* single % at the end, drop that */
+			case 0:
+				goto out;
+			/* Double percent, output one percent */
+			case '%':
+				err = cn_printf(cn, "%c", '%');
+				break;
+			/* pid */
+			case 'p':
+				pid_in_pattern = 1;
+				err = cn_printf(cn, "%d",
+					      task_tgid_vnr(current));
+				break;
+			/* uid */
+			case 'u':
+				err = cn_printf(cn, "%d", cred->uid);
+				break;
+			/* gid */
+			case 'g':
+				err = cn_printf(cn, "%d", cred->gid);
+				break;
+			/* signal that caused the coredump */
+			case 's':
+				err = cn_printf(cn, "%ld", signr);
+				break;
+			/* UNIX time of coredump */
+			case 't': {
+				struct timeval tv;
+				do_gettimeofday(&tv);
+				err = cn_printf(cn, "%lu", tv.tv_sec);
+				break;
+			}
+			/* hostname */
+			case 'h': {
+				char *namestart = cn->corename + cn->used;
+				down_read(&uts_sem);
+				err = cn_printf(cn, "%s",
+					      utsname()->nodename);
+				up_read(&uts_sem);
+				cn_escape(namestart);
+				break;
+			}
+			/* executable */
+			case 'e': {
+				char *commstart = cn->corename + cn->used;
+				err = cn_printf(cn, "%s", current->comm);
+				cn_escape(commstart);
+				break;
+			}
+			case 'E':
+				err = cn_print_exe_file(cn);
+				break;
+			/* core limit size */
+			case 'c':
+				err = cn_printf(cn, "%lu",
+					      rlimit(RLIMIT_CORE));
+				break;
+			default:
+				break;
+			}
+			++pat_ptr;
+		}
+
+		if (err)
+			return err;
+	}
+
+	/* Backward compatibility with core_uses_pid:
+	 *
+	 * If core_pattern does not include a %p (as is the default)
+	 * and core_uses_pid is set, then .%pid will be appended to
+	 * the filename. Do not do this for piped commands. */
+	if (!ispipe && !pid_in_pattern && core_uses_pid) {
+		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
+		if (err)
+			return err;
+	}
+out:
+	return ispipe;
+}
+
+static int zap_process(struct task_struct *start, int exit_code)
+{
+	struct task_struct *t;
+	int nr = 0;
+
+	start->signal->flags = SIGNAL_GROUP_EXIT;
+	start->signal->group_exit_code = exit_code;
+	start->signal->group_stop_count = 0;
+
+	t = start;
+	do {
+		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
+		if (t != current && t->mm) {
+			sigaddset(&t->pending.signal, SIGKILL);
+			signal_wake_up(t, 1);
+			nr++;
+		}
+	} while_each_thread(start, t);
+
+	return nr;
+}
+
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+				struct core_state *core_state, int exit_code)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	int nr = -EAGAIN;
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (!signal_group_exit(tsk->signal)) {
+		mm->core_state = core_state;
+		nr = zap_process(tsk, exit_code);
+	}
+	spin_unlock_irq(&tsk->sighand->siglock);
+	if (unlikely(nr < 0))
+		return nr;
+
+	if (atomic_read(&mm->mm_users) == nr + 1)
+		goto done;
+	/*
+	 * We should find and kill all tasks which use this mm, and we should
+	 * count them correctly into ->nr_threads. We don't take tasklist
+	 * lock, but this is safe wrt:
+	 *
+	 * fork:
+	 *	None of sub-threads can fork after zap_process(leader). All
+	 *	processes which were created before this point should be
+	 *	visible to zap_threads() because copy_process() adds the new
+	 *	process to the tail of init_task.tasks list, and lock/unlock
+	 *	of ->siglock provides a memory barrier.
+	 *
+	 * do_exit:
+	 *	The caller holds mm->mmap_sem. This means that the task which
+	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
+	 *	its ->mm.
+	 *
+	 * de_thread:
+	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
+	 *	we must see either old or new leader, this does not matter.
+	 *	However, it can change p->sighand, so lock_task_sighand(p)
+	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
+	 *	it can't fail.
+	 *
+	 *	Note also that "g" can be the old leader with ->mm == NULL
+	 *	and already unhashed and thus removed from ->thread_group.
+	 *	This is OK, __unhash_process()->list_del_rcu() does not
+	 *	clear the ->next pointer, we will find the new leader via
+	 *	next_thread().
+	 */
+	rcu_read_lock();
+	for_each_process(g) {
+		if (g == tsk->group_leader)
+			continue;
+		if (g->flags & PF_KTHREAD)
+			continue;
+		p = g;
+		do {
+			if (p->mm) {
+				if (unlikely(p->mm == mm)) {
+					lock_task_sighand(p, &flags);
+					nr += zap_process(p, exit_code);
+					unlock_task_sighand(p, &flags);
+				}
+				break;
+			}
+		} while_each_thread(g, p);
+	}
+	rcu_read_unlock();
+done:
+	atomic_set(&core_state->nr_threads, nr);
+	return nr;
+}
+
+static int coredump_wait(int exit_code, struct core_state *core_state)
+{
+	struct task_struct *tsk = current;
+	struct mm_struct *mm = tsk->mm;
+	int core_waiters = -EBUSY;
+
+	init_completion(&core_state->startup);
+	core_state->dumper.task = tsk;
+	core_state->dumper.next = NULL;
+
+	down_write(&mm->mmap_sem);
+	if (!mm->core_state)
+		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+	up_write(&mm->mmap_sem);
+
+	if (core_waiters > 0) {
+		struct core_thread *ptr;
+
+		wait_for_completion(&core_state->startup);
+		/*
+		 * Wait for all the threads to become inactive, so that
+		 * all the thread context (extended register state, like
+		 * fpu etc) gets copied to the memory.
+		 */
+		ptr = core_state->dumper.next;
+		while (ptr != NULL) {
+			wait_task_inactive(ptr->task, 0);
+			ptr = ptr->next;
+		}
+	}
+
+	return core_waiters;
+}
+
+static void coredump_finish(struct mm_struct *mm)
+{
+	struct core_thread *curr, *next;
+	struct task_struct *task;
+
+	next = mm->core_state->dumper.next;
+	while ((curr = next) != NULL) {
+		next = curr->next;
+		task = curr->task;
+		/*
+		 * see exit_mm(), curr->task must not see
+		 * ->task == NULL before we read ->next.
+		 */
+		smp_mb();
+		curr->task = NULL;
+		wake_up_process(task);
+	}
+
+	mm->core_state = NULL;
+}
+
+static void wait_for_dump_helpers(struct file *file)
+{
+	struct pipe_inode_info *pipe;
+
+	pipe = file->f_path.dentry->d_inode->i_pipe;
+
+	pipe_lock(pipe);
+	pipe->readers++;
+	pipe->writers--;
+
+	while ((pipe->readers > 1) && (!signal_pending(current))) {
+		wake_up_interruptible_sync(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		pipe_wait(pipe);
+	}
+
+	pipe->readers--;
+	pipe->writers++;
+	pipe_unlock(pipe);
+
+}
+
+/*
+ * umh_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
+{
+	struct file *files[2];
+	struct coredump_params *cp = (struct coredump_params *)info->data;
+	int err = create_pipe_files(files, 0);
+	if (err)
+		return err;
+
+	cp->file = files[1];
+
+	replace_fd(0, files[0], 0);
+	/* and disallow core files too */
+	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+
+	return 0;
+}
+
+void do_coredump(long signr, int exit_code, struct pt_regs *regs)
+{
+	struct core_state core_state;
+	struct core_name cn;
+	struct mm_struct *mm = current->mm;
+	struct linux_binfmt * binfmt;
+	const struct cred *old_cred;
+	struct cred *cred;
+	int retval = 0;
+	int flag = 0;
+	int ispipe;
+	struct files_struct *displaced;
+	bool need_nonrelative = false;
+	static atomic_t core_dump_count = ATOMIC_INIT(0);
+	struct coredump_params cprm = {
+		.signr = signr,
+		.regs = regs,
+		.limit = rlimit(RLIMIT_CORE),
+		/*
+		 * We must use the same mm->flags while dumping core to avoid
+		 * inconsistency of bit flags, since this flag is not protected
+		 * by any locks.
+		 */
+		.mm_flags = mm->flags,
+	};
+
+	audit_core_dumps(signr);
+
+	binfmt = mm->binfmt;
+	if (!binfmt || !binfmt->core_dump)
+		goto fail;
+	if (!__get_dumpable(cprm.mm_flags))
+		goto fail;
+
+	cred = prepare_creds();
+	if (!cred)
+		goto fail;
+	/*
+	 * We cannot trust fsuid as being the "true" uid of the process
+	 * nor do we know its entire history. We only know it was tainted
+	 * so we dump it as root in mode 2, and only into a controlled
+	 * environment (pipe handler or fully qualified path).
+	 */
+	if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
+		/* Setuid core dump mode */
+		flag = O_EXCL;		/* Stop rewrite attacks */
+		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
+		need_nonrelative = true;
+	}
+
+	retval = coredump_wait(exit_code, &core_state);
+	if (retval < 0)
+		goto fail_creds;
+
+	old_cred = override_creds(cred);
+
+	/*
+	 * Clear any false indication of pending signals that might
+	 * be seen by the filesystem code called to write the core file.
+	 */
+	clear_thread_flag(TIF_SIGPENDING);
+
+	ispipe = format_corename(&cn, signr);
+
+ 	if (ispipe) {
+		int dump_count;
+		char **helper_argv;
+
+		if (ispipe < 0) {
+			printk(KERN_WARNING "format_corename failed\n");
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_corename;
+		}
+
+		if (cprm.limit == 1) {
+			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
+			 *
+			 * Normally core limits are irrelevant to pipes, since
+			 * we're not writing to the file system, but we use
+			 * cprm.limit of 1 here as a speacial value, this is a
+			 * consistent way to catch recursive crashes.
+			 * We can still crash if the core_pattern binary sets
+			 * RLIM_CORE = !1, but it runs as root, and can do
+			 * lots of stupid things.
+			 *
+			 * Note that we use task_tgid_vnr here to grab the pid
+			 * of the process group leader.  That way we get the
+			 * right pid if a thread in a multi-threaded
+			 * core_pattern process dies.
+			 */
+			printk(KERN_WARNING
+				"Process %d(%s) has RLIMIT_CORE set to 1\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Aborting core\n");
+			goto fail_unlock;
+		}
+		cprm.limit = RLIM_INFINITY;
+
+		dump_count = atomic_inc_return(&core_dump_count);
+		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
+			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
+			       task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_dropcount;
+		}
+
+		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
+		if (!helper_argv) {
+			printk(KERN_WARNING "%s failed to allocate memory\n",
+			       __func__);
+			goto fail_dropcount;
+		}
+
+		retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+					NULL, UMH_WAIT_EXEC, umh_pipe_setup,
+					NULL, &cprm);
+		argv_free(helper_argv);
+		if (retval) {
+ 			printk(KERN_INFO "Core dump to %s pipe failed\n",
+			       cn.corename);
+			goto close_fail;
+ 		}
+	} else {
+		struct inode *inode;
+
+		if (cprm.limit < binfmt->min_coredump)
+			goto fail_unlock;
+
+		if (need_nonrelative && cn.corename[0] != '/') {
+			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
+				"to fully qualified path!\n",
+				task_tgid_vnr(current), current->comm);
+			printk(KERN_WARNING "Skipping core dump\n");
+			goto fail_unlock;
+		}
+
+		cprm.file = filp_open(cn.corename,
+				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
+				 0600);
+		if (IS_ERR(cprm.file))
+			goto fail_unlock;
+
+		inode = cprm.file->f_path.dentry->d_inode;
+		if (inode->i_nlink > 1)
+			goto close_fail;
+		if (d_unhashed(cprm.file->f_path.dentry))
+			goto close_fail;
+		/*
+		 * AK: actually i see no reason to not allow this for named
+		 * pipes etc, but keep the previous behaviour for now.
+		 */
+		if (!S_ISREG(inode->i_mode))
+			goto close_fail;
+		/*
+		 * Dont allow local users get cute and trick others to coredump
+		 * into their pre-created files.
+		 */
+		if (!uid_eq(inode->i_uid, current_fsuid()))
+			goto close_fail;
+		if (!cprm.file->f_op || !cprm.file->f_op->write)
+			goto close_fail;
+		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+			goto close_fail;
+	}
+
+	/* get us an unshared descriptor table; almost always a no-op */
+	retval = unshare_files(&displaced);
+	if (retval)
+		goto close_fail;
+	if (displaced)
+		put_files_struct(displaced);
+	retval = binfmt->core_dump(&cprm);
+	if (retval)
+		current->signal->group_exit_code |= 0x80;
+
+	if (ispipe && core_pipe_limit)
+		wait_for_dump_helpers(cprm.file);
+close_fail:
+	if (cprm.file)
+		filp_close(cprm.file, NULL);
+fail_dropcount:
+	if (ispipe)
+		atomic_dec(&core_dump_count);
+fail_unlock:
+	kfree(cn.corename);
+fail_corename:
+	coredump_finish(mm);
+	revert_creds(old_cred);
+fail_creds:
+	put_cred(cred);
+fail:
+	return;
+}
+
+/*
+ * Core dumping helper functions.  These are the only things you should
+ * do on a core-file: use only these functions to write out all the
+ * necessary info.
+ */
+int dump_write(struct file *file, const void *addr, int nr)
+{
+	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
+}
+EXPORT_SYMBOL(dump_write);
+
+int dump_seek(struct file *file, loff_t off)
+{
+	int ret = 1;
+
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
+			return 0;
+	} else {
+		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
+
+		if (!buf)
+			return 0;
+		while (off > 0) {
+			unsigned long n = off;
+
+			if (n > PAGE_SIZE)
+				n = PAGE_SIZE;
+			if (!dump_write(file, buf, n)) {
+				ret = 0;
+				break;
+			}
+			off -= n;
+		}
+		free_page((unsigned long)buf);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(dump_seek);
diff --git a/fs/exec.c b/fs/exec.c
index beb05a95e4a3..48fb26ef8a1b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,19 +66,8 @@
 
 #include <trace/events/sched.h>
 
-int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
-unsigned int core_pipe_limit;
 int suid_dumpable = 0;
 
-struct core_name {
-	char *corename;
-	int used, size;
-};
-static atomic_t call_count = ATOMIC_INIT(1);
-
-/* The maximal length of core_pattern is also specified in sysctl.c */
-
 static LIST_HEAD(formats);
 static DEFINE_RWLOCK(binfmt_lock);
 
@@ -1603,353 +1592,6 @@ void set_binfmt(struct linux_binfmt *new)
 
 EXPORT_SYMBOL(set_binfmt);
 
-static int expand_corename(struct core_name *cn)
-{
-	char *old_corename = cn->corename;
-
-	cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
-	cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
-
-	if (!cn->corename) {
-		kfree(old_corename);
-		return -ENOMEM;
-	}
-
-	return 0;
-}
-
-static int cn_printf(struct core_name *cn, const char *fmt, ...)
-{
-	char *cur;
-	int need;
-	int ret;
-	va_list arg;
-
-	va_start(arg, fmt);
-	need = vsnprintf(NULL, 0, fmt, arg);
-	va_end(arg);
-
-	if (likely(need < cn->size - cn->used - 1))
-		goto out_printf;
-
-	ret = expand_corename(cn);
-	if (ret)
-		goto expand_fail;
-
-out_printf:
-	cur = cn->corename + cn->used;
-	va_start(arg, fmt);
-	vsnprintf(cur, need + 1, fmt, arg);
-	va_end(arg);
-	cn->used += need;
-	return 0;
-
-expand_fail:
-	return ret;
-}
-
-static void cn_escape(char *str)
-{
-	for (; *str; str++)
-		if (*str == '/')
-			*str = '!';
-}
-
-static int cn_print_exe_file(struct core_name *cn)
-{
-	struct file *exe_file;
-	char *pathbuf, *path;
-	int ret;
-
-	exe_file = get_mm_exe_file(current->mm);
-	if (!exe_file) {
-		char *commstart = cn->corename + cn->used;
-		ret = cn_printf(cn, "%s (path unknown)", current->comm);
-		cn_escape(commstart);
-		return ret;
-	}
-
-	pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
-	if (!pathbuf) {
-		ret = -ENOMEM;
-		goto put_exe_file;
-	}
-
-	path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
-	if (IS_ERR(path)) {
-		ret = PTR_ERR(path);
-		goto free_buf;
-	}
-
-	cn_escape(path);
-
-	ret = cn_printf(cn, "%s", path);
-
-free_buf:
-	kfree(pathbuf);
-put_exe_file:
-	fput(exe_file);
-	return ret;
-}
-
-/* format_corename will inspect the pattern parameter, and output a
- * name into corename, which must have space for at least
- * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
- */
-static int format_corename(struct core_name *cn, long signr)
-{
-	const struct cred *cred = current_cred();
-	const char *pat_ptr = core_pattern;
-	int ispipe = (*pat_ptr == '|');
-	int pid_in_pattern = 0;
-	int err = 0;
-
-	cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
-	cn->corename = kmalloc(cn->size, GFP_KERNEL);
-	cn->used = 0;
-
-	if (!cn->corename)
-		return -ENOMEM;
-
-	/* Repeat as long as we have more pattern to process and more output
-	   space */
-	while (*pat_ptr) {
-		if (*pat_ptr != '%') {
-			if (*pat_ptr == 0)
-				goto out;
-			err = cn_printf(cn, "%c", *pat_ptr++);
-		} else {
-			switch (*++pat_ptr) {
-			/* single % at the end, drop that */
-			case 0:
-				goto out;
-			/* Double percent, output one percent */
-			case '%':
-				err = cn_printf(cn, "%c", '%');
-				break;
-			/* pid */
-			case 'p':
-				pid_in_pattern = 1;
-				err = cn_printf(cn, "%d",
-					      task_tgid_vnr(current));
-				break;
-			/* uid */
-			case 'u':
-				err = cn_printf(cn, "%d", cred->uid);
-				break;
-			/* gid */
-			case 'g':
-				err = cn_printf(cn, "%d", cred->gid);
-				break;
-			/* signal that caused the coredump */
-			case 's':
-				err = cn_printf(cn, "%ld", signr);
-				break;
-			/* UNIX time of coredump */
-			case 't': {
-				struct timeval tv;
-				do_gettimeofday(&tv);
-				err = cn_printf(cn, "%lu", tv.tv_sec);
-				break;
-			}
-			/* hostname */
-			case 'h': {
-				char *namestart = cn->corename + cn->used;
-				down_read(&uts_sem);
-				err = cn_printf(cn, "%s",
-					      utsname()->nodename);
-				up_read(&uts_sem);
-				cn_escape(namestart);
-				break;
-			}
-			/* executable */
-			case 'e': {
-				char *commstart = cn->corename + cn->used;
-				err = cn_printf(cn, "%s", current->comm);
-				cn_escape(commstart);
-				break;
-			}
-			case 'E':
-				err = cn_print_exe_file(cn);
-				break;
-			/* core limit size */
-			case 'c':
-				err = cn_printf(cn, "%lu",
-					      rlimit(RLIMIT_CORE));
-				break;
-			default:
-				break;
-			}
-			++pat_ptr;
-		}
-
-		if (err)
-			return err;
-	}
-
-	/* Backward compatibility with core_uses_pid:
-	 *
-	 * If core_pattern does not include a %p (as is the default)
-	 * and core_uses_pid is set, then .%pid will be appended to
-	 * the filename. Do not do this for piped commands. */
-	if (!ispipe && !pid_in_pattern && core_uses_pid) {
-		err = cn_printf(cn, ".%d", task_tgid_vnr(current));
-		if (err)
-			return err;
-	}
-out:
-	return ispipe;
-}
-
-static int zap_process(struct task_struct *start, int exit_code)
-{
-	struct task_struct *t;
-	int nr = 0;
-
-	start->signal->flags = SIGNAL_GROUP_EXIT;
-	start->signal->group_exit_code = exit_code;
-	start->signal->group_stop_count = 0;
-
-	t = start;
-	do {
-		task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
-		if (t != current && t->mm) {
-			sigaddset(&t->pending.signal, SIGKILL);
-			signal_wake_up(t, 1);
-			nr++;
-		}
-	} while_each_thread(start, t);
-
-	return nr;
-}
-
-static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-				struct core_state *core_state, int exit_code)
-{
-	struct task_struct *g, *p;
-	unsigned long flags;
-	int nr = -EAGAIN;
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (!signal_group_exit(tsk->signal)) {
-		mm->core_state = core_state;
-		nr = zap_process(tsk, exit_code);
-	}
-	spin_unlock_irq(&tsk->sighand->siglock);
-	if (unlikely(nr < 0))
-		return nr;
-
-	if (atomic_read(&mm->mm_users) == nr + 1)
-		goto done;
-	/*
-	 * We should find and kill all tasks which use this mm, and we should
-	 * count them correctly into ->nr_threads. We don't take tasklist
-	 * lock, but this is safe wrt:
-	 *
-	 * fork:
-	 *	None of sub-threads can fork after zap_process(leader). All
-	 *	processes which were created before this point should be
-	 *	visible to zap_threads() because copy_process() adds the new
-	 *	process to the tail of init_task.tasks list, and lock/unlock
-	 *	of ->siglock provides a memory barrier.
-	 *
-	 * do_exit:
-	 *	The caller holds mm->mmap_sem. This means that the task which
-	 *	uses this mm can't pass exit_mm(), so it can't exit or clear
-	 *	its ->mm.
-	 *
-	 * de_thread:
-	 *	It does list_replace_rcu(&leader->tasks, &current->tasks),
-	 *	we must see either old or new leader, this does not matter.
-	 *	However, it can change p->sighand, so lock_task_sighand(p)
-	 *	must be used. Since p->mm != NULL and we hold ->mmap_sem
-	 *	it can't fail.
-	 *
-	 *	Note also that "g" can be the old leader with ->mm == NULL
-	 *	and already unhashed and thus removed from ->thread_group.
-	 *	This is OK, __unhash_process()->list_del_rcu() does not
-	 *	clear the ->next pointer, we will find the new leader via
-	 *	next_thread().
-	 */
-	rcu_read_lock();
-	for_each_process(g) {
-		if (g == tsk->group_leader)
-			continue;
-		if (g->flags & PF_KTHREAD)
-			continue;
-		p = g;
-		do {
-			if (p->mm) {
-				if (unlikely(p->mm == mm)) {
-					lock_task_sighand(p, &flags);
-					nr += zap_process(p, exit_code);
-					unlock_task_sighand(p, &flags);
-				}
-				break;
-			}
-		} while_each_thread(g, p);
-	}
-	rcu_read_unlock();
-done:
-	atomic_set(&core_state->nr_threads, nr);
-	return nr;
-}
-
-static int coredump_wait(int exit_code, struct core_state *core_state)
-{
-	struct task_struct *tsk = current;
-	struct mm_struct *mm = tsk->mm;
-	int core_waiters = -EBUSY;
-
-	init_completion(&core_state->startup);
-	core_state->dumper.task = tsk;
-	core_state->dumper.next = NULL;
-
-	down_write(&mm->mmap_sem);
-	if (!mm->core_state)
-		core_waiters = zap_threads(tsk, mm, core_state, exit_code);
-	up_write(&mm->mmap_sem);
-
-	if (core_waiters > 0) {
-		struct core_thread *ptr;
-
-		wait_for_completion(&core_state->startup);
-		/*
-		 * Wait for all the threads to become inactive, so that
-		 * all the thread context (extended register state, like
-		 * fpu etc) gets copied to the memory.
-		 */
-		ptr = core_state->dumper.next;
-		while (ptr != NULL) {
-			wait_task_inactive(ptr->task, 0);
-			ptr = ptr->next;
-		}
-	}
-
-	return core_waiters;
-}
-
-static void coredump_finish(struct mm_struct *mm)
-{
-	struct core_thread *curr, *next;
-	struct task_struct *task;
-
-	next = mm->core_state->dumper.next;
-	while ((curr = next) != NULL) {
-		next = curr->next;
-		task = curr->task;
-		/*
-		 * see exit_mm(), curr->task must not see
-		 * ->task == NULL before we read ->next.
-		 */
-		smp_mb();
-		curr->task = NULL;
-		wake_up_process(task);
-	}
-
-	mm->core_state = NULL;
-}
-
 /*
  * set_dumpable converts traditional three-value dumpable to two flags and
  * stores them into mm->flags.  It modifies lower two bits of mm->flags, but
@@ -1991,7 +1633,7 @@ void set_dumpable(struct mm_struct *mm, int value)
 	}
 }
 
-static int __get_dumpable(unsigned long mm_flags)
+int __get_dumpable(unsigned long mm_flags)
 {
 	int ret;
 
@@ -2003,288 +1645,3 @@ int get_dumpable(struct mm_struct *mm)
 {
 	return __get_dumpable(mm->flags);
 }
-
-static void wait_for_dump_helpers(struct file *file)
-{
-	struct pipe_inode_info *pipe;
-
-	pipe = file->f_path.dentry->d_inode->i_pipe;
-
-	pipe_lock(pipe);
-	pipe->readers++;
-	pipe->writers--;
-
-	while ((pipe->readers > 1) && (!signal_pending(current))) {
-		wake_up_interruptible_sync(&pipe->wait);
-		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
-		pipe_wait(pipe);
-	}
-
-	pipe->readers--;
-	pipe->writers++;
-	pipe_unlock(pipe);
-
-}
-
-
-/*
- * umh_pipe_setup
- * helper function to customize the process used
- * to collect the core in userspace.  Specifically
- * it sets up a pipe and installs it as fd 0 (stdin)
- * for the process.  Returns 0 on success, or
- * PTR_ERR on failure.
- * Note that it also sets the core limit to 1.  This
- * is a special value that we use to trap recursive
- * core dumps
- */
-static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
-{
-	struct file *files[2];
-	struct coredump_params *cp = (struct coredump_params *)info->data;
-	int err = create_pipe_files(files, 0);
-	if (err)
-		return err;
-
-	cp->file = files[1];
-
-	replace_fd(0, files[0], 0);
-	/* and disallow core files too */
-	current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
-
-	return 0;
-}
-
-void do_coredump(long signr, int exit_code, struct pt_regs *regs)
-{
-	struct core_state core_state;
-	struct core_name cn;
-	struct mm_struct *mm = current->mm;
-	struct linux_binfmt * binfmt;
-	const struct cred *old_cred;
-	struct cred *cred;
-	int retval = 0;
-	int flag = 0;
-	int ispipe;
-	struct files_struct *displaced;
-	bool need_nonrelative = false;
-	static atomic_t core_dump_count = ATOMIC_INIT(0);
-	struct coredump_params cprm = {
-		.signr = signr,
-		.regs = regs,
-		.limit = rlimit(RLIMIT_CORE),
-		/*
-		 * We must use the same mm->flags while dumping core to avoid
-		 * inconsistency of bit flags, since this flag is not protected
-		 * by any locks.
-		 */
-		.mm_flags = mm->flags,
-	};
-
-	audit_core_dumps(signr);
-
-	binfmt = mm->binfmt;
-	if (!binfmt || !binfmt->core_dump)
-		goto fail;
-	if (!__get_dumpable(cprm.mm_flags))
-		goto fail;
-
-	cred = prepare_creds();
-	if (!cred)
-		goto fail;
-	/*
-	 * We cannot trust fsuid as being the "true" uid of the process
-	 * nor do we know its entire history. We only know it was tainted
-	 * so we dump it as root in mode 2, and only into a controlled
-	 * environment (pipe handler or fully qualified path).
-	 */
-	if (__get_dumpable(cprm.mm_flags) == SUID_DUMPABLE_SAFE) {
-		/* Setuid core dump mode */
-		flag = O_EXCL;		/* Stop rewrite attacks */
-		cred->fsuid = GLOBAL_ROOT_UID;	/* Dump root private */
-		need_nonrelative = true;
-	}
-
-	retval = coredump_wait(exit_code, &core_state);
-	if (retval < 0)
-		goto fail_creds;
-
-	old_cred = override_creds(cred);
-
-	/*
-	 * Clear any false indication of pending signals that might
-	 * be seen by the filesystem code called to write the core file.
-	 */
-	clear_thread_flag(TIF_SIGPENDING);
-
-	ispipe = format_corename(&cn, signr);
-
- 	if (ispipe) {
-		int dump_count;
-		char **helper_argv;
-
-		if (ispipe < 0) {
-			printk(KERN_WARNING "format_corename failed\n");
-			printk(KERN_WARNING "Aborting core\n");
-			goto fail_corename;
-		}
-
-		if (cprm.limit == 1) {
-			/* See umh_pipe_setup() which sets RLIMIT_CORE = 1.
-			 *
-			 * Normally core limits are irrelevant to pipes, since
-			 * we're not writing to the file system, but we use
-			 * cprm.limit of 1 here as a speacial value, this is a
-			 * consistent way to catch recursive crashes.
-			 * We can still crash if the core_pattern binary sets
-			 * RLIM_CORE = !1, but it runs as root, and can do
-			 * lots of stupid things.
-			 *
-			 * Note that we use task_tgid_vnr here to grab the pid
-			 * of the process group leader.  That way we get the
-			 * right pid if a thread in a multi-threaded
-			 * core_pattern process dies.
-			 */
-			printk(KERN_WARNING
-				"Process %d(%s) has RLIMIT_CORE set to 1\n",
-				task_tgid_vnr(current), current->comm);
-			printk(KERN_WARNING "Aborting core\n");
-			goto fail_unlock;
-		}
-		cprm.limit = RLIM_INFINITY;
-
-		dump_count = atomic_inc_return(&core_dump_count);
-		if (core_pipe_limit && (core_pipe_limit < dump_count)) {
-			printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
-			       task_tgid_vnr(current), current->comm);
-			printk(KERN_WARNING "Skipping core dump\n");
-			goto fail_dropcount;
-		}
-
-		helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
-		if (!helper_argv) {
-			printk(KERN_WARNING "%s failed to allocate memory\n",
-			       __func__);
-			goto fail_dropcount;
-		}
-
-		retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
-					NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-					NULL, &cprm);
-		argv_free(helper_argv);
-		if (retval) {
- 			printk(KERN_INFO "Core dump to %s pipe failed\n",
-			       cn.corename);
-			goto close_fail;
- 		}
-	} else {
-		struct inode *inode;
-
-		if (cprm.limit < binfmt->min_coredump)
-			goto fail_unlock;
-
-		if (need_nonrelative && cn.corename[0] != '/') {
-			printk(KERN_WARNING "Pid %d(%s) can only dump core "\
-				"to fully qualified path!\n",
-				task_tgid_vnr(current), current->comm);
-			printk(KERN_WARNING "Skipping core dump\n");
-			goto fail_unlock;
-		}
-
-		cprm.file = filp_open(cn.corename,
-				 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
-				 0600);
-		if (IS_ERR(cprm.file))
-			goto fail_unlock;
-
-		inode = cprm.file->f_path.dentry->d_inode;
-		if (inode->i_nlink > 1)
-			goto close_fail;
-		if (d_unhashed(cprm.file->f_path.dentry))
-			goto close_fail;
-		/*
-		 * AK: actually i see no reason to not allow this for named
-		 * pipes etc, but keep the previous behaviour for now.
-		 */
-		if (!S_ISREG(inode->i_mode))
-			goto close_fail;
-		/*
-		 * Dont allow local users get cute and trick others to coredump
-		 * into their pre-created files.
-		 */
-		if (!uid_eq(inode->i_uid, current_fsuid()))
-			goto close_fail;
-		if (!cprm.file->f_op || !cprm.file->f_op->write)
-			goto close_fail;
-		if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
-			goto close_fail;
-	}
-
-	/* get us an unshared descriptor table; almost always a no-op */
-	retval = unshare_files(&displaced);
-	if (retval)
-		goto close_fail;
-	if (displaced)
-		put_files_struct(displaced);
-	retval = binfmt->core_dump(&cprm);
-	if (retval)
-		current->signal->group_exit_code |= 0x80;
-
-	if (ispipe && core_pipe_limit)
-		wait_for_dump_helpers(cprm.file);
-close_fail:
-	if (cprm.file)
-		filp_close(cprm.file, NULL);
-fail_dropcount:
-	if (ispipe)
-		atomic_dec(&core_dump_count);
-fail_unlock:
-	kfree(cn.corename);
-fail_corename:
-	coredump_finish(mm);
-	revert_creds(old_cred);
-fail_creds:
-	put_cred(cred);
-fail:
-	return;
-}
-
-/*
- * Core dumping helper functions.  These are the only things you should
- * do on a core-file: use only these functions to write out all the
- * necessary info.
- */
-int dump_write(struct file *file, const void *addr, int nr)
-{
-	return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
-}
-EXPORT_SYMBOL(dump_write);
-
-int dump_seek(struct file *file, loff_t off)
-{
-	int ret = 1;
-
-	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
-		if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
-			return 0;
-	} else {
-		char *buf = (char *)get_zeroed_page(GFP_KERNEL);
-
-		if (!buf)
-			return 0;
-		while (off > 0) {
-			unsigned long n = off;
-
-			if (n > PAGE_SIZE)
-				n = PAGE_SIZE;
-			if (!dump_write(file, buf, n)) {
-				ret = 0;
-				break;
-			}
-			off -= n;
-		}
-		free_page((unsigned long)buf);
-	}
-	return ret;
-}
-EXPORT_SYMBOL(dump_seek);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 23bddac4bad8..78041f4c7584 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -405,6 +405,7 @@ static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
 
 extern void set_dumpable(struct mm_struct *mm, int value);
 extern int get_dumpable(struct mm_struct *mm);
+extern int __get_dumpable(unsigned long mm_flags);
 
 /* get/set_dumpable() values */
 #define SUID_DUMPABLE_DISABLED	0
-- 
cgit v1.2.3


From 99621b44aa194eab594e1f17217231c02b519211 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 29 Aug 2012 16:31:33 -0400
Subject: btrfs: reada_extent doesn't need kref for refcount

All increments and decrements are under the same spinlock - have to be,
since they need to protect the radix_tree it's found in.  Just use
int, no need to wank with kref...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/reada.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 48a4882d8ad5..a955669519a2 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -68,7 +68,7 @@ struct reada_extent {
 	u32			blocksize;
 	int			err;
 	struct list_head	extctl;
-	struct kref		refcnt;
+	int 			refcnt;
 	spinlock_t		lock;
 	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
 	int			nzones;
@@ -126,7 +126,7 @@ static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
 	if (re)
-		kref_get(&re->refcnt);
+		re->refcnt++;
 	spin_unlock(&fs_info->reada_lock);
 
 	if (!re)
@@ -336,7 +336,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
 	if (re)
-		kref_get(&re->refcnt);
+		re->refcnt++;
 	spin_unlock(&fs_info->reada_lock);
 
 	if (re)
@@ -352,7 +352,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	re->top = *top;
 	INIT_LIST_HEAD(&re->extctl);
 	spin_lock_init(&re->lock);
-	kref_init(&re->refcnt);
+	re->refcnt = 1;
 
 	/*
 	 * map block
@@ -398,7 +398,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	if (ret == -EEXIST) {
 		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
 		BUG_ON(!re_exist);
-		kref_get(&re_exist->refcnt);
+		re_exist->refcnt++;
 		spin_unlock(&fs_info->reada_lock);
 		goto error;
 	}
@@ -465,10 +465,6 @@ error:
 	return re_exist;
 }
 
-static void reada_kref_dummy(struct kref *kr)
-{
-}
-
 static void reada_extent_put(struct btrfs_fs_info *fs_info,
 			     struct reada_extent *re)
 {
@@ -476,7 +472,7 @@ static void reada_extent_put(struct btrfs_fs_info *fs_info,
 	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
 
 	spin_lock(&fs_info->reada_lock);
-	if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+	if (--re->refcnt) {
 		spin_unlock(&fs_info->reada_lock);
 		return;
 	}
@@ -671,7 +667,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
 		return 0;
 	}
 	dev->reada_next = re->logical + re->blocksize;
-	kref_get(&re->refcnt);
+	re->refcnt++;
 
 	spin_unlock(&fs_info->reada_lock);
 
-- 
cgit v1.2.3


From 8c0a85377048b64c880e76ec7368904fe46d0b94 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Wed, 26 Sep 2012 11:33:07 +1000
Subject: fs: push rcu_barrier() from deactivate_locked_super() to filesystems

There's no reason to call rcu_barrier() on every
deactivate_locked_super().  We only need to make sure that all delayed rcu
free inodes are flushed before we destroy related cache.

Removing rcu_barrier() from deactivate_locked_super() affects some fast
paths.  E.g.  on my machine exit_group() of a last process in IPC
namespace takes 0.07538s.  rcu_barrier() takes 0.05188s of that time.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/v9fs.c             | 5 +++++
 fs/adfs/super.c          | 5 +++++
 fs/affs/super.c          | 5 +++++
 fs/afs/super.c           | 5 +++++
 fs/befs/linuxvfs.c       | 5 +++++
 fs/bfs/inode.c           | 5 +++++
 fs/btrfs/extent_io.c     | 6 ++++++
 fs/btrfs/inode.c         | 5 +++++
 fs/ceph/super.c          | 5 +++++
 fs/cifs/cifsfs.c         | 5 +++++
 fs/coda/inode.c          | 5 +++++
 fs/ecryptfs/main.c       | 6 ++++++
 fs/efs/super.c           | 5 +++++
 fs/exofs/super.c         | 5 +++++
 fs/ext2/super.c          | 5 +++++
 fs/ext3/super.c          | 5 +++++
 fs/ext4/super.c          | 5 +++++
 fs/fat/inode.c           | 5 +++++
 fs/freevxfs/vxfs_super.c | 5 +++++
 fs/fuse/inode.c          | 6 ++++++
 fs/hfs/super.c           | 6 ++++++
 fs/hfsplus/super.c       | 6 ++++++
 fs/hpfs/super.c          | 5 +++++
 fs/hugetlbfs/inode.c     | 5 +++++
 fs/isofs/inode.c         | 5 +++++
 fs/jffs2/super.c         | 6 ++++++
 fs/jfs/super.c           | 6 ++++++
 fs/logfs/inode.c         | 5 +++++
 fs/minix/inode.c         | 5 +++++
 fs/ncpfs/inode.c         | 5 +++++
 fs/nfs/inode.c           | 5 +++++
 fs/nilfs2/super.c        | 6 ++++++
 fs/ntfs/super.c          | 6 ++++++
 fs/ocfs2/dlmfs/dlmfs.c   | 5 +++++
 fs/ocfs2/super.c         | 5 +++++
 fs/openpromfs/inode.c    | 5 +++++
 fs/qnx4/inode.c          | 5 +++++
 fs/qnx6/inode.c          | 5 +++++
 fs/reiserfs/super.c      | 5 +++++
 fs/romfs/super.c         | 5 +++++
 fs/squashfs/super.c      | 5 +++++
 fs/super.c               | 6 ------
 fs/sysv/inode.c          | 5 +++++
 fs/ubifs/super.c         | 6 ++++++
 fs/udf/super.c           | 5 +++++
 fs/ufs/super.c           | 5 +++++
 fs/xfs/xfs_super.c       | 5 +++++
 47 files changed, 240 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index b85efa773949..392c5dac1981 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -560,6 +560,11 @@ static int v9fs_init_inode_cache(void)
  */
 static void v9fs_destroy_inode_cache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(v9fs_inode_cache);
 }
 
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index bdaec92353c2..c830c857c663 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -275,6 +275,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(adfs_inode_cachep);
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index c70f1e5fc024..2f57053bf26c 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -147,6 +147,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(affs_inode_cachep);
 }
 
diff --git a/fs/afs/super.c b/fs/afs/super.c
index df8c6047c2a1..43165009428d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -123,6 +123,11 @@ void __exit afs_fs_exit(void)
 		BUG();
 	}
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(afs_inode_cachep);
 	_leave("");
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index cf7f3c67c8b7..962b4f8f7994 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -454,6 +454,11 @@ befs_init_inodecache(void)
 static void
 befs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(befs_inode_cachep);
 }
 
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 9870417c26e7..d5fc598d6e4a 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -280,6 +280,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(bfs_inode_cachep);
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4c878476bb91..b08ea4717e9d 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -107,6 +107,12 @@ void extent_io_exit(void)
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
+
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
 	if (extent_state_cache)
 		kmem_cache_destroy(extent_state_cache);
 	if (extent_buffer_cache)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ec154f954646..cf03a91d806f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7076,6 +7076,11 @@ static void init_once(void *foo)
 
 void btrfs_destroy_cachep(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	if (btrfs_inode_cachep)
 		kmem_cache_destroy(btrfs_inode_cachep);
 	if (btrfs_trans_handle_cachep)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b982239f38f9..3a42d9326378 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -603,6 +603,11 @@ bad_cap:
 
 static void destroy_caches(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ceph_inode_cachep);
 	kmem_cache_destroy(ceph_cap_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index db8a404a51dd..d4ce77a02327 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -977,6 +977,11 @@ cifs_init_inodecache(void)
 static void
 cifs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(cifs_inode_cachep);
 }
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index d315c6c5891a..be2aa4909487 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -85,6 +85,11 @@ int coda_init_inodecache(void)
 
 void coda_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(coda_inode_cachep);
 }
 
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 9b627c15010a..34fcde765d24 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -710,6 +710,12 @@ static void ecryptfs_free_kmem_caches(void)
 {
 	int i;
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
 	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
 		struct ecryptfs_cache_info *info;
 
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e755ec746c69..2002431ef9a0 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -96,6 +96,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(efs_inode_cachep);
 }
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index dde41a75c7c8..59e3bbfac0b1 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -206,6 +206,11 @@ static int init_inodecache(void)
  */
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(exofs_inode_cachep);
 }
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index af74d9e27b71..6c205d0c565b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -206,6 +206,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext2_inode_cachep);
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 8c892e93d8e7..8d41c8889eee 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -532,6 +532,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext3_inode_cachep);
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c6e0cb3d1f4a..455b7d8c6d62 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1019,6 +1019,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext4_inode_cachep);
 }
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 05e897fe9866..fd8e47cd898b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -521,6 +521,11 @@ static int __init fat_init_inodecache(void)
 
 static void __exit fat_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(fat_inode_cachep);
 }
 
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index d4fabd26084e..fed2c8afb3a9 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -279,6 +279,11 @@ static void __exit
 vxfs_cleanup(void)
 {
 	unregister_filesystem(&vxfs_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(vxfs_inode_cachep);
 }
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index fca222dabe3c..f0eda124cffb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1197,6 +1197,12 @@ static void fuse_fs_cleanup(void)
 {
 	unregister_filesystem(&fuse_fs_type);
 	unregister_fuseblk();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(fuse_inode_cachep);
 }
 
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4eb873e0c07b..941d7a8c2197 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -482,6 +482,12 @@ static int __init init_hfs_fs(void)
 static void __exit exit_hfs_fs(void)
 {
 	unregister_filesystem(&hfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hfs_inode_cachep);
 }
 
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index fdafb2d71654..811a84d2d964 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -635,6 +635,12 @@ static int __init init_hfsplus_fs(void)
 static void __exit exit_hfsplus_fs(void)
 {
 	unregister_filesystem(&hfsplus_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hfsplus_inode_cachep);
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 706a12c083ea..3cb1da56eb73 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -210,6 +210,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hpfs_inode_cachep);
 }
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 8349a899912e..c4b85d064e6b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1042,6 +1042,11 @@ static int __init init_hugetlbfs_fs(void)
 
 static void __exit exit_hugetlbfs_fs(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	kern_unmount(hugetlbfs_vfsmount);
 	unregister_filesystem(&hugetlbfs_fs_type);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 29037c365ba4..f94cde4527e8 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -114,6 +114,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(isofs_inode_cachep);
 }
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 61ea41389f90..ff487954cd96 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -418,6 +418,12 @@ static void __exit exit_jffs2_fs(void)
 	unregister_filesystem(&jffs2_fs_type);
 	jffs2_destroy_slab_caches();
 	jffs2_compressors_exit();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(jffs2_inode_cachep);
 }
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index c55c7452d285..3735347fd5f6 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -903,6 +903,12 @@ static void __exit exit_jfs_fs(void)
 	jfs_proc_clean();
 #endif
 	unregister_filesystem(&jfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(jfs_inode_cachep);
 }
 
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index 6984562738d3..121bba2cf6f2 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -417,5 +417,10 @@ int logfs_init_inode_cache(void)
 
 void logfs_destroy_inode_cache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(logfs_inode_cache);
 }
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 2a503ad020d5..dc8d3629c20a 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -100,6 +100,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(minix_inode_cachep);
 }
 
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 333df07ae3bd..0c62c55b25d7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -89,6 +89,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ncp_inode_cachep);
 }
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 9b47610338f5..e4c716d374a8 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1571,6 +1571,11 @@ static int __init nfs_init_inodecache(void)
 
 static void nfs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(nfs_inode_cachep);
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6a10812711c1..3c991dc84f2f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1382,6 +1382,12 @@ static void nilfs_segbuf_init_once(void *obj)
 
 static void nilfs_destroy_cachep(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
 	if (nilfs_inode_cachep)
 		kmem_cache_destroy(nilfs_inode_cachep);
 	if (nilfs_transaction_cachep)
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 2bc149d6a784..fe08d4afa106 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3168,6 +3168,12 @@ static void __exit exit_ntfs_fs(void)
 	ntfs_debug("Unregistering NTFS driver.");
 
 	unregister_filesystem(&ntfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ntfs_big_inode_cache);
 	kmem_cache_destroy(ntfs_inode_cache);
 	kmem_cache_destroy(ntfs_name_cache);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 83b6f98e0665..16b712d260d4 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -691,6 +691,11 @@ static void __exit exit_dlmfs_fs(void)
 	flush_workqueue(user_dlm_worker);
 	destroy_workqueue(user_dlm_worker);
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(dlmfs_inode_cache);
 
 	bdi_destroy(&dlmfs_backing_dev_info);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 68f4541c2db9..0e91ec22a940 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1818,6 +1818,11 @@ static int ocfs2_initialize_mem_caches(void)
 
 static void ocfs2_free_mem_caches(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	if (ocfs2_inode_cachep)
 		kmem_cache_destroy(ocfs2_inode_cachep);
 	ocfs2_inode_cachep = NULL;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 4a3477949bca..2ad080faca34 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -463,6 +463,11 @@ static int __init init_openprom_fs(void)
 static void __exit exit_openprom_fs(void)
 {
 	unregister_filesystem(&openprom_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(op_inode_cachep);
 }
 
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 552e994e3aa1..9534b4f76579 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -391,6 +391,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 2049c814bda4..1b37fff7b5ff 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -651,6 +651,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(qnx6_inode_cachep);
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 7a37dabf5a96..1078ae179993 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -608,6 +608,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(reiserfs_inode_cachep);
 }
 
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 77c5f2173983..fd7c5f60b46b 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -648,6 +648,11 @@ error_register:
 static void __exit exit_romfs_fs(void)
 {
 	unregister_filesystem(&romfs_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(romfs_inode_cachep);
 }
 
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 29cd014ed3a1..260e3928d4f5 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -425,6 +425,11 @@ static int __init init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(squashfs_inode_cachep);
 }
 
diff --git a/fs/super.c b/fs/super.c
index 0902cfa6a12e..5fdf7ff32c4e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -307,12 +307,6 @@ void deactivate_locked_super(struct super_block *s)
 
 		/* caches are now gone, we can safely kill the shrinker now */
 		unregister_shrinker(&s->s_shrink);
-
-		/*
-		 * We need to call rcu_barrier so all the delayed rcu free
-		 * inodes are flushed before we release the fs module.
-		 */
-		rcu_barrier();
 		put_filesystem(fs);
 		put_super(s);
 	} else {
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 80e1e2b18df1..0d0c50bd3321 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -360,5 +360,10 @@ int __init sysv_init_icache(void)
 
 void sysv_destroy_icache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(sysv_inode_cachep);
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 71a197f0f93d..36e09ca9130b 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2298,6 +2298,12 @@ static void __exit ubifs_exit(void)
 	dbg_debugfs_exit();
 	ubifs_compressors_exit();
 	unregister_shrinker(&ubifs_shrinker_info);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ubifs_inode_slab);
 	unregister_filesystem(&ubifs_fs_type);
 }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 18fc038a438d..b8d27642ab06 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -171,6 +171,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(udf_inode_cachep);
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 444927e5706b..f7cfecfe1cab 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1466,6 +1466,11 @@ static int init_inodecache(void)
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ufs_inode_cachep);
 }
 
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19e2380fb867..83d36e473d2f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1506,6 +1506,11 @@ xfs_init_zones(void)
 STATIC void
 xfs_destroy_zones(void)
 {
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
 	kmem_zone_destroy(xfs_ili_zone);
 	kmem_zone_destroy(xfs_inode_zone);
 	kmem_zone_destroy(xfs_efi_zone);
-- 
cgit v1.2.3


From 8f9c0119d7ba94c3ad13876acc240d7f12b6d8e1 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Wed, 19 Sep 2012 12:01:52 +0100
Subject: compat: fs: Generic compat_sys_sendfile implementation

This function is used by sparc, powerpc and arm64 for compat support.
The patch adds a generic implementation which calls do_sendfile()
directly and avoids set_fs().

The sparc architecture has wrappers for the sign extensions while
powerpc relies on the compiler to do the this. The patch adds wrappers
for powerpc to handle the u32->int type conversion.

compat_sys_sendfile64() can be replaced by a sys_sendfile() call since
compat_loff_t has the same size as off_t on a 64-bit system.

On powerpc, the patch also changes the 64-bit sendfile call from
sys_sendile64 to sys_sendfile.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/include/asm/systbl.h |  4 ++--
 arch/powerpc/include/asm/unistd.h |  1 +
 arch/powerpc/kernel/sys_ppc32.c   | 45 ++++++--------------------------------
 arch/sparc/include/asm/unistd.h   |  1 +
 arch/sparc/kernel/sys32.S         |  2 +-
 arch/sparc/kernel/sys_sparc32.c   | 46 ---------------------------------------
 fs/compat.c                       | 22 +++++++++++++++++++
 fs/read_write.c                   |  4 ++--
 fs/read_write.h                   |  2 ++
 include/linux/compat.h            |  3 +++
 10 files changed, 41 insertions(+), 89 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 559ae1ee6706..840838769853 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -189,7 +189,7 @@ SYSCALL_SPU(getcwd)
 SYSCALL_SPU(capget)
 SYSCALL_SPU(capset)
 COMPAT_SYS(sigaltstack)
-SYSX_SPU(sys_sendfile64,compat_sys_sendfile,sys_sendfile)
+SYSX_SPU(sys_sendfile,compat_sys_sendfile_wrapper,sys_sendfile)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 PPC_SYS(vfork)
@@ -229,7 +229,7 @@ COMPAT_SYS_SPU(sched_setaffinity)
 COMPAT_SYS_SPU(sched_getaffinity)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
-SYS32ONLY(sendfile64)
+SYSX(sys_ni_syscall,compat_sys_sendfile64_wrapper,sys_sendfile64)
 COMPAT_SYS_SPU(io_setup)
 SYSCALL_SPU(io_destroy)
 COMPAT_SYS_SPU(io_getevents)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index bd377a368611..c683fa350add 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -419,6 +419,7 @@
 #define __ARCH_WANT_COMPAT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
 #define __ARCH_WANT_SYS_NEWFSTATAT
+#define __ARCH_WANT_COMPAT_SYS_SENDFILE
 #endif
 
 /*
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 81c570633ead..abd1112da54f 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -143,48 +143,17 @@ long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t pt
  * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
  * and the register representation of a signed int (msr in 64-bit mode) is performed.
  */
-asmlinkage long compat_sys_sendfile(u32 out_fd, u32 in_fd, compat_off_t __user * offset, u32 count)
+asmlinkage long compat_sys_sendfile_wrapper(u32 out_fd, u32 in_fd,
+					    compat_off_t __user *offset, u32 count)
 {
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	off_t of;
-	off_t __user *up;
-
-	if (offset && get_user(of, offset))
-		return -EFAULT;
-
-	/* The __user pointer cast is valid because of the set_fs() */		
-	set_fs(KERNEL_DS);
-	up = offset ? (off_t __user *) &of : NULL;
-	ret = sys_sendfile((int)out_fd, (int)in_fd, up, count);
-	set_fs(old_fs);
-	
-	if (offset && put_user(of, offset))
-		return -EFAULT;
-		
-	return ret;
+	return compat_sys_sendfile((int)out_fd, (int)in_fd, offset, count);
 }
 
-asmlinkage int compat_sys_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset, s32 count)
+asmlinkage long compat_sys_sendfile64_wrapper(u32 out_fd, u32 in_fd,
+					      compat_loff_t __user *offset, u32 count)
 {
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	loff_t lof;
-	loff_t __user *up;
-	
-	if (offset && get_user(lof, offset))
-		return -EFAULT;
-		
-	/* The __user pointer cast is valid because of the set_fs() */		
-	set_fs(KERNEL_DS);
-	up = offset ? (loff_t __user *) &lof : NULL;
-	ret = sys_sendfile64(out_fd, in_fd, up, count);
-	set_fs(old_fs);
-	
-	if (offset && put_user(lof, offset))
-		return -EFAULT;
-		
-	return ret;
+	return sys_sendfile((int)out_fd, (int)in_fd,
+			    (off_t __user *)offset, count);
 }
 
 long compat_sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index fb2693464807..d9a677c51926 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -447,6 +447,7 @@
 #else
 #define __ARCH_WANT_COMPAT_SYS_TIME
 #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND
+#define __ARCH_WANT_COMPAT_SYS_SENDFILE
 #endif
 
 /*
diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S
index d97f3eb72e06..44025f4ba41f 100644
--- a/arch/sparc/kernel/sys32.S
+++ b/arch/sparc/kernel/sys32.S
@@ -90,7 +90,7 @@ SIGN1(sys32_mkdir, sys_mkdir, %o1)
 SIGN3(sys32_futex, compat_sys_futex, %o1, %o2, %o5)
 SIGN1(sys32_sysfs, compat_sys_sysfs, %o0)
 SIGN2(sys32_sendfile, compat_sys_sendfile, %o0, %o1)
-SIGN2(sys32_sendfile64, compat_sys_sendfile64, %o0, %o1)
+SIGN2(sys32_sendfile64, sys_sendfile, %o0, %o1)
 SIGN1(sys32_prctl, sys_prctl, %o0)
 SIGN1(sys32_sched_rr_get_interval, compat_sys_sched_rr_get_interval, %o0)
 SIGN2(sys32_waitpid, sys_waitpid, %o0, %o2)
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index f7392336961f..d862499eb01c 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -506,52 +506,6 @@ long compat_sys_fadvise64_64(int fd,
 				advice);
 }
 
-asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
-				    compat_off_t __user *offset,
-				    compat_size_t count)
-{
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	off_t of;
-	
-	if (offset && get_user(of, offset))
-		return -EFAULT;
-		
-	set_fs(KERNEL_DS);
-	ret = sys_sendfile(out_fd, in_fd,
-			   offset ? (off_t __user *) &of : NULL,
-			   count);
-	set_fs(old_fs);
-	
-	if (offset && put_user(of, offset))
-		return -EFAULT;
-		
-	return ret;
-}
-
-asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
-				      compat_loff_t __user *offset,
-				      compat_size_t count)
-{
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	loff_t lof;
-	
-	if (offset && get_user(lof, offset))
-		return -EFAULT;
-		
-	set_fs(KERNEL_DS);
-	ret = sys_sendfile64(out_fd, in_fd,
-			     offset ? (loff_t __user *) &lof : NULL,
-			     count);
-	set_fs(old_fs);
-	
-	if (offset && put_user(lof, offset))
-		return -EFAULT;
-		
-	return ret;
-}
-
 /* This is just a version for 32-bit applications which does
  * not force O_LARGEFILE on.
  */
diff --git a/fs/compat.c b/fs/compat.c
index d72d51e19025..b7a24d0ca30d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1792,3 +1792,25 @@ compat_sys_open_by_handle_at(int mountdirfd,
 	return do_handle_open(mountdirfd, handle, flags);
 }
 #endif
+
+#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE
+asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
+				    compat_off_t __user *offset, compat_size_t count)
+{
+	loff_t pos;
+	off_t off;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(get_user(off, offset)))
+			return -EFAULT;
+		pos = off;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */
diff --git a/fs/read_write.c b/fs/read_write.c
index 28b38279a219..d06534857e9e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -862,8 +862,8 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
-static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
-			   size_t count, loff_t max)
+ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
+		    loff_t max)
 {
 	struct fd in, out;
 	struct inode *in_inode, *out_inode;
diff --git a/fs/read_write.h b/fs/read_write.h
index d07b954c6e0c..d3e00ef67420 100644
--- a/fs/read_write.h
+++ b/fs/read_write.h
@@ -12,3 +12,5 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
+ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
+		    loff_t max);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 09b28b7369d7..fd4e29956d1c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -590,6 +590,9 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
 		unsigned long liovcnt, const struct compat_iovec __user *rvec,
 		unsigned long riovcnt, unsigned long flags);
 
+asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
+				    compat_off_t __user *offset, compat_size_t count);
+
 #else
 
 #define is_compat_task() (0)
-- 
cgit v1.2.3