From a358324466b171e145df20bdb74fe81759906de6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 11 Nov 2008 15:01:42 -0500
Subject: ring-buffer: buffer record on/off switch

Impact: enable/disable ring buffer recording API added

Several kernel developers have requested that there be a way to stop
recording into the ring buffers with a simple switch that can also
be enabled from userspace. This patch addes a new kernel API to the
ring buffers called:

 tracing_on()
 tracing_off()

When tracing_off() is called, all ring buffers will not be able to record
into their buffers.

tracing_on() will enable the ring buffers again.

These two act like an on/off switch. That is, there is no counting of the
number of times tracing_off or tracing_on has been called.

A new file is added to the debugfs/tracing directory called

  tracing_on

This allows for userspace applications to also flip the switch.

  echo 0 > debugfs/tracing/tracing_on

disables the tracing.

  echo 1 > /debugfs/tracing/tracing_on

enables it.

Note, this does not disable or enable any tracers. It only sets or clears
a flag that needs to be set in order for the ring buffers to write to
their buffers. It is a global flag, and affects all ring buffers.

The buffers start out with tracing_on enabled.

There are now three flags that control recording into the buffers:

 tracing_on: which affects all ring buffer tracers.

 buffer->record_disabled: which affects an allocated buffer, which may be set
     if an anomaly is detected, and tracing is disabled.

 cpu_buffer->record_disabled: which is set by tracing_stop() or if an
     anomaly is detected. tracing_start can not reenable this if
     an anomaly occurred.

The userspace debugfs/tracing/tracing_enabled is implemented with
tracing_stop() but the user space code can not enable it if the kernel
called tracing_stop().

Userspace can enable the tracing_on even if the kernel disabled it.
It is just a switch used to stop tracing if a condition was hit.
tracing_on is not for protecting critical areas in the kernel nor is
it for stopping tracing if an anomaly occurred. This is because userspace
can reenable it at any time.

Side effect: With this patch, I discovered a dead variable in ftrace.c
  called tracing_on. This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 536b0ca46a03..e097c2e6b6dc 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -120,6 +120,9 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+void tracing_on(void);
+void tracing_off(void);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
-- 
cgit v1.2.3


From ba32929a91fe2c0628f5be62d1597b379c8d3062 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 10 Nov 2008 15:29:58 +0900
Subject: block: make add_partition() return pointer to hd_struct

Make add_partition() return pointer to the new hd_struct on success
and ERR_PTR() value on failure.  This change will be used to fix md
autodetection bug.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/ioctl.c         |  7 +++----
 fs/partitions/check.c | 25 +++++++++++++------------
 include/linux/genhd.h |  4 +++-
 3 files changed, 19 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/block/ioctl.c b/block/ioctl.c
index c832d639b6e2..d03985b04d67 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,7 +18,6 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 	struct disk_part_iter piter;
 	long long start, length;
 	int partno;
-	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -61,10 +60,10 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			disk_part_iter_exit(&piter);
 
 			/* all seems OK */
-			err = add_partition(disk, partno, start, length,
-					    ADDPART_FLAG_NONE);
+			part = add_partition(disk, partno, start, length,
+					     ADDPART_FLAG_NONE);
 			mutex_unlock(&bdev->bd_mutex);
-			return err;
+			return IS_ERR(part) ? PTR_ERR(part) : 0;
 		case BLKPG_DEL_PARTITION:
 			part = disk_get_part(disk, partno);
 			if (!part)
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 90bcf136a9de..633025340239 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -348,8 +348,8 @@ static ssize_t whole_disk_show(struct device *dev,
 static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH,
 		   whole_disk_show, NULL);
 
-int add_partition(struct gendisk *disk, int partno,
-		  sector_t start, sector_t len, int flags)
+struct hd_struct *add_partition(struct gendisk *disk, int partno,
+				sector_t start, sector_t len, int flags)
 {
 	struct hd_struct *p;
 	dev_t devt = MKDEV(0, 0);
@@ -361,15 +361,15 @@ int add_partition(struct gendisk *disk, int partno,
 
 	err = disk_expand_part_tbl(disk, partno);
 	if (err)
-		return err;
+		return ERR_PTR(err);
 	ptbl = disk->part_tbl;
 
 	if (ptbl->part[partno])
-		return -EBUSY;
+		return ERR_PTR(-EBUSY);
 
 	p = kzalloc(sizeof(*p), GFP_KERNEL);
 	if (!p)
-		return -ENOMEM;
+		return ERR_PTR(-EBUSY);
 
 	if (!init_part_stats(p)) {
 		err = -ENOMEM;
@@ -424,20 +424,20 @@ int add_partition(struct gendisk *disk, int partno,
 	if (!ddev->uevent_suppress)
 		kobject_uevent(&pdev->kobj, KOBJ_ADD);
 
-	return 0;
+	return p;
 
 out_free_stats:
 	free_part_stats(p);
 out_free:
 	kfree(p);
-	return err;
+	return ERR_PTR(err);
 out_del:
 	kobject_put(p->holder_dir);
 	device_del(pdev);
 out_put:
 	put_device(pdev);
 	blk_free_devt(devt);
-	return err;
+	return ERR_PTR(err);
 }
 
 /* Not exported, helper to add_disk(). */
@@ -568,10 +568,11 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
 			       disk->disk_name, p, (unsigned long long) size);
 			size = get_capacity(disk) - from;
 		}
-		res = add_partition(disk, p, from, size, state->parts[p].flags);
-		if (res) {
-			printk(KERN_ERR " %s: p%d could not be added: %d\n",
-				disk->disk_name, p, -res);
+		part = add_partition(disk, p, from, size,
+				     state->parts[p].flags);
+		if (IS_ERR(part)) {
+			printk(KERN_ERR " %s: p%d could not be added: %ld\n",
+			       disk->disk_name, p, -PTR_ERR(part));
 			continue;
 		}
 #ifdef CONFIG_BLK_DEV_MD
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e439e6aed832..3df7742ce246 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -522,7 +522,9 @@ extern char *disk_name (struct gendisk *hd, int partno, char *buf);
 
 extern int disk_expand_part_tbl(struct gendisk *disk, int target);
 extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
-extern int __must_check add_partition(struct gendisk *, int, sector_t, sector_t, int);
+extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
+						     int partno, sector_t start,
+						     sector_t len, int flags);
 extern void delete_partition(struct gendisk *, int);
 extern void printk_all_partitions(void);
 
-- 
cgit v1.2.3


From de11defebf00007677fb7ee91d9b089b78786fbb Mon Sep 17 00:00:00 2001
From: Ulrich Drepper <drepper@redhat.com>
Date: Wed, 19 Nov 2008 15:36:14 -0800
Subject: reintroduce accept4

Introduce a new accept4() system call.  The addition of this system call
matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(),
inotify_init1(), epoll_create1(), pipe2()) which added new system calls
that differed from analogous traditional system calls in adding a flags
argument that can be used to access additional functionality.

The accept4() system call is exactly the same as accept(), except that
it adds a flags bit-mask argument.  Two flags are initially implemented.
(Most of the new system calls in 2.6.27 also had both of these flags.)

SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled
for the new file descriptor returned by accept4().  This is a useful
security feature to avoid leaking information in a multithreaded
program where one thread is doing an accept() at the same time as
another thread is doing a fork() plus exec().  More details here:
http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling",
Ulrich Drepper).

The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag
to be enabled on the new open file description created by accept4().
(This flag is merely a convenience, saving the use of additional calls
fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result.

Here's a test program.  Works on x86-32.  Should work on x86-64, but
I (mtk) don't have a system to hand to test with.

It tests accept4() with each of the four possible combinations of
SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies
that the appropriate flags are set on the file descriptor/open file
description returned by accept4().

I tested Ulrich's patch in this thread by applying against 2.6.28-rc2,
and it passes according to my test program.

/* test_accept4.c

  Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
       <mtk.manpages@gmail.com>

  Licensed under the GNU GPLv2 or later.
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>

#define PORT_NUM 33333

#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0)

/**********************************************************************/

/* The following is what we need until glibc gets a wrapper for
  accept4() */

/* Flags for socket(), socketpair(), accept4() */
#ifndef SOCK_CLOEXEC
#define SOCK_CLOEXEC    O_CLOEXEC
#endif
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK   O_NONBLOCK
#endif

#ifdef __x86_64__
#define SYS_accept4 288
#elif __i386__
#define USE_SOCKETCALL 1
#define SYS_ACCEPT4 18
#else
#error "Sorry -- don't know the syscall # on this architecture"
#endif

static int
accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags)
{
   printf("Calling accept4(): flags = %x", flags);
   if (flags != 0) {
       printf(" (");
       if (flags & SOCK_CLOEXEC)
           printf("SOCK_CLOEXEC");
       if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK))
           printf(" ");
       if (flags & SOCK_NONBLOCK)
           printf("SOCK_NONBLOCK");
       printf(")");
   }
   printf("\n");

#if USE_SOCKETCALL
   long args[6];

   args[0] = fd;
   args[1] = (long) sockaddr;
   args[2] = (long) addrlen;
   args[3] = flags;

   return syscall(SYS_socketcall, SYS_ACCEPT4, args);
#else
   return syscall(SYS_accept4, fd, sockaddr, addrlen, flags);
#endif
}

/**********************************************************************/

static int
do_test(int lfd, struct sockaddr_in *conn_addr,
       int closeonexec_flag, int nonblock_flag)
{
   int connfd, acceptfd;
   int fdf, flf, fdf_pass, flf_pass;
   struct sockaddr_in claddr;
   socklen_t addrlen;

   printf("=======================================\n");

   connfd = socket(AF_INET, SOCK_STREAM, 0);
   if (connfd == -1)
       die("socket");
   if (connect(connfd, (struct sockaddr *) conn_addr,
               sizeof(struct sockaddr_in)) == -1)
       die("connect");

   addrlen = sizeof(struct sockaddr_in);
   acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen,
                      closeonexec_flag | nonblock_flag);
   if (acceptfd == -1) {
       perror("accept4()");
       close(connfd);
       return 0;
   }

   fdf = fcntl(acceptfd, F_GETFD);
   if (fdf == -1)
       die("fcntl:F_GETFD");
   fdf_pass = ((fdf & FD_CLOEXEC) != 0) ==
              ((closeonexec_flag & SOCK_CLOEXEC) != 0);
   printf("Close-on-exec flag is %sset (%s); ",
           (fdf & FD_CLOEXEC) ? "" : "not ",
           fdf_pass ? "OK" : "failed");

   flf = fcntl(acceptfd, F_GETFL);
   if (flf == -1)
       die("fcntl:F_GETFD");
   flf_pass = ((flf & O_NONBLOCK) != 0) ==
              ((nonblock_flag & SOCK_NONBLOCK) !=0);
   printf("nonblock flag is %sset (%s)\n",
           (flf & O_NONBLOCK) ? "" : "not ",
           flf_pass ? "OK" : "failed");

   close(acceptfd);
   close(connfd);

   printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL");
   return fdf_pass && flf_pass;
}

static int
create_listening_socket(int port_num)
{
   struct sockaddr_in svaddr;
   int lfd;
   int optval;

   memset(&svaddr, 0, sizeof(struct sockaddr_in));
   svaddr.sin_family = AF_INET;
   svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
   svaddr.sin_port = htons(port_num);

   lfd = socket(AF_INET, SOCK_STREAM, 0);
   if (lfd == -1)
       die("socket");

   optval = 1;
   if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval,
                  sizeof(optval)) == -1)
       die("setsockopt");

   if (bind(lfd, (struct sockaddr *) &svaddr,
            sizeof(struct sockaddr_in)) == -1)
       die("bind");

   if (listen(lfd, 5) == -1)
       die("listen");

   return lfd;
}

int
main(int argc, char *argv[])
{
   struct sockaddr_in conn_addr;
   int lfd;
   int port_num;
   int passed;

   passed = 1;

   port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM;

   memset(&conn_addr, 0, sizeof(struct sockaddr_in));
   conn_addr.sin_family = AF_INET;
   conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
   conn_addr.sin_port = htons(port_num);

   lfd = create_listening_socket(port_num);

   if (!do_test(lfd, &conn_addr, 0, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0))
       passed = 0;
   if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK))
       passed = 0;
   if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK))
       passed = 0;

   close(lfd);

   exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
}

[mtk.manpages@gmail.com: rewrote changelog, updated test program]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/unistd_64.h |  4 +-
 include/linux/net.h              |  6 +--
 include/linux/syscalls.h         |  3 +-
 kernel/sys_ni.c                  |  2 +-
 net/compat.c                     | 50 +++----------------------
 net/socket.c                     | 80 +++++-----------------------------------
 6 files changed, 21 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 834b2c1d89fb..d2e415e6666f 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -639,8 +639,8 @@ __SYSCALL(__NR_fallocate, sys_fallocate)
 __SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
 #define __NR_timerfd_gettime			287
 __SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
-#define __NR_paccept				288
-__SYSCALL(__NR_paccept, sys_paccept)
+#define __NR_accept4				288
+__SYSCALL(__NR_accept4, sys_accept4)
 #define __NR_signalfd4				289
 __SYSCALL(__NR_signalfd4, sys_signalfd4)
 #define __NR_eventfd2				290
diff --git a/include/linux/net.h b/include/linux/net.h
index 6dc14a240042..4515efae4c39 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -40,7 +40,7 @@
 #define SYS_GETSOCKOPT	15		/* sys_getsockopt(2)		*/
 #define SYS_SENDMSG	16		/* sys_sendmsg(2)		*/
 #define SYS_RECVMSG	17		/* sys_recvmsg(2)		*/
-#define SYS_PACCEPT	18		/* sys_paccept(2)		*/
+#define SYS_ACCEPT4	18		/* sys_accept4(2)		*/
 
 typedef enum {
 	SS_FREE = 0,			/* not allocated		*/
@@ -100,7 +100,7 @@ enum sock_type {
  * remaining bits are used as flags. */
 #define SOCK_TYPE_MASK 0xf
 
-/* Flags for socket, socketpair, paccept */
+/* Flags for socket, socketpair, accept4 */
 #define SOCK_CLOEXEC	O_CLOEXEC
 #ifndef SOCK_NONBLOCK
 #define SOCK_NONBLOCK	O_NONBLOCK
@@ -223,8 +223,6 @@ extern int 	     sock_map_fd(struct socket *sock, int flags);
 extern struct socket *sockfd_lookup(int fd, int *err);
 #define		     sockfd_put(sock) fput(sock->file)
 extern int	     net_ratelimit(void);
-extern long	     do_accept(int fd, struct sockaddr __user *upeer_sockaddr,
-			       int __user *upeer_addrlen, int flags);
 
 #define net_random()		random32()
 #define net_srandom(seed)	srandom32((__force u32)seed)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index d6ff145919ca..04fb47bfb920 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -410,8 +410,7 @@ asmlinkage long sys_getsockopt(int fd, int level, int optname,
 asmlinkage long sys_bind(int, struct sockaddr __user *, int);
 asmlinkage long sys_connect(int, struct sockaddr __user *, int);
 asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
-asmlinkage long sys_paccept(int, struct sockaddr __user *, int __user *,
-			    const __user sigset_t *, size_t, int);
+asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
 asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
 asmlinkage long sys_send(int, void __user *, size_t, unsigned);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a77b27b11b04..e14a23281707 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,7 +31,7 @@ cond_syscall(sys_socketpair);
 cond_syscall(sys_bind);
 cond_syscall(sys_listen);
 cond_syscall(sys_accept);
-cond_syscall(sys_paccept);
+cond_syscall(sys_accept4);
 cond_syscall(sys_connect);
 cond_syscall(sys_getsockname);
 cond_syscall(sys_getpeername);
diff --git a/net/compat.c b/net/compat.c
index 6ce1a1cadcc0..a3a2ba0fac08 100644
--- a/net/compat.c
+++ b/net/compat.c
@@ -725,7 +725,7 @@ EXPORT_SYMBOL(compat_mc_getsockopt);
 static unsigned char nas[19]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 				AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 				AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-				AL(6)};
+				AL(4)};
 #undef AL
 
 asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags)
@@ -738,52 +738,13 @@ asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, uns
 	return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT);
 }
 
-asmlinkage long compat_sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
-				   int __user *upeer_addrlen,
-				   const compat_sigset_t __user *sigmask,
-				   compat_size_t sigsetsize, int flags)
-{
-	compat_sigset_t ss32;
-	sigset_t ksigmask, sigsaved;
-	int ret;
-
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ss32, sigmask, sizeof(ss32)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &ss32);
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
-
-	if (ret == -ERESTARTNOHAND) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
-}
-
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 {
 	int ret;
 	u32 a[6];
 	u32 a0, a1;
 
-	if (call < SYS_SOCKET || call > SYS_PACCEPT)
+	if (call < SYS_SOCKET || call > SYS_ACCEPT4)
 		return -EINVAL;
 	if (copy_from_user(a, args, nas[call]))
 		return -EFAULT;
@@ -804,7 +765,7 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 		ret = sys_listen(a0, a1);
 		break;
 	case SYS_ACCEPT:
-		ret = do_accept(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0);
 		break;
 	case SYS_GETSOCKNAME:
 		ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]));
@@ -844,9 +805,8 @@ asmlinkage long compat_sys_socketcall(int call, u32 __user *args)
 	case SYS_RECVMSG:
 		ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]);
 		break;
-	case SYS_PACCEPT:
-		ret = compat_sys_paccept(a0, compat_ptr(a1), compat_ptr(a[2]),
-					 compat_ptr(a[3]), a[4], a[5]);
+	case SYS_ACCEPT4:
+		ret = sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), a[3]);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/net/socket.c b/net/socket.c
index 57550c3bcabe..92764d836891 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1426,8 +1426,8 @@ asmlinkage long sys_listen(int fd, int backlog)
  *	clean when we restucture accept also.
  */
 
-long do_accept(int fd, struct sockaddr __user *upeer_sockaddr,
-	       int __user *upeer_addrlen, int flags)
+asmlinkage long sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
+			    int __user *upeer_addrlen, int flags)
 {
 	struct socket *sock, *newsock;
 	struct file *newfile;
@@ -1510,66 +1510,10 @@ out_fd:
 	goto out_put;
 }
 
-#if 0
-#ifdef HAVE_SET_RESTORE_SIGMASK
-asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
-			    int __user *upeer_addrlen,
-			    const sigset_t __user *sigmask,
-			    size_t sigsetsize, int flags)
-{
-	sigset_t ksigmask, sigsaved;
-	int ret;
-
-	if (sigmask) {
-		/* XXX: Don't preclude handling different sized sigset_t's.  */
-		if (sigsetsize != sizeof(sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
-			return -EFAULT;
-
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-        }
-
-	ret = do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
-
-	if (ret < 0 && signal_pending(current)) {
-		/*
-		 * Don't restore the signal mask yet. Let do_signal() deliver
-		 * the signal on the way back to userspace, before the signal
-		 * mask is restored.
-		 */
-		if (sigmask) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		}
-	} else if (sigmask)
-		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-
-	return ret;
-}
-#else
-asmlinkage long sys_paccept(int fd, struct sockaddr __user *upeer_sockaddr,
-			    int __user *upeer_addrlen,
-			    const sigset_t __user *sigmask,
-			    size_t sigsetsize, int flags)
-{
-	/* The platform does not support restoring the signal mask in the
-	 * return path.  So we do not allow using paccept() with a signal
-	 * mask.  */
-	if (sigmask)
-		return -EINVAL;
-
-	return do_accept(fd, upeer_sockaddr, upeer_addrlen, flags);
-}
-#endif
-#endif
-
 asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr,
 			   int __user *upeer_addrlen)
 {
-	return do_accept(fd, upeer_sockaddr, upeer_addrlen, 0);
+	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
 }
 
 /*
@@ -2096,7 +2040,7 @@ static const unsigned char nargs[19]={
 	AL(0),AL(3),AL(3),AL(3),AL(2),AL(3),
 	AL(3),AL(3),AL(4),AL(4),AL(4),AL(6),
 	AL(6),AL(2),AL(5),AL(5),AL(3),AL(3),
-	AL(6)
+	AL(4)
 };
 
 #undef AL
@@ -2115,7 +2059,7 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	unsigned long a0, a1;
 	int err;
 
-	if (call < 1 || call > SYS_PACCEPT)
+	if (call < 1 || call > SYS_ACCEPT4)
 		return -EINVAL;
 
 	/* copy_from_user should be SMP safe. */
@@ -2143,9 +2087,8 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 		err = sys_listen(a0, a1);
 		break;
 	case SYS_ACCEPT:
-		err =
-		    do_accept(a0, (struct sockaddr __user *)a1,
-			      (int __user *)a[2], 0);
+		err = sys_accept4(a0, (struct sockaddr __user *)a1,
+				  (int __user *)a[2], 0);
 		break;
 	case SYS_GETSOCKNAME:
 		err =
@@ -2192,12 +2135,9 @@ asmlinkage long sys_socketcall(int call, unsigned long __user *args)
 	case SYS_RECVMSG:
 		err = sys_recvmsg(a0, (struct msghdr __user *)a1, a[2]);
 		break;
-	case SYS_PACCEPT:
-		err =
-		    sys_paccept(a0, (struct sockaddr __user *)a1,
-			        (int __user *)a[2],
-				(const sigset_t __user *) a[3],
-				a[4], a[5]);
+	case SYS_ACCEPT4:
+		err = sys_accept4(a0, (struct sockaddr __user *)a1,
+				  (int __user *)a[2], a[3]);
 		break;
 	default:
 		err = -EINVAL;
-- 
cgit v1.2.3


From f481891fdc49d3d1b8a9674a1825d183069a805f Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 19 Nov 2008 15:36:30 -0800
Subject: cpuset: update top cpuset's mems after adding a node

After adding a node into the machine, top cpuset's mems isn't updated.

By reviewing the code, we found that the update function

  cpuset_track_online_nodes()

was invoked after node_states[N_ONLINE] changes.  It is wrong because
N_ONLINE just means node has pgdat, and if node has/added memory, we use
N_HIGH_MEMORY.  So, We should invoke the update function after
node_states[N_HIGH_MEMORY] changes, just like its commit says.

This patch fixes it.  And we use notifier of memory hotplug instead of
direct calling of cpuset_track_online_nodes().

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Paul Menage <menage@google.com
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h |  4 ----
 kernel/cpuset.c        | 19 ++++++++++++++++---
 mm/memory_hotplug.c    |  3 ---
 3 files changed, 16 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2691926fb506..8e540d32c9fe 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -74,8 +74,6 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return current->flags & PF_SPREAD_SLAB;
 }
 
-extern void cpuset_track_online_nodes(void);
-
 extern int current_cpuset_is_being_rebound(void);
 
 extern void rebuild_sched_domains(void);
@@ -151,8 +149,6 @@ static inline int cpuset_do_slab_mem_spread(void)
 	return 0;
 }
 
-static inline void cpuset_track_online_nodes(void) {}
-
 static inline int current_cpuset_is_being_rebound(void)
 {
 	return 0;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 81fc6791a296..da7ff6137f37 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -36,6 +36,7 @@
 #include <linux/list.h>
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
+#include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
@@ -2015,12 +2016,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
  * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
  * See also the previous routine cpuset_track_online_cpus().
  */
-void cpuset_track_online_nodes(void)
+static int cpuset_track_online_nodes(struct notifier_block *self,
+				unsigned long action, void *arg)
 {
 	cgroup_lock();
-	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-	scan_for_empty_cpusets(&top_cpuset);
+	switch (action) {
+	case MEM_ONLINE:
+		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+		break;
+	case MEM_OFFLINE:
+		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+		scan_for_empty_cpusets(&top_cpuset);
+		break;
+	default:
+		break;
+	}
 	cgroup_unlock();
+	return NOTIFY_OK;
 }
 #endif
 
@@ -2036,6 +2048,7 @@ void __init cpuset_init_smp(void)
 	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
 
 	hotcpu_notifier(cpuset_track_online_cpus, 0);
+	hotplug_memory_notifier(cpuset_track_online_nodes, 10);
 }
 
 /**
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 6837a1014372..b5b2b15085a8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,7 +22,6 @@
 #include <linux/highmem.h>
 #include <linux/vmalloc.h>
 #include <linux/ioport.h>
-#include <linux/cpuset.h>
 #include <linux/delay.h>
 #include <linux/migrate.h>
 #include <linux/page-isolation.h>
@@ -498,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size)
 	/* we online node here. we can't roll back from here. */
 	node_set_online(nid);
 
-	cpuset_track_online_nodes();
-
 	if (new_pgdat) {
 		ret = register_one_node(nid);
 		/*
-- 
cgit v1.2.3