From 2177905ca7419c49910d47e38e44790affd918cc Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 11 Jun 2012 23:56:09 -0700 Subject: Input: fix input.h kernel-doc warning Fix kernel-doc warning in input.h: Warning(include/linux/input.h:140): No description found for parameter 'len' Signed-off-by: Randy Dunlap Signed-off-by: Dmitry Torokhov --- include/linux/input.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/input.h b/include/linux/input.h index a81671453575..2740d080ec6b 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -116,6 +116,7 @@ struct input_keymap_entry { /** * EVIOCGMTSLOTS(len) - get MT slot values + * @len: size of the data buffer in bytes * * The ioctl buffer argument should be binary equivalent to * -- cgit v1.2.3 From cba6d0d64ee53772b285d0c0c288deefbeaf7775 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 2 Jul 2012 07:08:42 -0700 Subject: Revert "rcu: Move PREEMPT_RCU preemption to switch_to() invocation" This reverts commit 616c310e83b872024271c915c1b9ab505b9efad9. (Move PREEMPT_RCU preemption to switch_to() invocation). Testing by Sasha Levin showed that this can result in deadlock due to invoking the scheduler when one of the runqueue locks is held. Because this commit was simply a performance optimization, revert it. Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney Tested-by: Sasha Levin --- arch/um/drivers/mconsole_kern.c | 1 - include/linux/rcupdate.h | 1 - include/linux/rcutiny.h | 6 ++++++ include/linux/sched.h | 10 ---------- kernel/rcutree.c | 1 + kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 14 +++++++++++--- kernel/sched/core.c | 1 - 8 files changed, 19 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index 88e466b159dc..43b39d61b538 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -705,7 +705,6 @@ static void stack_proc(void *arg) struct task_struct *from = current, *to = arg; to->thread.saved_task = from; - rcu_switch_from(from); switch_to(from, to, from); } diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 26d1a47591f1..9cac722b169c 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -184,7 +184,6 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ extern void rcu_sched_qs(int cpu); extern void rcu_bh_qs(int cpu); -extern void rcu_preempt_note_context_switch(void); extern void rcu_check_callbacks(int cpu, int user); struct notifier_block; extern void rcu_idle_enter(void); diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 854dc4c5c271..4e56a9c69a35 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,6 +87,10 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU +static inline void rcu_preempt_note_context_switch(void) +{ +} + static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { *delta_jiffies = ULONG_MAX; @@ -95,6 +99,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) #else /* #ifdef CONFIG_TINY_RCU */ +void rcu_preempt_note_context_switch(void); int rcu_preempt_needs_cpu(void); static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) @@ -108,6 +113,7 @@ static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) static inline void rcu_note_context_switch(int cpu) { rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(); } /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4059c0f33f07..06a4c5f4f55c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1871,22 +1871,12 @@ static inline void rcu_copy_process(struct task_struct *p) INIT_LIST_HEAD(&p->rcu_node_entry); } -static inline void rcu_switch_from(struct task_struct *prev) -{ - if (prev->rcu_read_lock_nesting != 0) - rcu_preempt_note_context_switch(); -} - #else static inline void rcu_copy_process(struct task_struct *p) { } -static inline void rcu_switch_from(struct task_struct *prev) -{ -} - #endif #ifdef CONFIG_SMP diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 38ecdda3f55f..4b97bba7396e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -201,6 +201,7 @@ void rcu_note_context_switch(int cpu) { trace_rcu_utilization("Start context switch"); rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); trace_rcu_utilization("End context switch"); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); diff --git a/kernel/rcutree.h b/kernel/rcutree.h index ea056495783e..19b61ac1079f 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -444,6 +444,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work); /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); #ifdef CONFIG_HOTPLUG_CPU static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 5271a020887e..3e4899459f3d 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -153,7 +153,7 @@ static void rcu_preempt_qs(int cpu) * * Caller must disable preemption. */ -void rcu_preempt_note_context_switch(void) +static void rcu_preempt_note_context_switch(int cpu) { struct task_struct *t = current; unsigned long flags; @@ -164,7 +164,7 @@ void rcu_preempt_note_context_switch(void) (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { /* Possibly blocking in an RCU read-side critical section. */ - rdp = __this_cpu_ptr(rcu_preempt_state.rda); + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); rnp = rdp->mynode; raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; @@ -228,7 +228,7 @@ void rcu_preempt_note_context_switch(void) * means that we continue to block the current grace period. */ local_irq_save(flags); - rcu_preempt_qs(smp_processor_id()); + rcu_preempt_qs(cpu); local_irq_restore(flags); } @@ -1001,6 +1001,14 @@ void rcu_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); +/* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + /* * Because preemptible RCU does not exist, there are never any preempted * RCU readers. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..eaead2df6aa8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, #endif /* Here we just switch the register state and the stack. */ - rcu_switch_from(prev); switch_to(prev, next, prev); barrier(); -- cgit v1.2.3 From d4db2935e4fffeba42540b0dc9d85e3036701221 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Fri, 29 Jun 2012 09:56:08 -0600 Subject: KVM: Pass kvm_irqfd to functions Prune this down to just the struct kvm_irqfd so we can avoid changing function definition for every flag or field we use. Signed-off-by: Alex Williamson Acked-by: Cornelia Huck Signed-off-by: Marcelo Tosatti --- include/linux/kvm_host.h | 4 ++-- virt/kvm/eventfd.c | 20 ++++++++++---------- virt/kvm/kvm_main.c | 2 +- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c4464356b35b..96c158a37d3e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -815,7 +815,7 @@ static inline void kvm_free_irq_routing(struct kvm *kvm) {} #ifdef CONFIG_HAVE_KVM_EVENTFD void kvm_eventfd_init(struct kvm *kvm); -int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags); +int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); void kvm_irqfd_release(struct kvm *kvm); void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); @@ -824,7 +824,7 @@ int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); static inline void kvm_eventfd_init(struct kvm *kvm) {} -static inline int kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { return -EINVAL; } diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index f59c1e8de7a2..c307c24c147a 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -198,7 +198,7 @@ static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd, } static int -kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) +kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args) { struct kvm_irq_routing_table *irq_rt; struct _irqfd *irqfd, *tmp; @@ -212,12 +212,12 @@ kvm_irqfd_assign(struct kvm *kvm, int fd, int gsi) return -ENOMEM; irqfd->kvm = kvm; - irqfd->gsi = gsi; + irqfd->gsi = args->gsi; INIT_LIST_HEAD(&irqfd->list); INIT_WORK(&irqfd->inject, irqfd_inject); INIT_WORK(&irqfd->shutdown, irqfd_shutdown); - file = eventfd_fget(fd); + file = eventfd_fget(args->fd); if (IS_ERR(file)) { ret = PTR_ERR(file); goto fail; @@ -298,19 +298,19 @@ kvm_eventfd_init(struct kvm *kvm) * shutdown any irqfd's that match fd+gsi */ static int -kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) +kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args) { struct _irqfd *irqfd, *tmp; struct eventfd_ctx *eventfd; - eventfd = eventfd_ctx_fdget(fd); + eventfd = eventfd_ctx_fdget(args->fd); if (IS_ERR(eventfd)) return PTR_ERR(eventfd); spin_lock_irq(&kvm->irqfds.lock); list_for_each_entry_safe(irqfd, tmp, &kvm->irqfds.items, list) { - if (irqfd->eventfd == eventfd && irqfd->gsi == gsi) { + if (irqfd->eventfd == eventfd && irqfd->gsi == args->gsi) { /* * This rcu_assign_pointer is needed for when * another thread calls kvm_irq_routing_update before @@ -338,12 +338,12 @@ kvm_irqfd_deassign(struct kvm *kvm, int fd, int gsi) } int -kvm_irqfd(struct kvm *kvm, int fd, int gsi, int flags) +kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) { - if (flags & KVM_IRQFD_FLAG_DEASSIGN) - return kvm_irqfd_deassign(kvm, fd, gsi); + if (args->flags & KVM_IRQFD_FLAG_DEASSIGN) + return kvm_irqfd_deassign(kvm, args); - return kvm_irqfd_assign(kvm, fd, gsi); + return kvm_irqfd_assign(kvm, args); } /* diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7e140683ff14..e98a5cac55c4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2047,7 +2047,7 @@ static long kvm_vm_ioctl(struct file *filp, r = -EFAULT; if (copy_from_user(&data, argp, sizeof data)) goto out; - r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags); + r = kvm_irqfd(kvm, &data); break; } case KVM_IOEVENTFD: { -- cgit v1.2.3 From 5a081caa0414b9bbb82c17ffab9d6fe66edbb72f Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Wed, 6 Jun 2012 10:09:25 +0300 Subject: rpmsg: avoid premature deallocation of endpoints When an inbound message arrives, the rpmsg core looks up its associated endpoint and invokes the registered callback. If a message arrives while its endpoint is being removed (because the rpmsg driver was removed, or a recovery of a remote processor has kicked in) we must ensure atomicity, i.e.: - Either the ept is removed before it is found or - The ept is found but will not be freed until the callback returns This is achieved by maintaining a per-ept reference count, which, when drops to zero, will trigger deallocation of the ept. With this in hand, it is now forbidden to directly deallocate epts once they have been added to the endpoints idr. Cc: stable Reported-by: Fernando Guzman Lugo Signed-off-by: Ohad Ben-Cohen --- drivers/rpmsg/virtio_rpmsg_bus.c | 36 ++++++++++++++++++++++++++++++++++-- include/linux/rpmsg.h | 3 +++ 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 75506ec2840e..9623327ba509 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -188,6 +188,26 @@ static int rpmsg_uevent(struct device *dev, struct kobj_uevent_env *env) rpdev->id.name); } +/** + * __ept_release() - deallocate an rpmsg endpoint + * @kref: the ept's reference count + * + * This function deallocates an ept, and is invoked when its @kref refcount + * drops to zero. + * + * Never invoke this function directly! + */ +static void __ept_release(struct kref *kref) +{ + struct rpmsg_endpoint *ept = container_of(kref, struct rpmsg_endpoint, + refcount); + /* + * At this point no one holds a reference to ept anymore, + * so we can directly free it + */ + kfree(ept); +} + /* for more info, see below documentation of rpmsg_create_ept() */ static struct rpmsg_endpoint *__rpmsg_create_ept(struct virtproc_info *vrp, struct rpmsg_channel *rpdev, rpmsg_rx_cb_t cb, @@ -206,6 +226,8 @@ static struct rpmsg_endpoint *__rpmsg_create_ept(struct virtproc_info *vrp, return NULL; } + kref_init(&ept->refcount); + ept->rpdev = rpdev; ept->cb = cb; ept->priv = priv; @@ -238,7 +260,7 @@ rem_idr: idr_remove(&vrp->endpoints, request); free_ept: mutex_unlock(&vrp->endpoints_lock); - kfree(ept); + kref_put(&ept->refcount, __ept_release); return NULL; } @@ -306,7 +328,7 @@ __rpmsg_destroy_ept(struct virtproc_info *vrp, struct rpmsg_endpoint *ept) idr_remove(&vrp->endpoints, ept->addr); mutex_unlock(&vrp->endpoints_lock); - kfree(ept); + kref_put(&ept->refcount, __ept_release); } /** @@ -790,7 +812,13 @@ static void rpmsg_recv_done(struct virtqueue *rvq) /* use the dst addr to fetch the callback of the appropriate user */ mutex_lock(&vrp->endpoints_lock); + ept = idr_find(&vrp->endpoints, msg->dst); + + /* let's make sure no one deallocates ept while we use it */ + if (ept) + kref_get(&ept->refcount); + mutex_unlock(&vrp->endpoints_lock); if (ept && ept->cb) @@ -798,6 +826,10 @@ static void rpmsg_recv_done(struct virtqueue *rvq) else dev_warn(dev, "msg received with no recepient\n"); + /* farewell, ept, we don't need you anymore */ + if (ept) + kref_put(&ept->refcount, __ept_release); + /* publish the real size of the buffer */ sg_init_one(&sg, msg, RPMSG_BUF_SIZE); diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index a8e50e44203c..195f373590b8 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -38,6 +38,7 @@ #include #include #include +#include /* The feature bitmap for virtio rpmsg */ #define VIRTIO_RPMSG_F_NS 0 /* RP supports name service notifications */ @@ -120,6 +121,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32); /** * struct rpmsg_endpoint - binds a local rpmsg address to its user * @rpdev: rpmsg channel device + * @refcount: when this drops to zero, the ept is deallocated * @cb: rx callback handler * @addr: local rpmsg address * @priv: private data for the driver's use @@ -140,6 +142,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32); */ struct rpmsg_endpoint { struct rpmsg_channel *rpdev; + struct kref refcount; rpmsg_rx_cb_t cb; u32 addr; void *priv; -- cgit v1.2.3 From 15fd943af50dbc5f7f4de33835795c72595f7bf4 Mon Sep 17 00:00:00 2001 From: Ohad Ben-Cohen Date: Thu, 7 Jun 2012 15:39:35 +0300 Subject: rpmsg: make sure inflight messages don't invoke just-removed callbacks When inbound messages arrive, rpmsg core looks up their associated endpoint (by destination address) and then invokes their callback. We've made sure that endpoints will never be de-allocated after they were found by rpmsg core, but we also need to protect against the (rare) scenario where the rpmsg driver was just removed, and its callback function isn't available anymore. This is achieved by introducing a callback mutex, which must be taken before the callback is invoked, and, obviously, before it is removed. Cc: stable Reported-by: Fernando Guzman Lugo Signed-off-by: Ohad Ben-Cohen --- drivers/rpmsg/virtio_rpmsg_bus.c | 25 +++++++++++++++++++------ include/linux/rpmsg.h | 3 +++ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c index 9623327ba509..39d3aa41adda 100644 --- a/drivers/rpmsg/virtio_rpmsg_bus.c +++ b/drivers/rpmsg/virtio_rpmsg_bus.c @@ -227,6 +227,7 @@ static struct rpmsg_endpoint *__rpmsg_create_ept(struct virtproc_info *vrp, } kref_init(&ept->refcount); + mutex_init(&ept->cb_lock); ept->rpdev = rpdev; ept->cb = cb; @@ -324,10 +325,16 @@ EXPORT_SYMBOL(rpmsg_create_ept); static void __rpmsg_destroy_ept(struct virtproc_info *vrp, struct rpmsg_endpoint *ept) { + /* make sure new inbound messages can't find this ept anymore */ mutex_lock(&vrp->endpoints_lock); idr_remove(&vrp->endpoints, ept->addr); mutex_unlock(&vrp->endpoints_lock); + /* make sure in-flight inbound messages won't invoke cb anymore */ + mutex_lock(&ept->cb_lock); + ept->cb = NULL; + mutex_unlock(&ept->cb_lock); + kref_put(&ept->refcount, __ept_release); } @@ -821,14 +828,20 @@ static void rpmsg_recv_done(struct virtqueue *rvq) mutex_unlock(&vrp->endpoints_lock); - if (ept && ept->cb) - ept->cb(ept->rpdev, msg->data, msg->len, ept->priv, msg->src); - else - dev_warn(dev, "msg received with no recepient\n"); + if (ept) { + /* make sure ept->cb doesn't go away while we use it */ + mutex_lock(&ept->cb_lock); - /* farewell, ept, we don't need you anymore */ - if (ept) + if (ept->cb) + ept->cb(ept->rpdev, msg->data, msg->len, ept->priv, + msg->src); + + mutex_unlock(&ept->cb_lock); + + /* farewell, ept, we don't need you anymore */ kref_put(&ept->refcount, __ept_release); + } else + dev_warn(dev, "msg received with no recepient\n"); /* publish the real size of the buffer */ sg_init_one(&sg, msg, RPMSG_BUF_SIZE); diff --git a/include/linux/rpmsg.h b/include/linux/rpmsg.h index 195f373590b8..82a673905edb 100644 --- a/include/linux/rpmsg.h +++ b/include/linux/rpmsg.h @@ -39,6 +39,7 @@ #include #include #include +#include /* The feature bitmap for virtio rpmsg */ #define VIRTIO_RPMSG_F_NS 0 /* RP supports name service notifications */ @@ -123,6 +124,7 @@ typedef void (*rpmsg_rx_cb_t)(struct rpmsg_channel *, void *, int, void *, u32); * @rpdev: rpmsg channel device * @refcount: when this drops to zero, the ept is deallocated * @cb: rx callback handler + * @cb_lock: must be taken before accessing/changing @cb * @addr: local rpmsg address * @priv: private data for the driver's use * @@ -144,6 +146,7 @@ struct rpmsg_endpoint { struct rpmsg_channel *rpdev; struct kref refcount; rpmsg_rx_cb_t cb; + struct mutex cb_lock; u32 addr; void *priv; }; -- cgit v1.2.3 From 2dfd06036ba7ae8e7be2daf5a2fff1dac42390bf Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Wed, 27 Jun 2012 17:09:54 +0800 Subject: aio: make kiocb->private NUll in init_sync_kiocb() Ocfs2 uses kiocb.*private as a flag of unsigned long size. In commit a11f7e6 ocfs2: serialize unaligned aio, the unaligned io flag is involved in it to serialize the unaligned aio. As *private is not initialized in init_sync_kiocb() of do_sync_write(), this unaligned io flag may be unexpectly set in an aligned dio. And this will cause OCFS2_I(inode)->ip_unaligned_aio decreased to -1 in ocfs2_dio_end_io(), thus the following unaligned dio will hang forever at ocfs2_aiodio_wait() in ocfs2_file_aio_write(). Signed-off-by: Junxiao Bi Cc: stable@vger.kernel.org Acked-by: Jeff Moyer Signed-off-by: Joel Becker --- include/linux/aio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/aio.h b/include/linux/aio.h index 2314ad8b3c9c..b1a520ec8b59 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -140,6 +140,7 @@ struct kiocb { (x)->ki_dtor = NULL; \ (x)->ki_obj.tsk = tsk; \ (x)->ki_user_data = 0; \ + (x)->private = NULL; \ } while (0) #define AIO_RING_MAGIC 0xa10a10a1 -- cgit v1.2.3 From f567fde24640cf6f2d6416196bfc8b3fefc8e433 Mon Sep 17 00:00:00 2001 From: Laxman Dewangan Date: Wed, 20 Jun 2012 14:14:05 +0530 Subject: gpio: fix bits conflict for gpio flags The bit 2 and 3 in GPIO flag are allocated for the flag OPEN_DRAIN/OPEN_SOURCE. These bits are reused for the flag EXPORT/EXPORT_CHANGEABLE and so creating conflict. Fix this conflict by assigning bit 4 and 5 for the flag EXPORT/EXPORT_CHANGEABLE. Signed-off-by: Laxman Dewangan Signed-off-by: Linus Walleij --- include/linux/gpio.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/gpio.h b/include/linux/gpio.h index f07fc2d08159..2e31e8b3a190 100644 --- a/include/linux/gpio.h +++ b/include/linux/gpio.h @@ -22,8 +22,8 @@ /* Gpio pin is open source */ #define GPIOF_OPEN_SOURCE (1 << 3) -#define GPIOF_EXPORT (1 << 2) -#define GPIOF_EXPORT_CHANGEABLE (1 << 3) +#define GPIOF_EXPORT (1 << 4) +#define GPIOF_EXPORT_CHANGEABLE (1 << 5) #define GPIOF_EXPORT_DIR_FIXED (GPIOF_EXPORT) #define GPIOF_EXPORT_DIR_CHANGEABLE (GPIOF_EXPORT | GPIOF_EXPORT_CHANGEABLE) -- cgit v1.2.3 From 5167e8d5417bf5c322a703d2927daec727ea40dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jun 2012 15:52:09 +0200 Subject: sched/nohz: Rewrite and fix load-avg computation -- again Thanks to Charles Wang for spotting the defects in the current code: - If we go idle during the sample window -- after sampling, we get a negative bias because we can negate our own sample. - If we wake up during the sample window we get a positive bias because we push the sample to a known active period. So rewrite the entire nohz load-avg muck once again, now adding copious documentation to the code. Reported-and-tested-by: Doug Smythies Reported-and-tested-by: Charles Wang Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1340373782.18025.74.camel@twins [ minor edits ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 ++ kernel/sched/core.c | 275 ++++++++++++++++++++++++++++++++++------------- kernel/sched/idle_task.c | 1 - kernel/sched/sched.h | 2 - kernel/time/tick-sched.c | 2 + 5 files changed, 213 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4059c0f33f07..20cb7497c59c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1909,6 +1909,14 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p, } #endif +#ifdef CONFIG_NO_HZ +void calc_load_enter_idle(void); +void calc_load_exit_idle(void); +#else +static inline void calc_load_enter_idle(void) { } +static inline void calc_load_exit_idle(void) { } +#endif /* CONFIG_NO_HZ */ + #ifndef CONFIG_CPUMASK_OFFSTACK static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..bb840405335d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2161,11 +2161,73 @@ unsigned long this_cpu_load(void) } +/* + * Global load-average calculations + * + * We take a distributed and async approach to calculating the global load-avg + * in order to minimize overhead. + * + * The global load average is an exponentially decaying average of nr_running + + * nr_uninterruptible. + * + * Once every LOAD_FREQ: + * + * nr_active = 0; + * for_each_possible_cpu(cpu) + * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; + * + * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) + * + * Due to a number of reasons the above turns in the mess below: + * + * - for_each_possible_cpu() is prohibitively expensive on machines with + * serious number of cpus, therefore we need to take a distributed approach + * to calculating nr_active. + * + * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 + * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } + * + * So assuming nr_active := 0 when we start out -- true per definition, we + * can simply take per-cpu deltas and fold those into a global accumulate + * to obtain the same result. See calc_load_fold_active(). + * + * Furthermore, in order to avoid synchronizing all per-cpu delta folding + * across the machine, we assume 10 ticks is sufficient time for every + * cpu to have completed this task. + * + * This places an upper-bound on the IRQ-off latency of the machine. Then + * again, being late doesn't loose the delta, just wrecks the sample. + * + * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because + * this would add another cross-cpu cacheline miss and atomic operation + * to the wakeup path. Instead we increment on whatever cpu the task ran + * when it went into uninterruptible state and decrement on whatever cpu + * did the wakeup. This means that only the sum of nr_uninterruptible over + * all cpus yields the correct result. + * + * This covers the NO_HZ=n code, for extra head-aches, see the comment below. + */ + /* Variables and functions for calc_load */ static atomic_long_t calc_load_tasks; static unsigned long calc_load_update; unsigned long avenrun[3]; -EXPORT_SYMBOL(avenrun); +EXPORT_SYMBOL(avenrun); /* should be removed */ + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} static long calc_load_fold_active(struct rq *this_rq) { @@ -2182,6 +2244,9 @@ static long calc_load_fold_active(struct rq *this_rq) return delta; } +/* + * a1 = a0 * e + a * (1 - e) + */ static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { @@ -2193,30 +2258,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) #ifdef CONFIG_NO_HZ /* - * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * Handle NO_HZ for the global load-average. + * + * Since the above described distributed algorithm to compute the global + * load-average relies on per-cpu sampling from the tick, it is affected by + * NO_HZ. + * + * The basic idea is to fold the nr_active delta into a global idle-delta upon + * entering NO_HZ state such that we can include this as an 'extra' cpu delta + * when we read the global state. + * + * Obviously reality has to ruin such a delightfully simple scheme: + * + * - When we go NO_HZ idle during the window, we can negate our sample + * contribution, causing under-accounting. + * + * We avoid this by keeping two idle-delta counters and flipping them + * when the window starts, thus separating old and new NO_HZ load. + * + * The only trick is the slight shift in index flip for read vs write. + * + * 0s 5s 10s 15s + * +10 +10 +10 +10 + * |-|-----------|-|-----------|-|-----------|-| + * r:0 0 1 1 0 0 1 1 0 + * w:0 1 1 0 0 1 1 0 0 + * + * This ensures we'll fold the old idle contribution in this window while + * accumlating the new one. + * + * - When we wake up from NO_HZ idle during the window, we push up our + * contribution, since we effectively move our sample point to a known + * busy state. + * + * This is solved by pushing the window forward, and thus skipping the + * sample, for this cpu (effectively using the idle-delta for this cpu which + * was in effect at the time the window opened). This also solves the issue + * of having to deal with a cpu having been in NOHZ idle for multiple + * LOAD_FREQ intervals. * * When making the ILB scale, we should try to pull this in as well. */ -static atomic_long_t calc_load_tasks_idle; +static atomic_long_t calc_load_idle[2]; +static int calc_load_idx; -void calc_load_account_idle(struct rq *this_rq) +static inline int calc_load_write_idx(void) { + int idx = calc_load_idx; + + /* + * See calc_global_nohz(), if we observe the new index, we also + * need to observe the new update time. + */ + smp_rmb(); + + /* + * If the folding window started, make sure we start writing in the + * next idle-delta. + */ + if (!time_before(jiffies, calc_load_update)) + idx++; + + return idx & 1; +} + +static inline int calc_load_read_idx(void) +{ + return calc_load_idx & 1; +} + +void calc_load_enter_idle(void) +{ + struct rq *this_rq = this_rq(); long delta; + /* + * We're going into NOHZ mode, if there's any pending delta, fold it + * into the pending idle delta. + */ delta = calc_load_fold_active(this_rq); - if (delta) - atomic_long_add(delta, &calc_load_tasks_idle); + if (delta) { + int idx = calc_load_write_idx(); + atomic_long_add(delta, &calc_load_idle[idx]); + } } -static long calc_load_fold_idle(void) +void calc_load_exit_idle(void) { - long delta = 0; + struct rq *this_rq = this_rq(); + + /* + * If we're still before the sample window, we're done. + */ + if (time_before(jiffies, this_rq->calc_load_update)) + return; /* - * Its got a race, we don't care... + * We woke inside or after the sample window, this means we're already + * accounted through the nohz accounting, so skip the entire deal and + * sync up for the next window. */ - if (atomic_long_read(&calc_load_tasks_idle)) - delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + this_rq->calc_load_update = calc_load_update; + if (time_before(jiffies, this_rq->calc_load_update + 10)) + this_rq->calc_load_update += LOAD_FREQ; +} + +static long calc_load_fold_idle(void) +{ + int idx = calc_load_read_idx(); + long delta = 0; + + if (atomic_long_read(&calc_load_idle[idx])) + delta = atomic_long_xchg(&calc_load_idle[idx], 0); return delta; } @@ -2302,66 +2455,39 @@ static void calc_global_nohz(void) { long delta, active, n; - /* - * If we crossed a calc_load_update boundary, make sure to fold - * any pending idle changes, the respective CPUs might have - * missed the tick driven calc_load_account_active() update - * due to NO_HZ. - */ - delta = calc_load_fold_idle(); - if (delta) - atomic_long_add(delta, &calc_load_tasks); - - /* - * It could be the one fold was all it took, we done! - */ - if (time_before(jiffies, calc_load_update + 10)) - return; - - /* - * Catch-up, fold however many we are behind still - */ - delta = jiffies - calc_load_update - 10; - n = 1 + (delta / LOAD_FREQ); + if (!time_before(jiffies, calc_load_update + 10)) { + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); - active = atomic_long_read(&calc_load_tasks); - active = active > 0 ? active * FIXED_1 : 0; + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; - avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); - avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); - avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); - calc_load_update += n * LOAD_FREQ; -} -#else -void calc_load_account_idle(struct rq *this_rq) -{ -} + calc_load_update += n * LOAD_FREQ; + } -static inline long calc_load_fold_idle(void) -{ - return 0; + /* + * Flip the idle index... + * + * Make sure we first write the new time then flip the index, so that + * calc_load_write_idx() will see the new time when it reads the new + * index, this avoids a double flip messing things up. + */ + smp_wmb(); + calc_load_idx++; } +#else /* !CONFIG_NO_HZ */ -static void calc_global_nohz(void) -{ -} -#endif +static inline long calc_load_fold_idle(void) { return 0; } +static inline void calc_global_nohz(void) { } -/** - * get_avenrun - get the load average array - * @loads: pointer to dest load array - * @offset: offset to add - * @shift: shift count to shift the result left - * - * These values are estimates at best, so no need for locking. - */ -void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -{ - loads[0] = (avenrun[0] + offset) << shift; - loads[1] = (avenrun[1] + offset) << shift; - loads[2] = (avenrun[2] + offset) << shift; -} +#endif /* CONFIG_NO_HZ */ /* * calc_load - update the avenrun load estimates 10 ticks after the @@ -2369,11 +2495,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) */ void calc_global_load(unsigned long ticks) { - long active; + long active, delta; if (time_before(jiffies, calc_load_update + 10)) return; + /* + * Fold the 'old' idle-delta to include all NO_HZ cpus. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + active = atomic_long_read(&calc_load_tasks); active = active > 0 ? active * FIXED_1 : 0; @@ -2384,12 +2517,7 @@ void calc_global_load(unsigned long ticks) calc_load_update += LOAD_FREQ; /* - * Account one period with whatever state we found before - * folding in the nohz state and ageing the entire idle period. - * - * This avoids loosing a sample when we go idle between - * calc_load_account_active() (10 ticks ago) and now and thus - * under-accounting. + * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. */ calc_global_nohz(); } @@ -2406,13 +2534,16 @@ static void calc_load_account_active(struct rq *this_rq) return; delta = calc_load_fold_active(this_rq); - delta += calc_load_fold_idle(); if (delta) atomic_long_add(delta, &calc_load_tasks); this_rq->calc_load_update += LOAD_FREQ; } +/* + * End of global load-average stuff + */ + /* * The exact cpuload at various idx values, calculated at every tick would be * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - calc_load_account_idle(rq); return rq->idle; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 6d52cea7f33d..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -942,8 +942,6 @@ static inline u64 sched_avg_period(void) return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; } -void calc_load_account_idle(struct rq *this_rq); - #ifdef CONFIG_SCHED_HRTICK /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 869997833928..4a08472c3ca7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -406,6 +406,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) */ if (!ts->tick_stopped) { select_nohz_load_balancer(1); + calc_load_enter_idle(); ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; @@ -597,6 +598,7 @@ void tick_nohz_idle_exit(void) account_idle_ticks(ticks); #endif + calc_load_exit_idle(); touch_softlockup_watchdog(); /* * Cancel the scheduled timer and restore the tick -- cgit v1.2.3 From cc2caea5b6152b8ce66dc2bbe83dc72b60612da8 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Fri, 6 Jul 2012 12:02:04 +0200 Subject: mm: cma: fix condition check when setting global cma area dev_set_cma_area incorrectly assigned cma to global area on first call due to incorrect check. This patch fixes this issue. Signed-off-by: Marek Szyprowski --- include/asm-generic/dma-contiguous.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-generic/dma-contiguous.h b/include/asm-generic/dma-contiguous.h index c544356b374b..294b1e755ab2 100644 --- a/include/asm-generic/dma-contiguous.h +++ b/include/asm-generic/dma-contiguous.h @@ -18,7 +18,7 @@ static inline void dev_set_cma_area(struct device *dev, struct cma *cma) { if (dev) dev->cma_area = cma; - if (!dev || !dma_contiguous_default_area) + if (!dev && !dma_contiguous_default_area) dma_contiguous_default_area = cma; } -- cgit v1.2.3 From c540521bba5d2f24bd2c0417157bfaf8b85e2eee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Jul 2012 11:23:24 -0700 Subject: security: Minor improvements to no_new_privs documentation The documentation didn't actually mention how to enable no_new_privs. This also adds a note about possible interactions between no_new_privs and LSMs (i.e. why teaching systemd to set no_new_privs is not necessarily a good idea), and it references the new docs from include/linux/prctl.h. Suggested-by: Rob Landley Signed-off-by: Andy Lutomirski Acked-by: Kees Cook Signed-off-by: James Morris --- Documentation/prctl/no_new_privs.txt | 7 +++++++ include/linux/prctl.h | 2 ++ 2 files changed, 9 insertions(+) (limited to 'include') diff --git a/Documentation/prctl/no_new_privs.txt b/Documentation/prctl/no_new_privs.txt index cb705ec69abe..f7be84fba910 100644 --- a/Documentation/prctl/no_new_privs.txt +++ b/Documentation/prctl/no_new_privs.txt @@ -25,6 +25,13 @@ bits will no longer change the uid or gid; file capabilities will not add to the permitted set, and LSMs will not relax constraints after execve. +To set no_new_privs, use prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0). + +Be careful, though: LSMs might also not tighten constraints on exec +in no_new_privs mode. (This means that setting up a general-purpose +service launcher to set no_new_privs before execing daemons may +interfere with LSM-based sandboxing.) + Note that no_new_privs does not prevent privilege changes that do not involve execve. An appropriately privileged task can still call setuid(2) and receive SCM_RIGHTS datagrams. diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 3988012255dc..289760f424aa 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -141,6 +141,8 @@ * Changing LSM security domain is considered a new privilege. So, for example, * asking selinux for a specific new context (e.g. with runcon) will result * in execve returning -EPERM. + * + * See Documentation/prctl/no_new_privs.txt for more details. */ #define PR_SET_NO_NEW_PRIVS 38 #define PR_GET_NO_NEW_PRIVS 39 -- cgit v1.2.3 From 222a806af830fda34ad1f6bc991cd226916de060 Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Thu, 21 Jun 2012 12:23:42 -0700 Subject: [SCSI] Fix NULL dereferences in scsi_cmd_to_driver Avoid crashing if the private_data pointer happens to be NULL. This has been seen sometimes when a host reset happens, notably when there are many LUNs: host3: Assigned Port ID 0c1601 scsi host3: libfc: Host reset succeeded on port (0c1601) BUG: unable to handle kernel NULL pointer dereference at 0000000000000350 IP: [] scsi_send_eh_cmnd+0x58/0x3a0 Process scsi_eh_3 (pid: 4144, threadinfo ffff88030920c000, task ffff880326b160c0) Stack: 000000010372e6ba 0000000000000282 000027100920dca0 ffffffffa0038ee0 0000000000000000 0000000000030003 ffff88030920dc80 ffff88030920dc80 00000002000e0000 0000000a00004000 ffff8803242f7760 ffff88031326ed80 Call Trace: [] ? lock_timer_base+0x70/0x70 [] scsi_eh_tur+0x3e/0xc0 [] scsi_eh_test_devices+0x76/0x170 [] scsi_eh_host_reset+0x85/0x160 [] scsi_eh_ready_devs+0x91/0x110 [] scsi_unjam_host+0xed/0x1f0 [] scsi_error_handler+0x1a8/0x200 [] ? scsi_unjam_host+0x1f0/0x1f0 [] kthread+0x9e/0xb0 [] kernel_thread_helper+0x4/0x10 [] ? kthread_freezable_should_stop+0x70/0x70 [] ? gs_change+0x13/0x13 Code: 25 28 00 00 00 48 89 45 c8 31 c0 48 8b 87 80 00 00 00 48 8d b5 60 ff ff ff 89 d1 48 89 fb 41 89 d6 4c 89 fa 48 8b 80 b8 00 00 00 <48> 8b 80 50 03 00 00 48 8b 00 48 89 85 38 ff ff ff 48 8b 07 4c RIP [] scsi_send_eh_cmnd+0x58/0x3a0 RSP CR2: 0000000000000350 Signed-off-by: Mark Rustad Tested-by: Marcus Dennis Cc: Signed-off-by: James Bottomley --- include/scsi/scsi_cmnd.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 1e1198546c72..ac06cc595890 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -134,10 +134,16 @@ struct scsi_cmnd { static inline struct scsi_driver *scsi_cmd_to_driver(struct scsi_cmnd *cmd) { + struct scsi_driver **sdp; + if (!cmd->request->rq_disk) return NULL; - return *(struct scsi_driver **)cmd->request->rq_disk->private_data; + sdp = (struct scsi_driver **)cmd->request->rq_disk->private_data; + if (!sdp) + return NULL; + + return *sdp; } extern struct scsi_cmnd *scsi_get_command(struct scsi_device *, gfp_t); -- cgit v1.2.3 From 6ef1b512f4e6f936d89aa20be3d97a7ec7c290ac Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 22 Jun 2012 10:52:34 -0700 Subject: [SCSI] libsas: fix taskfile corruption in sas_ata_qc_fill_rtf fill_result_tf() grabs the taskfile flags from the originating qc which sas_ata_qc_fill_rtf() promptly overwrites. The presence of an ata_taskfile in the sata_device makes it tempting to just copy the full contents in sas_ata_qc_fill_rtf(). However, libata really only wants the fis contents and expects the other portions of the taskfile to not be touched by ->qc_fill_rtf. To that end store a fis buffer in the sata_device and use ata_tf_from_fis() like every other ->qc_fill_rtf() implementation. Cc: Reported-by: Praveen Murali Tested-by: Praveen Murali Signed-off-by: Dan Williams Signed-off-by: James Bottomley --- drivers/scsi/aic94xx/aic94xx_task.c | 2 +- drivers/scsi/libsas/sas_ata.c | 12 ++++++------ include/scsi/libsas.h | 6 ++++-- 3 files changed, 11 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/drivers/scsi/aic94xx/aic94xx_task.c b/drivers/scsi/aic94xx/aic94xx_task.c index 532d212b6b2c..393e7ce8e95a 100644 --- a/drivers/scsi/aic94xx/aic94xx_task.c +++ b/drivers/scsi/aic94xx/aic94xx_task.c @@ -201,7 +201,7 @@ static void asd_get_response_tasklet(struct asd_ascb *ascb, if (SAS_STATUS_BUF_SIZE >= sizeof(*resp)) { resp->frame_len = le16_to_cpu(*(__le16 *)(r+6)); - memcpy(&resp->ending_fis[0], r+16, 24); + memcpy(&resp->ending_fis[0], r+16, ATA_RESP_FIS_SIZE); ts->buf_valid_size = sizeof(*resp); } } diff --git a/drivers/scsi/libsas/sas_ata.c b/drivers/scsi/libsas/sas_ata.c index 441d88ad99a7..d109cc3a17b6 100644 --- a/drivers/scsi/libsas/sas_ata.c +++ b/drivers/scsi/libsas/sas_ata.c @@ -139,12 +139,12 @@ static void sas_ata_task_done(struct sas_task *task) if (stat->stat == SAS_PROTO_RESPONSE || stat->stat == SAM_STAT_GOOD || ((stat->stat == SAM_STAT_CHECK_CONDITION && dev->sata_dev.command_set == ATAPI_COMMAND_SET))) { - ata_tf_from_fis(resp->ending_fis, &dev->sata_dev.tf); + memcpy(dev->sata_dev.fis, resp->ending_fis, ATA_RESP_FIS_SIZE); if (!link->sactive) { - qc->err_mask |= ac_err_mask(dev->sata_dev.tf.command); + qc->err_mask |= ac_err_mask(dev->sata_dev.fis[2]); } else { - link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.tf.command); + link->eh_info.err_mask |= ac_err_mask(dev->sata_dev.fis[2]); if (unlikely(link->eh_info.err_mask)) qc->flags |= ATA_QCFLAG_FAILED; } @@ -161,8 +161,8 @@ static void sas_ata_task_done(struct sas_task *task) qc->flags |= ATA_QCFLAG_FAILED; } - dev->sata_dev.tf.feature = 0x04; /* status err */ - dev->sata_dev.tf.command = ATA_ERR; + dev->sata_dev.fis[3] = 0x04; /* status err */ + dev->sata_dev.fis[2] = ATA_ERR; } } @@ -269,7 +269,7 @@ static bool sas_ata_qc_fill_rtf(struct ata_queued_cmd *qc) { struct domain_device *dev = qc->ap->private_data; - memcpy(&qc->result_tf, &dev->sata_dev.tf, sizeof(qc->result_tf)); + ata_tf_from_fis(dev->sata_dev.fis, &qc->result_tf); return true; } diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index f4f1c96dca72..10ce74f589c5 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -163,6 +163,8 @@ enum ata_command_set { ATAPI_COMMAND_SET = 1, }; +#define ATA_RESP_FIS_SIZE 24 + struct sata_device { enum ata_command_set command_set; struct smp_resp rps_resp; /* report_phy_sata_resp */ @@ -171,7 +173,7 @@ struct sata_device { struct ata_port *ap; struct ata_host ata_host; - struct ata_taskfile tf; + u8 fis[ATA_RESP_FIS_SIZE]; }; enum { @@ -537,7 +539,7 @@ enum exec_status { */ struct ata_task_resp { u16 frame_len; - u8 ending_fis[24]; /* dev to host or data-in */ + u8 ending_fis[ATA_RESP_FIS_SIZE]; /* dev to host or data-in */ }; #define SAS_STATUS_BUF_SIZE 96 -- cgit v1.2.3 From 6bd0405bb4196b44f1acb7a58f11382cdaf6f7f0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 5 Jul 2012 15:42:10 +0200 Subject: netfilter: nf_ct_ecache: fix crash with multiple containers, one shutting down Hans reports that he's still hitting: BUG: unable to handle kernel NULL pointer dereference at 000000000000027c IP: [] netlink_has_listeners+0xb/0x60 PGD 0 Oops: 0000 [#3] PREEMPT SMP CPU 0 It happens when adding a number of containers with do: nfct_query(h, NFCT_Q_CREATE, ct); and most likely one namespace shuts down. this problem was supposed to be fixed by: 70e9942 netfilter: nf_conntrack: make event callback registration per-netns Still, it was missing one rcu_access_pointer to check if the callback is set or not. Reported-by: Hans Schillstrom Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index a88fb6939387..e1ce1048fe5f 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -78,7 +78,7 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) struct net *net = nf_ct_net(ct); struct nf_conntrack_ecache *e; - if (net->ct.nf_conntrack_event_cb == NULL) + if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) return; e = nf_ct_ecache_find(ct); -- cgit v1.2.3 From dbf0e4c7257f8d684ec1a3c919853464293de66e Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Mon, 9 Jul 2012 11:09:21 -0400 Subject: PCI: EHCI: fix crash during suspend on ASUS computers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quite a few ASUS computers experience a nasty problem, related to the EHCI controllers, when going into system suspend. It was observed that the problem didn't occur if the controllers were not put into the D3 power state before starting the suspend, and commit 151b61284776be2d6f02d48c23c3625678960b97 (USB: EHCI: fix crash during suspend on ASUS computers) was created to do this. It turned out this approach messed up other computers that didn't have the problem -- it prevented USB wakeup from working. Consequently commit c2fb8a3fa25513de8fedb38509b1f15a5bbee47b (USB: add NO_D3_DURING_SLEEP flag and revert 151b61284776be2) was merged; it reverted the earlier commit and added a whitelist of known good board names. Now we know the actual cause of the problem. Thanks to AceLan Kao for tracking it down. According to him, an engineer at ASUS explained that some of their BIOSes contain a bug that was added in an attempt to work around a problem in early versions of Windows. When the computer goes into S3 suspend, the BIOS tries to verify that the EHCI controllers were first quiesced by the OS. Nothing's wrong with this, but the BIOS does it by checking that the PCI COMMAND registers contain 0 without checking the controllers' power state. If the register isn't 0, the BIOS assumes the controller needs to be quiesced and tries to do so. This involves making various MMIO accesses to the controller, which don't work very well if the controller is already in D3. The end result is a system hang or memory corruption. Since the value in the PCI COMMAND register doesn't matter once the controller has been suspended, and since the value will be restored anyway when the controller is resumed, we can work around the BIOS bug simply by setting the register to 0 during system suspend. This patch (as1590) does so and also reverts the second commit mentioned above, which is now unnecessary. In theory we could do this for every PCI device. However to avoid introducing new problems, the patch restricts itself to EHCI host controllers. Finally the affected systems can suspend with USB wakeup working properly. Reference: https://bugzilla.kernel.org/show_bug.cgi?id=37632 Reference: https://bugzilla.kernel.org/show_bug.cgi?id=42728 Based-on-patch-by: AceLan Kao Signed-off-by: Alan Stern Tested-by: Dâniel Fraga Tested-by: Javier Marcet Tested-by: Andrey Rahmatullin Tested-by: Oleksij Rempel Tested-by: Pavel Pisa Cc: stable Acked-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-driver.c | 12 ++++++++++++ drivers/pci/pci.c | 5 ----- drivers/pci/quirks.c | 26 -------------------------- include/linux/pci.h | 2 -- 4 files changed, 12 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index bf0cee629b60..099f46cd8e87 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -748,6 +748,18 @@ static int pci_pm_suspend_noirq(struct device *dev) pci_pm_set_unknown_state(pci_dev); + /* + * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's + * PCI COMMAND register isn't 0, the BIOS assumes that the controller + * hasn't been quiesced and tries to turn it off. If the controller + * is already in D3, this can hang or cause memory corruption. + * + * Since the value of the COMMAND register doesn't matter once the + * device has been suspended, we can safely set it to 0 here. + */ + if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) + pci_write_config_word(pci_dev, PCI_COMMAND, 0); + return 0; } diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 77cb54a65cde..447e83472c01 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1744,11 +1744,6 @@ int pci_prepare_to_sleep(struct pci_dev *dev) if (target_state == PCI_POWER_ERROR) return -EIO; - /* Some devices mustn't be in D3 during system sleep */ - if (target_state == PCI_D3hot && - (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP)) - return 0; - pci_enable_wake(dev, target_state, device_may_wakeup(&dev->dev)); error = pci_set_power_state(dev, target_state); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 194b243a2817..2a7521677541 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2929,32 +2929,6 @@ static void __devinit disable_igfx_irq(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0102, disable_igfx_irq); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x010a, disable_igfx_irq); -/* - * The Intel 6 Series/C200 Series chipset's EHCI controllers on many - * ASUS motherboards will cause memory corruption or a system crash - * if they are in D3 while the system is put into S3 sleep. - */ -static void __devinit asus_ehci_no_d3(struct pci_dev *dev) -{ - const char *sys_info; - static const char good_Asus_board[] = "P8Z68-V"; - - if (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP) - return; - if (dev->subsystem_vendor != PCI_VENDOR_ID_ASUSTEK) - return; - sys_info = dmi_get_system_info(DMI_BOARD_NAME); - if (sys_info && memcmp(sys_info, good_Asus_board, - sizeof(good_Asus_board) - 1) == 0) - return; - - dev_info(&dev->dev, "broken D3 during system sleep on ASUS\n"); - dev->dev_flags |= PCI_DEV_FLAGS_NO_D3_DURING_SLEEP; - device_set_wakeup_capable(&dev->dev, false); -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c26, asus_ehci_no_d3); -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c2d, asus_ehci_no_d3); - static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, struct pci_fixup *end) { diff --git a/include/linux/pci.h b/include/linux/pci.h index fefb4e19bf6a..d8c379dba6ad 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -176,8 +176,6 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2, /* Provide indication device is assigned by a Virtual Machine Manager */ PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4, - /* Device causes system crash if in D3 during S3 sleep */ - PCI_DEV_FLAGS_NO_D3_DURING_SLEEP = (__force pci_dev_flags_t) 8, }; enum pci_irq_reroute_variant { -- cgit v1.2.3 From f55a6faa384304c89cfef162768e88374d3312cb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Tue, 10 Jul 2012 18:43:19 -0400 Subject: hrtimer: Provide clock_was_set_delayed() clock_was_set() cannot be called from hard interrupt context because it calls on_each_cpu(). For fixing the widely reported leap seconds issue it is necessary to call it from hard interrupt context, i.e. the timer tick code, which does the timekeeping updates. Provide a new function which denotes it in the hrtimer cpu base structure of the cpu on which it is called and raise the hrtimer softirq. We then execute the clock_was_set() notificiation from softirq context in run_hrtimer_softirq(). The hrtimer softirq is rarely used, so polling the flag there is not a performance issue. [ tglx: Made it depend on CONFIG_HIGH_RES_TIMERS. We really should get rid of all this ifdeffery ASAP ] Signed-off-by: John Stultz Reported-by: Jan Engelhardt Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1341960205-56738-2-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 9 ++++++++- kernel/hrtimer.c | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fd0dc30c9f15..c9ec9400ee5b 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -165,6 +165,7 @@ enum hrtimer_base_type { * @lock: lock protecting the base and associated clock bases * and timers * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set: Indicates that clock was set from irq context. * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() * @hres_active: State of high resolution mode @@ -177,7 +178,8 @@ enum hrtimer_base_type { */ struct hrtimer_cpu_base { raw_spinlock_t lock; - unsigned long active_bases; + unsigned int active_bases; + unsigned int clock_was_set; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; @@ -286,6 +288,8 @@ extern void hrtimer_peek_ahead_timers(void); # define MONOTONIC_RES_NSEC HIGH_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_HIGH_RES +extern void clock_was_set_delayed(void); + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC @@ -306,6 +310,9 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return 0; } + +static inline void clock_was_set_delayed(void) { } + #endif extern void clock_was_set(void); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index ae34bf51682b..3c24fb2c25c8 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -717,6 +717,19 @@ static int hrtimer_switch_to_hres(void) return 1; } +/* + * Called from timekeeping code to reprogramm the hrtimer interrupt + * device. If called from the timer interrupt context we defer it to + * softirq context. + */ +void clock_was_set_delayed(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + cpu_base->clock_was_set = 1; + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); +} + #else static inline int hrtimer_hres_active(void) { return 0; } @@ -1395,6 +1408,13 @@ void hrtimer_peek_ahead_timers(void) static void run_hrtimer_softirq(struct softirq_action *h) { + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + + if (cpu_base->clock_was_set) { + cpu_base->clock_was_set = 0; + clock_was_set(); + } + hrtimer_peek_ahead_timers(); } -- cgit v1.2.3 From f6c06abfb3972ad4914cef57d8348fcb2932bc3b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 10 Jul 2012 18:43:24 -0400 Subject: timekeeping: Provide hrtimer update function To finally fix the infamous leap second issue and other race windows caused by functions which change the offsets between the various time bases (CLOCK_MONOTONIC, CLOCK_REALTIME and CLOCK_BOOTTIME) we need a function which atomically gets the current monotonic time and updates the offsets of CLOCK_REALTIME and CLOCK_BOOTTIME with minimalistic overhead. The previous patch which provides ktime_t offsets allows us to make this function almost as cheap as ktime_get() which is going to be replaced in hrtimer_interrupt(). Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Acked-by: Peter Zijlstra Acked-by: Prarit Bhargava Cc: stable@vger.kernel.org Signed-off-by: John Stultz Link: http://lkml.kernel.org/r/1341960205-56738-7-git-send-email-johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 1 + kernel/time/timekeeping.c | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c9ec9400ee5b..cc07d2777bbe 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -327,6 +327,7 @@ extern ktime_t ktime_get(void); extern ktime_t ktime_get_real(void); extern ktime_t ktime_get_boottime(void); extern ktime_t ktime_get_monotonic_offset(void); +extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1c038dac71a2..269b1fe5f2ae 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1271,6 +1271,40 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, } while (read_seqretry(&timekeeper.lock, seq)); } +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * ktime_get_update_offsets - hrtimer helper + * @offs_real: pointer to storage for monotonic -> realtime offset + * @offs_boot: pointer to storage for monotonic -> boottime offset + * + * Returns current monotonic time and updates the offsets + * Called from hrtimer_interupt() or retrigger_next_event() + */ +ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot) +{ + ktime_t now; + unsigned int seq; + u64 secs, nsecs; + + do { + seq = read_seqbegin(&timekeeper.lock); + + secs = timekeeper.xtime.tv_sec; + nsecs = timekeeper.xtime.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + *offs_real = timekeeper.offs_real; + *offs_boot = timekeeper.offs_boot; + } while (read_seqretry(&timekeeper.lock, seq)); + + now = ktime_add_ns(ktime_set(secs, 0), nsecs); + now = ktime_sub(now, *offs_real); + return now; +} +#endif + /** * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format */ -- cgit v1.2.3 From d8adde17e5f858427504725218c56aef90e90fc7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 11 Jul 2012 14:01:52 -0700 Subject: memory hotplug: fix invalid memory access caused by stale kswapd pointer kswapd_stop() is called to destroy the kswapd work thread when all memory of a NUMA node has been offlined. But kswapd_stop() only terminates the work thread without resetting NODE_DATA(nid)->kswapd to NULL. The stale pointer will prevent kswapd_run() from creating a new work thread when adding memory to the memory-less NUMA node again. Eventually the stale pointer may cause invalid memory access. An example stack dump as below. It's reproduced with 2.6.32, but latest kernel has the same issue. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] exit_creds+0x12/0x78 PGD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/system/memory/memory391/state CPU 11 Modules linked in: cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq microcode fuse loop dm_mod tpm_tis rtc_cmos i2c_i801 rtc_core tpm serio_raw pcspkr sg tpm_bios igb i2c_core iTCO_wdt rtc_lib mptctl iTCO_vendor_support button dca bnx2 usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix libata thermal processor thermal_sys hwmon mptsas mptscsih mptbase scsi_transport_sas scsi_mod Pid: 7949, comm: sh Not tainted 2.6.32.12-qiuxishi-5-default #92 Tecal RH2285 RIP: 0010:exit_creds+0x12/0x78 RSP: 0018:ffff8806044f1d78 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff880604f22140 RCX: 0000000000019502 RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000 RBP: ffff880604f22150 R08: 0000000000000000 R09: ffffffff81a4dc10 R10: 00000000000032a0 R11: ffff880006202500 R12: 0000000000000000 R13: 0000000000c40000 R14: 0000000000008000 R15: 0000000000000001 FS: 00007fbc03d066f0(0000) GS:ffff8800282e0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000060f029000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process sh (pid: 7949, threadinfo ffff8806044f0000, task ffff880603d7c600) Stack: ffff880604f22140 ffffffff8103aac5 ffff880604f22140 ffffffff8104d21e ffff880006202500 0000000000008000 0000000000c38000 ffffffff810bd5b1 0000000000000000 ffff880603d7c600 00000000ffffdd29 0000000000000003 Call Trace: __put_task_struct+0x5d/0x97 kthread_stop+0x50/0x58 offline_pages+0x324/0x3da memory_block_change_state+0x179/0x1db store_mem_state+0x9e/0xbb sysfs_write_file+0xd0/0x107 vfs_write+0xad/0x169 sys_write+0x45/0x6e system_call_fastpath+0x16/0x1b Code: ff 4d 00 0f 94 c0 84 c0 74 08 48 89 ef e8 1f fd ff ff 5b 5d 31 c0 41 5c c3 53 48 8b 87 20 06 00 00 48 89 fb 48 8b bf 18 06 00 00 <8b> 00 48 c7 83 18 06 00 00 00 00 00 00 f0 ff 0f 0f 94 c0 84 c0 RIP exit_creds+0x12/0x78 RSP CR2: 0000000000000000 [akpm@linux-foundation.org: add pglist_data.kswapd locking comments] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- mm/vmscan.c | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2427706f78b4..68c569fcbb66 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -694,7 +694,7 @@ typedef struct pglist_data { range, including holes */ int node_id; wait_queue_head_t kswapd_wait; - struct task_struct *kswapd; + struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ int kswapd_max_order; enum zone_type classzone_idx; } pg_data_t; diff --git a/mm/vmscan.c b/mm/vmscan.c index eeb3bc9d1d36..661576324c7f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2955,14 +2955,17 @@ int kswapd_run(int nid) } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold lock_memory_hotplug(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) -- cgit v1.2.3 From 99ab7b19440a72ebdf225f99b20f8ef40decee86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:53 -0700 Subject: mm: sparse: fix usemap allocation above node descriptor section After commit f5bf18fa22f8 ("bootmem/sparsemem: remove limit constraint in alloc_bootmem_section"), usemap allocations may easily be placed outside the optimal section that holds the node descriptor, even if there is space available in that section. This results in unnecessary hotplug dependencies that need to have the node unplugged before the section holding the usemap. The reason is that the bootmem allocator doesn't guarantee a linear search starting from the passed allocation goal but may start out at a much higher address absent an upper limit. Fix this by trying the allocation with the limit at the section end, then retry without if that fails. This keeps the fix from f5bf18fa22f8 of not panicking if the allocation does not fit in the section, but still makes sure to try to stay within the section at first. Signed-off-by: Yinghai Lu Signed-off-by: Johannes Weiner Cc: [3.3.x, 3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 5 +++++ mm/bootmem.c | 2 +- mm/nobootmem.c | 2 +- mm/sparse.c | 18 +++++++++++++----- 4 files changed, 20 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 324fe08ea3b1..6d6795d46a75 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -91,6 +91,11 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit); extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); diff --git a/mm/bootmem.c b/mm/bootmem.c index ec4fcb7a56c8..73096630cb35 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index d23415c001bc..0900b3910cda 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -274,7 +274,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, diff --git a/mm/sparse.c b/mm/sparse.c index e861397016a9..c7bb952400c8 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -275,8 +275,9 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - pg_data_t *host_pgdat; - unsigned long goal; + unsigned long goal, limit; + unsigned long *p; + int nid; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while @@ -288,9 +289,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * this problem. */ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); - host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); - return __alloc_bootmem_node_nopanic(host_pgdat, size, - SMP_CACHE_BYTES, goal); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, goal, limit); + if (!p && limit) { + limit = 0; + goto again; + } + return p; } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) -- cgit v1.2.3 From 29f6738609e40227dabcc63bfb3b84b3726a75bd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:56 -0700 Subject: memblock: free allocated memblock_reserved_regions later memblock_free_reserved_regions() calls memblock_free(), but memblock_free() would double reserved.regions too, so we could free the old range for reserved.regions. Also tj said there is another bug which could be related to this. | I don't think we're saving any noticeable | amount by doing this "free - give it to page allocator - reserve | again" dancing. We should just allocate regions aligned to page | boundaries and free them later when memblock is no longer in use. in that case, when DEBUG_PAGEALLOC, will get panic: memblock_free: [0x0000102febc080-0x0000102febf080] memblock_free_reserved_regions+0x37/0x39 BUG: unable to handle kernel paging request at ffff88102febd948 IP: [] __next_free_mem_range+0x9b/0x155 PGD 4826063 PUD cf67a067 PMD cf7fa067 PTE 800000102febd160 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU 0 Pid: 0, comm: swapper Not tainted 3.5.0-rc2-next-20120614-sasha #447 RIP: 0010:[] [] __next_free_mem_range+0x9b/0x155 See the discussion at https://lkml.org/lkml/2012/6/13/469 So try to allocate with PAGE_SIZE alignment and free it later. Reported-by: Sasha Levin Acked-by: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Yinghai Lu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 4 +--- mm/memblock.c | 51 ++++++++++++++++++++++-------------------------- mm/nobootmem.c | 38 ++++++++++++++++++++++-------------- 3 files changed, 47 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index a6bb10235148..19dc455b4f3d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,9 +50,7 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); -int memblock_free_reserved_regions(void); -int memblock_reserve_reserved_regions(void); - +phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index d4382095f8bd..5cc6731b00cc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, MAX_NUMNODES); } -/* - * Free memblock.reserved.regions - */ -int __init_memblock memblock_free_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_free(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - -/* - * Reserve memblock.reserved.regions - */ -int __init_memblock memblock_reserve_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_reserve(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { type->total_size -= type->regions[r].size; @@ -184,6 +160,18 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + /** * memblock_double_array - double the size of the memblock regions array * @type: memblock type of the regions array being doubled @@ -204,6 +192,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); int *in_slab; @@ -217,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); /* Retrieve the slab flag */ if (type == &memblock.memory) @@ -245,11 +240,11 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, addr = memblock_find_in_range(new_area_start + new_area_size, memblock.current_limit, - new_size, sizeof(phys_addr_t)); + new_alloc_size, PAGE_SIZE); if (!addr && new_area_size) addr = memblock_find_in_range(0, min(new_area_start, memblock.current_limit), - new_size, sizeof(phys_addr_t)); + new_alloc_size, PAGE_SIZE); new_array = addr ? __va(addr) : 0; } @@ -279,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_size); + memblock_free(__pa(old_array), old_alloc_size); /* Reserve the new array if that comes from the memblock. * Otherwise, we needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_size)); + BUG_ON(memblock_reserve(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 0900b3910cda..405573010f99 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn > end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + unsigned long __init free_low_memory_core_early(int nodeid) { unsigned long count = 0; - phys_addr_t start, end; + phys_addr_t start, end, size; u64 i; - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } - } + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + count += __free_memory_core(start, end); + + /* free range that is used for reserved array if we allocate it */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } -- cgit v1.2.3 From 9e33ce453f8ac8452649802bee1f410319408f4b Mon Sep 17 00:00:00 2001 From: Lin Ming Date: Sat, 7 Jul 2012 18:26:10 +0800 Subject: ipvs: fix oops on NAT reply in br_nf context IPVS should not reset skb->nf_bridge in FORWARD hook by calling nf_reset for NAT replies. It triggers oops in br_nf_forward_finish. [ 579.781508] BUG: unable to handle kernel NULL pointer dereference at 0000000000000004 [ 579.781669] IP: [] br_nf_forward_finish+0x58/0x112 [ 579.781792] PGD 218f9067 PUD 0 [ 579.781865] Oops: 0000 [#1] SMP [ 579.781945] CPU 0 [ 579.781983] Modules linked in: [ 579.782047] [ 579.782080] [ 579.782114] Pid: 4644, comm: qemu Tainted: G W 3.5.0-rc5-00006-g95e69f9 #282 Hewlett-Packard /30E8 [ 579.782300] RIP: 0010:[] [] br_nf_forward_finish+0x58/0x112 [ 579.782455] RSP: 0018:ffff88007b003a98 EFLAGS: 00010287 [ 579.782541] RAX: 0000000000000008 RBX: ffff8800762ead00 RCX: 000000000001670a [ 579.782653] RDX: 0000000000000000 RSI: 000000000000000a RDI: ffff8800762ead00 [ 579.782845] RBP: ffff88007b003ac8 R08: 0000000000016630 R09: ffff88007b003a90 [ 579.782957] R10: ffff88007b0038e8 R11: ffff88002da37540 R12: ffff88002da01a02 [ 579.783066] R13: ffff88002da01a80 R14: ffff88002d83c000 R15: ffff88002d82a000 [ 579.783177] FS: 0000000000000000(0000) GS:ffff88007b000000(0063) knlGS:00000000f62d1b70 [ 579.783306] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b [ 579.783395] CR2: 0000000000000004 CR3: 00000000218fe000 CR4: 00000000000027f0 [ 579.783505] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 579.783684] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 579.783795] Process qemu (pid: 4644, threadinfo ffff880021b20000, task ffff880021aba760) [ 579.783919] Stack: [ 579.783959] ffff88007693cedc ffff8800762ead00 ffff88002da01a02 ffff8800762ead00 [ 579.784110] ffff88002da01a02 ffff88002da01a80 ffff88007b003b18 ffffffff817b26c7 [ 579.784260] ffff880080000000 ffffffff81ef59f0 ffff8800762ead00 ffffffff81ef58b0 [ 579.784477] Call Trace: [ 579.784523] [ 579.784562] [ 579.784603] [] br_nf_forward_ip+0x275/0x2c8 [ 579.784707] [] nf_iterate+0x47/0x7d [ 579.784797] [] ? br_dev_queue_push_xmit+0xae/0xae [ 579.784906] [] nf_hook_slow+0x6d/0x102 [ 579.784995] [] ? br_dev_queue_push_xmit+0xae/0xae [ 579.785175] [] ? _raw_write_unlock_bh+0x19/0x1b [ 579.785179] [] __br_forward+0x97/0xa2 [ 579.785179] [] br_handle_frame_finish+0x1a6/0x257 [ 579.785179] [] br_nf_pre_routing_finish+0x26d/0x2cb [ 579.785179] [] br_nf_pre_routing+0x55d/0x5c1 [ 579.785179] [] nf_iterate+0x47/0x7d [ 579.785179] [] ? br_handle_local_finish+0x44/0x44 [ 579.785179] [] nf_hook_slow+0x6d/0x102 [ 579.785179] [] ? br_handle_local_finish+0x44/0x44 [ 579.785179] [] ? sky2_poll+0xb35/0xb54 [ 579.785179] [] br_handle_frame+0x213/0x229 [ 579.785179] [] ? br_handle_frame_finish+0x257/0x257 [ 579.785179] [] __netif_receive_skb+0x2b4/0x3f1 [ 579.785179] [] process_backlog+0x99/0x1e2 [ 579.785179] [] net_rx_action+0xdf/0x242 [ 579.785179] [] __do_softirq+0xc1/0x1e0 [ 579.785179] [] ? trace_hardirqs_off_thunk+0x3a/0x6c [ 579.785179] [] call_softirq+0x1c/0x30 The steps to reproduce as follow, 1. On Host1, setup brige br0(192.168.1.106) 2. Boot a kvm guest(192.168.1.105) on Host1 and start httpd 3. Start IPVS service on Host1 ipvsadm -A -t 192.168.1.106:80 -s rr ipvsadm -a -t 192.168.1.106:80 -r 192.168.1.105:80 -m 4. Run apache benchmark on Host2(192.168.1.101) ab -n 1000 http://192.168.1.106/ ip_vs_reply4 ip_vs_out handle_response ip_vs_notrack nf_reset() { skb->nf_bridge = NULL; } Actually, IPVS wants in this case just to replace nfct with untracked version. So replace the nf_reset(skb) call in ip_vs_notrack() with a nf_conntrack_put(skb->nfct) call. Signed-off-by: Lin Ming Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d6146b4811c2..95374d1696a1 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1425,7 +1425,7 @@ static inline void ip_vs_notrack(struct sk_buff *skb) struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (!ct || !nf_ct_is_untracked(ct)) { - nf_reset(skb); + nf_conntrack_put(skb->nfct); skb->nfct = &nf_ct_untracked_get()->ct_general; skb->nfctinfo = IP_CT_NEW; nf_conntrack_get(skb->nfct); -- cgit v1.2.3