From f1c6f1a7eed963ed233ba4c8b6fa8addb86c6ddc Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Wed, 26 Oct 2011 23:14:16 +0200 Subject: sched: Set the command name of the idle tasks in SMP kernels In UP systems, the idle task is initialized using the init_task structure from which the command name is taken (currently "swapper"). In SMP systems, one idle task per CPU is forked by the worker thread from which the task structure is copied. The command name is, therefore, "kworker/0:0" or "kworker/0:1", if not updated. Since such update was lacking, all idle tasks in SMP systems were incorrectly named. This longtime bug was not discovered immediately, because there is no /proc/0 entry - the bug only becomes apparent when tracing is enabled. This patch sets the command name of the idle tasks in SMP systems to the name that is used in the INIT_TASK structure suffixed by a slash and the number of the CPU. Signed-off-by: Carsten Emde Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111026211708.768925506@osadl.org Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 08ffab01e76c..b6e5b8b000e0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -126,6 +126,8 @@ extern struct cred init_cred; # define INIT_PERF_EVENTS(tsk) #endif +#define INIT_TASK_COMM "swapper" + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -162,7 +164,7 @@ extern struct cred init_cred; .group_leader = &tsk, \ RCU_INIT_POINTER(.real_cred, &init_cred), \ RCU_INIT_POINTER(.cred, &init_cred), \ - .comm = "swapper", \ + .comm = INIT_TASK_COMM, \ .thread = INIT_THREAD, \ .fs = &init_fs, \ .files = &init_files, \ -- cgit v1.2.3 From 5151412dd4338b273afdb107c3772528e9e67d92 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 23 Nov 2011 10:59:13 +0100 Subject: block: initialize request_queue's numa node during struct request_queue is allocated with __GFP_ZERO so its "node" field is zero before initialization. This causes an oops if node 0 is offline in the page allocator because its zonelists are not initialized. From Dave Young's dmesg: SRAT: Node 1 PXM 2 0-d0000000 SRAT: Node 1 PXM 2 100000000-330000000 SRAT: Node 0 PXM 1 330000000-630000000 Initmem setup node 1 0000000000000000-000000000affb000 ... Built 1 zonelists in Node order, mobility grouping on. ... BUG: unable to handle kernel paging request at 0000000000001c08 IP: [] __alloc_pages_nodemask+0xb5/0x870 and __alloc_pages_nodemask+0xb5 translates to a NULL pointer on zonelist->_zonerefs. The fix is to initialize q->node at the time of allocation so the correct node is passed to the slab allocator later. Since blk_init_allocated_queue_node() is no longer needed, merge it with blk_init_allocated_queue(). [rientjes@google.com: changelog, initializing q->node] Cc: stable@vger.kernel.org [2.6.37+] Reported-by: Dave Young Signed-off-by: Mike Snitzer Signed-off-by: David Rientjes Tested-by: Dave Young Signed-off-by: Jens Axboe --- block/blk-core.c | 14 +++----------- include/linux/blkdev.h | 3 --- 2 files changed, 3 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/block/blk-core.c b/block/blk-core.c index ea70e6c80cd3..20d69f6beb6b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -467,6 +467,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; q->backing_dev_info.name = "block"; + q->node = node_id; err = bdi_init(&q->backing_dev_info); if (err) { @@ -551,7 +552,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) if (!uninit_q) return NULL; - q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + q = blk_init_allocated_queue(uninit_q, rfn, lock); if (!q) blk_cleanup_queue(uninit_q); @@ -562,19 +563,10 @@ EXPORT_SYMBOL(blk_init_queue_node); struct request_queue * blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, spinlock_t *lock) -{ - return blk_init_allocated_queue_node(q, rfn, lock, -1); -} -EXPORT_SYMBOL(blk_init_allocated_queue); - -struct request_queue * -blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, - spinlock_t *lock, int node_id) { if (!q) return NULL; - q->node = node_id; if (blk_init_free_list(q)) return NULL; @@ -604,7 +596,7 @@ blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, return NULL; } -EXPORT_SYMBOL(blk_init_allocated_queue_node); +EXPORT_SYMBOL(blk_init_allocated_queue); int blk_get_queue(struct request_queue *q) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c7a6d3b5bc7b..94acd8172b5b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -805,9 +805,6 @@ extern void blk_unprep_request(struct request *); */ extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id); -extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *, - request_fn_proc *, - spinlock_t *, int node_id); extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *); extern struct request_queue *blk_init_allocated_queue(struct request_queue *, request_fn_proc *, spinlock_t *); -- cgit v1.2.3 From a67ba43d30bf8c1cfdc2615439455302d2408453 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Thu, 1 Dec 2011 12:54:31 -0500 Subject: asm-generic/unistd.h: support new process_vm_{readv,write} syscalls Also prototype the "compat" functions so they can be referenced from C code. Signed-off-by: Chris Metcalf Acked-by: Arnd Bergmann --- include/asm-generic/unistd.h | 8 +++++++- include/linux/compat.h | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index f4c38d8c6674..2292d1af9d70 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -685,9 +685,15 @@ __SYSCALL(__NR_syncfs, sys_syncfs) __SYSCALL(__NR_setns, sys_setns) #define __NR_sendmmsg 269 __SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg) +#define __NR_process_vm_readv 270 +__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \ + compat_sys_process_vm_readv) +#define __NR_process_vm_writev 271 +__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \ + compat_sys_process_vm_writev) #undef __NR_syscalls -#define __NR_syscalls 270 +#define __NR_syscalls 272 /* * All syscalls below here should go away really, diff --git a/include/linux/compat.h b/include/linux/compat.h index 154bf5683015..66ed067fb729 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -552,5 +552,14 @@ extern ssize_t compat_rw_copy_check_uvector(int type, extern void __user *compat_alloc_user_space(unsigned long len); +asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, const struct compat_iovec __user *rvec, + unsigned long riovcnt, unsigned long flags); +asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, const struct compat_iovec __user *rvec, + unsigned long riovcnt, unsigned long flags); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ -- cgit v1.2.3 From 10c6db110d0eb4466b59812c49088ab56218fc2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 26 Nov 2011 02:47:31 +0100 Subject: perf: Fix loss of notification with multi-event When you do: $ perf record -e cycles,cycles,cycles noploop 10 You expect about 10,000 samples for each event, i.e., 10s at 1000samples/sec. However, this is not what's happening. You get much fewer samples, maybe 3700 samples/event: $ perf report -D | tail -15 Aggregated stats: TOTAL events: 10998 MMAP events: 66 COMM events: 2 SAMPLE events: 10930 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 cycles stats: TOTAL events: 3642 SAMPLE events: 3642 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 On a Intel Nehalem or even AMD64, there are 4 counters capable of measuring cycles, so there is plenty of space to measure those events without multiplexing (even with the NMI watchdog active). And even with multiplexing, we'd expect roughly the same number of samples per event. The root of the problem was that when the event that caused the buffer to become full was not the first event passed on the cmdline, the user notification would get lost. The notification was sent to the file descriptor of the overflowed event but the perf tool was not polling on it. The perf tool aggregates all samples into a single buffer, i.e., the buffer of the first event. Consequently, it assumes notifications for any event will come via that descriptor. The seemingly straight forward solution of moving the waitq into the ringbuffer object doesn't work because of life-time issues. One could perf_event_set_output() on a fd that you're also blocking on and cause the old rb object to be freed while its waitq would still be referenced by the blocked thread -> FAIL. Therefore link all events to the ringbuffer and broadcast the wakeup from the ringbuffer object to all possible events that could be waited upon. This is rather ugly, and we're open to better solutions but it works for now. Reported-by: Stephane Eranian Finished-by: Stephane Eranian Reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 86 +++++++++++++++++++++++++++++++++++++++++++-- kernel/events/internal.h | 3 ++ kernel/events/ring_buffer.c | 3 ++ 4 files changed, 91 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e9ebe5e0091..b1f89122bf6a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -822,6 +822,7 @@ struct perf_event { int mmap_locked; struct user_struct *mmap_user; struct ring_buffer *rb; + struct list_head rb_entry; /* poll related */ wait_queue_head_t waitq; diff --git a/kernel/events/core.c b/kernel/events/core.c index b0c1186fd97b..600c1629b64d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -3191,12 +3194,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) struct ring_buffer *rb; unsigned int events = POLL_HUP; + /* + * Race between perf_event_set_output() and perf_poll(): perf_poll() + * grabs the rb reference but perf_event_set_output() overrides it. + * Here is the timeline for two threads T1, T2: + * t0: T1, rb = rcu_dereference(event->rb) + * t1: T2, old_rb = event->rb + * t2: T2, event->rb = new rb + * t3: T2, ring_buffer_detach(old_rb) + * t4: T1, ring_buffer_attach(rb1) + * t5: T1, poll_wait(event->waitq) + * + * To avoid this problem, we grab mmap_mutex in perf_poll() + * thereby ensuring that the assignment of the new ring buffer + * and the detachment of the old buffer appear atomic to perf_poll() + */ + mutex_lock(&event->mmap_mutex); + rcu_read_lock(); rb = rcu_dereference(event->rb); - if (rb) + if (rb) { + ring_buffer_attach(event, rb); events = atomic_xchg(&rb->poll, 0); + } rcu_read_unlock(); + mutex_unlock(&event->mmap_mutex); + poll_wait(file, &event->waitq, wait); return events; @@ -3497,6 +3521,49 @@ unlock: return ret; } +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (!list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + if (!list_empty(&event->rb_entry)) + goto unlock; + + list_add(&event->rb_entry, &rb->event_list); +unlock: + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_detach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_wakeup(struct perf_event *event) +{ + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + wake_up_all(&event->waitq); + } + rcu_read_unlock(); +} + static void rb_free_rcu(struct rcu_head *rcu_head) { struct ring_buffer *rb; @@ -3522,9 +3589,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { + struct perf_event *event, *n; + unsigned long flags; + if (!atomic_dec_and_test(&rb->refcount)) return; + spin_lock_irqsave(&rb->event_lock, flags); + list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + } + spin_unlock_irqrestore(&rb->event_lock, flags); + call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3547,6 +3624,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); ring_buffer_put(rb); @@ -3701,7 +3779,7 @@ static const struct file_operations perf_fops = { void perf_event_wakeup(struct perf_event *event) { - wake_up_all(&event->waitq); + ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(&event->fasync, SIGIO, event->pending_kill); @@ -5823,6 +5901,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->rb_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -6029,6 +6109,8 @@ set: old_rb = event->rb; rcu_assign_pointer(event->rb, rb); + if (old_rb) + ring_buffer_detach(event, old_rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..64568a699375 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -22,6 +22,9 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + /* poll crap */ + spinlock_t event_lock; + struct list_head event_list; struct perf_event_mmap_page *user_page; void *data_pages[0]; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0f..7f3011c6b57f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->writable = 1; atomic_set(&rb->refcount, 1); + + INIT_LIST_HEAD(&rb->event_list); + spin_lock_init(&rb->event_lock); } #ifndef CONFIG_PERF_USE_VMALLOC -- cgit v1.2.3 From f62ef5f3e9cff065aa845e2b7f487e1810b8e57e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 2 Dec 2011 08:21:43 +0100 Subject: x86, amd: Fix up numa_node information for AMD CPU family 15h model 0-0fh northbridge functions I've received complaints that the numa_node attribute for family 15h model 00-0fh (e.g. Interlagos) northbridge functions shows -1 instead of the proper node ID. Correct this with attached quirks (similar to quirks for other AMD CPU families used in multi-socket systems). Signed-off-by: Andreas Herrmann Cc: Frank Arnold Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20111202072143.GA31916@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 13 +++++++++++++ include/linux/pci_ids.h | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index b78643d0f9a5..03920a15a632 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, quirk_amd_nb_node); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + #endif diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 172ba70306d1..2aaee0ca9da8 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -517,8 +517,12 @@ #define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302 #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 +#define PCI_DEVICE_ID_AMD_15H_NB_F0 0x1600 +#define PCI_DEVICE_ID_AMD_15H_NB_F1 0x1601 +#define PCI_DEVICE_ID_AMD_15H_NB_F2 0x1602 #define PCI_DEVICE_ID_AMD_15H_NB_F3 0x1603 #define PCI_DEVICE_ID_AMD_15H_NB_F4 0x1604 +#define PCI_DEVICE_ID_AMD_15H_NB_F5 0x1605 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- cgit v1.2.3 From 27b14b56af081ec7edeefb3a38b2c9577cc5ef48 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Nov 2011 09:09:35 +0800 Subject: tracing: Restore system filter behavior Though not all events have field 'prev_pid', it was allowed to do this: # echo 'prev_pid == 100' > events/sched/filter but commit 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 (tracing/filter: Swap entire filter of events) broke it without any reason. Link: http://lkml.kernel.org/r/4EAF46CF.8040408@cn.fujitsu.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_events_filter.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 96efa6794ea5..c3da42dd22ba 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -172,6 +172,7 @@ enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_RECORDED_CMD_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, + TRACE_EVENT_FL_NO_SET_FILTER_BIT, }; enum { @@ -179,6 +180,7 @@ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), + TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), }; struct ftrace_event_call { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d6e7926dcd26..95dc31efd6dd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; -- cgit v1.2.3 From 02125a826459a6ad142f8d91c5b6357562f96615 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Dec 2011 08:43:34 -0500 Subject: fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API __d_path() API is asking for trouble and in case of apparmor d_namespace_path() getting just that. The root cause is that when __d_path() misses the root it had been told to look for, it stores the location of the most remote ancestor in *root. Without grabbing references. Sure, at the moment of call it had been pinned down by what we have in *path. And if we raced with umount -l, we could have very well stopped at vfsmount/dentry that got freed as soon as prepend_path() dropped vfsmount_lock. It is safe to compare these pointers with pre-existing (and known to be still alive) vfsmount and dentry, as long as all we are asking is "is it the same address?". Dereferencing is not safe and apparmor ended up stepping into that. d_namespace_path() really wants to examine the place where we stopped, even if it's not connected to our namespace. As the result, it looked at ->d_sb->s_magic of a dentry that might've been already freed by that point. All other callers had been careful enough to avoid that, but it's really a bad interface - it invites that kind of trouble. The fix is fairly straightforward, even though it's bigger than I'd like: * prepend_path() root argument becomes const. * __d_path() is never called with NULL/NULL root. It was a kludge to start with. Instead, we have an explicit function - d_absolute_root(). Same as __d_path(), except that it doesn't get root passed and stops where it stops. apparmor and tomoyo are using it. * __d_path() returns NULL on path outside of root. The main caller is show_mountinfo() and that's precisely what we pass root for - to skip those outside chroot jail. Those who don't want that can (and do) use d_path(). * __d_path() root argument becomes const. Everyone agrees, I hope. * apparmor does *NOT* try to use __d_path() or any of its variants when it sees that path->mnt is an internal vfsmount. In that case it's definitely not mounted anywhere and dentry_path() is exactly what we want there. Handling of sysctl()-triggered weirdness is moved to that place. * if apparmor is asked to do pathname relative to chroot jail and __d_path() tells it we it's not in that jail, the sucker just calls d_absolute_path() instead. That's the other remaining caller of __d_path(), BTW. * seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway - the normal seq_file logics will take care of growing the buffer and redoing the call of ->show() just fine). However, if it gets path not reachable from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped ignoring the return value as it used to do). Reviewed-by: John Johansen ACKed-by: John Johansen Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- fs/dcache.c | 71 ++++++++++++++++++++++++++++------------------ fs/namespace.c | 20 +++++++------ fs/seq_file.c | 6 ++-- include/linux/dcache.h | 3 +- include/linux/fs.h | 1 + security/apparmor/path.c | 65 ++++++++++++++++++++++++------------------ security/tomoyo/realpath.c | 3 +- 7 files changed, 100 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 10ba92def3f6..89509b5a090e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2439,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * Caller holds the rename_lock. - * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). */ -static int prepend_path(const struct path *path, struct path *root, +static int prepend_path(const struct path *path, + const struct path *root, char **buffer, int *buflen) { struct dentry *dentry = path->dentry; @@ -2483,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, dentry = parent; } -out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); +out: br_read_unlock(vfsmount_lock); return error; @@ -2500,15 +2498,17 @@ global_root: WARN(1, "Root dentry has weird name <%.*s>\n", (int) dentry->d_name.len, dentry->d_name.name); } - root->mnt = vfsmnt; - root->dentry = dentry; + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = vfsmnt->mnt_ns ? 1 : 2; goto out; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * @@ -2519,10 +2519,10 @@ global_root: * * "buflen" should be positive. * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). + * If the path is not reachable from the supplied root, return %NULL. */ -char *__d_path(const struct path *path, struct path *root, +char *__d_path(const struct path *path, + const struct path *root, char *buf, int buflen) { char *res = buf + buflen; @@ -2533,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) return ERR_PTR(error); return res; } @@ -2541,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, /* * same as __d_path but appends "(deleted)" for unlinked files. */ -static int path_with_deleted(const struct path *path, struct path *root, - char **buf, int *buflen) +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { @@ -2579,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; /* @@ -2594,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (error) + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) res = ERR_PTR(error); write_sequnlock(&rename_lock); path_put(&root); @@ -2617,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; if (path->dentry->d_op && path->dentry->d_op->d_dname) @@ -2625,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (!error && !path_equal(&tmp, &root)) + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); path_put(&root); @@ -2758,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - struct path tmp = root; char *cwd = page + PAGE_SIZE; int buflen = PAGE_SIZE; prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &tmp, &cwd, &buflen); + error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) goto out; /* Unreachable from current root */ - if (!path_equal(&tmp, &root)) { + if (error > 0) { error = prepend_unreachable(&cwd, &buflen); if (error) goto out; diff --git a/fs/namespace.c b/fs/namespace.c index 6d3a1963879b..cfc6d4448aa5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v) if (err) goto out; seq_putc(m, ' '); - seq_path_root(m, &mnt_path, &root, " \t\n\\"); - if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { - /* - * Mountpoint is outside root, discard that one. Ugly, - * but less so than trying to do that in iterator in a - * race-free way (due to renames). - */ - return SEQ_SKIP; - } + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (err) + goto out; + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); show_mnt_opts(m, mnt); @@ -2776,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt) } } EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(mnt); +} diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e78c95..dba43c3ea3af 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path); /* * Same as seq_path, but relative to supplied root. - * - * root may be changed, see __d_path(). */ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *esc) @@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *p; p = __d_path(path, root, buf, size); + if (!p) + return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); @@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, } seq_commit(m, res); - return res < 0 ? res : 0; + return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4df926199369..ed9f74f6c519 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -339,7 +339,8 @@ extern int d_validate(struct dentry *, struct dentry *); */ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); -extern char *__d_path(const struct path *path, struct path *root, char *, int); +extern char *__d_path(const struct path *, const struct path *, char *, int); +extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *d_path_with_unreachable(const struct path *, char *, int); extern char *dentry_path_raw(struct dentry *, char *, int); diff --git a/include/linux/fs.h b/include/linux/fs.h index e3130220ce3e..019dc558df1a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1942,6 +1942,7 @@ extern int fd_statfs(int, struct kstatfs *); extern int statfs_by_dentry(struct dentry *, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); +extern bool our_mnt(struct vfsmount *mnt); extern int current_umask(void); diff --git a/security/apparmor/path.c b/security/apparmor/path.c index 36cc0cc39e78..b566eba4a65c 100644 --- a/security/apparmor/path.c +++ b/security/apparmor/path.c @@ -57,23 +57,44 @@ static int prepend(char **buffer, int buflen, const char *str, int namelen) static int d_namespace_path(struct path *path, char *buf, int buflen, char **name, int flags) { - struct path root, tmp; char *res; - int connected, error = 0; + int error = 0; + int connected = 1; + + if (path->mnt->mnt_flags & MNT_INTERNAL) { + /* it's not mounted anywhere */ + res = dentry_path(path->dentry, buf, buflen); + *name = res; + if (IS_ERR(res)) { + *name = buf; + return PTR_ERR(res); + } + if (path->dentry->d_sb->s_magic == PROC_SUPER_MAGIC && + strncmp(*name, "/sys/", 5) == 0) { + /* TODO: convert over to using a per namespace + * control instead of hard coded /proc + */ + return prepend(name, *name - buf, "/proc", 5); + } + return 0; + } - /* Get the root we want to resolve too, released below */ + /* resolve paths relative to chroot?*/ if (flags & PATH_CHROOT_REL) { - /* resolve paths relative to chroot */ + struct path root; get_fs_root(current->fs, &root); - } else { - /* resolve paths relative to namespace */ - root.mnt = current->nsproxy->mnt_ns->root; - root.dentry = root.mnt->mnt_root; - path_get(&root); + res = __d_path(path, &root, buf, buflen); + if (res && !IS_ERR(res)) { + /* everything's fine */ + *name = res; + path_put(&root); + goto ok; + } + path_put(&root); + connected = 0; } - tmp = root; - res = __d_path(path, &tmp, buf, buflen); + res = d_absolute_path(path, buf, buflen); *name = res; /* handle error conditions - and still allow a partial path to @@ -84,7 +105,10 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, *name = buf; goto out; } + if (!our_mnt(path->mnt)) + connected = 0; +ok: /* Handle two cases: * 1. A deleted dentry && profile is not allowing mediation of deleted * 2. On some filesystems, newly allocated dentries appear to the @@ -97,10 +121,7 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, goto out; } - /* Determine if the path is connected to the expected root */ - connected = tmp.dentry == root.dentry && tmp.mnt == root.mnt; - - /* If the path is not connected, + /* If the path is not connected to the expected root, * check if it is a sysctl and handle specially else remove any * leading / that __d_path may have returned. * Unless @@ -112,17 +133,9 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, * namespace root. */ if (!connected) { - /* is the disconnect path a sysctl? */ - if (tmp.dentry->d_sb->s_magic == PROC_SUPER_MAGIC && - strncmp(*name, "/sys/", 5) == 0) { - /* TODO: convert over to using a per namespace - * control instead of hard coded /proc - */ - error = prepend(name, *name - buf, "/proc", 5); - } else if (!(flags & PATH_CONNECT_PATH) && + if (!(flags & PATH_CONNECT_PATH) && !(((flags & CHROOT_NSCONNECT) == CHROOT_NSCONNECT) && - (tmp.mnt == current->nsproxy->mnt_ns->root && - tmp.dentry == tmp.mnt->mnt_root))) { + our_mnt(path->mnt))) { /* disconnected path, don't return pathname starting * with '/' */ @@ -133,8 +146,6 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, } out: - path_put(&root); - return error; } diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 738bbdf8d4c7..36fa7c9bedc4 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -101,9 +101,8 @@ static char *tomoyo_get_absolute_path(struct path *path, char * const buffer, { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { - struct path ns_root = { }; /* go to whatever namespace root we are under */ - pos = __d_path(path, &ns_root, buffer, buflen - 1); + pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = path->dentry->d_inode; if (inode && S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From 83aeeada7c69f35e5100b27ec354335597a7a488 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:54 -0800 Subject: vmscan: use atomic-long for shrinker batching Use atomic-long operations instead of looping around cmpxchg(). [akpm@linux-foundation.org: massage atomic.h inclusions] Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- include/linux/mm.h | 1 + include/linux/shrinker.h | 2 +- mm/vmscan.c | 17 +++++++---------- 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 019dc558df1a..e0bc4ffb8e7f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -393,8 +393,8 @@ struct inodes_stat_t { #include #include #include -#include #include +#include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dc3a8c2c485..4baadd18f4ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index a83833a1f7a2..07ceb97d53fa 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -35,7 +35,7 @@ struct shrinker { /* These are for internal use */ struct list_head list; - long nr; /* objs pending delete */ + atomic_long_t nr_in_batch; /* objs pending delete */ }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ extern void register_shrinker(struct shrinker *); diff --git a/mm/vmscan.c b/mm/vmscan.c index f5255442ae2b..f54a05b7a61d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + atomic_long_set(&shrinker->nr_in_batch, 0); down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -264,9 +264,7 @@ unsigned long shrink_slab(struct shrink_control *shrink, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - do { - nr = shrinker->nr; - } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -328,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - do { - nr = shrinker->nr; - new_nr = total_scan + nr; - if (total_scan <= 0) - break; - } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_in_batch); + else + new_nr = atomic_long_read(&shrinker->nr_in_batch); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } -- cgit v1.2.3 From 6de5fc9cf7de334912de4cfd2d06eb2d744d2afe Mon Sep 17 00:00:00 2001 From: Stefan Nilsson XK Date: Thu, 3 Nov 2011 09:44:12 +0100 Subject: mmc: core: Add quirk for long data read time Adds a quirk that sets the data read timeout to a fixed value instead of relying on the information in the CSD. The timeout value chosen is 300ms since that has proven enough for the problematic cards found, but could be increased if other cards require this. This patch also enables this quirk for certain Micron cards known to have this problem. Signed-off-by: Stefan Nilsson XK Signed-off-by: Ulf Hansson Acked-by: Linus Walleij Cc: Signed-off-by: Chris Ball --- drivers/mmc/card/block.c | 8 ++++++++ drivers/mmc/core/core.c | 12 ++++++++++++ include/linux/mmc/card.h | 6 ++++++ 3 files changed, 26 insertions(+) (limited to 'include/linux') diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index a1cb21f95302..1e0e27cbe987 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -1606,6 +1606,14 @@ static const struct mmc_fixup blk_fixups[] = MMC_QUIRK_BLK_NO_CMD23), MMC_FIXUP("MMC32G", 0x11, CID_OEMID_ANY, add_quirk_mmc, MMC_QUIRK_BLK_NO_CMD23), + + /* + * Some Micron MMC cards needs longer data read timeout than + * indicated in CSD. + */ + MMC_FIXUP(CID_NAME_ANY, 0x13, 0x200, add_quirk_mmc, + MMC_QUIRK_LONG_READ_TIME), + END_FIXUP }; diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 5278ffb20e74..74a012ad2bab 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -529,6 +529,18 @@ void mmc_set_data_timeout(struct mmc_data *data, const struct mmc_card *card) data->timeout_clks = 0; } } + + /* + * Some cards require longer data read timeout than indicated in CSD. + * Address this by setting the read timeout to a "reasonably high" + * value. For the cards tested, 300ms has proven enough. If necessary, + * this value can be increased if other problematic cards require this. + */ + if (mmc_card_long_read_time(card) && data->flags & MMC_DATA_READ) { + data->timeout_ns = 300000000; + data->timeout_clks = 0; + } + /* * Some cards need very high timeouts if driven in SPI mode. * The worst observed timeout was 900ms after writing a diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 415f2db414e1..c8ef9bc54d50 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -218,6 +218,7 @@ struct mmc_card { #define MMC_QUIRK_INAND_CMD38 (1<<6) /* iNAND devices have broken CMD38 */ #define MMC_QUIRK_BLK_NO_CMD23 (1<<7) /* Avoid CMD23 for regular multiblock */ #define MMC_QUIRK_BROKEN_BYTE_MODE_512 (1<<8) /* Avoid sending 512 bytes in */ +#define MMC_QUIRK_LONG_READ_TIME (1<<9) /* Data read time > CSD says */ /* byte mode */ unsigned int poweroff_notify_state; /* eMMC4.5 notify feature */ #define MMC_NO_POWER_NOTIFICATION 0 @@ -433,6 +434,11 @@ static inline int mmc_card_broken_byte_mode_512(const struct mmc_card *c) return c->quirks & MMC_QUIRK_BROKEN_BYTE_MODE_512; } +static inline int mmc_card_long_read_time(const struct mmc_card *c) +{ + return c->quirks & MMC_QUIRK_LONG_READ_TIME; +} + #define mmc_card_name(c) ((c)->cid.prod_name) #define mmc_card_id(c) (dev_name(&(c)->dev)) -- cgit v1.2.3 From 13c07b0286d340275f2d97adf085cecda37ede37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 12 Dec 2011 22:06:55 -0800 Subject: linux/log2.h: Fix rounddown_pow_of_two(1) Exactly like roundup_pow_of_two(1), the rounddown version was buggy for the case of a compile-time constant '1' argument. Probably because it originated from the same code, sharing history with the roundup version from before the bugfix (for that one, see commit 1a06a52ee1b0: "Fix roundup_pow_of_two(1)"). However, unlike the roundup version, the fix for rounddown is to just remove the broken special case entirely. It's simply not needed - the generic code 1UL << ilog2(n) does the right thing for the constant '1' argment too. The only reason roundup needed that special case was because rounding up does so by subtracting one from the argument (and then adding one to the result) causing the obvious problems with "ilog2(0)". But rounddown doesn't do any of that, since ilog2() naturally truncates (ie "rounds down") to the right rounded down value. And without the ilog2(0) case, there's no reason for the special case that had the wrong value. tl;dr: rounddown_pow_of_two(1) should be 1, not 0. Acked-by: Dmitry Torokhov Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- include/linux/log2.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/log2.h b/include/linux/log2.h index 25b808631cd9..fd7ff3d91e6a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h @@ -185,7 +185,6 @@ unsigned long __rounddown_pow_of_two(unsigned long n) #define rounddown_pow_of_two(n) \ ( \ __builtin_constant_p(n) ? ( \ - (n == 1) ? 0 : \ (1UL << ilog2(n))) : \ __rounddown_pow_of_two(n) \ ) -- cgit v1.2.3 From 8bc1f85c02a20a59956b00b3acea12c04dce9ae8 Mon Sep 17 00:00:00 2001 From: Eugeni Dodonov Date: Wed, 23 Nov 2011 16:42:14 -0200 Subject: iommu: Export intel_iommu_enabled to signal when iommu is in use In i915 driver, we do not enable either rc6 or semaphores on SNB when dmar is enabled. The new 'intel_iommu_enabled' variable signals when the iommu code is in operation. Cc: Ted Phelps Cc: Peter Cc: Lukas Hejtmanek Cc: Andrew Lutomirski CC: Daniel Vetter Cc: Eugeni Dodonov Signed-off-by: Keith Packard --- drivers/iommu/intel-iommu.c | 5 +++++ include/linux/dma_remapping.h | 2 ++ 2 files changed, 7 insertions(+) (limited to 'include/linux') diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index c0c7820d4c46..8dc19b8f5d45 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -405,6 +405,9 @@ int dmar_disabled = 0; int dmar_disabled = 1; #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/ +int intel_iommu_enabled = 0; +EXPORT_SYMBOL_GPL(intel_iommu_enabled); + static int dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; @@ -3647,6 +3650,8 @@ int __init intel_iommu_init(void) bus_register_notifier(&pci_bus_type, &device_nb); + intel_iommu_enabled = 1; + return 0; } diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index ef90cbd8e173..57c9a8ae4f2d 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -31,6 +31,7 @@ extern void free_dmar_iommu(struct intel_iommu *iommu); extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; +extern int intel_iommu_enabled; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) { @@ -44,6 +45,7 @@ static inline void free_dmar_iommu(struct intel_iommu *iommu) { } #define dmar_disabled (1) +#define intel_iommu_enabled (0) #endif -- cgit v1.2.3 From b1b73d095084e754562961c443aa8f6587a55f8e Mon Sep 17 00:00:00 2001 From: Kusanagi Kouichi Date: Mon, 19 Dec 2011 18:13:19 +0900 Subject: time/clocksource: Fix kernel-doc warnings Fix various KernelDoc build warnings. Signed-off-by: Kusanagi Kouichi Cc: John Stultz Link: http://lkml.kernel.org/r/20111219091320.0D5AF6FC03D@msa105.auone-net.jp Signed-off-by: Ingo Molnar --- include/linux/clocksource.h | 11 ++++++++--- kernel/time/clocksource.c | 12 +++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index c86c940d1de3..081147da0564 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -71,7 +71,7 @@ struct timecounter { /** * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds - * @tc: Pointer to cycle counter. + * @cc: Pointer to cycle counter. * @cycles: Cycles * * XXX - This could use some mult_lxl_ll() asm optimization. Same code @@ -114,7 +114,7 @@ extern u64 timecounter_read(struct timecounter *tc); * time base as values returned by * timecounter_read() * @tc: Pointer to time counter. - * @cycle: a value returned by tc->cc->read() + * @cycle_tstamp: a value returned by tc->cc->read() * * Cycle counts that are converted correctly as long as they * fall into the interval [-1/2 max cycle count, +1/2 max cycle count], @@ -156,11 +156,12 @@ extern u64 timecounter_cyc2time(struct timecounter *tc, * @mult: cycle to nanosecond multiplier * @shift: cycle to nanosecond divisor (power of two) * @max_idle_ns: max idle time permitted by the clocksource (nsecs) - * @maxadj maximum adjustment value to mult (~11%) + * @maxadj: maximum adjustment value to mult (~11%) * @flags: flags describing special properties * @archdata: arch-specific data * @suspend: suspend function for the clocksource, if necessary * @resume: resume function for the clocksource, if necessary + * @cycle_last: most recent cycle counter value seen by ::read() */ struct clocksource { /* @@ -187,6 +188,7 @@ struct clocksource { void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); + /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG /* Watchdog related data, used by the framework */ struct list_head wd_list; @@ -261,6 +263,9 @@ static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant) /** * clocksource_cyc2ns - converts clocksource cycles to nanoseconds + * @cycles: cycles + * @mult: cycle to nanosecond multiplier + * @shift: cycle to nanosecond divisor (power of two) * * Converts cycles to nanoseconds, using the given mult and shift. * diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index da2f760e780c..d3ad022136e5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -647,7 +647,7 @@ static void clocksource_enqueue(struct clocksource *cs) /** * __clocksource_updatefreq_scale - Used update clocksource with new freq - * @t: clocksource to be registered + * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * @@ -699,7 +699,7 @@ EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); /** * __clocksource_register_scale - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale * @@ -727,7 +727,7 @@ EXPORT_SYMBOL_GPL(__clocksource_register_scale); /** * clocksource_register - Used to install new clocksources - * @t: clocksource to be registered + * @cs: clocksource to be registered * * Returns -EBUSY if registration fails, zero otherwise. */ @@ -761,6 +761,8 @@ static void __clocksource_change_rating(struct clocksource *cs, int rating) /** * clocksource_change_rating - Change the rating of a registered clocksource + * @cs: clocksource to be changed + * @rating: new rating */ void clocksource_change_rating(struct clocksource *cs, int rating) { @@ -772,6 +774,7 @@ EXPORT_SYMBOL(clocksource_change_rating); /** * clocksource_unregister - remove a registered clocksource + * @cs: clocksource to be unregistered */ void clocksource_unregister(struct clocksource *cs) { @@ -787,6 +790,7 @@ EXPORT_SYMBOL(clocksource_unregister); /** * sysfs_show_current_clocksources - sysfs interface for current clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing current clocksource. @@ -807,6 +811,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, /** * sysfs_override_clocksource - interface for manually overriding clocksource * @dev: unused + * @attr: unused * @buf: name of override clocksource * @count: length of buffer * @@ -842,6 +847,7 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev, /** * sysfs_show_available_clocksources - sysfs interface for listing clocksource * @dev: unused + * @attr: unused * @buf: char buffer to be filled with clocksource list * * Provides sysfs interface for listing registered clocksources -- cgit v1.2.3 From e30e2fdfe56288576ee9e04dbb06b4bd5f282203 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Thu, 22 Dec 2011 02:45:29 +0530 Subject: VFS: Fix race between CPU hotplug and lglocks Currently, the *_global_[un]lock_online() routines are not at all synchronized with CPU hotplug. Soft-lockups detected as a consequence of this race was reported earlier at https://lkml.org/lkml/2011/8/24/185. (Thanks to Cong Meng for finding out that the root-cause of this issue is the race condition between br_write_[un]lock() and CPU hotplug, which results in the lock states getting messed up). Fixing this race by just adding {get,put}_online_cpus() at appropriate places in *_global_[un]lock_online() is not a good option, because, then suddenly br_write_[un]lock() would become blocking, whereas they have been kept as non-blocking all this time, and we would want to keep them that way. So, overall, we want to ensure 3 things: 1. br_write_lock() and br_write_unlock() must remain as non-blocking. 2. The corresponding lock and unlock of the per-cpu spinlocks must not happen for different sets of CPUs. 3. Either prevent any new CPU online operation in between this lock-unlock, or ensure that the newly onlined CPU does not proceed with its corresponding per-cpu spinlock unlocked. To achieve all this: (a) We introduce a new spinlock that is taken by the *_global_lock_online() routine and released by the *_global_unlock_online() routine. (b) We register a callback for CPU hotplug notifications, and this callback takes the same spinlock as above. (c) We maintain a bitmap which is close to the cpu_online_mask, and once it is initialized in the lock_init() code, all future updates to it are done in the callback, under the above spinlock. (d) The above bitmap is used (instead of cpu_online_mask) while locking and unlocking the per-cpu locks. The callback takes the spinlock upon the CPU_UP_PREPARE event. So, if the br_write_lock-unlock sequence is in progress, the callback keeps spinning, thus preventing the CPU online operation till the lock-unlock sequence is complete. This takes care of requirement (3). The bitmap that we maintain remains unmodified throughout the lock-unlock sequence, since all updates to it are managed by the callback, which takes the same spinlock as the one taken by the lock code and released only by the unlock routine. Combining this with (d) above, satisfies requirement (2). Overall, since we use a spinlock (mentioned in (a)) to prevent CPU hotplug operations from racing with br_write_lock-unlock, requirement (1) is also taken care of. By the way, it is to be noted that a CPU offline operation can actually run in parallel with our lock-unlock sequence, because our callback doesn't react to notifications earlier than CPU_DEAD (in order to maintain our bitmap properly). And this means, since we use our own bitmap (which is stale, on purpose) during the lock-unlock sequence, we could end up unlocking the per-cpu lock of an offline CPU (because we had locked it earlier, when the CPU was online), in order to satisfy requirement (2). But this is harmless, though it looks a bit awkward. Debugged-by: Cong Meng Signed-off-by: Srivatsa S. Bhat Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- include/linux/lglock.h | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lglock.h b/include/linux/lglock.h index f549056fb20b..87f402ccec55 100644 --- a/include/linux/lglock.h +++ b/include/linux/lglock.h @@ -22,6 +22,7 @@ #include #include #include +#include /* can make br locks by using local lock for read side, global lock for write */ #define br_lock_init(name) name##_lock_init() @@ -72,9 +73,31 @@ #define DEFINE_LGLOCK(name) \ \ + DEFINE_SPINLOCK(name##_cpu_lock); \ + cpumask_t name##_cpus __read_mostly; \ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \ DEFINE_LGLOCK_LOCKDEP(name); \ \ + static int \ + name##_lg_cpu_callback(struct notifier_block *nb, \ + unsigned long action, void *hcpu) \ + { \ + switch (action & ~CPU_TASKS_FROZEN) { \ + case CPU_UP_PREPARE: \ + spin_lock(&name##_cpu_lock); \ + cpu_set((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + break; \ + case CPU_UP_CANCELED: case CPU_DEAD: \ + spin_lock(&name##_cpu_lock); \ + cpu_clear((unsigned long)hcpu, name##_cpus); \ + spin_unlock(&name##_cpu_lock); \ + } \ + return NOTIFY_OK; \ + } \ + static struct notifier_block name##_lg_cpu_notifier = { \ + .notifier_call = name##_lg_cpu_callback, \ + }; \ void name##_lock_init(void) { \ int i; \ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \ @@ -83,6 +106,11 @@ lock = &per_cpu(name##_lock, i); \ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \ } \ + register_hotcpu_notifier(&name##_lg_cpu_notifier); \ + get_online_cpus(); \ + for_each_online_cpu(i) \ + cpu_set(i, name##_cpus); \ + put_online_cpus(); \ } \ EXPORT_SYMBOL(name##_lock_init); \ \ @@ -124,9 +152,9 @@ \ void name##_global_lock_online(void) { \ int i; \ - preempt_disable(); \ + spin_lock(&name##_cpu_lock); \ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_lock(lock); \ @@ -137,12 +165,12 @@ void name##_global_unlock_online(void) { \ int i; \ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \ - for_each_online_cpu(i) { \ + for_each_cpu(i, &name##_cpus) { \ arch_spinlock_t *lock; \ lock = &per_cpu(name##_lock, i); \ arch_spin_unlock(lock); \ } \ - preempt_enable(); \ + spin_unlock(&name##_cpu_lock); \ } \ EXPORT_SYMBOL(name##_global_unlock_online); \ \ -- cgit v1.2.3