From f1c6f1a7eed963ed233ba4c8b6fa8addb86c6ddc Mon Sep 17 00:00:00 2001 From: Carsten Emde Date: Wed, 26 Oct 2011 23:14:16 +0200 Subject: sched: Set the command name of the idle tasks in SMP kernels In UP systems, the idle task is initialized using the init_task structure from which the command name is taken (currently "swapper"). In SMP systems, one idle task per CPU is forked by the worker thread from which the task structure is copied. The command name is, therefore, "kworker/0:0" or "kworker/0:1", if not updated. Since such update was lacking, all idle tasks in SMP systems were incorrectly named. This longtime bug was not discovered immediately, because there is no /proc/0 entry - the bug only becomes apparent when tracing is enabled. This patch sets the command name of the idle tasks in SMP systems to the name that is used in the INIT_TASK structure suffixed by a slash and the number of the CPU. Signed-off-by: Carsten Emde Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111026211708.768925506@osadl.org Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 08ffab01e76c..b6e5b8b000e0 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -126,6 +126,8 @@ extern struct cred init_cred; # define INIT_PERF_EVENTS(tsk) #endif +#define INIT_TASK_COMM "swapper" + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -162,7 +164,7 @@ extern struct cred init_cred; .group_leader = &tsk, \ RCU_INIT_POINTER(.real_cred, &init_cred), \ RCU_INIT_POINTER(.cred, &init_cred), \ - .comm = "swapper", \ + .comm = INIT_TASK_COMM, \ .thread = INIT_THREAD, \ .fs = &init_fs, \ .files = &init_files, \ -- cgit v1.2.3 From 5eccdf5e06eb67779716ae26142402a1ae9b012c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 21 Nov 2011 06:53:46 +0000 Subject: tc: comment spelling fixes Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index c5336705921f..7281d5acf2f9 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -30,7 +30,7 @@ */ struct tc_stats { - __u64 bytes; /* NUmber of enqueues bytes */ + __u64 bytes; /* Number of enqueued bytes */ __u32 packets; /* Number of enqueued packets */ __u32 drops; /* Packets dropped because of lack of resources */ __u32 overlimits; /* Number of throttle events when this @@ -297,7 +297,7 @@ struct tc_htb_glob { __u32 debug; /* debug flags */ /* stats */ - __u32 direct_pkts; /* count of non shapped packets */ + __u32 direct_pkts; /* count of non shaped packets */ }; enum { TCA_HTB_UNSPEC, @@ -503,7 +503,7 @@ enum { }; #define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) -/* State transition probablities for 4 state model */ +/* State transition probabilities for 4 state model */ struct tc_netem_gimodel { __u32 p13; __u32 p31; -- cgit v1.2.3 From 5cac98dd06bc43a7baab3523184f70fd359e9f35 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Sun, 27 Nov 2011 21:14:46 +0000 Subject: net: Fix corruption in /proc/*/net/dev_mcast I just hit this during my testing. Isn't there another bug lurking? BUG kmalloc-8: Redzone overwritten INFO: 0xc0000000de9dec48-0xc0000000de9dec4b. First byte 0x0 instead of 0xcc INFO: Allocated in .__seq_open_private+0x30/0xa0 age=0 cpu=5 pid=3896 .__kmalloc+0x1e0/0x2d0 .__seq_open_private+0x30/0xa0 .seq_open_net+0x60/0xe0 .dev_mc_seq_open+0x4c/0x70 .proc_reg_open+0xd8/0x260 .__dentry_open.clone.11+0x2b8/0x400 .do_last+0xf4/0x950 .path_openat+0xf8/0x480 .do_filp_open+0x48/0xc0 .do_sys_open+0x140/0x250 syscall_exit+0x0/0x40 dev_mc_seq_ops uses dev_seq_start/next/stop but only allocates sizeof(struct seq_net_private) of private data, whereas it expects sizeof(struct dev_iter_state): struct dev_iter_state { struct seq_net_private p; unsigned int pos; /* bucket << BUCKET_SPACE + offset */ }; Create dev_seq_open_ops and use it so we don't have to expose struct dev_iter_state. [ Problem added by commit f04565ddf52e4 (dev: use name hash for dev_seq_ops) -Eric ] Signed-off-by: Anton Blanchard Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 6 ++++++ net/core/dev_addr_lists.c | 3 +-- 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index cbeb5867cff7..a82ad4dd306a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2536,6 +2536,8 @@ extern void net_disable_timestamp(void); extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); extern void dev_seq_stop(struct seq_file *seq, void *v); +extern int dev_seq_open_ops(struct inode *inode, struct file *file, + const struct seq_operations *ops); #endif extern int netdev_class_create_file(struct class_attribute *class_attr); diff --git a/net/core/dev.c b/net/core/dev.c index 6ba50a1e404c..1482eea0bbf0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4282,6 +4282,12 @@ static int dev_seq_open(struct inode *inode, struct file *file) sizeof(struct dev_iter_state)); } +int dev_seq_open_ops(struct inode *inode, struct file *file, + const struct seq_operations *ops) +{ + return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state)); +} + static const struct file_operations dev_seq_fops = { .owner = THIS_MODULE, .open = dev_seq_open, diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 277faef9148d..febba516db62 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -696,8 +696,7 @@ static const struct seq_operations dev_mc_seq_ops = { static int dev_mc_seq_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &dev_mc_seq_ops, - sizeof(struct seq_net_private)); + return dev_seq_open_ops(inode, file, &dev_mc_seq_ops); } static const struct file_operations dev_mc_seq_fops = { -- cgit v1.2.3 From 4f718a29fe4908c2cea782f751e9805319684e2b Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 28 Nov 2011 09:44:14 +0100 Subject: firmware: Sigma: Prevent out of bounds memory access The SigmaDSP firmware loader currently does not perform enough boundary size checks when processing the firmware. As a result it is possible that a malformed firmware can cause an out of bounds memory access. This patch adds checks which ensure that both the action header and the payload are completely inside the firmware data boundaries before processing them. Signed-off-by: Lars-Peter Clausen Acked-by: Mike Frysinger Signed-off-by: Mark Brown Cc: stable@kernel.org --- drivers/firmware/sigma.c | 76 +++++++++++++++++++++++++++++++++++------------- include/linux/sigma.h | 5 ---- 2 files changed, 55 insertions(+), 26 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/sigma.c b/drivers/firmware/sigma.c index f10fc521951b..c780baa59ed9 100644 --- a/drivers/firmware/sigma.c +++ b/drivers/firmware/sigma.c @@ -14,13 +14,34 @@ #include #include -/* Return: 0==OK, <0==error, =1 ==no more actions */ +static size_t sigma_action_size(struct sigma_action *sa) +{ + size_t payload = 0; + + switch (sa->instr) { + case SIGMA_ACTION_WRITEXBYTES: + case SIGMA_ACTION_WRITESINGLE: + case SIGMA_ACTION_WRITESAFELOAD: + payload = sigma_action_len(sa); + break; + default: + break; + } + + payload = ALIGN(payload, 2); + + return payload + sizeof(struct sigma_action); +} + +/* + * Returns a negative error value in case of an error, 0 if processing of + * the firmware should be stopped after this action, 1 otherwise. + */ static int -process_sigma_action(struct i2c_client *client, struct sigma_firmware *ssfw) +process_sigma_action(struct i2c_client *client, struct sigma_action *sa) { - struct sigma_action *sa = (void *)(ssfw->fw->data + ssfw->pos); size_t len = sigma_action_len(sa); - int ret = 0; + int ret; pr_debug("%s: instr:%i addr:%#x len:%zu\n", __func__, sa->instr, sa->addr, len); @@ -29,44 +50,50 @@ process_sigma_action(struct i2c_client *client, struct sigma_firmware *ssfw) case SIGMA_ACTION_WRITEXBYTES: case SIGMA_ACTION_WRITESINGLE: case SIGMA_ACTION_WRITESAFELOAD: - if (ssfw->fw->size < ssfw->pos + len) - return -EINVAL; ret = i2c_master_send(client, (void *)&sa->addr, len); if (ret < 0) return -EINVAL; break; - case SIGMA_ACTION_DELAY: - ret = 0; udelay(len); len = 0; break; - case SIGMA_ACTION_END: - return 1; - + return 0; default: return -EINVAL; } - /* when arrive here ret=0 or sent data */ - ssfw->pos += sigma_action_size(sa, len); - return ssfw->pos == ssfw->fw->size; + return 1; } static int process_sigma_actions(struct i2c_client *client, struct sigma_firmware *ssfw) { - pr_debug("%s: processing %p\n", __func__, ssfw); + struct sigma_action *sa; + size_t size; + int ret; + + while (ssfw->pos + sizeof(*sa) <= ssfw->fw->size) { + sa = (struct sigma_action *)(ssfw->fw->data + ssfw->pos); + + size = sigma_action_size(sa); + ssfw->pos += size; + if (ssfw->pos > ssfw->fw->size || size == 0) + break; + + ret = process_sigma_action(client, sa); - while (1) { - int ret = process_sigma_action(client, ssfw); pr_debug("%s: action returned %i\n", __func__, ret); - if (ret == 1) - return 0; - else if (ret) + + if (ret <= 0) return ret; } + + if (ssfw->pos != ssfw->fw->size) + return -EINVAL; + + return 0; } int process_sigma_firmware(struct i2c_client *client, const char *name) @@ -89,7 +116,14 @@ int process_sigma_firmware(struct i2c_client *client, const char *name) /* then verify the header */ ret = -EINVAL; - if (fw->size < sizeof(*ssfw_head)) + + /* + * Reject too small or unreasonable large files. The upper limit has been + * chosen a bit arbitrarily, but it should be enough for all practical + * purposes and having the limit makes it easier to avoid integer + * overflows later in the loading process. + */ + if (fw->size < sizeof(*ssfw_head) || fw->size >= 0x4000000) goto done; ssfw_head = (void *)fw->data; diff --git a/include/linux/sigma.h b/include/linux/sigma.h index e2accb3164d8..9a138c2946bb 100644 --- a/include/linux/sigma.h +++ b/include/linux/sigma.h @@ -50,11 +50,6 @@ static inline u32 sigma_action_len(struct sigma_action *sa) return (sa->len_hi << 16) | sa->len; } -static inline size_t sigma_action_size(struct sigma_action *sa, u32 payload_len) -{ - return sizeof(*sa) + payload_len + (payload_len % 2); -} - extern int process_sigma_firmware(struct i2c_client *client, const char *name); #endif -- cgit v1.2.3 From bda63586bc5929e97288cdb371bb6456504867ed Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Mon, 28 Nov 2011 09:44:16 +0100 Subject: firmware: Sigma: Fix endianess issues Currently the SigmaDSP firmware loader only works correctly on little-endian systems. Fix this by using the proper endianess conversion functions. Signed-off-by: Lars-Peter Clausen Acked-by: Mike Frysinger Signed-off-by: Mark Brown Cc: stable@kernel.org --- drivers/firmware/sigma.c | 2 +- include/linux/sigma.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/sigma.c b/drivers/firmware/sigma.c index 36265de0a9e8..1eedb6f7fdab 100644 --- a/drivers/firmware/sigma.c +++ b/drivers/firmware/sigma.c @@ -133,7 +133,7 @@ int process_sigma_firmware(struct i2c_client *client, const char *name) crc = crc32(0, fw->data + sizeof(*ssfw_head), fw->size - sizeof(*ssfw_head)); pr_debug("%s: crc=%x\n", __func__, crc); - if (crc != ssfw_head->crc) + if (crc != le32_to_cpu(ssfw_head->crc)) goto done; ssfw.pos = sizeof(*ssfw_head); diff --git a/include/linux/sigma.h b/include/linux/sigma.h index 9a138c2946bb..d0de882c0d96 100644 --- a/include/linux/sigma.h +++ b/include/linux/sigma.h @@ -24,7 +24,7 @@ struct sigma_firmware { struct sigma_firmware_header { unsigned char magic[7]; u8 version; - u32 crc; + __le32 crc; }; enum { @@ -40,14 +40,14 @@ enum { struct sigma_action { u8 instr; u8 len_hi; - u16 len; - u16 addr; + __le16 len; + __be16 addr; unsigned char payload[]; }; static inline u32 sigma_action_len(struct sigma_action *sa) { - return (sa->len_hi << 16) | sa->len; + return (sa->len_hi << 16) | le16_to_cpu(sa->len); } extern int process_sigma_firmware(struct i2c_client *client, const char *name); -- cgit v1.2.3 From a67ba43d30bf8c1cfdc2615439455302d2408453 Mon Sep 17 00:00:00 2001 From: Chris Metcalf Date: Thu, 1 Dec 2011 12:54:31 -0500 Subject: asm-generic/unistd.h: support new process_vm_{readv,write} syscalls Also prototype the "compat" functions so they can be referenced from C code. Signed-off-by: Chris Metcalf Acked-by: Arnd Bergmann --- include/asm-generic/unistd.h | 8 +++++++- include/linux/compat.h | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index f4c38d8c6674..2292d1af9d70 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -685,9 +685,15 @@ __SYSCALL(__NR_syncfs, sys_syncfs) __SYSCALL(__NR_setns, sys_setns) #define __NR_sendmmsg 269 __SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg) +#define __NR_process_vm_readv 270 +__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \ + compat_sys_process_vm_readv) +#define __NR_process_vm_writev 271 +__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \ + compat_sys_process_vm_writev) #undef __NR_syscalls -#define __NR_syscalls 270 +#define __NR_syscalls 272 /* * All syscalls below here should go away really, diff --git a/include/linux/compat.h b/include/linux/compat.h index 154bf5683015..66ed067fb729 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -552,5 +552,14 @@ extern ssize_t compat_rw_copy_check_uvector(int type, extern void __user *compat_alloc_user_space(unsigned long len); +asmlinkage ssize_t compat_sys_process_vm_readv(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, const struct compat_iovec __user *rvec, + unsigned long riovcnt, unsigned long flags); +asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, const struct compat_iovec __user *rvec, + unsigned long riovcnt, unsigned long flags); + #endif /* CONFIG_COMPAT */ #endif /* _LINUX_COMPAT_H */ -- cgit v1.2.3 From 10c6db110d0eb4466b59812c49088ab56218fc2e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 26 Nov 2011 02:47:31 +0100 Subject: perf: Fix loss of notification with multi-event When you do: $ perf record -e cycles,cycles,cycles noploop 10 You expect about 10,000 samples for each event, i.e., 10s at 1000samples/sec. However, this is not what's happening. You get much fewer samples, maybe 3700 samples/event: $ perf report -D | tail -15 Aggregated stats: TOTAL events: 10998 MMAP events: 66 COMM events: 2 SAMPLE events: 10930 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 cycles stats: TOTAL events: 3642 SAMPLE events: 3642 cycles stats: TOTAL events: 3644 SAMPLE events: 3644 On a Intel Nehalem or even AMD64, there are 4 counters capable of measuring cycles, so there is plenty of space to measure those events without multiplexing (even with the NMI watchdog active). And even with multiplexing, we'd expect roughly the same number of samples per event. The root of the problem was that when the event that caused the buffer to become full was not the first event passed on the cmdline, the user notification would get lost. The notification was sent to the file descriptor of the overflowed event but the perf tool was not polling on it. The perf tool aggregates all samples into a single buffer, i.e., the buffer of the first event. Consequently, it assumes notifications for any event will come via that descriptor. The seemingly straight forward solution of moving the waitq into the ringbuffer object doesn't work because of life-time issues. One could perf_event_set_output() on a fd that you're also blocking on and cause the old rb object to be freed while its waitq would still be referenced by the blocked thread -> FAIL. Therefore link all events to the ringbuffer and broadcast the wakeup from the ringbuffer object to all possible events that could be waited upon. This is rather ugly, and we're open to better solutions but it works for now. Reported-by: Stephane Eranian Finished-by: Stephane Eranian Reviewed-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111126014731.GA7030@quad Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/core.c | 86 +++++++++++++++++++++++++++++++++++++++++++-- kernel/events/internal.h | 3 ++ kernel/events/ring_buffer.c | 3 ++ 4 files changed, 91 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1e9ebe5e0091..b1f89122bf6a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -822,6 +822,7 @@ struct perf_event { int mmap_locked; struct user_struct *mmap_user; struct ring_buffer *rb; + struct list_head rb_entry; /* poll related */ wait_queue_head_t waitq; diff --git a/kernel/events/core.c b/kernel/events/core.c index b0c1186fd97b..600c1629b64d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -185,6 +185,9 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb); + void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -3191,12 +3194,33 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) struct ring_buffer *rb; unsigned int events = POLL_HUP; + /* + * Race between perf_event_set_output() and perf_poll(): perf_poll() + * grabs the rb reference but perf_event_set_output() overrides it. + * Here is the timeline for two threads T1, T2: + * t0: T1, rb = rcu_dereference(event->rb) + * t1: T2, old_rb = event->rb + * t2: T2, event->rb = new rb + * t3: T2, ring_buffer_detach(old_rb) + * t4: T1, ring_buffer_attach(rb1) + * t5: T1, poll_wait(event->waitq) + * + * To avoid this problem, we grab mmap_mutex in perf_poll() + * thereby ensuring that the assignment of the new ring buffer + * and the detachment of the old buffer appear atomic to perf_poll() + */ + mutex_lock(&event->mmap_mutex); + rcu_read_lock(); rb = rcu_dereference(event->rb); - if (rb) + if (rb) { + ring_buffer_attach(event, rb); events = atomic_xchg(&rb->poll, 0); + } rcu_read_unlock(); + mutex_unlock(&event->mmap_mutex); + poll_wait(file, &event->waitq, wait); return events; @@ -3497,6 +3521,49 @@ unlock: return ret; } +static void ring_buffer_attach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (!list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + if (!list_empty(&event->rb_entry)) + goto unlock; + + list_add(&event->rb_entry, &rb->event_list); +unlock: + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_detach(struct perf_event *event, + struct ring_buffer *rb) +{ + unsigned long flags; + + if (list_empty(&event->rb_entry)) + return; + + spin_lock_irqsave(&rb->event_lock, flags); + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + spin_unlock_irqrestore(&rb->event_lock, flags); +} + +static void ring_buffer_wakeup(struct perf_event *event) +{ + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + wake_up_all(&event->waitq); + } + rcu_read_unlock(); +} + static void rb_free_rcu(struct rcu_head *rcu_head) { struct ring_buffer *rb; @@ -3522,9 +3589,19 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) static void ring_buffer_put(struct ring_buffer *rb) { + struct perf_event *event, *n; + unsigned long flags; + if (!atomic_dec_and_test(&rb->refcount)) return; + spin_lock_irqsave(&rb->event_lock, flags); + list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { + list_del_init(&event->rb_entry); + wake_up_all(&event->waitq); + } + spin_unlock_irqrestore(&rb->event_lock, flags); + call_rcu(&rb->rcu_head, rb_free_rcu); } @@ -3547,6 +3624,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); ring_buffer_put(rb); @@ -3701,7 +3779,7 @@ static const struct file_operations perf_fops = { void perf_event_wakeup(struct perf_event *event) { - wake_up_all(&event->waitq); + ring_buffer_wakeup(event); if (event->pending_kill) { kill_fasync(&event->fasync, SIGIO, event->pending_kill); @@ -5823,6 +5901,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->group_entry); INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); + INIT_LIST_HEAD(&event->rb_entry); + init_waitqueue_head(&event->waitq); init_irq_work(&event->pending, perf_pending_event); @@ -6029,6 +6109,8 @@ set: old_rb = event->rb; rcu_assign_pointer(event->rb, rb); + if (old_rb) + ring_buffer_detach(event, old_rb); ret = 0; unlock: mutex_unlock(&event->mmap_mutex); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09097dd8116c..64568a699375 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -22,6 +22,9 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + /* poll crap */ + spinlock_t event_lock; + struct list_head event_list; struct perf_event_mmap_page *user_page; void *data_pages[0]; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index a2a29205cc0f..7f3011c6b57f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -209,6 +209,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->writable = 1; atomic_set(&rb->refcount, 1); + + INIT_LIST_HEAD(&rb->event_list); + spin_lock_init(&rb->event_lock); } #ifndef CONFIG_PERF_USE_VMALLOC -- cgit v1.2.3 From f62ef5f3e9cff065aa845e2b7f487e1810b8e57e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 2 Dec 2011 08:21:43 +0100 Subject: x86, amd: Fix up numa_node information for AMD CPU family 15h model 0-0fh northbridge functions I've received complaints that the numa_node attribute for family 15h model 00-0fh (e.g. Interlagos) northbridge functions shows -1 instead of the proper node ID. Correct this with attached quirks (similar to quirks for other AMD CPU families used in multi-socket systems). Signed-off-by: Andreas Herrmann Cc: Frank Arnold Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20111202072143.GA31916@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 13 +++++++++++++ include/linux/pci_ids.h | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index b78643d0f9a5..03920a15a632 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, quirk_amd_nb_node); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + #endif diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 172ba70306d1..2aaee0ca9da8 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -517,8 +517,12 @@ #define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302 #define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 #define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 +#define PCI_DEVICE_ID_AMD_15H_NB_F0 0x1600 +#define PCI_DEVICE_ID_AMD_15H_NB_F1 0x1601 +#define PCI_DEVICE_ID_AMD_15H_NB_F2 0x1602 #define PCI_DEVICE_ID_AMD_15H_NB_F3 0x1603 #define PCI_DEVICE_ID_AMD_15H_NB_F4 0x1604 +#define PCI_DEVICE_ID_AMD_15H_NB_F5 0x1605 #define PCI_DEVICE_ID_AMD_CNB17H_F3 0x1703 #define PCI_DEVICE_ID_AMD_LANCE 0x2000 #define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 -- cgit v1.2.3 From 27b14b56af081ec7edeefb3a38b2c9577cc5ef48 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 1 Nov 2011 09:09:35 +0800 Subject: tracing: Restore system filter behavior Though not all events have field 'prev_pid', it was allowed to do this: # echo 'prev_pid == 100' > events/sched/filter but commit 75b8e98263fdb0bfbdeba60d4db463259f1fe8a2 (tracing/filter: Swap entire filter of events) broke it without any reason. Link: http://lkml.kernel.org/r/4EAF46CF.8040408@cn.fujitsu.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_events_filter.c | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 96efa6794ea5..c3da42dd22ba 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -172,6 +172,7 @@ enum { TRACE_EVENT_FL_FILTERED_BIT, TRACE_EVENT_FL_RECORDED_CMD_BIT, TRACE_EVENT_FL_CAP_ANY_BIT, + TRACE_EVENT_FL_NO_SET_FILTER_BIT, }; enum { @@ -179,6 +180,7 @@ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), TRACE_EVENT_FL_RECORDED_CMD = (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT), TRACE_EVENT_FL_CAP_ANY = (1 << TRACE_EVENT_FL_CAP_ANY_BIT), + TRACE_EVENT_FL_NO_SET_FILTER = (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT), }; struct ftrace_event_call { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d6e7926dcd26..95dc31efd6dd 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1649,7 +1649,9 @@ static int replace_system_preds(struct event_subsystem *system, */ err = replace_preds(call, NULL, ps, filter_string, true); if (err) - goto fail; + call->flags |= TRACE_EVENT_FL_NO_SET_FILTER; + else + call->flags &= ~TRACE_EVENT_FL_NO_SET_FILTER; } list_for_each_entry(call, &ftrace_events, list) { @@ -1658,6 +1660,9 @@ static int replace_system_preds(struct event_subsystem *system, if (strcmp(call->class->system, system->name) != 0) continue; + if (call->flags & TRACE_EVENT_FL_NO_SET_FILTER) + continue; + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); if (!filter_item) goto fail_mem; -- cgit v1.2.3 From 02125a826459a6ad142f8d91c5b6357562f96615 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 5 Dec 2011 08:43:34 -0500 Subject: fix apparmor dereferencing potentially freed dentry, sanitize __d_path() API __d_path() API is asking for trouble and in case of apparmor d_namespace_path() getting just that. The root cause is that when __d_path() misses the root it had been told to look for, it stores the location of the most remote ancestor in *root. Without grabbing references. Sure, at the moment of call it had been pinned down by what we have in *path. And if we raced with umount -l, we could have very well stopped at vfsmount/dentry that got freed as soon as prepend_path() dropped vfsmount_lock. It is safe to compare these pointers with pre-existing (and known to be still alive) vfsmount and dentry, as long as all we are asking is "is it the same address?". Dereferencing is not safe and apparmor ended up stepping into that. d_namespace_path() really wants to examine the place where we stopped, even if it's not connected to our namespace. As the result, it looked at ->d_sb->s_magic of a dentry that might've been already freed by that point. All other callers had been careful enough to avoid that, but it's really a bad interface - it invites that kind of trouble. The fix is fairly straightforward, even though it's bigger than I'd like: * prepend_path() root argument becomes const. * __d_path() is never called with NULL/NULL root. It was a kludge to start with. Instead, we have an explicit function - d_absolute_root(). Same as __d_path(), except that it doesn't get root passed and stops where it stops. apparmor and tomoyo are using it. * __d_path() returns NULL on path outside of root. The main caller is show_mountinfo() and that's precisely what we pass root for - to skip those outside chroot jail. Those who don't want that can (and do) use d_path(). * __d_path() root argument becomes const. Everyone agrees, I hope. * apparmor does *NOT* try to use __d_path() or any of its variants when it sees that path->mnt is an internal vfsmount. In that case it's definitely not mounted anywhere and dentry_path() is exactly what we want there. Handling of sysctl()-triggered weirdness is moved to that place. * if apparmor is asked to do pathname relative to chroot jail and __d_path() tells it we it's not in that jail, the sucker just calls d_absolute_path() instead. That's the other remaining caller of __d_path(), BTW. * seq_path_root() does _NOT_ return -ENAMETOOLONG (it's stupid anyway - the normal seq_file logics will take care of growing the buffer and redoing the call of ->show() just fine). However, if it gets path not reachable from root, it returns SEQ_SKIP. The only caller adjusted (i.e. stopped ignoring the return value as it used to do). Reviewed-by: John Johansen ACKed-by: John Johansen Signed-off-by: Al Viro Cc: stable@vger.kernel.org --- fs/dcache.c | 71 ++++++++++++++++++++++++++++------------------ fs/namespace.c | 20 +++++++------ fs/seq_file.c | 6 ++-- include/linux/dcache.h | 3 +- include/linux/fs.h | 1 + security/apparmor/path.c | 65 ++++++++++++++++++++++++------------------ security/tomoyo/realpath.c | 3 +- 7 files changed, 100 insertions(+), 69 deletions(-) (limited to 'include/linux') diff --git a/fs/dcache.c b/fs/dcache.c index 10ba92def3f6..89509b5a090e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2439,16 +2439,14 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) /** * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buffer: pointer to the end of the buffer * @buflen: pointer to buffer length * * Caller holds the rename_lock. - * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). */ -static int prepend_path(const struct path *path, struct path *root, +static int prepend_path(const struct path *path, + const struct path *root, char **buffer, int *buflen) { struct dentry *dentry = path->dentry; @@ -2483,10 +2481,10 @@ static int prepend_path(const struct path *path, struct path *root, dentry = parent; } -out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); +out: br_read_unlock(vfsmount_lock); return error; @@ -2500,15 +2498,17 @@ global_root: WARN(1, "Root dentry has weird name <%.*s>\n", (int) dentry->d_name.len, dentry->d_name.name); } - root->mnt = vfsmnt; - root->dentry = dentry; + if (!slash) + error = prepend(buffer, buflen, "/", 1); + if (!error) + error = vfsmnt->mnt_ns ? 1 : 2; goto out; } /** * __d_path - return the path of a dentry * @path: the dentry/vfsmount to report - * @root: root vfsmnt/dentry (may be modified by this function) + * @root: root vfsmnt/dentry * @buf: buffer to return value in * @buflen: buffer length * @@ -2519,10 +2519,10 @@ global_root: * * "buflen" should be positive. * - * If path is not reachable from the supplied root, then the value of - * root is changed (without modifying refcounts). + * If the path is not reachable from the supplied root, return %NULL. */ -char *__d_path(const struct path *path, struct path *root, +char *__d_path(const struct path *path, + const struct path *root, char *buf, int buflen) { char *res = buf + buflen; @@ -2533,7 +2533,28 @@ char *__d_path(const struct path *path, struct path *root, error = prepend_path(path, root, &res, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) + return ERR_PTR(error); + if (error > 0) + return NULL; + return res; +} + +char *d_absolute_path(const struct path *path, + char *buf, int buflen) +{ + struct path root = {}; + char *res = buf + buflen; + int error; + + prepend(&res, &buflen, "\0", 1); + write_seqlock(&rename_lock); + error = prepend_path(path, &root, &res, &buflen); + write_sequnlock(&rename_lock); + + if (error > 1) + error = -EINVAL; + if (error < 0) return ERR_PTR(error); return res; } @@ -2541,8 +2562,9 @@ char *__d_path(const struct path *path, struct path *root, /* * same as __d_path but appends "(deleted)" for unlinked files. */ -static int path_with_deleted(const struct path *path, struct path *root, - char **buf, int *buflen) +static int path_with_deleted(const struct path *path, + const struct path *root, + char **buf, int *buflen) { prepend(buf, buflen, "\0", 1); if (d_unlinked(path->dentry)) { @@ -2579,7 +2601,6 @@ char *d_path(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; /* @@ -2594,9 +2615,8 @@ char *d_path(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (error) + error = path_with_deleted(path, &root, &res, &buflen); + if (error < 0) res = ERR_PTR(error); write_sequnlock(&rename_lock); path_put(&root); @@ -2617,7 +2637,6 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) { char *res = buf + buflen; struct path root; - struct path tmp; int error; if (path->dentry->d_op && path->dentry->d_op->d_dname) @@ -2625,9 +2644,8 @@ char *d_path_with_unreachable(const struct path *path, char *buf, int buflen) get_fs_root(current->fs, &root); write_seqlock(&rename_lock); - tmp = root; - error = path_with_deleted(path, &tmp, &res, &buflen); - if (!error && !path_equal(&tmp, &root)) + error = path_with_deleted(path, &root, &res, &buflen); + if (error > 0) error = prepend_unreachable(&res, &buflen); write_sequnlock(&rename_lock); path_put(&root); @@ -2758,19 +2776,18 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) write_seqlock(&rename_lock); if (!d_unlinked(pwd.dentry)) { unsigned long len; - struct path tmp = root; char *cwd = page + PAGE_SIZE; int buflen = PAGE_SIZE; prepend(&cwd, &buflen, "\0", 1); - error = prepend_path(&pwd, &tmp, &cwd, &buflen); + error = prepend_path(&pwd, &root, &cwd, &buflen); write_sequnlock(&rename_lock); - if (error) + if (error < 0) goto out; /* Unreachable from current root */ - if (!path_equal(&tmp, &root)) { + if (error > 0) { error = prepend_unreachable(&cwd, &buflen); if (error) goto out; diff --git a/fs/namespace.c b/fs/namespace.c index 6d3a1963879b..cfc6d4448aa5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1048,15 +1048,12 @@ static int show_mountinfo(struct seq_file *m, void *v) if (err) goto out; seq_putc(m, ' '); - seq_path_root(m, &mnt_path, &root, " \t\n\\"); - if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { - /* - * Mountpoint is outside root, discard that one. Ugly, - * but less so than trying to do that in iterator in a - * race-free way (due to renames). - */ - return SEQ_SKIP; - } + + /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ + err = seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (err) + goto out; + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); show_mnt_opts(m, mnt); @@ -2776,3 +2773,8 @@ void kern_unmount(struct vfsmount *mnt) } } EXPORT_SYMBOL(kern_unmount); + +bool our_mnt(struct vfsmount *mnt) +{ + return check_mnt(mnt); +} diff --git a/fs/seq_file.c b/fs/seq_file.c index 05d6b0e78c95..dba43c3ea3af 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -449,8 +449,6 @@ EXPORT_SYMBOL(seq_path); /* * Same as seq_path, but relative to supplied root. - * - * root may be changed, see __d_path(). */ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *esc) @@ -463,6 +461,8 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, char *p; p = __d_path(path, root, buf, size); + if (!p) + return SEQ_SKIP; res = PTR_ERR(p); if (!IS_ERR(p)) { char *end = mangle_path(buf, p, esc); @@ -474,7 +474,7 @@ int seq_path_root(struct seq_file *m, struct path *path, struct path *root, } seq_commit(m, res); - return res < 0 ? res : 0; + return res < 0 && res != -ENAMETOOLONG ? res : 0; } /* diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4df926199369..ed9f74f6c519 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -339,7 +339,8 @@ extern int d_validate(struct dentry *, struct dentry *); */ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); -extern char *__d_path(const struct path *path, struct path *root, char *, int); +extern char *__d_path(const struct path *, const struct path *, char *, int); +extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *d_path_with_unreachable(const struct path *, char *, int); extern char *dentry_path_raw(struct dentry *, char *, int); diff --git a/include/linux/fs.h b/include/linux/fs.h index e3130220ce3e..019dc558df1a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1942,6 +1942,7 @@ extern int fd_statfs(int, struct kstatfs *); extern int statfs_by_dentry(struct dentry *, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); +extern bool our_mnt(struct vfsmount *mnt); extern int current_umask(void); diff --git a/security/apparmor/path.c b/security/apparmor/path.c index 36cc0cc39e78..b566eba4a65c 100644 --- a/security/apparmor/path.c +++ b/security/apparmor/path.c @@ -57,23 +57,44 @@ static int prepend(char **buffer, int buflen, const char *str, int namelen) static int d_namespace_path(struct path *path, char *buf, int buflen, char **name, int flags) { - struct path root, tmp; char *res; - int connected, error = 0; + int error = 0; + int connected = 1; + + if (path->mnt->mnt_flags & MNT_INTERNAL) { + /* it's not mounted anywhere */ + res = dentry_path(path->dentry, buf, buflen); + *name = res; + if (IS_ERR(res)) { + *name = buf; + return PTR_ERR(res); + } + if (path->dentry->d_sb->s_magic == PROC_SUPER_MAGIC && + strncmp(*name, "/sys/", 5) == 0) { + /* TODO: convert over to using a per namespace + * control instead of hard coded /proc + */ + return prepend(name, *name - buf, "/proc", 5); + } + return 0; + } - /* Get the root we want to resolve too, released below */ + /* resolve paths relative to chroot?*/ if (flags & PATH_CHROOT_REL) { - /* resolve paths relative to chroot */ + struct path root; get_fs_root(current->fs, &root); - } else { - /* resolve paths relative to namespace */ - root.mnt = current->nsproxy->mnt_ns->root; - root.dentry = root.mnt->mnt_root; - path_get(&root); + res = __d_path(path, &root, buf, buflen); + if (res && !IS_ERR(res)) { + /* everything's fine */ + *name = res; + path_put(&root); + goto ok; + } + path_put(&root); + connected = 0; } - tmp = root; - res = __d_path(path, &tmp, buf, buflen); + res = d_absolute_path(path, buf, buflen); *name = res; /* handle error conditions - and still allow a partial path to @@ -84,7 +105,10 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, *name = buf; goto out; } + if (!our_mnt(path->mnt)) + connected = 0; +ok: /* Handle two cases: * 1. A deleted dentry && profile is not allowing mediation of deleted * 2. On some filesystems, newly allocated dentries appear to the @@ -97,10 +121,7 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, goto out; } - /* Determine if the path is connected to the expected root */ - connected = tmp.dentry == root.dentry && tmp.mnt == root.mnt; - - /* If the path is not connected, + /* If the path is not connected to the expected root, * check if it is a sysctl and handle specially else remove any * leading / that __d_path may have returned. * Unless @@ -112,17 +133,9 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, * namespace root. */ if (!connected) { - /* is the disconnect path a sysctl? */ - if (tmp.dentry->d_sb->s_magic == PROC_SUPER_MAGIC && - strncmp(*name, "/sys/", 5) == 0) { - /* TODO: convert over to using a per namespace - * control instead of hard coded /proc - */ - error = prepend(name, *name - buf, "/proc", 5); - } else if (!(flags & PATH_CONNECT_PATH) && + if (!(flags & PATH_CONNECT_PATH) && !(((flags & CHROOT_NSCONNECT) == CHROOT_NSCONNECT) && - (tmp.mnt == current->nsproxy->mnt_ns->root && - tmp.dentry == tmp.mnt->mnt_root))) { + our_mnt(path->mnt))) { /* disconnected path, don't return pathname starting * with '/' */ @@ -133,8 +146,6 @@ static int d_namespace_path(struct path *path, char *buf, int buflen, } out: - path_put(&root); - return error; } diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c index 738bbdf8d4c7..36fa7c9bedc4 100644 --- a/security/tomoyo/realpath.c +++ b/security/tomoyo/realpath.c @@ -101,9 +101,8 @@ static char *tomoyo_get_absolute_path(struct path *path, char * const buffer, { char *pos = ERR_PTR(-ENOMEM); if (buflen >= 256) { - struct path ns_root = { }; /* go to whatever namespace root we are under */ - pos = __d_path(path, &ns_root, buffer, buflen - 1); + pos = d_absolute_path(path, buffer, buflen - 1); if (!IS_ERR(pos) && *pos == '/' && pos[1]) { struct inode *inode = path->dentry->d_inode; if (inode && S_ISDIR(inode->i_mode)) { -- cgit v1.2.3 From 83aeeada7c69f35e5100b27ec354335597a7a488 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 8 Dec 2011 14:33:54 -0800 Subject: vmscan: use atomic-long for shrinker batching Use atomic-long operations instead of looping around cmpxchg(). [akpm@linux-foundation.org: massage atomic.h inclusions] Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 2 +- include/linux/mm.h | 1 + include/linux/shrinker.h | 2 +- mm/vmscan.c | 17 +++++++---------- 4 files changed, 10 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 019dc558df1a..e0bc4ffb8e7f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -393,8 +393,8 @@ struct inodes_stat_t { #include #include #include -#include #include +#include #include diff --git a/include/linux/mm.h b/include/linux/mm.h index 3dc3a8c2c485..4baadd18f4ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index a83833a1f7a2..07ceb97d53fa 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -35,7 +35,7 @@ struct shrinker { /* These are for internal use */ struct list_head list; - long nr; /* objs pending delete */ + atomic_long_t nr_in_batch; /* objs pending delete */ }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ extern void register_shrinker(struct shrinker *); diff --git a/mm/vmscan.c b/mm/vmscan.c index f5255442ae2b..f54a05b7a61d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -183,7 +183,7 @@ static unsigned long zone_nr_lru_pages(struct zone *zone, */ void register_shrinker(struct shrinker *shrinker) { - shrinker->nr = 0; + atomic_long_set(&shrinker->nr_in_batch, 0); down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); @@ -264,9 +264,7 @@ unsigned long shrink_slab(struct shrink_control *shrink, * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ - do { - nr = shrinker->nr; - } while (cmpxchg(&shrinker->nr, nr, 0) != nr); + nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); total_scan = nr; delta = (4 * nr_pages_scanned) / shrinker->seeks; @@ -328,12 +326,11 @@ unsigned long shrink_slab(struct shrink_control *shrink, * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - do { - nr = shrinker->nr; - new_nr = total_scan + nr; - if (total_scan <= 0) - break; - } while (cmpxchg(&shrinker->nr, nr, new_nr) != nr); + if (total_scan > 0) + new_nr = atomic_long_add_return(total_scan, + &shrinker->nr_in_batch); + else + new_nr = atomic_long_read(&shrinker->nr_in_batch); trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); } -- cgit v1.2.3