From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4ca..cf3c2f5dba51 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ -- cgit v1.2.3 From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Aug 2009 09:12:30 -0400 Subject: NFS: Fix an O_DIRECT Oops... We can't call nfs_readdata_release()/nfs_writedata_release() without first initialising and referencing args.context. Doing so inside nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment() causes an Oops. We should rather be calling nfs_readdata_free()/nfs_writedata_free() in those cases. Looking at the O_DIRECT code, the "struct nfs_direct_req" is already referencing the nfs_open_context for us. Since the readdata and writedata structures carry a reference to that, we can simplify things by getting rid of the extra nfs_open_context references, so that we can replace all instances of nfs_readdata_release()/nfs_writedata_release(). Reported-by: Catalin Marinas Signed-off-by: Trond Myklebust Tested-by: Catalin Marinas Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 20 ++++++++++---------- fs/nfs/read.c | 6 ++---- fs/nfs/write.c | 6 ++---- include/linux/nfs_fs.h | 5 ++--- 4 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 489fc01a3204..e4e089a8f294 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_release(calldata); + nfs_readdata_free(data); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_readdata_release(data); + nfs_readdata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); + nfs_readdata_free(data); break; } bytes -= pgbase; @@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); + nfs_writedata_free(data); } } @@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata) dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); - nfs_commitdata_release(calldata); + nfs_commit_free(data); } static const struct rpc_call_ops nfs_commit_direct_ops = { @@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(dreq->ctx); + data->args.context = dreq->ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_writedata_release(data); + nfs_writedata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); + nfs_writedata_free(data); break; } bytes -= pgbase; @@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 73ea5e8d66ce..12c9e66d3f1d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(void *data) +static void nfs_readdata_release(struct nfs_read_data *rdata) { - struct nfs_read_data *rdata = data; - put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0a0a2ff767c3..a34fae21fe10 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(void *data) +static void nfs_writedata_release(struct nfs_write_data *wdata) { - struct nfs_write_data *wdata = data; - put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index fdffb413b192..f6b90240dd41 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -473,7 +473,6 @@ extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); -extern void nfs_writedata_release(void *); /* * Try to write back everything synchronously (but check the @@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -extern void nfs_commitdata_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) @@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode) * Allocate nfs_write_data structures */ extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages); +extern void nfs_writedata_free(struct nfs_write_data *); /* * linux/fs/nfs/read.c @@ -515,7 +514,6 @@ extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); -extern void nfs_readdata_release(void *data); extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); @@ -523,6 +521,7 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, * Allocate nfs_read_data structures */ extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages); +extern void nfs_readdata_free(struct nfs_read_data *); /* * linux/fs/nfs3proc.c -- cgit v1.2.3 From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe8..2b36afe6ae0f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a07f394..e26d2fcfa320 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3 From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 47 ++++++-- kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++------------ 2 files changed, 238 insertions(+), 83 deletions(-) (limited to 'include/linux') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe6ae0f..b53f7006cc4e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339589a0..b8c6b97a20a3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { -- cgit v1.2.3 From 9c0d90103c7e0eb6e638e5b649e9f6d8d9c1b4b3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:53:58 -0400 Subject: Capabilities: move cap_file_mmap to commoncap.c Currently we duplicate the mmap_min_addr test in cap_file_mmap and in security_file_mmap if !CONFIG_SECURITY. This patch moves cap_file_mmap into commoncap.c and then calls that function directly from security_file_mmap ifndef CONFIG_SECURITY like all of the other capability checks are done. Signed-off-by: Eric Paris Acked-by: Serge Hallyn Signed-off-by: James Morris --- include/linux/security.h | 7 ++++--- security/capability.c | 9 --------- security/commoncap.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index 5eff459b3833..ac4bc3760b46 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -66,6 +66,9 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name, extern int cap_inode_removexattr(struct dentry *dentry, const char *name); extern int cap_inode_need_killpriv(struct dentry *dentry); extern int cap_inode_killpriv(struct dentry *dentry); +extern int cap_file_mmap(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags, + unsigned long addr, unsigned long addr_only); extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5); @@ -2197,9 +2200,7 @@ static inline int security_file_mmap(struct file *file, unsigned long reqprot, unsigned long addr, unsigned long addr_only) { - if ((addr < mmap_min_addr) && !capable(CAP_SYS_RAWIO)) - return -EACCES; - return 0; + return cap_file_mmap(file, reqprot, prot, flags, addr, addr_only); } static inline int security_file_mprotect(struct vm_area_struct *vma, diff --git a/security/capability.c b/security/capability.c index 21b6cead6a8e..88f752e8152c 100644 --- a/security/capability.c +++ b/security/capability.c @@ -330,15 +330,6 @@ static int cap_file_ioctl(struct file *file, unsigned int command, return 0; } -static int cap_file_mmap(struct file *file, unsigned long reqprot, - unsigned long prot, unsigned long flags, - unsigned long addr, unsigned long addr_only) -{ - if ((addr < mmap_min_addr) && !capable(CAP_SYS_RAWIO)) - return -EACCES; - return 0; -} - static int cap_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot, unsigned long prot) { diff --git a/security/commoncap.c b/security/commoncap.c index 48b7e0228fa3..6bcf6e81e547 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -984,3 +984,33 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages) cap_sys_admin = 1; return __vm_enough_memory(mm, pages, cap_sys_admin); } + +/* + * cap_file_mmap - check if able to map given addr + * @file: unused + * @reqprot: unused + * @prot: unused + * @flags: unused + * @addr: address attempting to be mapped + * @addr_only: unused + * + * If the process is attempting to map memory below mmap_min_addr they need + * CAP_SYS_RAWIO. The other parameters to this function are unused by the + * capability security module. Returns 0 if this mapping should be allowed + * -EPERM if not. + */ +int cap_file_mmap(struct file *file, unsigned long reqprot, + unsigned long prot, unsigned long flags, + unsigned long addr, unsigned long addr_only) +{ + int ret = 0; + + if (addr < mmap_min_addr) { + ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO, + SECURITY_CAP_AUDIT); + /* set PF_SUPERPRIV if it turns out we allow the low mmap */ + if (ret == 0) + current->flags |= PF_SUPERPRIV; + } + return ret; +} -- cgit v1.2.3 From 788084aba2ab7348257597496befcbccabdc98a3 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 31 Jul 2009 12:54:11 -0400 Subject: Security/SELinux: seperate lsm specific mmap_min_addr Currently SELinux enforcement of controls on the ability to map low memory is determined by the mmap_min_addr tunable. This patch causes SELinux to ignore the tunable and instead use a seperate Kconfig option specific to how much space the LSM should protect. The tunable will now only control the need for CAP_SYS_RAWIO and SELinux permissions will always protect the amount of low memory designated by CONFIG_LSM_MMAP_MIN_ADDR. This allows users who need to disable the mmap_min_addr controls (usual reason being they run WINE as a non-root user) to do so and still have SELinux controls preventing confined domains (like a web server) from being able to map some area of low memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/mm.h | 15 --------------- include/linux/security.h | 17 +++++++++++++++++ kernel/sysctl.c | 7 ++++--- mm/Kconfig | 6 +++--- mm/mmap.c | 3 --- mm/nommu.c | 3 --- security/Kconfig | 16 ++++++++++++++++ security/Makefile | 2 +- security/commoncap.c | 2 +- security/min_addr.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++ security/selinux/hooks.c | 2 +- 11 files changed, 92 insertions(+), 30 deletions(-) create mode 100644 security/min_addr.c (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ba3a7cb1eaa0..9a72cc78e6b8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -34,8 +34,6 @@ extern int sysctl_legacy_va_layout; #define sysctl_legacy_va_layout 0 #endif -extern unsigned long mmap_min_addr; - #include #include #include @@ -574,19 +572,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, set_page_section(page, pfn_to_section_nr(pfn)); } -/* - * If a hint addr is less than mmap_min_addr change hint to be as - * low as possible but still greater than mmap_min_addr - */ -static inline unsigned long round_hint_to_min(unsigned long hint) -{ - hint &= PAGE_MASK; - if (((void *)hint != NULL) && - (hint < mmap_min_addr)) - return PAGE_ALIGN(mmap_min_addr); - return hint; -} - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/security.h b/include/linux/security.h index ac4bc3760b46..dc3472c1f781 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -28,6 +28,7 @@ #include #include #include +#include /* PAGE_ALIGN */ #include #include #include @@ -95,6 +96,7 @@ extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb, int cap); extern unsigned long mmap_min_addr; +extern unsigned long dac_mmap_min_addr; /* * Values used in the task_security_ops calls */ @@ -147,6 +149,21 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} + +extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); /** * struct security_operations - main security structure * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 98e02328c67d..58be76017fd0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -1306,10 +1307,10 @@ static struct ctl_table vm_table[] = { { .ctl_name = CTL_UNNUMBERED, .procname = "mmap_min_addr", - .data = &mmap_min_addr, - .maxlen = sizeof(unsigned long), + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), .mode = 0644, - .proc_handler = &proc_doulongvec_minmax, + .proc_handler = &mmap_min_addr_handler, }, #ifdef CONFIG_NUMA { diff --git a/mm/Kconfig b/mm/Kconfig index c948d4ca8bde..fe5f674d7a7d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -225,9 +225,9 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. diff --git a/mm/mmap.c b/mm/mmap.c index 34579b23ebd5..8101de490c73 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to diff --git a/mm/nommu.c b/mm/nommu.c index 53cab10fece4..28754c40be98 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -69,9 +69,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); diff --git a/security/Kconfig b/security/Kconfig index d23c839038f0..9c60c346a91d 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -113,6 +113,22 @@ config SECURITY_ROOTPLUG If you are unsure how to answer this question, answer N. +config LSM_MMAP_MIN_ADDR + int "Low address space for LSM to from user allocation" + depends on SECURITY && SECURITY_SELINUX + default 65535 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages + can help reduce the impact of kernel NULL pointer bugs. + + For most ia64, ppc64 and x86 users with lots of address space + a value of 65536 is reasonable and should cause no problems. + On arm and other archs it should not be higher than 32768. + Programs which use vm86 functionality or have some need to map + this low address space will need the permission specific to the + systems running LSM. + source security/selinux/Kconfig source security/smack/Kconfig source security/tomoyo/Kconfig diff --git a/security/Makefile b/security/Makefile index c67557cdaa85..b56e7f9ecbc2 100644 --- a/security/Makefile +++ b/security/Makefile @@ -8,7 +8,7 @@ subdir-$(CONFIG_SECURITY_SMACK) += smack subdir-$(CONFIG_SECURITY_TOMOYO) += tomoyo # always enable default capabilities -obj-y += commoncap.o +obj-y += commoncap.o min_addr.o # Object file lists obj-$(CONFIG_SECURITY) += security.o capability.o diff --git a/security/commoncap.c b/security/commoncap.c index 6bcf6e81e547..e3097c0a1311 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -1005,7 +1005,7 @@ int cap_file_mmap(struct file *file, unsigned long reqprot, { int ret = 0; - if (addr < mmap_min_addr) { + if (addr < dac_mmap_min_addr) { ret = cap_capable(current, current_cred(), CAP_SYS_RAWIO, SECURITY_CAP_AUDIT); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ diff --git a/security/min_addr.c b/security/min_addr.c new file mode 100644 index 000000000000..14cc7b3b8d03 --- /dev/null +++ b/security/min_addr.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include + +/* amount of vm to protect from userspace access by both DAC and the LSM*/ +unsigned long mmap_min_addr; +/* amount of vm to protect from userspace using CAP_SYS_RAWIO (DAC) */ +unsigned long dac_mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; +/* amount of vm to protect from userspace using the LSM = CONFIG_LSM_MMAP_MIN_ADDR */ + +/* + * Update mmap_min_addr = max(dac_mmap_min_addr, CONFIG_LSM_MMAP_MIN_ADDR) + */ +static void update_mmap_min_addr(void) +{ +#ifdef CONFIG_LSM_MMAP_MIN_ADDR + if (dac_mmap_min_addr > CONFIG_LSM_MMAP_MIN_ADDR) + mmap_min_addr = dac_mmap_min_addr; + else + mmap_min_addr = CONFIG_LSM_MMAP_MIN_ADDR; +#else + mmap_min_addr = dac_mmap_min_addr; +#endif +} + +/* + * sysctl handler which just sets dac_mmap_min_addr = the new value and then + * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly + */ +int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + + update_mmap_min_addr(); + + return ret; +} + +int __init init_mmap_min_addr(void) +{ + update_mmap_min_addr(); + + return 0; +} +pure_initcall(init_mmap_min_addr); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index e6d1432b0800..8d8b69c5664e 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3036,7 +3036,7 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot, * at bad behaviour/exploit that we always want to get the AVC, even * if DAC would have also denied the operation. */ - if (addr < mmap_min_addr) { + if (addr < CONFIG_LSM_MMAP_MIN_ADDR) { rc = avc_has_perm(sid, sid, SECCLASS_MEMPROTECT, MEMPROTECT__MMAP_ZERO, NULL); if (rc) -- cgit v1.2.3 From 1d9959734a1949ea4f2427bd2d8b21ede6b2441c Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 7 Aug 2009 14:53:57 -0400 Subject: security: define round_hint_to_min in !CONFIG_SECURITY Fix the header files to define round_hint_to_min() and to define mmap_min_addr_handler() in the !CONFIG_SECURITY case. Built and tested with !CONFIG_SECURITY Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/security.h | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/security.h b/include/linux/security.h index dc3472c1f781..1f16eea2017b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -121,6 +121,21 @@ struct request_sock; #define LSM_UNSAFE_PTRACE 2 #define LSM_UNSAFE_PTRACE_CAP 4 +/* + * If a hint addr is less than mmap_min_addr change hint to be as + * low as possible but still greater than mmap_min_addr + */ +static inline unsigned long round_hint_to_min(unsigned long hint) +{ + hint &= PAGE_MASK; + if (((void *)hint != NULL) && + (hint < mmap_min_addr)) + return PAGE_ALIGN(mmap_min_addr); + return hint; +} +extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); + #ifdef CONFIG_SECURITY struct security_mnt_opts { @@ -149,21 +164,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) opts->num_mnt_opts = 0; } -/* - * If a hint addr is less than mmap_min_addr change hint to be as - * low as possible but still greater than mmap_min_addr - */ -static inline unsigned long round_hint_to_min(unsigned long hint) -{ - hint &= PAGE_MASK; - if (((void *)hint != NULL) && - (hint < mmap_min_addr)) - return PAGE_ALIGN(mmap_min_addr); - return hint; -} - -extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, - void __user *buffer, size_t *lenp, loff_t *ppos); /** * struct security_operations - main security structure * -- cgit v1.2.3 From c1a8f1f1c8e01eab5862c8db39b49ace814e6c66 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 16 Aug 2009 09:36:49 +0000 Subject: net: restore gnet_stats_basic to previous definition In 5e140dfc1fe87eae27846f193086724806b33c7d "net: reorder struct Qdisc for better SMP performance" the definition of struct gnet_stats_basic changed incompatibly, as copies of this struct are shipped to userland via netlink. Restoring old behavior is not welcome, for performance reason. Fix is to use a private structure for kernel, and teach gnet_stats_copy_basic() to convert from kernel to user land, using legacy structure (struct gnet_stats_basic) Based on a report and initial patch from Michael Spang. Reported-by: Michael Spang Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/gen_stats.h | 5 +++++ include/net/act_api.h | 2 +- include/net/gen_stats.h | 10 +++++----- include/net/netfilter/xt_rateest.h | 2 +- include/net/sch_generic.h | 2 +- net/core/gen_estimator.c | 12 ++++++------ net/core/gen_stats.c | 11 ++++++++--- net/netfilter/xt_RATEEST.c | 2 +- net/sched/sch_atm.c | 2 +- net/sched/sch_cbq.c | 2 +- net/sched/sch_drr.c | 2 +- net/sched/sch_hfsc.c | 2 +- net/sched/sch_htb.c | 2 +- 13 files changed, 33 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/include/linux/gen_stats.h b/include/linux/gen_stats.h index 0ffa41df0ee8..710e901085d0 100644 --- a/include/linux/gen_stats.h +++ b/include/linux/gen_stats.h @@ -19,6 +19,11 @@ enum { * @packets: number of seen packets */ struct gnet_stats_basic +{ + __u64 bytes; + __u32 packets; +}; +struct gnet_stats_basic_packed { __u64 bytes; __u32 packets; diff --git a/include/net/act_api.h b/include/net/act_api.h index 565eed8fe496..c05fd717c588 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -16,7 +16,7 @@ struct tcf_common { u32 tcfc_capab; int tcfc_action; struct tcf_t tcfc_tm; - struct gnet_stats_basic tcfc_bstats; + struct gnet_stats_basic_packed tcfc_bstats; struct gnet_stats_queue tcfc_qstats; struct gnet_stats_rate_est tcfc_rate_est; spinlock_t tcfc_lock; diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index d136b5240ef2..c1488553e349 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -28,7 +28,7 @@ extern int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d); extern int gnet_stats_copy_basic(struct gnet_dump *d, - struct gnet_stats_basic *b); + struct gnet_stats_basic_packed *b); extern int gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r); extern int gnet_stats_copy_queue(struct gnet_dump *d, @@ -37,14 +37,14 @@ extern int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); extern int gnet_stats_finish_copy(struct gnet_dump *d); -extern int gen_new_estimator(struct gnet_stats_basic *bstats, +extern int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct nlattr *opt); -extern void gen_kill_estimator(struct gnet_stats_basic *bstats, +extern void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est); -extern int gen_replace_estimator(struct gnet_stats_basic *bstats, +extern int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct nlattr *opt); -extern bool gen_estimator_active(const struct gnet_stats_basic *bstats, +extern bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, const struct gnet_stats_rate_est *rate_est); #endif diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h index 65d594dffbff..ddbf37e19616 100644 --- a/include/net/netfilter/xt_rateest.h +++ b/include/net/netfilter/xt_rateest.h @@ -8,7 +8,7 @@ struct xt_rateest { spinlock_t lock; struct gnet_estimator params; struct gnet_stats_rate_est rstats; - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; }; extern struct xt_rateest *xt_rateest_lookup(const char *name); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 964ffa0d8815..5482e9582f55 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -72,7 +72,7 @@ struct Qdisc */ unsigned long state; struct sk_buff_head q; - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; }; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 78e5bfc454ae..493775f4f2f1 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -81,7 +81,7 @@ struct gen_estimator { struct list_head list; - struct gnet_stats_basic *bstats; + struct gnet_stats_basic_packed *bstats; struct gnet_stats_rate_est *rate_est; spinlock_t *stats_lock; int ewma_log; @@ -165,7 +165,7 @@ static void gen_add_node(struct gen_estimator *est) } static -struct gen_estimator *gen_find_node(const struct gnet_stats_basic *bstats, +struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats, const struct gnet_stats_rate_est *rate_est) { struct rb_node *p = est_root.rb_node; @@ -202,7 +202,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic *bstats, * * NOTE: Called under rtnl_mutex */ -int gen_new_estimator(struct gnet_stats_basic *bstats, +int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct nlattr *opt) @@ -262,7 +262,7 @@ static void __gen_kill_estimator(struct rcu_head *head) * * NOTE: Called under rtnl_mutex */ -void gen_kill_estimator(struct gnet_stats_basic *bstats, +void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est) { struct gen_estimator *e; @@ -292,7 +292,7 @@ EXPORT_SYMBOL(gen_kill_estimator); * * Returns 0 on success or a negative error code. */ -int gen_replace_estimator(struct gnet_stats_basic *bstats, +int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, struct nlattr *opt) { @@ -308,7 +308,7 @@ EXPORT_SYMBOL(gen_replace_estimator); * * Returns true if estimator is active, and false if not. */ -bool gen_estimator_active(const struct gnet_stats_basic *bstats, +bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats, const struct gnet_stats_rate_est *rate_est) { ASSERT_RTNL(); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index c3d0ffeac243..8569310268ab 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -106,16 +106,21 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, * if the room in the socket buffer was not sufficient. */ int -gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b) +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic_packed *b) { if (d->compat_tc_stats) { d->tc_stats.bytes = b->bytes; d->tc_stats.packets = b->packets; } - if (d->tail) - return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b)); + if (d->tail) { + struct gnet_stats_basic sb; + memset(&sb, 0, sizeof(sb)); + sb.bytes = b->bytes; + sb.packets = b->packets; + return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb)); + } return 0; } diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 43f5676b1af4..d80b8192e0d4 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -74,7 +74,7 @@ static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_target_param *par) { const struct xt_rateest_target_info *info = par->targinfo; - struct gnet_stats_basic *stats = &info->est->bstats; + struct gnet_stats_basic_packed *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); stats->bytes += skb->len; diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 2a8b83af7c47..ab82f145f689 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -49,7 +49,7 @@ struct atm_flow_data { struct socket *sock; /* for closing */ u32 classid; /* x:y type ID */ int ref; /* reference count */ - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct atm_flow_data *next; struct atm_flow_data *excess; /* flow for excess traffic; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 23a167670fd5..d5798e17a832 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -128,7 +128,7 @@ struct cbq_class long avgidle; long deficit; /* Saved deficit for WRR */ psched_time_t penalized; - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; struct tc_cbq_xstats xstats; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 7597fe146866..12b2fb04b29b 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -22,7 +22,7 @@ struct drr_class { unsigned int refcnt; unsigned int filter_cnt; - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; struct list_head alist; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 362c2811b2df..dad0144423da 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -116,7 +116,7 @@ struct hfsc_class struct Qdisc_class_common cl_common; unsigned int refcnt; /* usage count */ - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; unsigned int level; /* class level in hierarchy */ diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 88cd02626621..ec4d46399d59 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -74,7 +74,7 @@ enum htb_cmode { struct htb_class { struct Qdisc_class_common common; /* general class parameters */ - struct gnet_stats_basic bstats; + struct gnet_stats_basic_packed bstats; struct gnet_stats_queue qstats; struct gnet_stats_rate_est rate_est; struct tc_htb_xstats xstats; /* our special stats */ -- cgit v1.2.3 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 15 +++------ fs/proc/base.c | 19 ++--------- include/linux/mm_types.h | 2 -- include/linux/sched.h | 1 + kernel/fork.c | 1 - mm/oom_kill.c | 64 +++++++++++++++++++++++--------------- 6 files changed, 48 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fad18f9456e4..ffead13f9443 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS 3.1 /proc//oom_adj - Adjust the oom-killer score ------------------------------------------------------ -This file can be used to adjust the score used to select which processes should -be killed in an out-of-memory situation. The oom_adj value is a characteristic -of the task's mm, so all threads that share an mm with pid will have the same -oom_adj value. A high value will increase the likelihood of this process being -killed by the oom-killer. Valid values are in the range -16 to +15 as -explained below and a special value of -17, which disables oom-killing -altogether for threads sharing pid's mm. +This file can be used to adjust the score used to select which processes +should be killed in an out-of-memory situation. Giving it a high score will +increase the likelihood of this process being killed by the oom-killer. Valid +values are in the range -16 to +15, plus the special value -17, which disables +oom-killing altogether for this process. The process to be killed in an out-of-memory situation is selected among all others based on its badness score. This value equals the original memory size of the process @@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers are the prime candidates to be killed. Having only one 'hungry' child will make parent less preferable than the child. -/proc//oom_adj cannot be changed for kthreads since they are immune from -oom-killing already. - /proc//oom_score shows process' current badness score. The following heuristics are then applied: diff --git a/fs/proc/base.c b/fs/proc/base.c index 175db258942f..6f742f6658a9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7acc8439d9b3..0042090a4d70 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,8 +240,6 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - s8 oom_adj; /* OOM kill score adjustment (bit shift) */ - cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..0f1ea4a66957 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1198,6 +1198,7 @@ struct task_struct { * a short time */ unsigned char fpu_counter; + s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556e..144326b7af50 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..a7b2460e922b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } -- cgit v1.2.3 From 1700f5fde88f9a251037bc86bde538ee32c59905 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 20 Aug 2009 22:05:53 -0700 Subject: Input: ucb1400_ts - enable ADC Filter This patch enables ADC filtering on UCB1400 codec by default. The benefit from this change is mostly on some Colibri boards where the ADCSYNC pin of the UCB1400 codec isn't connected causing the touchscreen to jitter very badly. This change has no visible effect on boards where the ADCSYNC pin is connected. Signed-off-by: Marek Vasut Tested-by: Palo Revak Signed-off-by: Dmitry Torokhov --- drivers/input/touchscreen/ucb1400_ts.c | 9 +++++++++ include/linux/ucb1400.h | 4 ++++ 2 files changed, 13 insertions(+) (limited to 'include/linux') diff --git a/drivers/input/touchscreen/ucb1400_ts.c b/drivers/input/touchscreen/ucb1400_ts.c index 6954f5500108..3b345f9cf0c7 100644 --- a/drivers/input/touchscreen/ucb1400_ts.c +++ b/drivers/input/touchscreen/ucb1400_ts.c @@ -345,6 +345,7 @@ static int ucb1400_ts_detect_irq(struct ucb1400_ts *ucb) static int ucb1400_ts_probe(struct platform_device *dev) { int error, x_res, y_res; + u16 fcsr; struct ucb1400_ts *ucb = dev->dev.platform_data; ucb->ts_idev = input_allocate_device(); @@ -382,6 +383,14 @@ static int ucb1400_ts_probe(struct platform_device *dev) ucb->ts_idev->evbit[0] = BIT_MASK(EV_ABS) | BIT_MASK(EV_KEY); ucb->ts_idev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); + /* + * Enable ADC filter to prevent horrible jitter on Colibri. + * This also further reduces jitter on boards where ADCSYNC + * pin is connected. + */ + fcsr = ucb1400_reg_read(ucb->ac97, UCB_FCSR); + ucb1400_reg_write(ucb->ac97, UCB_FCSR, fcsr | UCB_FCSR_AVE); + ucb1400_adc_enable(ucb->ac97); x_res = ucb1400_ts_read_xres(ucb); y_res = ucb1400_ts_read_yres(ucb); diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h index ed889f4168f3..ae779bb8cc0f 100644 --- a/include/linux/ucb1400.h +++ b/include/linux/ucb1400.h @@ -73,6 +73,10 @@ #define UCB_ADC_DATA 0x68 #define UCB_ADC_DAT_VALID (1 << 15) + +#define UCB_FCSR 0x6c +#define UCB_FCSR_AVE (1 << 12) + #define UCB_ADC_DAT_MASK 0x3ff #define UCB_ID 0x7e -- cgit v1.2.3 From f4b0373b26567cafd421d91101852ed7a34e9e94 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 21 Aug 2009 09:26:15 -0700 Subject: Make bitmask 'and' operators return a result code When 'and'ing two bitmasks (where 'andnot' is a variation on it), some cases want to know whether the result is the empty set or not. In particular, the TLB IPI sending code wants to do cpumask operations and determine if there are any CPU's left in the final set. So this just makes the bitmask (and cpumask) functions return a boolean for whether the result has any bits set. Cc: stable@kernel.org (2.6.30, needed by TLB shootdown fix) Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 18 ++++++++---------- include/linux/cpumask.h | 20 ++++++++++---------- lib/bitmap.c | 12 ++++++++---- 3 files changed, 26 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 2878811c6134..756d78b8c1c5 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -94,13 +94,13 @@ extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); -extern void __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); -extern void __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, +extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); extern int __bitmap_intersects(const unsigned long *bitmap1, const unsigned long *bitmap2, int bits); @@ -171,13 +171,12 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } } -static inline void bitmap_and(unsigned long *dst, const unsigned long *src1, +static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { if (small_const_nbits(nbits)) - *dst = *src1 & *src2; - else - __bitmap_and(dst, src1, src2, nbits); + return (*dst = *src1 & *src2) != 0; + return __bitmap_and(dst, src1, src2, nbits); } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, @@ -198,13 +197,12 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, __bitmap_xor(dst, src1, src2, nbits); } -static inline void bitmap_andnot(unsigned long *dst, const unsigned long *src1, +static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, const unsigned long *src2, int nbits) { if (small_const_nbits(nbits)) - *dst = *src1 & ~(*src2); - else - __bitmap_andnot(dst, src1, src2, nbits); + return (*dst = *src1 & ~(*src2)) != 0; + return __bitmap_andnot(dst, src1, src2, nbits); } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index c5ac87ca7bc6..796df12091b7 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -43,10 +43,10 @@ * int cpu_isset(cpu, mask) true iff bit 'cpu' set in mask * int cpu_test_and_set(cpu, mask) test and set bit 'cpu' in mask * - * void cpus_and(dst, src1, src2) dst = src1 & src2 [intersection] + * int cpus_and(dst, src1, src2) dst = src1 & src2 [intersection] * void cpus_or(dst, src1, src2) dst = src1 | src2 [union] * void cpus_xor(dst, src1, src2) dst = src1 ^ src2 - * void cpus_andnot(dst, src1, src2) dst = src1 & ~src2 + * int cpus_andnot(dst, src1, src2) dst = src1 & ~src2 * void cpus_complement(dst, src) dst = ~src * * int cpus_equal(mask1, mask2) Does mask1 == mask2? @@ -179,10 +179,10 @@ static inline int __cpu_test_and_set(int cpu, cpumask_t *addr) } #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS) -static inline void __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, +static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, const cpumask_t *src2p, int nbits) { - bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS) @@ -201,10 +201,10 @@ static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, #define cpus_andnot(dst, src1, src2) \ __cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS) -static inline void __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, +static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, const cpumask_t *src2p, int nbits) { - bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); + return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS) @@ -738,11 +738,11 @@ static inline void cpumask_clear(struct cpumask *dstp) * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_and(struct cpumask *dstp, +static inline int cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { - bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), + return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } @@ -779,11 +779,11 @@ static inline void cpumask_xor(struct cpumask *dstp, * @src1p: the first input * @src2p: the second input */ -static inline void cpumask_andnot(struct cpumask *dstp, +static inline int cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { - bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), + return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } diff --git a/lib/bitmap.c b/lib/bitmap.c index 35a1f7ff4149..702565821c99 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -179,14 +179,16 @@ void __bitmap_shift_left(unsigned long *dst, } EXPORT_SYMBOL(__bitmap_shift_left); -void __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, +int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k; int nr = BITS_TO_LONGS(bits); + unsigned long result = 0; for (k = 0; k < nr; k++) - dst[k] = bitmap1[k] & bitmap2[k]; + result |= (dst[k] = bitmap1[k] & bitmap2[k]); + return result != 0; } EXPORT_SYMBOL(__bitmap_and); @@ -212,14 +214,16 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_xor); -void __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, +int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, const unsigned long *bitmap2, int bits) { int k; int nr = BITS_TO_LONGS(bits); + unsigned long result = 0; for (k = 0; k < nr; k++) - dst[k] = bitmap1[k] & ~bitmap2[k]; + result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); -- cgit v1.2.3 From 6777d773a463ac045d333b989d4e44660f8d92ad Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 21 Aug 2009 14:32:48 -0400 Subject: kernel_read: redefine offset type vfs_read() offset is defined as loff_t, but kernel_read() offset is only defined as unsigned long. Redefine kernel_read() offset as loff_t. Cc: stable@kernel.org Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/exec.c | 4 ++-- include/linux/fs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/exec.c b/fs/exec.c index 4a8849e45b21..fb4f3cdda78c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,8 +678,8 @@ exit: } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, unsigned long offset, - char *addr, unsigned long count) +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) { mm_segment_t old_fs; loff_t pos = offset; diff --git a/include/linux/fs.h b/include/linux/fs.h index 67888a9e0655..73e9b643e455 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2123,7 +2123,7 @@ extern struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode); extern int may_open(struct path *, int, int); -extern int kernel_read(struct file *, unsigned long, char *, unsigned long); +extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ -- cgit v1.2.3 From 353d5c30c666580347515da609dd74a2b8e9b828 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 24 Aug 2009 16:30:28 +0100 Subject: mm: fix hugetlb bug due to user_shm_unlock call 2.6.30's commit 8a0bdec194c21c8fdef840989d0d7b742bb5d4bc removed user_shm_lock() calls in hugetlb_file_setup() but left the user_shm_unlock call in shm_destroy(). In detail: Assume that can_do_hugetlb_shm() returns true and hence user_shm_lock() is not called in hugetlb_file_setup(). However, user_shm_unlock() is called in any case in shm_destroy() and in the following atomic_dec_and_lock(&up->__count) in free_uid() is executed and if up->__count gets zero, also cleanup_user_struct() is scheduled. Note that sched_destroy_user() is empty if CONFIG_USER_SCHED is not set. However, the ref counter up->__count gets unexpectedly non-positive and the corresponding structs are freed even though there are live references to them, resulting in a kernel oops after a lots of shmget(SHM_HUGETLB)/shmctl(IPC_RMID) cycles and CONFIG_USER_SCHED set. Hugh changed Stefan's suggested patch: can_do_hugetlb_shm() at the time of shm_destroy() may give a different answer from at the time of hugetlb_file_setup(). And fixed newseg()'s no_id error path, which has missed user_shm_unlock() ever since it came in 2.6.9. Reported-by: Stefan Huber Signed-off-by: Hugh Dickins Tested-by: Stefan Huber Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 20 ++++++++++++-------- include/linux/hugetlb.h | 6 ++++-- ipc/shm.c | 8 +++++--- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 941c8425c10b..cb88dac8ccaa 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -935,26 +935,28 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, + struct user_struct **user) { int error = -ENOMEM; - int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; - struct user_struct *user = current_user(); + *user = NULL; if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); if (!can_do_hugetlb_shm()) { - if (user_shm_lock(size, user)) { - unlock_shm = 1; + *user = current_user(); + if (user_shm_lock(size, *user)) { WARN_ONCE(1, "Using mlock ulimits for SHM_HUGETLB deprecated\n"); - } else + } else { + *user = NULL; return ERR_PTR(-EPERM); + } } root = hugetlbfs_vfsmount->mnt_root; @@ -996,8 +998,10 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - if (unlock_shm) - user_shm_unlock(size, user); + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } return ERR_PTR(error); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2723513a5651..5cbc620bdfe0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -10,6 +10,7 @@ #include struct ctl_table; +struct user_struct; int PageHuge(struct page *page); @@ -146,7 +147,8 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t, int); +struct file *hugetlb_file_setup(const char *name, size_t size, int acct, + struct user_struct **user); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); @@ -168,7 +170,7 @@ static inline void set_file_hugepages(struct file *file) #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() -#define hugetlb_file_setup(name,size,acctflag) ERR_PTR(-ENOSYS) +#define hugetlb_file_setup(name,size,acct,user) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ diff --git a/ipc/shm.c b/ipc/shm.c index 15dd238e5338..1bc4701ef4f0 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -174,7 +174,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0, shp->mlock_user); - else + else if (shp->mlock_user) user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, shp->mlock_user); fput (shp->shm_file); @@ -369,8 +369,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) /* hugetlb_file_setup applies strict accounting */ if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; - file = hugetlb_file_setup(name, size, acctflag); - shp->mlock_user = current_user(); + file = hugetlb_file_setup(name, size, acctflag, + &shp->mlock_user); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even @@ -410,6 +410,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) return error; no_id: + if (shp->mlock_user) /* shmflg & SHM_HUGETLB case */ + user_shm_unlock(size, shp->mlock_user); fput(file); no_file: security_shm_free(shp); -- cgit v1.2.3 From 8e7ee27095aee87b5db1b0061e2ceea5878a1bbd Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 26 Aug 2009 14:29:21 -0700 Subject: flex_array: declare parts member to have incomplete type The `parts' member of struct flex_array should evaluate to an incomplete type so that sizeof() cannot be used and C99 does not require the zero-length specification. Signed-off-by: David Rientjes Acked-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/flex_array.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h index 23c1ec79a31b..603160db7c98 100644 --- a/include/linux/flex_array.h +++ b/include/linux/flex_array.h @@ -21,7 +21,7 @@ struct flex_array { struct { int element_size; int total_nr_elements; - struct flex_array_part *parts[0]; + struct flex_array_part *parts[]; }; /* * This little trick makes sure that -- cgit v1.2.3 From b62e408c05228f40e69bb38a48db8961cac6cd23 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 26 Aug 2009 14:29:22 -0700 Subject: flex_array: convert element_nr formals to unsigned It's problematic to allow signed element_nr's or total's to be passed as part of the flex array API. flex_array_alloc() allows total_nr_elements to be set to a negative quantity, which is obviously erroneous. flex_array_get() and flex_array_put() allows negative array indices in dereferencing an array part, which could address memory mapped before struct flex_array. The fix is to convert all existing element_nr formals to be qualified as unsigned. Existing checks to compare it to total_nr_elements or the max array size based on element_size need not be changed. Signed-off-by: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/flex_array.h | 10 ++++++---- lib/flex_array.c | 24 +++++++++++++----------- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h index 603160db7c98..45ff18491514 100644 --- a/include/linux/flex_array.h +++ b/include/linux/flex_array.h @@ -36,12 +36,14 @@ struct flex_array { .total_nr_elements = (total), \ } } } -struct flex_array *flex_array_alloc(int element_size, int total, gfp_t flags); -int flex_array_prealloc(struct flex_array *fa, int start, int end, gfp_t flags); +struct flex_array *flex_array_alloc(int element_size, unsigned int total, + gfp_t flags); +int flex_array_prealloc(struct flex_array *fa, unsigned int start, + unsigned int end, gfp_t flags); void flex_array_free(struct flex_array *fa); void flex_array_free_parts(struct flex_array *fa); -int flex_array_put(struct flex_array *fa, int element_nr, void *src, +int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, gfp_t flags); -void *flex_array_get(struct flex_array *fa, int element_nr); +void *flex_array_get(struct flex_array *fa, unsigned int element_nr); #endif /* _FLEX_ARRAY_H */ diff --git a/lib/flex_array.c b/lib/flex_array.c index cf4e372cae90..7baed2fc3bc8 100644 --- a/lib/flex_array.c +++ b/lib/flex_array.c @@ -99,7 +99,8 @@ static inline int elements_fit_in_base(struct flex_array *fa) * capacity in the base structure. Also note that no effort is made * to efficiently pack objects across page boundaries. */ -struct flex_array *flex_array_alloc(int element_size, int total, gfp_t flags) +struct flex_array *flex_array_alloc(int element_size, unsigned int total, + gfp_t flags) { struct flex_array *ret; int max_size = nr_base_part_ptrs() * __elements_per_part(element_size); @@ -115,7 +116,8 @@ struct flex_array *flex_array_alloc(int element_size, int total, gfp_t flags) return ret; } -static int fa_element_to_part_nr(struct flex_array *fa, int element_nr) +static int fa_element_to_part_nr(struct flex_array *fa, + unsigned int element_nr) { return element_nr / __elements_per_part(fa->element_size); } @@ -143,14 +145,12 @@ void flex_array_free(struct flex_array *fa) kfree(fa); } -static int fa_index_inside_part(struct flex_array *fa, int element_nr) +static unsigned int index_inside_part(struct flex_array *fa, + unsigned int element_nr) { - return element_nr % __elements_per_part(fa->element_size); -} + unsigned int part_offset; -static int index_inside_part(struct flex_array *fa, int element_nr) -{ - int part_offset = fa_index_inside_part(fa, element_nr); + part_offset = element_nr % __elements_per_part(fa->element_size); return part_offset * fa->element_size; } @@ -185,7 +185,8 @@ __fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) * * Locking must be provided by the caller. */ -int flex_array_put(struct flex_array *fa, int element_nr, void *src, gfp_t flags) +int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, + gfp_t flags) { int part_nr = fa_element_to_part_nr(fa, element_nr); struct flex_array_part *part; @@ -217,7 +218,8 @@ int flex_array_put(struct flex_array *fa, int element_nr, void *src, gfp_t flags * * Locking must be provided by the caller. */ -int flex_array_prealloc(struct flex_array *fa, int start, int end, gfp_t flags) +int flex_array_prealloc(struct flex_array *fa, unsigned int start, + unsigned int end, gfp_t flags) { int start_part; int end_part; @@ -248,7 +250,7 @@ int flex_array_prealloc(struct flex_array *fa, int start, int end, gfp_t flags) * * Locking must be provided by the caller. */ -void *flex_array_get(struct flex_array *fa, int element_nr) +void *flex_array_get(struct flex_array *fa, unsigned int element_nr) { int part_nr = fa_element_to_part_nr(fa, element_nr); struct flex_array_part *part; -- cgit v1.2.3 From 1a37f184fa7824982a5f434c06981ec46a66cef7 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Mon, 31 Aug 2009 13:48:16 +1000 Subject: lmb: Also remove __init from lmb_end_of_RAM() declaration in lmb.h My previous patch (commit 4f8ee2c9cc: "lmb: Remove __init from lmb_end_of_DRAM()") removed __init in lmb.c but missed the fact that it was also marked as such in the .h Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- include/linux/lmb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lmb.h b/include/linux/lmb.h index c46c89505dac..2442e3f3d033 100644 --- a/include/linux/lmb.h +++ b/include/linux/lmb.h @@ -51,7 +51,7 @@ extern u64 __init lmb_alloc_base(u64 size, extern u64 __init __lmb_alloc_base(u64 size, u64 align, u64 max_addr); extern u64 __init lmb_phys_mem_size(void); -extern u64 __init lmb_end_of_DRAM(void); +extern u64 lmb_end_of_DRAM(void); extern void __init lmb_enforce_memory_limit(u64 memory_limit); extern int __init lmb_is_reserved(u64 addr); extern int lmb_find(struct lmb_property *res); -- cgit v1.2.3 From 40bea431274c247425e7f5970d796ff7b37a2b22 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 4 Sep 2009 20:40:25 +0100 Subject: dm stripe: expose correct io hints Set sensible I/O hints for striped DM devices in the topology infrastructure added for 2.6.31 for userspace tools to obtain via sysfs. Add .io_hints to 'struct target_type' to allow the I/O hints portion (io_min and io_opt) of the 'struct queue_limits' to be set by each target and implement this for dm-stripe. Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-stripe.c | 13 ++++++++++++- drivers/md/dm-table.c | 4 ++++ include/linux/device-mapper.h | 4 ++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 4e0e5937e42a..3e563d251733 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -329,9 +329,19 @@ static int stripe_iterate_devices(struct dm_target *ti, return ret; } +static void stripe_io_hints(struct dm_target *ti, + struct queue_limits *limits) +{ + struct stripe_c *sc = ti->private; + unsigned chunk_size = (sc->chunk_mask + 1) << 9; + + blk_limits_io_min(limits, chunk_size); + limits->io_opt = chunk_size * sc->stripes; +} + static struct target_type stripe_target = { .name = "striped", - .version = {1, 2, 0}, + .version = {1, 3, 0}, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, @@ -339,6 +349,7 @@ static struct target_type stripe_target = { .end_io = stripe_end_io, .status = stripe_status, .iterate_devices = stripe_iterate_devices, + .io_hints = stripe_io_hints, }; int __init dm_stripe_init(void) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index c90e662d2802..1a6cb3c7822e 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1007,6 +1007,10 @@ int dm_calculate_queue_limits(struct dm_table *table, ti->type->iterate_devices(ti, dm_set_device_limits, &ti_limits); + /* Set I/O hints portion of queue limits */ + if (ti->type->io_hints) + ti->type->io_hints(ti, &ti_limits); + /* * Check each device area is consistent with the target's * overall queue limits. diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 655e7721580a..df7607e6dce8 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -91,6 +91,9 @@ typedef int (*dm_iterate_devices_fn) (struct dm_target *ti, iterate_devices_callout_fn fn, void *data); +typedef void (*dm_io_hints_fn) (struct dm_target *ti, + struct queue_limits *limits); + /* * Returns: * 0: The target can handle the next I/O immediately. @@ -151,6 +154,7 @@ struct target_type { dm_merge_fn merge; dm_busy_fn busy; dm_iterate_devices_fn iterate_devices; + dm_io_hints_fn io_hints; /* For internal device-mapper use. */ struct list_head list; -- cgit v1.2.3 From 7ec23d50949d5062b5b749638dd9380ed75e58e5 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Fri, 4 Sep 2009 20:40:34 +0100 Subject: dm log: userspace add luid to distinguish between concurrent log instances Device-mapper userspace logs (like the clustered log) are identified by a universally unique identifier (UUID). This identifier is used to associate requests from the kernel to a specific log in userspace. The UUID must be unique everywhere, since multiple machines may use this identifier when communicating about a particular log, as is the case for cluster logs. Sometimes, device-mapper/LVM may re-use a UUID. This is the case during pvmoves, when moving from one segment of an LV to another, or when resizing a mirror, etc. In these cases, a new log is created with the same UUID and loaded in the "inactive" slot. When a device-mapper "resume" is issued, the "live" table is deactivated and the new "inactive" table becomes "live". (The "inactive" table can also be removed via a device-mapper 'clear' command.) The above two issues were colliding. More than one log was being created with the same UUID, and there was no way to distinguish between them. So, sometimes the wrong log would be swapped out during the exchange. The solution is to create a locally unique identifier, 'luid', to go along with the UUID. This new identifier is used to determine exactly which log is being referenced by the kernel when the log exchange is made. The identifier is not universally safe, but it does not need to be, since create/destroy/suspend/resume operations are bound to a specific machine; and these are the operations that make up the exchange. Signed-off-by: Jonathan Brassow Signed-off-by: Alasdair G Kergon --- drivers/md/dm-log-userspace-base.c | 23 ++++++++++++++--------- drivers/md/dm-log-userspace-transfer.c | 6 ++++-- drivers/md/dm-log-userspace-transfer.h | 2 +- include/linux/dm-log-userspace.h | 13 ++++++++++++- 4 files changed, 31 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c index c49da0a41c8e..6e186b1a062d 100644 --- a/drivers/md/dm-log-userspace-base.c +++ b/drivers/md/dm-log-userspace-base.c @@ -21,6 +21,7 @@ struct log_c { struct dm_target *ti; uint32_t region_size; region_t region_count; + uint64_t luid; char uuid[DM_UUID_LEN]; char *usr_argv_str; @@ -63,7 +64,7 @@ static int userspace_do_request(struct log_c *lc, const char *uuid, * restored. */ retry: - r = dm_consult_userspace(uuid, request_type, data, + r = dm_consult_userspace(uuid, lc->luid, request_type, data, data_size, rdata, rdata_size); if (r != -ESRCH) @@ -74,14 +75,15 @@ retry: set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(2*HZ); DMWARN("Attempting to contact userspace log server..."); - r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str, + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_CTR, + lc->usr_argv_str, strlen(lc->usr_argv_str) + 1, NULL, NULL); if (!r) break; } DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete"); - r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL, + r = dm_consult_userspace(uuid, lc->luid, DM_ULOG_RESUME, NULL, 0, NULL, NULL); if (!r) goto retry; @@ -153,6 +155,9 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, return -ENOMEM; } + /* The ptr value is sufficient for local unique id */ + lc->luid = (uint64_t)lc; + lc->ti = ti; if (strlen(argv[0]) > (DM_UUID_LEN - 1)) { @@ -172,7 +177,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, } /* Send table string */ - r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR, + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_CTR, ctr_str, str_size, NULL, NULL); if (r == -ESRCH) { @@ -182,7 +187,7 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti, /* Since the region size does not change, get it now */ rdata_size = sizeof(rdata); - r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE, + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_GET_REGION_SIZE, NULL, 0, (char *)&rdata, &rdata_size); if (r) { @@ -211,7 +216,7 @@ static void userspace_dtr(struct dm_dirty_log *log) int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR, + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_DTR, NULL, 0, NULL, NULL); @@ -226,7 +231,7 @@ static int userspace_presuspend(struct dm_dirty_log *log) int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND, + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_PRESUSPEND, NULL, 0, NULL, NULL); @@ -238,7 +243,7 @@ static int userspace_postsuspend(struct dm_dirty_log *log) int r; struct log_c *lc = log->context; - r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND, + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_POSTSUSPEND, NULL, 0, NULL, NULL); @@ -251,7 +256,7 @@ static int userspace_resume(struct dm_dirty_log *log) struct log_c *lc = log->context; lc->in_sync_hint = 0; - r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME, + r = dm_consult_userspace(lc->uuid, lc->luid, DM_ULOG_RESUME, NULL, 0, NULL, NULL); diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c index 8ce74d95ae4d..ba0edad2d048 100644 --- a/drivers/md/dm-log-userspace-transfer.c +++ b/drivers/md/dm-log-userspace-transfer.c @@ -147,7 +147,8 @@ static void cn_ulog_callback(void *data) /** * dm_consult_userspace - * @uuid: log's uuid (must be DM_UUID_LEN in size) + * @uuid: log's universal unique identifier (must be DM_UUID_LEN in size) + * @luid: log's local unique identifier * @request_type: found in include/linux/dm-log-userspace.h * @data: data to tx to the server * @data_size: size of data in bytes @@ -163,7 +164,7 @@ static void cn_ulog_callback(void *data) * * Returns: 0 on success, -EXXX on failure **/ -int dm_consult_userspace(const char *uuid, int request_type, +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, char *data, size_t data_size, char *rdata, size_t *rdata_size) { @@ -190,6 +191,7 @@ resend: memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size); memcpy(tfr->uuid, uuid, DM_UUID_LEN); + tfr->luid = luid; tfr->seq = dm_ulog_seq++; /* diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h index c26d8e4e2710..04ee874f9153 100644 --- a/drivers/md/dm-log-userspace-transfer.h +++ b/drivers/md/dm-log-userspace-transfer.h @@ -11,7 +11,7 @@ int dm_ulog_tfr_init(void); void dm_ulog_tfr_exit(void); -int dm_consult_userspace(const char *uuid, int request_type, +int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type, char *data, size_t data_size, char *rdata, size_t *rdata_size); diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h index 642e3017b51f..8a1f972c0fe9 100644 --- a/include/linux/dm-log-userspace.h +++ b/include/linux/dm-log-userspace.h @@ -371,7 +371,18 @@ (DM_ULOG_REQUEST_MASK & (request_type)) struct dm_ulog_request { - char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */ + /* + * The local unique identifier (luid) and the universally unique + * identifier (uuid) are used to tie a request to a specific + * mirror log. A single machine log could probably make due with + * just the 'luid', but a cluster-aware log must use the 'uuid' and + * the 'luid'. The uuid is what is required for node to node + * communication concerning a particular log, but the 'luid' helps + * differentiate between logs that are being swapped and have the + * same 'uuid'. (Think "live" and "inactive" device-mapper tables.) + */ + uint64_t luid; + char uuid[DM_UUID_LEN]; char padding[7]; /* Padding because DM_UUID_LEN = 129 */ int32_t error; /* Used to report back processing errors */ -- cgit v1.2.3 From 4e49627b9bc29a14b393c480e8c979e3bc922ef7 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Sep 2009 11:17:06 -0700 Subject: workqueues: introduce __cancel_delayed_work() cancel_delayed_work() has to use del_timer_sync() to guarantee the timer function is not running after return. But most users doesn't actually need this, and del_timer_sync() has problems: it is not useable from interrupt, and it depends on every lock which could be taken from irq. Introduce __cancel_delayed_work() which calls del_timer() instead. The immediate reason for this patch is http://bugzilla.kernel.org/show_bug.cgi?id=13757 but hopefully this helper makes sense anyway. As for 13757 bug, actually we need requeue_delayed_work(), but its semantics are not yet clear. Merge this patch early to resolves cross-tree interdependencies between input and infiniband. Signed-off-by: Oleg Nesterov Cc: Dmitry Torokhov Cc: Roland Dreier Cc: Stefan Richter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/workqueue.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include/linux') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 13e1adf55c4c..6273fa97b527 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -240,6 +240,21 @@ static inline int cancel_delayed_work(struct delayed_work *work) return ret; } +/* + * Like above, but uses del_timer() instead of del_timer_sync(). This means, + * if it returns 0 the timer function may be running and the queueing is in + * progress. + */ +static inline int __cancel_delayed_work(struct delayed_work *work) +{ + int ret; + + ret = del_timer(&work->timer); + if (ret) + work_clear_pending(&work->work); + return ret; +} + extern int cancel_delayed_work_sync(struct delayed_work *work); /* Obsolete. use cancel_delayed_work_sync() */ -- cgit v1.2.3 From a2a8474c3fff88d8dd52d05cb450563fb26fd26c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 5 Sep 2009 11:17:13 -0700 Subject: exec: do not sleep in TASK_TRACED under ->cred_guard_mutex Tom Horsley reports that his debugger hangs when it tries to read /proc/pid_of_tracee/maps, this happens since "mm_for_maps: take ->cred_guard_mutex to fix the race with exec" 04b836cbf19e885f8366bccb2e4b0474346c02d commit in 2.6.31. But the root of the problem lies in the fact that do_execve() path calls tracehook_report_exec() which can stop if the tracer sets PT_TRACE_EXEC. The tracee must not sleep in TASK_TRACED holding this mutex. Even if we remove ->cred_guard_mutex from mm_for_maps() and proc_pid_attr_write(), another task doing PTRACE_ATTACH should not hang until it is killed or the tracee resumes. With this patch do_execve() does not use ->cred_guard_mutex directly and we do not hold it throughout, instead: - introduce prepare_bprm_creds() helper, it locks the mutex and calls prepare_exec_creds() to initialize bprm->cred. - install_exec_creds() drops the mutex after commit_creds(), and thus before tracehook_report_exec()->ptrace_stop(). or, if exec fails, free_bprm() drops this mutex when bprm->cred != NULL which indicates install_exec_creds() was not called. Reported-by: Tom Horsley Signed-off-by: Oleg Nesterov Acked-by: David Howells Cc: Roland McGrath Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat.c | 17 ++++--------- fs/exec.c | 63 +++++++++++++++++++++++++++++-------------------- include/linux/binfmts.h | 1 + 3 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include/linux') diff --git a/fs/compat.c b/fs/compat.c index 94502dab972a..6d6f98fe64a0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1567,10 +1561,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/exec.c b/fs/exec.c index fb4f3cdda78c..172ceb6edde4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1015,6 +1015,35 @@ out: EXPORT_SYMBOL(flush_old_exec); +/* + * Prepare credentials and lock ->cred_guard_mutex. + * install_exec_creds() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->cred_guard_mutex); + return -ENOMEM; +} + +void free_bprm(struct linux_binprm *bprm) +{ + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->cred_guard_mutex); + abort_creds(bprm->cred); + } + kfree(bprm); +} + /* * install the new credentials for this executable */ @@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; - - /* cred_guard_mutex must be held at least to this point to prevent + /* + * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's - * credentials; any time after this it may be unlocked */ - + * credentials; any time after this it may be unlocked. + */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) EXPORT_SYMBOL(search_binary_handler); -void free_bprm(struct linux_binprm *bprm) -{ - free_arg_pages(bprm); - if (bprm->cred) - abort_creds(bprm->cred); - kfree(bprm); -} - /* * sys_execve() executes a new program. */ @@ -1277,20 +1299,15 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1340,7 +1357,6 @@ int do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1360,10 +1376,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 61ee18c1bdb4..2046b5b8af48 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -117,6 +117,7 @@ extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack); extern int bprm_mm_init(struct linux_binprm *bprm); extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); +extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void do_coredump(long signr, int exit_code, struct pt_regs *regs); extern int set_binfmt(struct linux_binfmt *new); -- cgit v1.2.3