From c757249af152c59fd74b85e52e8c090acb33d9c0 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:40 -0700 Subject: [PATCH] per-task-delay-accounting: taskstats interface Create a "taskstats" interface based on generic netlink (NETLINK_GENERIC family), for getting statistics of tasks and thread groups during their lifetime and when they exit. The interface is intended for use by multiple accounting packages though it is being created in the context of delay accounting. This patch creates the interface without populating the fields of the data that is sent to the user in response to a command or upon the exit of a task. Each accounting package interested in using taskstats has to provide an additional patch to add its stats to the common structure. [akpm@osdl.org: cleanups, Kconfig fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/taskstats.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 include/linux/taskstats.h (limited to 'include/linux/taskstats.h') diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h new file mode 100644 index 000000000000..51f62759bea9 --- /dev/null +++ b/include/linux/taskstats.h @@ -0,0 +1,84 @@ +/* taskstats.h - exporting per-task statistics + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef _LINUX_TASKSTATS_H +#define _LINUX_TASKSTATS_H + +/* Format for per-task data returned to userland when + * - a task exits + * - listener requests stats for a task + * + * The struct is versioned. Newer versions should only add fields to + * the bottom of the struct to maintain backward compatibility. + * + * + * To add new fields + * a) bump up TASKSTATS_VERSION + * b) add comment indicating new version number at end of struct + * c) add new fields after version comment; maintain 64-bit alignment + */ + +#define TASKSTATS_VERSION 1 + +struct taskstats { + + /* Version 1 */ + __u64 version; +}; + + +#define TASKSTATS_LISTEN_GROUP 0x1 + +/* + * Commands sent from userspace + * Not versioned. New commands should only be inserted at the enum's end + * prior to __TASKSTATS_CMD_MAX + */ + +enum { + TASKSTATS_CMD_UNSPEC = 0, /* Reserved */ + TASKSTATS_CMD_GET, /* user->kernel request/get-response */ + TASKSTATS_CMD_NEW, /* kernel->user event */ + __TASKSTATS_CMD_MAX, +}; + +#define TASKSTATS_CMD_MAX (__TASKSTATS_CMD_MAX - 1) + +enum { + TASKSTATS_TYPE_UNSPEC = 0, /* Reserved */ + TASKSTATS_TYPE_PID, /* Process id */ + TASKSTATS_TYPE_TGID, /* Thread group id */ + TASKSTATS_TYPE_STATS, /* taskstats structure */ + TASKSTATS_TYPE_AGGR_PID, /* contains pid + stats */ + TASKSTATS_TYPE_AGGR_TGID, /* contains tgid + stats */ + __TASKSTATS_TYPE_MAX, +}; + +#define TASKSTATS_TYPE_MAX (__TASKSTATS_TYPE_MAX - 1) + +enum { + TASKSTATS_CMD_ATTR_UNSPEC = 0, + TASKSTATS_CMD_ATTR_PID, + TASKSTATS_CMD_ATTR_TGID, + __TASKSTATS_CMD_ATTR_MAX, +}; + +#define TASKSTATS_CMD_ATTR_MAX (__TASKSTATS_CMD_ATTR_MAX - 1) + +/* NETLINK_GENERIC related info */ + +#define TASKSTATS_GENL_NAME "TASKSTATS" +#define TASKSTATS_GENL_VERSION 0x1 + +#endif /* _LINUX_TASKSTATS_H */ -- cgit v1.2.3 From 6f44993fe1d7b2b097f6ac60cd5835c6f5ca0874 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:41 -0700 Subject: [PATCH] per-task-delay-accounting: delay accounting usage of taskstats interface Usage of taskstats interface by delay accounting. Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Cc: Jes Sorensen Cc: Peter Chubb Cc: Erich Focht Cc: Levent Serinol Cc: Jay Lan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 15 ++++++++++ include/linux/sched.h | 1 + include/linux/taskstats.h | 55 ++++++++++++++++++++++++++++++++++++- include/linux/taskstats_kern.h | 1 + init/Kconfig | 1 + kernel/delayacct.c | 62 +++++++++++++++++++++++++++++++++++++++++- kernel/taskstats.c | 16 +++++++---- 7 files changed, 144 insertions(+), 7 deletions(-) (limited to 'include/linux/taskstats.h') diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 0ecbf9aad8e1..d955078a1441 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -18,6 +18,7 @@ #define _LINUX_DELAYACCT_H #include +#include /* * Per-task flags relevant to delay accounting @@ -35,6 +36,7 @@ extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(void); +extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); static inline void delayacct_set_flag(int flag) { @@ -74,6 +76,16 @@ static inline void delayacct_blkio_end(void) __delayacct_blkio_end(); } +static inline int delayacct_add_tsk(struct taskstats *d, + struct task_struct *tsk) +{ + if (likely(!delayacct_on)) + return -EINVAL; + if (!tsk->delays) + return 0; + return __delayacct_add_tsk(d, tsk); +} + #else static inline void delayacct_set_flag(int flag) {} @@ -89,6 +101,9 @@ static inline void delayacct_blkio_start(void) {} static inline void delayacct_blkio_end(void) {} +static inline int delayacct_add_tsk(struct taskstats *d, + struct task_struct *tsk) +{ return 0; } #endif /* CONFIG_TASK_DELAY_ACCT */ #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index f751062d89a2..3c5610ca0c92 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -990,6 +990,7 @@ struct task_struct { */ struct pipe_inode_info *splice_pipe; #ifdef CONFIG_TASK_DELAY_ACCT + spinlock_t delays_lock; struct task_delay_info *delays; #endif }; diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h index 51f62759bea9..c6aeca32348e 100644 --- a/include/linux/taskstats.h +++ b/include/linux/taskstats.h @@ -34,7 +34,60 @@ struct taskstats { /* Version 1 */ - __u64 version; + __u16 version; + __u16 padding[3]; /* Userspace should not interpret the padding + * field which can be replaced by useful + * fields if struct taskstats is extended. + */ + + /* Delay accounting fields start + * + * All values, until comment "Delay accounting fields end" are + * available only if delay accounting is enabled, even though the last + * few fields are not delays + * + * xxx_count is the number of delay values recorded + * xxx_delay_total is the corresponding cumulative delay in nanoseconds + * + * xxx_delay_total wraps around to zero on overflow + * xxx_count incremented regardless of overflow + */ + + /* Delay waiting for cpu, while runnable + * count, delay_total NOT updated atomically + */ + __u64 cpu_count; + __u64 cpu_delay_total; + + /* Following four fields atomically updated using task->delays->lock */ + + /* Delay waiting for synchronous block I/O to complete + * does not account for delays in I/O submission + */ + __u64 blkio_count; + __u64 blkio_delay_total; + + /* Delay waiting for page fault I/O (swap in only) */ + __u64 swapin_count; + __u64 swapin_delay_total; + + /* cpu "wall-clock" running time + * On some architectures, value will adjust for cpu time stolen + * from the kernel in involuntary waits due to virtualization. + * Value is cumulative, in nanoseconds, without a corresponding count + * and wraps around to zero silently on overflow + */ + __u64 cpu_run_real_total; + + /* cpu "virtual" running time + * Uses time intervals seen by the kernel i.e. no adjustment + * for kernel's involuntary waits due to virtualization. + * Value is cumulative, in nanoseconds, without a corresponding count + * and wraps around to zero silently on overflow + */ + __u64 cpu_run_virtual_total; + /* Delay accounting fields end */ + /* version 1 ends here */ }; diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h index bd0ecb969c26..fc9da2e26443 100644 --- a/include/linux/taskstats_kern.h +++ b/include/linux/taskstats_kern.h @@ -17,6 +17,7 @@ enum { #ifdef CONFIG_TASKSTATS extern kmem_cache_t *taskstats_cache; +extern struct mutex taskstats_exit_mutex; static inline void taskstats_exit_alloc(struct taskstats **ptidstats, struct taskstats **ptgidstats) diff --git a/init/Kconfig b/init/Kconfig index 56a7093b4e4c..a099fc6526d9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -173,6 +173,7 @@ config TASKSTATS config TASK_DELAY_ACCT bool "Enable per-task delay accounting (EXPERIMENTAL)" + depends on TASKSTATS help Collect information on time spent by a task waiting for system resources like cpu, synchronous block I/O completion and swapping diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 3546b0800f9f..1be274a462ca 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -41,6 +41,10 @@ void delayacct_init(void) void __delayacct_tsk_init(struct task_struct *tsk) { + spin_lock_init(&tsk->delays_lock); + /* No need to acquire tsk->delays_lock for allocation here unless + __delayacct_tsk_init called after tsk is attached to tasklist + */ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); if (tsk->delays) spin_lock_init(&tsk->delays->lock); @@ -48,8 +52,11 @@ void __delayacct_tsk_init(struct task_struct *tsk) void __delayacct_tsk_exit(struct task_struct *tsk) { - kmem_cache_free(delayacct_cache, tsk->delays); + struct task_delay_info *delays = tsk->delays; + spin_lock(&tsk->delays_lock); tsk->delays = NULL; + spin_unlock(&tsk->delays_lock); + kmem_cache_free(delayacct_cache, delays); } /* @@ -104,3 +111,56 @@ void __delayacct_blkio_end(void) ¤t->delays->blkio_delay, ¤t->delays->blkio_count); } + +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + s64 tmp; + struct timespec ts; + unsigned long t1,t2,t3; + + spin_lock(&tsk->delays_lock); + + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ + t1 = tsk->sched_info.pcnt; + t2 = tsk->sched_info.run_delay; + t3 = tsk->sched_info.cpu_time; + + d->cpu_count += t1; + + jiffies_to_timespec(t2, &ts); + tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; + + tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + d->cpu_run_virtual_total = + (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ + + spin_lock(&tsk->delays->lock); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + spin_unlock(&tsk->delays->lock); + +done: + spin_unlock(&tsk->delays_lock); + return 0; +} diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 82ec9137d908..ea9506de3b85 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -18,13 +18,13 @@ #include #include +#include #include #include static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; -static DEFINE_MUTEX(taskstats_exit_mutex); static struct genl_family family = { .id = GENL_ID_GENERATE, @@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk, * goto err; */ -err: + rc = delayacct_add_tsk(stats, tsk); + stats->version = TASKSTATS_VERSION; + + /* Define err: label here if needed */ put_task_struct(tsk); return rc; @@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, * break; */ + rc = delayacct_add_tsk(stats, tsk); + if (rc) + break; + } while_each_thread(first, tsk); read_unlock(&tasklist_lock); + stats->version = TASKSTATS_VERSION; + /* * Accounting subsytems can also add calls here if they don't @@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, if (!family_registered || !tidstats) return; - mutex_lock(&taskstats_exit_mutex); - is_thread_group = !thread_group_empty(tsk); rc = 0; @@ -292,7 +299,6 @@ nla_put_failure: err_skb: nlmsg_free(rep_skb); ret: - mutex_unlock(&taskstats_exit_mutex); return; } -- cgit v1.2.3 From f9fd8914c1acca0d98b69d831b128d5b52f03c51 Mon Sep 17 00:00:00 2001 From: Shailabh Nagar Date: Fri, 14 Jul 2006 00:24:47 -0700 Subject: [PATCH] per-task delay accounting taskstats interface: control exit data through cpumasks On systems with a large number of cpus, with even a modest rate of tasks exiting per cpu, the volume of taskstats data sent on thread exit can overflow a userspace listener's buffers. One approach to avoiding overflow is to allow listeners to get data for a limited and specific set of cpus. By scaling the number of listeners and/or the cpus they monitor, userspace can handle the statistical data overload more gracefully. In this patch, each listener registers to listen to a specific set of cpus by specifying a cpumask. The interest is recorded per-cpu. When a task exits on a cpu, its taskstats data is unicast to each listener interested in that cpu. Thanks to Andrew Morton for pointing out the various scalability and general concerns of previous attempts and for suggesting this design. [akpm@osdl.org: build fix] Signed-off-by: Shailabh Nagar Signed-off-by: Balbir Singh Signed-off-by: Chandra Seetharaman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/taskstats.h | 4 +- include/linux/taskstats_kern.h | 27 +----- kernel/exit.c | 5 +- kernel/taskstats.c | 200 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 198 insertions(+), 38 deletions(-) (limited to 'include/linux/taskstats.h') diff --git a/include/linux/taskstats.h b/include/linux/taskstats.h index c6aeca32348e..f1cb6cddd19d 100644 --- a/include/linux/taskstats.h +++ b/include/linux/taskstats.h @@ -91,8 +91,6 @@ struct taskstats { }; -#define TASKSTATS_LISTEN_GROUP 0x1 - /* * Commands sent from userspace * Not versioned. New commands should only be inserted at the enum's end @@ -124,6 +122,8 @@ enum { TASKSTATS_CMD_ATTR_UNSPEC = 0, TASKSTATS_CMD_ATTR_PID, TASKSTATS_CMD_ATTR_TGID, + TASKSTATS_CMD_ATTR_REGISTER_CPUMASK, + TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK, __TASKSTATS_CMD_ATTR_MAX, }; diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h index 2b6adec3a2e4..16894b7edcc8 100644 --- a/include/linux/taskstats_kern.h +++ b/include/linux/taskstats_kern.h @@ -11,30 +11,10 @@ #include #include -enum { - TASKSTATS_MSG_UNICAST, /* send data only to requester */ - TASKSTATS_MSG_MULTICAST, /* send data to a group */ -}; - #ifdef CONFIG_TASKSTATS extern kmem_cache_t *taskstats_cache; extern struct mutex taskstats_exit_mutex; -static inline int taskstats_has_listeners(void) -{ - if (!genl_sock) - return 0; - return netlink_has_listeners(genl_sock, TASKSTATS_LISTEN_GROUP); -} - - -static inline void taskstats_exit_alloc(struct taskstats **ptidstats) -{ - *ptidstats = NULL; - if (taskstats_has_listeners()) - *ptidstats = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); -} - static inline void taskstats_exit_free(struct taskstats *tidstats) { if (tidstats) @@ -82,17 +62,18 @@ static inline void taskstats_tgid_free(struct signal_struct *sig) kmem_cache_free(taskstats_cache, stats); } -extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int); +extern void taskstats_exit_alloc(struct taskstats **, unsigned int *); +extern void taskstats_exit_send(struct task_struct *, struct taskstats *, int, unsigned int); extern void taskstats_init_early(void); extern void taskstats_tgid_alloc(struct signal_struct *); #else -static inline void taskstats_exit_alloc(struct taskstats **ptidstats) +static inline void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) {} static inline void taskstats_exit_free(struct taskstats *ptidstats) {} static inline void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead) + int group_dead, unsigned int cpu) {} static inline void taskstats_tgid_init(struct signal_struct *sig) {} diff --git a/kernel/exit.c b/kernel/exit.c index 67c1e9a4f812..dba194a8d416 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -847,6 +847,7 @@ fastcall NORET_TYPE void do_exit(long code) struct task_struct *tsk = current; struct taskstats *tidstats; int group_dead; + unsigned int mycpu; profile_task_exit(tsk); @@ -884,7 +885,7 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); - taskstats_exit_alloc(&tidstats); + taskstats_exit_alloc(&tidstats, &mycpu); acct_update_integrals(tsk); if (tsk->mm) { @@ -905,7 +906,7 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); - taskstats_exit_send(tsk, tidstats, group_dead); + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); taskstats_exit_free(tidstats); delayacct_tsk_exit(tsk); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4a0a5022b299..abb59e323544 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -19,9 +19,17 @@ #include #include #include +#include +#include #include #include +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; static int family_registered; kmem_cache_t *taskstats_cache; @@ -37,8 +45,25 @@ static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] __read_mostly = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; }; +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, void **replyp, size_t size) @@ -74,25 +99,68 @@ static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, return 0; } -static int send_reply(struct sk_buff *skb, pid_t pid, int event) +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, pid_t pid) { struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); - void *reply; + void *reply = genlmsg_data(genlhdr); int rc; - reply = genlmsg_data(genlhdr); - rc = genlmsg_end(skb, reply); if (rc < 0) { nlmsg_free(skb); return rc; } - if (event == TASKSTATS_MSG_MULTICAST) - return genlmsg_multicast(skb, pid, TASKSTATS_LISTEN_GROUP); return genlmsg_unicast(skb, pid); } +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, ret; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + rc = 0; + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) { + nlmsg_free(skb_cur); + rc = -ENOMEM; + break; + } + } + ret = genlmsg_unicast(skb_cur, s->pid); + if (ret == -ECONNREFUSED) { + list_del(&s->list); + kfree(s); + rc = ret; + } + skb_cur = skb_next; + } + up_write(&listeners->sem); + + return rc; +} + static int fill_pid(pid_t pid, struct task_struct *pidtsk, struct taskstats *stats) { @@ -204,8 +272,73 @@ ret: return; } +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp; + unsigned int cpu; + cpumask_t mask = *maskp; -static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, cpumask_t *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, *mask); + kfree(data); + return ret; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) { int rc = 0; struct sk_buff *rep_skb; @@ -213,6 +346,19 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) void *reply; size_t size; struct nlattr *na; + cpumask_t mask; + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, REGISTER); + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, DEREGISTER); /* * Size includes space for nested attributes @@ -252,7 +398,7 @@ static int taskstats_send_stats(struct sk_buff *skb, struct genl_info *info) nla_nest_end(rep_skb, na); - return send_reply(rep_skb, info->snd_pid, TASKSTATS_MSG_UNICAST); + return send_reply(rep_skb, info->snd_pid); nla_put_failure: return genlmsg_cancel(rep_skb, reply); @@ -261,9 +407,35 @@ err: return rc; } +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + /* Send pid data out on exit */ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, - int group_dead) + int group_dead, unsigned int mycpu) { int rc; struct sk_buff *rep_skb; @@ -324,7 +496,7 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, nla_nest_end(rep_skb, na); send: - send_reply(rep_skb, 0, TASKSTATS_MSG_MULTICAST); + send_cpu_listeners(rep_skb, mycpu); return; nla_put_failure: @@ -338,16 +510,22 @@ ret: static struct genl_ops taskstats_ops = { .cmd = TASKSTATS_CMD_GET, - .doit = taskstats_send_stats, + .doit = taskstats_user_cmd, .policy = taskstats_cmd_get_policy, }; /* Needed early in initialization */ void __init taskstats_init_early(void) { + unsigned int i; + taskstats_cache = kmem_cache_create("taskstats_cache", sizeof(struct taskstats), 0, SLAB_PANIC, NULL, NULL); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } } static int __init taskstats_init(void) -- cgit v1.2.3