diff options
Diffstat (limited to 'kernel')
53 files changed, 3997 insertions, 900 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 58908f9d156a..752bd7d383af 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,6 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o +obj-y += time/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_FUTEX) += futex.o ifeq ($(CONFIG_COMPAT),y) @@ -20,8 +21,8 @@ obj-$(CONFIG_SMP) += cpu.o spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_UID16) += uid16.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_STACK_UNWIND) += unwind.o obj-$(CONFIG_PM) += power/ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o diff --git a/kernel/acct.c b/kernel/acct.c index b327f4d20104..368c4f03fe0e 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(long, struct file *); +static void do_acct_process(struct file *); /* * This structure is used so that all the data protected by lock @@ -118,7 +118,7 @@ static int check_free_space(struct file *file) spin_unlock(&acct_globals.lock); /* May block */ - if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + if (vfs_statfs(file->f_dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file) if (old_acct) { mnt_unpin(old_acct->f_vfsmnt); spin_unlock(&acct_globals.lock); - do_acct_process(0, old_acct); + do_acct_process(old_acct); filp_close(old_acct, NULL); spin_lock(&acct_globals.lock); } @@ -419,16 +419,15 @@ static u32 encode_float(u64 value) /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(long exitcode, struct file *file) +static void do_acct_process(struct file *file) { + struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; mm_segment_t fs; - unsigned long vsize; unsigned long flim; u64 elapsed; u64 run_time; struct timespec uptime; - unsigned long jiffies; /* * First check to see if there is enough free_space to continue @@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file) #endif do_div(elapsed, AHZ); ac.ac_btime = xtime.tv_sec - elapsed; - jiffies = cputime_to_jiffies(cputime_add(current->utime, - current->signal->utime)); - ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies)); - jiffies = cputime_to_jiffies(cputime_add(current->stime, - current->signal->stime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies)); /* we really need to bite the bullet and change layout */ ac.ac_uid = current->uid; ac.ac_gid = current->gid; @@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - ac.ac_flag = 0; - if (current->flags & PF_FORKNOEXEC) - ac.ac_flag |= AFORK; - if (current->flags & PF_SUPERPRIV) - ac.ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - ac.ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - ac.ac_flag |= AXSIG; - - vsize = 0; - if (current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - vsize = vsize / 1024; - ac.ac_mem = encode_comp_t(vsize); + spin_lock(¤t->sighand->siglock); + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac.ac_flag = pacct->ac_flag; + ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->min_flt); - ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->maj_flt); ac.ac_swaps = encode_comp_t(0); - ac.ac_exitcode = exitcode; /* * Kernel segment override to datasegment and write it @@ -546,12 +520,63 @@ static void do_acct_process(long exitcode, struct file *file) } /** + * acct_init_pacct - initialize a new pacct_struct + */ +void acct_init_pacct(struct pacct_struct *pacct) +{ + memset(pacct, 0, sizeof(struct pacct_struct)); + pacct->ac_utime = pacct->ac_stime = cputime_zero; +} + +/** + * acct_collect - collect accounting information into pacct_struct + * @exitcode: task exit code + * @group_dead: not 0, if this thread is the last one in the process. + */ +void acct_collect(long exitcode, int group_dead) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (group_dead && current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock_irq(¤t->sighand->siglock); + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); +} + +/** * acct_process - now just a wrapper around do_acct_process * @exitcode: task exit code * * handles process accounting for an exiting task */ -void acct_process(long exitcode) +void acct_process() { struct file *file = NULL; @@ -570,7 +595,7 @@ void acct_process(long exitcode) get_file(file); spin_unlock(&acct_globals.lock); - do_acct_process(exitcode, file); + do_acct_process(file); fput(file); } @@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk) */ void acct_clear_integrals(struct task_struct *tsk) { - if (tsk) { - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; - } + tsk->acct_stimexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; } diff --git a/kernel/audit.c b/kernel/audit.c index df57b493e1cb..7dfac7031bd7 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -56,6 +56,7 @@ #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/selinux.h> +#include <linux/inotify.h> #include "audit.h" @@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ uid_t audit_sig_uid = -1; pid_t audit_sig_pid = -1; +u32 audit_sig_sid = 0; /* Records can be lost in several ways: 0) [suppressed in audit_alloc] @@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* The netlink socket. */ static struct sock *audit_sock; +/* Inotify handle. */ +struct inotify_handle *audit_ih; + +/* Hash for inode-based rules */ +struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + /* The audit_freelist is a list of pre-allocated audit buffers (if more * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of * being placed on the freelist). */ @@ -114,10 +122,8 @@ static struct task_struct *kauditd_task; static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); -/* The netlink socket is only to be read by 1 CPU, which lets us assume - * that list additions and deletions never happen simultaneously in - * auditsc.c */ -DEFINE_MUTEX(audit_netlink_mutex); +/* Serialize requests from userspace. */ +static DEFINE_MUTEX(audit_cmd_mutex); /* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting * audit records. Since printk uses a 1024 byte buffer, this buffer @@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) "audit_rate_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_rate_limit = limit; - return old; + return 0; } static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) @@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) "audit_backlog_limit=%d old=%d by auid=%u", limit, old, loginuid); audit_backlog_limit = limit; - return old; + return 0; } static int audit_set_enabled(int state, uid_t loginuid, u32 sid) @@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid) "audit_enabled=%d old=%d by auid=%u", state, old, loginuid); audit_enabled = state; - return old; + return 0; } static int audit_set_failure(int state, uid_t loginuid, u32 sid) @@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid) "audit_failure=%d old=%d by auid=%u", state, old, loginuid); audit_failure = state; - return old; + return 0; } static int kauditd_thread(void *dummy) @@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy) remove_wait_queue(&kauditd_wait, &wait); } } +} + +int audit_send_list(void *_dest) +{ + struct audit_netlink_list *dest = _dest; + int pid = dest->pid; + struct sk_buff *skb; + + /* wait for parent to finish and send an ACK */ + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); + + while ((skb = __skb_dequeue(&dest->q)) != NULL) + netlink_unicast(audit_sock, skb, pid, 0); + + kfree(dest); + return 0; } +struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, + int multi, void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len = NLMSG_SPACE(size); + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = alloc_skb(len, GFP_KERNEL); + if (!skb) + return NULL; + + nlh = NLMSG_PUT(skb, pid, seq, t, size); + nlh->nlmsg_flags = flags; + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + return skb; + +nlmsg_failure: /* Used by NLMSG_PUT */ + if (skb) + kfree_skb(skb); + return NULL; +} + /** * audit_send_reply - send an audit reply message via netlink * @pid: process id to send reply to @@ -383,29 +432,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size) { struct sk_buff *skb; - struct nlmsghdr *nlh; - int len = NLMSG_SPACE(size); - void *data; - int flags = multi ? NLM_F_MULTI : 0; - int t = done ? NLMSG_DONE : type; - - skb = alloc_skb(len, GFP_KERNEL); + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); if (!skb) return; - - nlh = NLMSG_PUT(skb, pid, seq, t, size); - nlh->nlmsg_flags = flags; - data = NLMSG_DATA(nlh); - memcpy(data, payload, size); - /* Ignore failure. It'll only happen if the sender goes away, because our timeout is set to infinite. */ netlink_unicast(audit_sock, skb, pid, 0); return; - -nlmsg_failure: /* Used by NLMSG_PUT */ - if (skb) - kfree_skb(skb); } /* @@ -451,7 +484,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) struct audit_buffer *ab; u16 msg_type = nlh->nlmsg_type; uid_t loginuid; /* loginuid of sender */ - struct audit_sig_info sig_data; + struct audit_sig_info *sig_data; + char *ctx; + u32 len; err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); if (err) @@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (status_get->mask & AUDIT_STATUS_PID) { int old = audit_pid; if (sid) { - char *ctx = NULL; - u32 len; - int rc; - if ((rc = selinux_ctxid_to_string( + if ((err = selinux_ctxid_to_string( sid, &ctx, &len))) - return rc; + return err; else audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, @@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_pid = status_get->pid; } if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) - audit_set_rate_limit(status_get->rate_limit, + err = audit_set_rate_limit(status_get->rate_limit, loginuid, sid); if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) - audit_set_backlog_limit(status_get->backlog_limit, + err = audit_set_backlog_limit(status_get->backlog_limit, loginuid, sid); break; case AUDIT_USER: @@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) "user pid=%d uid=%u auid=%u", pid, uid, loginuid); if (sid) { - char *ctx = NULL; - u32 len; if (selinux_ctxid_to_string( sid, &ctx, &len)) { audit_log_format(ab, @@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) loginuid, sid); break; case AUDIT_SIGNAL_INFO: - sig_data.uid = audit_sig_uid; - sig_data.pid = audit_sig_pid; + err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); + if (err) + return err; + sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + if (!sig_data) { + kfree(ctx); + return -ENOMEM; + } + sig_data->uid = audit_sig_uid; + sig_data->pid = audit_sig_pid; + memcpy(sig_data->ctx, ctx, len); + kfree(ctx); audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, - 0, 0, &sig_data, sizeof(sig_data)); + 0, 0, sig_data, sizeof(*sig_data) + len); + kfree(sig_data); break; default: err = -EINVAL; @@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length) struct sk_buff *skb; unsigned int qlen; - mutex_lock(&audit_netlink_mutex); + mutex_lock(&audit_cmd_mutex); for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { skb = skb_dequeue(&sk->sk_receive_queue); audit_receive_skb(skb); kfree_skb(skb); } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_cmd_mutex); } +#ifdef CONFIG_AUDITSYSCALL +static const struct inotify_operations audit_inotify_ops = { + .handle_event = audit_handle_ievent, + .destroy_watch = audit_free_parent, +}; +#endif /* Initialize audit support at boot time. */ static int __init audit_init(void) { +#ifdef CONFIG_AUDITSYSCALL + int i; +#endif + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", audit_default ? "enabled" : "disabled"); audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, @@ -661,6 +712,16 @@ static int __init audit_init(void) selinux_audit_set_callback(&selinux_audit_rule_update); audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + +#ifdef CONFIG_AUDITSYSCALL + audit_ih = inotify_init(&audit_inotify_ops); + if (IS_ERR(audit_ih)) + audit_panic("cannot initialize inotify handle"); + + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) + INIT_LIST_HEAD(&audit_inode_hash[i]); +#endif + return 0; } __initcall(audit_init); @@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab) kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); - if (++audit_freelist_count > AUDIT_MAXFREE) + if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); - else + else { + audit_freelist_count++; list_add(&ab->list, &audit_freelist); + } spin_unlock_irqrestore(&audit_freelist_lock, flags); } @@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, skb_put(skb, len << 1); /* new string is twice the old string */ } +/* + * Format a string of no more than slen characters into the audit buffer, + * enclosed in quote marks. + */ +static void audit_log_n_string(struct audit_buffer *ab, size_t slen, + const char *string) +{ + int avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = slen + 3; /* enclosing quotes + null terminator */ + if (new_len > avail) { + avail = audit_expand(ab, new_len); + if (!avail) + return; + } + ptr = skb->tail; + *ptr++ = '"'; + memcpy(ptr, string, slen); + ptr += slen; + *ptr++ = '"'; + *ptr = 0; + skb_put(skb, slen + 2); /* don't include null terminator */ +} + /** - * audit_log_unstrustedstring - log a string that may contain random characters + * audit_log_n_unstrustedstring - log a string that may contain random characters * @ab: audit_buffer + * @len: lenth of string (not including trailing null) * @string: string to be logged * * This code will escape a string that is passed to it if the string * contains a control character, unprintable character, double quote mark, * or a space. Unescaped strings will start and end with a double quote mark. * Strings that are escaped are printed in hex (2 digits per char). + * + * The caller specifies the number of characters in the string to log, which may + * or may not be the entire string. */ -void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, + const char *string) { const unsigned char *p = string; while (*p) { if (*p == '"' || *p < 0x21 || *p > 0x7f) { - audit_log_hex(ab, string, strlen(string)); - return; + audit_log_hex(ab, string, len); + return string + len + 1; } p++; } - audit_log_format(ab, "\"%s\"", string); + audit_log_n_string(ab, len, string); + return p + 1; +} + +/** + * audit_log_unstrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * Same as audit_log_n_unstrustedstring(), except that strlen is used to + * determine string length. + */ +const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +{ + return audit_log_n_untrustedstring(ab, strlen(string), string); } /* This is a helper-function to print the escaped d_path */ diff --git a/kernel/audit.h b/kernel/audit.h index 6f733920fd32..8323e4132a33 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -19,9 +19,9 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#include <linux/mutex.h> #include <linux/fs.h> #include <linux/audit.h> +#include <linux/skbuff.h> /* 0 = no checking 1 = put_count checking @@ -53,6 +53,18 @@ enum audit_state { }; /* Rule lists */ +struct audit_parent; + +struct audit_watch { + atomic_t count; /* reference count */ + char *path; /* insertion path */ + dev_t dev; /* associated superblock device */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* associated rules */ +}; + struct audit_field { u32 type; u32 val; @@ -70,6 +82,9 @@ struct audit_krule { u32 buflen; /* for data alloc on list rules */ u32 field_count; struct audit_field *fields; + struct audit_field *inode_f; /* quick access to an inode field */ + struct audit_watch *watch; /* associated watch */ + struct list_head rlist; /* entry in audit_watch.rules list */ }; struct audit_entry { @@ -78,15 +93,53 @@ struct audit_entry { struct audit_krule rule; }; - extern int audit_pid; -extern int audit_comparator(const u32 left, const u32 op, const u32 right); +#define AUDIT_INODE_BUCKETS 32 +extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + +static inline int audit_hash_ino(u32 ino) +{ + return (ino & (AUDIT_INODE_BUCKETS-1)); +} + +extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen); +extern struct sk_buff * audit_make_reply(int pid, int seq, int type, + int done, int multi, + void *payload, int size); extern void audit_send_reply(int pid, int seq, int type, int done, int multi, void *payload, int size); extern void audit_log_lost(const char *message); extern void audit_panic(const char *message); -extern struct mutex audit_netlink_mutex; +struct audit_netlink_list { + int pid; + struct sk_buff_head q; +}; + +int audit_send_list(void *); + +struct inotify_watch; +extern void audit_free_parent(struct inotify_watch *); +extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32, + const char *, struct inode *); extern int selinux_audit_rule_update(void); + +#ifdef CONFIG_AUDITSYSCALL +extern void __audit_signal_info(int sig, struct task_struct *t); +static inline void audit_signal_info(int sig, struct task_struct *t) +{ + if (unlikely(audit_pid && t->tgid == audit_pid)) + __audit_signal_info(sig, t); +} +extern enum audit_state audit_filter_inodes(struct task_struct *, + struct audit_context *); +extern void audit_set_auditable(struct audit_context *); +#else +#define audit_signal_info(s,t) +#define audit_filter_inodes(t,c) AUDIT_DISABLED +#define audit_set_auditable(c) +#endif diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 7c134906d689..4c99d2c586ed 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -22,13 +22,59 @@ #include <linux/kernel.h> #include <linux/audit.h> #include <linux/kthread.h> +#include <linux/mutex.h> +#include <linux/fs.h> +#include <linux/namei.h> #include <linux/netlink.h> +#include <linux/sched.h> +#include <linux/inotify.h> #include <linux/selinux.h> #include "audit.h" -/* There are three lists of rules -- one to search at task creation - * time, one to search at syscall entry time, and another to search at - * syscall exit time. */ +/* + * Locking model: + * + * audit_filter_mutex: + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * selinux rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. + */ + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_parent { + struct list_head ilist; /* entry in inotify registration list */ + struct list_head watches; /* associated watches */ + struct inotify_watch wdata; /* inotify watch data */ + unsigned flags; /* status flags */ +}; + +/* + * audit_parent status flags: + * + * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to + * a filesystem event to ensure we're adding audit watches to a valid parent. + * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot + * receive them while we have nameidata, but must be used for IN_MOVE_SELF which + * we can receive while holding nameidata. + */ +#define AUDIT_PARENT_INVALID 0x001 + +/* Audit filter lists, defined in <linux/audit.h> */ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { LIST_HEAD_INIT(audit_filter_list[0]), LIST_HEAD_INIT(audit_filter_list[1]), @@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { #endif }; +static DEFINE_MUTEX(audit_filter_mutex); + +/* Inotify handle */ +extern struct inotify_handle *audit_ih; + +/* Inotify events we care about. */ +#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF + +void audit_free_parent(struct inotify_watch *i_watch) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +static inline void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +static void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +static void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + put_inotify_watch(&watch->parent->wdata); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + static inline void audit_free_rule(struct audit_entry *e) { int i; + + /* some rules don't have associated watches */ + if (e->rule.watch) + audit_put_watch(e->rule.watch); if (e->rule.fields) for (i = 0; i < e->rule.field_count; i++) { struct audit_field *f = &e->rule.fields[i]; @@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head) audit_free_rule(e); } +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct nameidata *ndp) +{ + struct audit_parent *parent; + s32 wd; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + parent->flags = 0; + + inotify_init_watch(&parent->wdata); + /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */ + get_inotify_watch(&parent->wdata); + wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode, + AUDIT_IN_WATCH); + if (wd < 0) { + audit_free_parent(&parent->wdata); + return ERR_PTR(wd); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + /* Initialize an audit filterlist entry. */ static inline struct audit_entry *audit_init_entry(u32 field_count) { @@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } +/* Translate an inode field to kernel respresentation. */ +static inline int audit_to_inode(struct audit_krule *krule, + struct audit_field *f) +{ + if (krule->listnr != AUDIT_FILTER_EXIT || + krule->watch || krule->inode_f) + return -EINVAL; + + krule->inode_f = f; + return 0; +} + +/* Translate a watch string to kernel respresentation. */ +static int audit_to_watch(struct audit_krule *krule, char *path, int len, + u32 op) +{ + struct audit_watch *watch; + + if (!audit_ih) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op & ~AUDIT_EQUAL || + krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */ + return -EINVAL; + + watch = audit_init_watch(path); + if (unlikely(IS_ERR(watch))) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + /* Common user-space to kernel rule translation. */ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) { @@ -128,8 +299,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) #endif ; } - if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && - rule->action != AUDIT_ALWAYS) + if (unlikely(rule->action == AUDIT_POSSIBLE)) { + printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + goto exit_err; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) goto exit_err; if (rule->field_count > AUDIT_MAX_FIELDS) goto exit_err; @@ -158,6 +332,7 @@ exit_err: static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) { struct audit_entry *entry; + struct audit_field *f; int err = 0; int i; @@ -172,14 +347,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); f->val = rule->values[i]; - if (f->type & AUDIT_UNUSED_BITS || - f->type == AUDIT_SE_USER || - f->type == AUDIT_SE_ROLE || - f->type == AUDIT_SE_TYPE || - f->type == AUDIT_SE_SEN || - f->type == AUDIT_SE_CLR) { - err = -EINVAL; + err = -EINVAL; + switch(f->type) { + default: goto exit_free; + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; } entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; @@ -196,6 +394,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) } } + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; + } + } + exit_nofree: return entry; @@ -210,6 +420,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, { int err = 0; struct audit_entry *entry; + struct audit_field *f; void *bufp; size_t remain = datasz - sizeof(struct audit_rule_data); int i; @@ -235,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, f->se_str = NULL; f->se_rule = NULL; switch(f->type) { + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_ARCH: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; case AUDIT_SE_USER: case AUDIT_SE_ROLE: case AUDIT_SE_TYPE: @@ -260,6 +494,37 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, } else f->se_str = str; break; + case AUDIT_WATCH: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = audit_to_watch(&entry->rule, str, f->val, f->op); + if (err) { + kfree(str); + goto exit_free; + } + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; + default: + goto exit_free; + } + } + + f = entry->rule.inode_f; + if (f) { + switch(f->op) { + case AUDIT_NOT_EQUAL: + entry->rule.inode_f = NULL; + case AUDIT_EQUAL: + break; + default: + goto exit_free; } } @@ -291,7 +556,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) rule = kmalloc(sizeof(*rule), GFP_KERNEL); if (unlikely(!rule)) - return ERR_PTR(-ENOMEM); + return NULL; memset(rule, 0, sizeof(*rule)); rule->flags = krule->flags | krule->listnr; @@ -322,7 +587,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); if (unlikely(!data)) - return ERR_PTR(-ENOMEM); + return NULL; memset(data, 0, sizeof(*data)); data->flags = krule->flags | krule->listnr; @@ -343,6 +608,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) data->buflen += data->values[i] = audit_pack_string(&bufp, f->se_str); break; + case AUDIT_WATCH: + data->buflen += data->values[i] = + audit_pack_string(&bufp, krule->watch->path); + break; default: data->values[i] = f->val; } @@ -378,6 +647,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) return 1; break; + case AUDIT_WATCH: + if (strcmp(a->watch->path, b->watch->path)) + return 1; + break; default: if (a->fields[i].val != b->fields[i].val) return 1; @@ -391,6 +664,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) return 0; } +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (unlikely(IS_ERR(new))) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + get_inotify_watch(&old->parent->wdata); + new->parent = old->parent; + +out: + return new; +} + /* Duplicate selinux field information. The se_rule is opaque, so must be * re-initialized. */ static inline int audit_dupe_selinux_field(struct audit_field *df, @@ -422,8 +721,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df, /* Duplicate an audit rule. This will be a deep copy with the exception * of the watch - that pointer is carried over. The selinux specific fields * will be updated in the copy. The point is to be able to replace the old - * rule with the new rule in the filterlist, then free the old rule. */ -static struct audit_entry *audit_dupe_rule(struct audit_krule *old) + * rule with the new rule in the filterlist, then free the old rule. + * The rlist element is undefined; list manipulations are handled apart from + * the initial copy. */ +static struct audit_entry *audit_dupe_rule(struct audit_krule *old, + struct audit_watch *watch) { u32 fcount = old->field_count; struct audit_entry *entry; @@ -442,6 +744,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) for (i = 0; i < AUDIT_BITMASK_SIZE; i++) new->mask[i] = old->mask[i]; new->buflen = old->buflen; + new->inode_f = old->inode_f; + new->watch = NULL; new->field_count = old->field_count; memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); @@ -463,68 +767,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old) } } + if (watch) { + audit_get_watch(watch); + new->watch = watch; + } + return entry; } -/* Add rule to given filterlist if not a duplicate. Protected by - * audit_netlink_mutex. */ +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + struct audit_buffer *ab; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path, NULL)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && + audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT) + audit_set_auditable(current->audit_context); + + nwatch = audit_dupe_watch(owatch); + if (unlikely(IS_ERR(nwatch))) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule, nwatch); + if (unlikely(IS_ERR(nentry))) + audit_panic("error updating watch, removing"); + else { + int h = audit_hash_ino((u32)ino); + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + } + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "audit updated rules specifying watch="); + audit_log_untrustedstring(ab, owatch->path); + audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); + audit_log_end(ab); + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + parent->flags |= AUDIT_PARENT_INVALID; + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + list_del(&r->rlist); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, + "audit implicitly removed rule from list=%d\n", + AUDIT_FILTER_EXIT); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); +} + +/* Unregister inotify watches for parents on in_list. + * Generates an IN_IGNORED event. */ +static void audit_inotify_unregister(struct list_head *in_list) +{ + struct audit_parent *p, *n; + + list_for_each_entry_safe(p, n, in_list, ilist) { + list_del(&p->ilist); + inotify_rm_watch(audit_ih, &p->wdata); + /* the put matching the get in audit_do_del_rule() */ + put_inotify_watch(&p->wdata); + } +} + +/* Find an existing audit rule. + * Caller must hold audit_filter_mutex to prevent stale rule data. */ +static struct audit_entry *audit_find_rule(struct audit_entry *entry, + struct list_head *list) +{ + struct audit_entry *e, *found = NULL; + int h; + + if (entry->rule.watch) { + /* we don't know the inode number, so must walk entire hash */ + for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { + list = &audit_inode_hash[h]; + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + } + goto out; + } + + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + +out: + return found; +} + +/* Get path information necessary for adding watches. */ +static int audit_get_nd(char *path, struct nameidata **ndp, + struct nameidata **ndw) +{ + struct nameidata *ndparent, *ndwatch; + int err; + + ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL); + if (unlikely(!ndparent)) + return -ENOMEM; + + ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL); + if (unlikely(!ndwatch)) { + kfree(ndparent); + return -ENOMEM; + } + + err = path_lookup(path, LOOKUP_PARENT, ndparent); + if (err) { + kfree(ndparent); + kfree(ndwatch); + return err; + } + + err = path_lookup(path, 0, ndwatch); + if (err) { + kfree(ndwatch); + ndwatch = NULL; + } + + *ndp = ndparent; + *ndw = ndwatch; + + return 0; +} + +/* Release resources used for watch path information. */ +static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw) +{ + if (ndp) { + path_release(ndp); + kfree(ndp); + } + if (ndw) { + path_release(ndw); + kfree(ndw); + } +} + +/* Associate the given rule with an existing parent inotify_watch. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + get_inotify_watch(&parent->wdata); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp, + struct nameidata *ndw) +{ + struct audit_watch *watch = krule->watch; + struct inotify_watch *i_watch; + struct audit_parent *parent; + int ret = 0; + + /* update watch filter fields */ + if (ndw) { + watch->dev = ndw->dentry->d_inode->i_sb->s_dev; + watch->ino = ndw->dentry->d_inode->i_ino; + } + + /* The audit_filter_mutex must not be held during inotify calls because + * we hold it during inotify event callback processing. If an existing + * inotify watch is found, inotify_find_watch() grabs a reference before + * returning. + */ + mutex_unlock(&audit_filter_mutex); + + if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) { + parent = audit_init_parent(ndp); + if (IS_ERR(parent)) { + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + return PTR_ERR(parent); + } + } else + parent = container_of(i_watch, struct audit_parent, wdata); + + mutex_lock(&audit_filter_mutex); + + /* parent was moved before we took audit_filter_mutex */ + if (parent->flags & AUDIT_PARENT_INVALID) + ret = -ENOENT; + else + audit_add_to_parent(krule, parent); + + /* match get in audit_init_parent or inotify_find_watch */ + put_inotify_watch(&parent->wdata); + return ret; +} + +/* Add rule to given filterlist if not a duplicate. */ static inline int audit_add_rule(struct audit_entry *entry, - struct list_head *list) + struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch = entry->rule.watch; + struct nameidata *ndp, *ndw; + int h, err, putnd_needed = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } - /* Do not use the _rcu iterator here, since this is the only - * addition routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) - return -EEXIST; + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + mutex_unlock(&audit_filter_mutex); + if (e) { + err = -EEXIST; + goto error; + } + + /* Avoid calling path_lookup under audit_filter_mutex. */ + if (watch) { + err = audit_get_nd(watch->path, &ndp, &ndw); + if (err) + goto error; + putnd_needed = 1; + } + + mutex_lock(&audit_filter_mutex); + if (watch) { + /* audit_filter_mutex is dropped and re-taken during this call */ + err = audit_add_watch(&entry->rule, ndp, ndw); + if (err) { + mutex_unlock(&audit_filter_mutex); + goto error; + } + h = audit_hash_ino((u32)watch->ino); + list = &audit_inode_hash[h]; } if (entry->rule.flags & AUDIT_FILTER_PREPEND) { list_add_rcu(&entry->list, list); + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; } else { list_add_tail_rcu(&entry->list, list); } + mutex_unlock(&audit_filter_mutex); - return 0; + if (putnd_needed) + audit_put_nd(ndp, ndw); + + return 0; + +error: + if (putnd_needed) + audit_put_nd(ndp, ndw); + if (watch) + audit_put_watch(watch); /* tmp watch, matches initial get */ + return err; } -/* Remove an existing rule from filterlist. Protected by - * audit_netlink_mutex. */ +/* Remove an existing rule from filterlist. */ static inline int audit_del_rule(struct audit_entry *entry, struct list_head *list) { struct audit_entry *e; + struct audit_field *inode_f = entry->rule.inode_f; + struct audit_watch *watch, *tmp_watch = entry->rule.watch; + LIST_HEAD(inotify_list); + int h, ret = 0; + + if (inode_f) { + h = audit_hash_ino(inode_f->val); + list = &audit_inode_hash[h]; + } - /* Do not use the _rcu iterator here, since this is the only - * deletion routine. */ - list_for_each_entry(e, list, list) { - if (!audit_compare_rule(&entry->rule, &e->rule)) { - list_del_rcu(&e->list); - call_rcu(&e->rcu, audit_free_rule_rcu); - return 0; + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, list); + if (!e) { + mutex_unlock(&audit_filter_mutex); + ret = -ENOENT; + goto out; + } + + watch = e->rule.watch; + if (watch) { + struct audit_parent *parent = watch->parent; + + list_del(&e->rule.rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + /* Put parent on the inotify un-registration + * list. Grab a reference before releasing + * audit_filter_mutex, to be released in + * audit_inotify_unregister(). */ + list_add(&parent->ilist, &inotify_list); + get_inotify_watch(&parent->wdata); + } } } - return -ENOENT; /* No matching rule */ + + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + + mutex_unlock(&audit_filter_mutex); + + if (!list_empty(&inotify_list)) + audit_inotify_unregister(&inotify_list); + +out: + if (tmp_watch) + audit_put_watch(tmp_watch); /* match initial get */ + + return ret; } /* List rules using struct audit_rule. Exists for backward * compatibility with userspace. */ -static int audit_list(void *_dest) +static void audit_list(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *entry; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { list_for_each_entry(entry, &audit_filter_list[i], list) { struct audit_rule *rule; @@ -532,33 +1177,41 @@ static int audit_list(void *_dest) rule = audit_krule_to_rule(&entry->rule); if (unlikely(!rule)) break; - audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); kfree(rule); } } - audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); - - mutex_unlock(&audit_netlink_mutex); - return 0; + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry(entry, &audit_inode_hash[i], list) { + struct audit_rule *rule; + + rule = audit_krule_to_rule(&entry->rule); + if (unlikely(!rule)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1, + rule, sizeof(*rule)); + if (skb) + skb_queue_tail(q, skb); + kfree(rule); + } + } + skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /* List rules using struct audit_rule_data. */ -static int audit_list_rules(void *_dest) +static void audit_list_rules(int pid, int seq, struct sk_buff_head *q) { - int pid, seq; - int *dest = _dest; + struct sk_buff *skb; struct audit_entry *e; int i; - pid = dest[0]; - seq = dest[1]; - kfree(dest); - - mutex_lock(&audit_netlink_mutex); - - /* The *_rcu iterators not needed here because we are - always called with audit_netlink_mutex held. */ + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ for (i=0; i<AUDIT_NR_FILTERS; i++) { list_for_each_entry(e, &audit_filter_list[i], list) { struct audit_rule_data *data; @@ -566,15 +1219,30 @@ static int audit_list_rules(void *_dest) data = audit_krule_to_data(&e->rule); if (unlikely(!data)) break; - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, - data, sizeof(*data)); + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); + if (skb) + skb_queue_tail(q, skb); kfree(data); } } - audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + for (i=0; i< AUDIT_INODE_BUCKETS; i++) { + list_for_each_entry(e, &audit_inode_hash[i], list) { + struct audit_rule_data *data; - mutex_unlock(&audit_netlink_mutex); - return 0; + data = audit_krule_to_data(&e->rule); + if (unlikely(!data)) + break; + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); + if (skb) + skb_queue_tail(q, skb); + kfree(data); + } + } + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); } /** @@ -592,7 +1260,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, size_t datasz, uid_t loginuid, u32 sid) { struct task_struct *tsk; - int *dest; + struct audit_netlink_list *dest; int err = 0; struct audit_entry *entry; @@ -605,18 +1273,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, * happen if we're actually running in the context of auditctl * trying to _send_ the stuff */ - dest = kmalloc(2 * sizeof(int), GFP_KERNEL); + dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); if (!dest) return -ENOMEM; - dest[0] = pid; - dest[1] = seq; + dest->pid = pid; + skb_queue_head_init(&dest->q); + mutex_lock(&audit_filter_mutex); if (type == AUDIT_LIST) - tsk = kthread_run(audit_list, dest, "audit_list"); + audit_list(pid, seq, &dest->q); else - tsk = kthread_run(audit_list_rules, dest, - "audit_list_rules"); + audit_list_rules(pid, seq, &dest->q); + mutex_unlock(&audit_filter_mutex); + + tsk = kthread_run(audit_send_list, dest, "audit_send_list"); if (IS_ERR(tsk)) { + skb_queue_purge(&dest->q); kfree(dest); err = PTR_ERR(tsk); } @@ -632,6 +1304,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data, err = audit_add_rule(entry, &audit_filter_list[entry->rule.listnr]); + if (sid) { char *ctx = NULL; u32 len; @@ -712,7 +1385,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right) return 0; } +/* Compare given dentry name with last component in given path, + * return of 0 indicates a match. */ +int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen) +{ + int dlen, plen; + const char *p; + if (!dname || !path) + return 1; + + dlen = strlen(dname); + plen = strlen(path); + if (plen < dlen) + return 1; + + /* disregard trailing slashes */ + p = path + plen - 1; + while ((*p == '/') && (p > path)) + p--; + + /* find last path component */ + p = p - dlen + 1; + if (p < path) + return 1; + else if (p > path) { + if (*--p != '/') + return 1; + else + p++; + } + + /* return length of path's directory component */ + if (dirlen) + *dirlen = p - path; + return strncmp(p, dname, dlen); +} static int audit_filter_user_rules(struct netlink_skb_parms *cb, struct audit_krule *rule, @@ -744,7 +1453,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; @@ -826,32 +1534,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule) int selinux_audit_rule_update(void) { struct audit_entry *entry, *n, *nentry; + struct audit_watch *watch; int i, err = 0; - /* audit_netlink_mutex synchronizes the writers */ - mutex_lock(&audit_netlink_mutex); + /* audit_filter_mutex synchronizes the writers */ + mutex_lock(&audit_filter_mutex); for (i = 0; i < AUDIT_NR_FILTERS; i++) { list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { if (!audit_rule_has_selinux(&entry->rule)) continue; - nentry = audit_dupe_rule(&entry->rule); + watch = entry->rule.watch; + nentry = audit_dupe_rule(&entry->rule, watch); if (unlikely(IS_ERR(nentry))) { /* save the first error encountered for the * return value */ if (!err) err = PTR_ERR(nentry); audit_panic("error updating selinux filters"); + if (watch) + list_del(&entry->rule.rlist); list_del_rcu(&entry->list); } else { + if (watch) { + list_add(&nentry->rule.rlist, + &watch->rules); + list_del(&entry->rule.rlist); + } list_replace_rcu(&entry->list, &nentry->list); } call_rcu(&entry->rcu, audit_free_rule_rcu); } } - mutex_unlock(&audit_netlink_mutex); + mutex_unlock(&audit_filter_mutex); return err; } + +/* Update watch data in audit rules based on inotify events. */ +void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask, + u32 cookie, const char *dname, struct inode *inode) +{ + struct audit_parent *parent; + + parent = container_of(i_watch, struct audit_parent, wdata); + + if (mask & (IN_CREATE|IN_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, + inode->i_ino, 0); + else if (mask & (IN_DELETE|IN_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + /* inotify automatically removes the watch and sends IN_IGNORED */ + else if (mask & (IN_DELETE_SELF|IN_UNMOUNT)) + audit_remove_parent_watches(parent); + /* inotify does not remove the watch, so remove it manually */ + else if(mask & IN_MOVE_SELF) { + audit_remove_parent_watches(parent); + inotify_remove_watch_locked(audit_ih, i_watch); + } else if (mask & IN_IGNORED) + put_inotify_watch(i_watch); +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c03a4ed1b27..9ebd96fda295 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -3,7 +3,7 @@ * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * Copyright 2005 Hewlett-Packard Development Company, L.P. - * Copyright (C) 2005 IBM Corporation + * Copyright (C) 2005, 2006 IBM Corporation * All Rights Reserved. * * This program is free software; you can redistribute it and/or modify @@ -29,6 +29,9 @@ * this file -- see entry.S) is based on a GPL'd patch written by * okir@suse.de and Copyright 2003 SuSE Linux AG. * + * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>, + * 2006. + * * The support of additional filter rules compares (>, <, >=, <=) was * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. * @@ -49,6 +52,7 @@ #include <linux/module.h> #include <linux/mount.h> #include <linux/socket.h> +#include <linux/mqueue.h> #include <linux/audit.h> #include <linux/personality.h> #include <linux/time.h> @@ -59,6 +63,8 @@ #include <linux/list.h> #include <linux/tty.h> #include <linux/selinux.h> +#include <linux/binfmts.h> +#include <linux/syscalls.h> #include "audit.h" @@ -76,6 +82,9 @@ extern int audit_enabled; * path_lookup. */ #define AUDIT_NAMES_RESERVED 7 +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + /* When fs/namei.c:getname() is called, we store the pointer in name and * we don't let putname() free it (instead we free all of the saved * pointers at syscall exit time). @@ -83,8 +92,9 @@ extern int audit_enabled; * Further, in fs/namei.c:path_lookup() we store the inode and device. */ struct audit_names { const char *name; + int name_len; /* number of name's characters to log */ + unsigned name_put; /* call __putname() for this name */ unsigned long ino; - unsigned long pino; dev_t dev; umode_t mode; uid_t uid; @@ -100,6 +110,33 @@ struct audit_aux_data { #define AUDIT_AUX_IPCPERM 0 +struct audit_aux_data_mq_open { + struct audit_aux_data d; + int oflag; + mode_t mode; + struct mq_attr attr; +}; + +struct audit_aux_data_mq_sendrecv { + struct audit_aux_data d; + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; +}; + +struct audit_aux_data_mq_notify { + struct audit_aux_data d; + mqd_t mqdes; + struct sigevent notification; +}; + +struct audit_aux_data_mq_getsetattr { + struct audit_aux_data d; + mqd_t mqdes; + struct mq_attr mqstat; +}; + struct audit_aux_data_ipcctl { struct audit_aux_data d; struct ipc_perm p; @@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl { u32 osid; }; +struct audit_aux_data_execve { + struct audit_aux_data d; + int argc; + int envc; + char mem[0]; +}; + struct audit_aux_data_socketcall { struct audit_aux_data d; int nargs; @@ -148,7 +192,7 @@ struct audit_context { struct audit_aux_data *aux; /* Save things to print about task_struct */ - pid_t pid; + pid_t pid, ppid; uid_t uid, euid, suid, fsuid; gid_t gid, egid, sgid, fsgid; unsigned long personality; @@ -160,12 +204,13 @@ struct audit_context { #endif }; - +/* Determine if any context name data matches a rule's watch data */ /* Compare a task_struct with an audit_rule. Return 1 on match, 0 * otherwise. */ static int audit_filter_rules(struct task_struct *tsk, struct audit_krule *rule, struct audit_context *ctx, + struct audit_names *name, enum audit_state *state) { int i, j, need_sid = 1; @@ -179,6 +224,10 @@ static int audit_filter_rules(struct task_struct *tsk, case AUDIT_PID: result = audit_comparator(tsk->pid, f->op, f->val); break; + case AUDIT_PPID: + if (ctx) + result = audit_comparator(ctx->ppid, f->op, f->val); + break; case AUDIT_UID: result = audit_comparator(tsk->uid, f->op, f->val); break; @@ -224,7 +273,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMAJOR: - if (ctx) { + if (name) + result = audit_comparator(MAJOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -234,7 +286,10 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_DEVMINOR: - if (ctx) { + if (name) + result = audit_comparator(MINOR(name->dev), + f->op, f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { ++result; @@ -244,16 +299,22 @@ static int audit_filter_rules(struct task_struct *tsk, } break; case AUDIT_INODE: - if (ctx) { + if (name) + result = (name->ino == f->val); + else if (ctx) { for (j = 0; j < ctx->name_count; j++) { - if (audit_comparator(ctx->names[j].ino, f->op, f->val) || - audit_comparator(ctx->names[j].pino, f->op, f->val)) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { ++result; break; } } } break; + case AUDIT_WATCH: + if (name && rule->watch->ino != (unsigned long)-1) + result = (name->dev == rule->watch->dev && + name->ino == rule->watch->ino); + break; case AUDIT_LOGINUID: result = 0; if (ctx) @@ -294,7 +355,6 @@ static int audit_filter_rules(struct task_struct *tsk, } switch (rule->action) { case AUDIT_NEVER: *state = AUDIT_DISABLED; break; - case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break; case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; } return 1; @@ -311,7 +371,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk) rcu_read_lock(); list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { - if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) { rcu_read_unlock(); return state; } @@ -341,8 +401,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, int bit = AUDIT_BIT(ctx->major); list_for_each_entry_rcu(e, list, list) { - if ((e->rule.mask[word] & bit) == bit - && audit_filter_rules(tsk, &e->rule, ctx, &state)) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state)) { + rcu_read_unlock(); + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* At syscall exit time, this filter is called if any audit_names[] have been + * collected during syscall processing. We only check rules in sublists at hash + * buckets applicable to the inode numbers in audit_names[]. + * Regarding audit_state, same rules apply as for audit_filter_syscall(). + */ +enum audit_state audit_filter_inodes(struct task_struct *tsk, + struct audit_context *ctx) +{ + int i; + struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + for (i = 0; i < ctx->name_count; i++) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + struct audit_names *n = &ctx->names[i]; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + + if (list_empty(list)) + continue; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, &state)) { rcu_read_unlock(); return state; } @@ -352,6 +451,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk, return AUDIT_BUILD_CONTEXT; } +void audit_set_auditable(struct audit_context *ctx) +{ + ctx->auditable = 1; +} + static inline struct audit_context *audit_get_context(struct task_struct *tsk, int return_valid, int return_code) @@ -365,12 +469,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, if (context->in_syscall && !context->auditable) { enum audit_state state; + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + if (state == AUDIT_RECORD_CONTEXT) { + context->auditable = 1; + goto get_context; + } + + state = audit_filter_inodes(tsk, context); if (state == AUDIT_RECORD_CONTEXT) context->auditable = 1; + } +get_context: context->pid = tsk->pid; + context->ppid = sys_getppid(); /* sic. tsk == current in all cases */ context->uid = tsk->uid; context->gid = tsk->gid; context->euid = tsk->euid; @@ -413,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context) #endif for (i = 0; i < context->name_count; i++) { - if (context->names[i].name) + if (context->names[i].name && context->names[i].name_put) __putname(context->names[i].name); } context->name_count = 0; @@ -606,7 +720,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts tty = "(none)"; audit_log_format(ab, " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" - " pid=%d auid=%u uid=%u gid=%u" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" " euid=%u suid=%u fsuid=%u" " egid=%u sgid=%u fsgid=%u tty=%s", context->argv[0], @@ -614,6 +728,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts context->argv[2], context->argv[3], context->name_count, + context->ppid, context->pid, context->loginuid, context->uid, @@ -630,11 +745,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts continue; /* audit_panic has been called */ switch (aux->type) { + case AUDIT_MQ_OPEN: { + struct audit_aux_data_mq_open *axi = (void *)aux; + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + axi->oflag, axi->mode, axi->attr.mq_flags, + axi->attr.mq_maxmsg, axi->attr.mq_msgsize, + axi->attr.mq_curmsgs); + break; } + + case AUDIT_MQ_SENDRECV: { + struct audit_aux_data_mq_sendrecv *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + axi->mqdes, axi->msg_len, axi->msg_prio, + axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec); + break; } + + case AUDIT_MQ_NOTIFY: { + struct audit_aux_data_mq_notify *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d sigev_signo=%d", + axi->mqdes, + axi->notification.sigev_signo); + break; } + + case AUDIT_MQ_GETSETATTR: { + struct audit_aux_data_mq_getsetattr *axi = (void *)aux; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + axi->mqdes, + axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg, + axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs); + break; } + case AUDIT_IPC: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " qbytes=%lx iuid=%u igid=%u mode=%x", - axi->qbytes, axi->uid, axi->gid, axi->mode); + "ouid=%u ogid=%u mode=%x", + axi->uid, axi->gid, axi->mode); if (axi->osid != 0) { char *ctx = NULL; u32 len; @@ -652,19 +804,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts case AUDIT_IPC_SET_PERM: { struct audit_aux_data_ipcctl *axi = (void *)aux; audit_log_format(ab, - " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", + "qbytes=%lx ouid=%u ogid=%u mode=%x", axi->qbytes, axi->uid, axi->gid, axi->mode); - if (axi->osid != 0) { - char *ctx = NULL; - u32 len; - if (selinux_ctxid_to_string( - axi->osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - axi->osid); - call_panic = 1; - } else - audit_log_format(ab, " obj=%s", ctx); - kfree(ctx); + break; } + + case AUDIT_EXECVE: { + struct audit_aux_data_execve *axi = (void *)aux; + int i; + const char *p; + for (i = 0, p = axi->mem; i < axi->argc; i++) { + audit_log_format(ab, "a%d=", i); + p = audit_log_untrustedstring(ab, p); + audit_log_format(ab, "\n"); } break; } @@ -700,8 +851,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts } } for (i = 0; i < context->name_count; i++) { - unsigned long ino = context->names[i].ino; - unsigned long pino = context->names[i].pino; + struct audit_names *n = &context->names[i]; ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); if (!ab) @@ -709,33 +859,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts audit_log_format(ab, "item=%d", i); - audit_log_format(ab, " name="); - if (context->names[i].name) - audit_log_untrustedstring(ab, context->names[i].name); - else - audit_log_format(ab, "(null)"); - - if (pino != (unsigned long)-1) - audit_log_format(ab, " parent=%lu", pino); - if (ino != (unsigned long)-1) - audit_log_format(ab, " inode=%lu", ino); - if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) - audit_log_format(ab, " dev=%02x:%02x mode=%#o" - " ouid=%u ogid=%u rdev=%02x:%02x", - MAJOR(context->names[i].dev), - MINOR(context->names[i].dev), - context->names[i].mode, - context->names[i].uid, - context->names[i].gid, - MAJOR(context->names[i].rdev), - MINOR(context->names[i].rdev)); - if (context->names[i].osid != 0) { + if (n->name) { + switch(n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, " name=", context->pwd, + context->pwdmnt); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name_len, + n->name); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { char *ctx = NULL; u32 len; if (selinux_ctxid_to_string( - context->names[i].osid, &ctx, &len)) { - audit_log_format(ab, " osid=%u", - context->names[i].osid); + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); call_panic = 2; } else audit_log_format(ab, " obj=%s", ctx); @@ -908,11 +1072,11 @@ void audit_syscall_exit(int valid, long return_code) * Add a name to the list of audit names for this context. * Called from fs/namei.c:getname(). */ -void audit_getname(const char *name) +void __audit_getname(const char *name) { struct audit_context *context = current->audit_context; - if (!context || IS_ERR(name) || !name) + if (IS_ERR(name) || !name) return; if (!context->in_syscall) { @@ -925,6 +1089,8 @@ void audit_getname(const char *name) } BUG_ON(context->name_count >= AUDIT_NAMES); context->names[context->name_count].name = name; + context->names[context->name_count].name_len = AUDIT_NAME_FULL; + context->names[context->name_count].name_put = 1; context->names[context->name_count].ino = (unsigned long)-1; ++context->name_count; if (!context->pwd) { @@ -991,11 +1157,10 @@ static void audit_inode_context(int idx, const struct inode *inode) * audit_inode - store the inode and device from a lookup * @name: name being audited * @inode: inode being audited - * @flags: lookup flags (as used in path_lookup()) * * Called from fs/namei.c:path_lookup(). */ -void __audit_inode(const char *name, const struct inode *inode, unsigned flags) +void __audit_inode(const char *name, const struct inode *inode) { int idx; struct audit_context *context = current->audit_context; @@ -1021,20 +1186,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags) ++context->ino_count; #endif } + context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; context->names[idx].mode = inode->i_mode; context->names[idx].uid = inode->i_uid; context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) && - (strcmp(name, ".") != 0)) { - context->names[idx].ino = (unsigned long)-1; - context->names[idx].pino = inode->i_ino; - } else { - context->names[idx].ino = inode->i_ino; - context->names[idx].pino = (unsigned long)-1; - } } /** @@ -1056,51 +1214,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode, { int idx; struct audit_context *context = current->audit_context; + const char *found_name = NULL; + int dirlen = 0; if (!context->in_syscall) return; /* determine matching parent */ - if (dname) - for (idx = 0; idx < context->name_count; idx++) - if (context->names[idx].pino == pino) { - const char *n; - const char *name = context->names[idx].name; - int dlen = strlen(dname); - int nlen = name ? strlen(name) : 0; - - if (nlen < dlen) - continue; - - /* disregard trailing slashes */ - n = name + nlen - 1; - while ((*n == '/') && (n > name)) - n--; - - /* find last path component */ - n = n - dlen + 1; - if (n < name) - continue; - else if (n > name) { - if (*--n != '/') - continue; - else - n++; - } - - if (strncmp(n, dname, dlen) == 0) - goto update_context; + if (!dname) + goto update_context; + for (idx = 0; idx < context->name_count; idx++) + if (context->names[idx].ino == pino) { + const char *name = context->names[idx].name; + + if (!name) + continue; + + if (audit_compare_dname_path(dname, name, &dirlen) == 0) { + context->names[idx].name_len = dirlen; + found_name = name; + break; } + } - /* catch-all in case match not found */ +update_context: idx = context->name_count++; - context->names[idx].name = NULL; - context->names[idx].pino = pino; #if AUDIT_DEBUG context->ino_count++; #endif + /* Re-use the name belonging to the slot for a matching parent directory. + * All names for this context are relinquished in audit_free_names() */ + context->names[idx].name = found_name; + context->names[idx].name_len = AUDIT_NAME_FULL; + context->names[idx].name_put = 0; /* don't call __putname() */ -update_context: if (inode) { context->names[idx].ino = inode->i_ino; context->names[idx].dev = inode->i_sb->s_dev; @@ -1109,7 +1256,8 @@ update_context: context->names[idx].gid = inode->i_gid; context->names[idx].rdev = inode->i_rdev; audit_inode_context(idx, inode); - } + } else + context->names[idx].ino = (unsigned long)-1; } /** @@ -1142,18 +1290,23 @@ void auditsc_get_stamp(struct audit_context *ctx, */ int audit_set_loginuid(struct task_struct *task, uid_t loginuid) { - if (task->audit_context) { - struct audit_buffer *ab; - - ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); - if (ab) { - audit_log_format(ab, "login pid=%d uid=%u " - "old auid=%u new auid=%u", - task->pid, task->uid, - task->audit_context->loginuid, loginuid); - audit_log_end(ab); + struct audit_context *context = task->audit_context; + + if (context) { + /* Only log if audit is enabled */ + if (context->in_syscall) { + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old auid=%u new auid=%u", + task->pid, task->uid, + context->loginuid, loginuid); + audit_log_end(ab); + } } - task->audit_context->loginuid = loginuid; + context->loginuid = loginuid; } return 0; } @@ -1170,16 +1323,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx) } /** - * audit_ipc_obj - record audit data for ipc object - * @ipcp: ipc permissions + * __audit_mq_open - record audit data for a POSIX MQ open + * @oflag: open flag + * @mode: mode bits + * @u_attr: queue attributes * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_obj(struct kern_ipc_perm *ipcp) +int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) { - struct audit_aux_data_ipcctl *ax; + struct audit_aux_data_mq_open *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_attr != NULL) { + if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->attr, 0, sizeof(ax->attr)); + + ax->oflag = oflag; + ax->mode = mode; + + ax->d.type = AUDIT_MQ_OPEN; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedsend - record audit data for a POSIX MQ timed send + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + ax->msg_prio = msg_prio; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len, + unsigned int __user *u_msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct audit_aux_data_mq_sendrecv *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_msg_prio != NULL) { + if (get_user(ax->msg_prio, u_msg_prio)) { + kfree(ax); + return -EFAULT; + } + } else + ax->msg_prio = 0; + + if (u_abs_timeout != NULL) { + if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout)); + + ax->mqdes = mqdes; + ax->msg_len = msg_len; + + ax->d.type = AUDIT_MQ_SENDRECV; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_notify - record audit data for a POSIX MQ notify + * @mqdes: MQ descriptor + * @u_notification: Notification event + * + * Returns 0 for success or NULL context or < 0 on error. + */ + +int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) +{ + struct audit_aux_data_mq_notify *ax; + struct audit_context *context = current->audit_context; + + if (!audit_enabled) + return 0; + + if (likely(!context)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + + if (u_notification != NULL) { + if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) { + kfree(ax); + return -EFAULT; + } + } else + memset(&ax->notification, 0, sizeof(ax->notification)); + + ax->mqdes = mqdes; + + ax->d.type = AUDIT_MQ_NOTIFY; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute + * @mqdes: MQ descriptor + * @mqstat: MQ flags + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + struct audit_aux_data_mq_getsetattr *ax; struct audit_context *context = current->audit_context; + if (!audit_enabled) + return 0; + if (likely(!context)) return 0; @@ -1187,6 +1517,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) if (!ax) return -ENOMEM; + ax->mqdes = mqdes; + ax->mqstat = *mqstat; + + ax->d.type = AUDIT_MQ_GETSETATTR; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + +/** + * audit_ipc_obj - record audit data for ipc object + * @ipcp: ipc permissions + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int __audit_ipc_obj(struct kern_ipc_perm *ipcp) +{ + struct audit_aux_data_ipcctl *ax; + struct audit_context *context = current->audit_context; + + ax = kmalloc(sizeof(*ax), GFP_ATOMIC); + if (!ax) + return -ENOMEM; + ax->uid = ipcp->uid; ax->gid = ipcp->gid; ax->mode = ipcp->mode; @@ -1204,17 +1558,15 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp) * @uid: msgq user id * @gid: msgq group id * @mode: msgq mode (permissions) + * @ipcp: in-kernel IPC permissions * * Returns 0 for success or NULL context or < 0 on error. */ -int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) +int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) { struct audit_aux_data_ipcctl *ax; struct audit_context *context = current->audit_context; - if (likely(!context)) - return 0; - ax = kmalloc(sizeof(*ax), GFP_ATOMIC); if (!ax) return -ENOMEM; @@ -1223,7 +1575,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, ax->uid = uid; ax->gid = gid; ax->mode = mode; - selinux_get_ipc_sid(ipcp, &ax->osid); ax->d.type = AUDIT_IPC_SET_PERM; ax->d.next = context->aux; @@ -1231,6 +1582,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, return 0; } +int audit_bprm(struct linux_binprm *bprm) +{ + struct audit_aux_data_execve *ax; + struct audit_context *context = current->audit_context; + unsigned long p, next; + void *to; + + if (likely(!audit_enabled || !context)) + return 0; + + ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, + GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->argc = bprm->argc; + ax->envc = bprm->envc; + for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) { + struct page *page = bprm->page[p / PAGE_SIZE]; + void *kaddr = kmap(page); + next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1); + memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p); + to += next - p; + kunmap(page); + } + + ax->d.type = AUDIT_EXECVE; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + + /** * audit_socketcall - record audit data for sys_socketcall * @nargs: number of args @@ -1325,19 +1709,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt) * If the audit subsystem is being terminated, record the task (pid) * and uid that is doing that. */ -void audit_signal_info(int sig, struct task_struct *t) +void __audit_signal_info(int sig, struct task_struct *t) { extern pid_t audit_sig_pid; extern uid_t audit_sig_uid; - - if (unlikely(audit_pid && t->tgid == audit_pid)) { - if (sig == SIGTERM || sig == SIGHUP) { - struct audit_context *ctx = current->audit_context; - audit_sig_pid = current->pid; - if (ctx) - audit_sig_uid = ctx->loginuid; - else - audit_sig_uid = current->uid; - } + extern u32 audit_sig_sid; + + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { + struct task_struct *tsk = current; + struct audit_context *ctx = tsk->audit_context; + audit_sig_pid = tsk->pid; + if (ctx) + audit_sig_uid = ctx->loginuid; + else + audit_sig_uid = tsk->uid; + selinux_get_task_sid(tsk, &audit_sig_sid); } } diff --git a/kernel/compat.c b/kernel/compat.c index c1601a84f8d8..126dee9530aa 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -21,6 +21,7 @@ #include <linux/unistd.h> #include <linux/security.h> #include <linux/timex.h> +#include <linux/migrate.h> #include <asm/uaccess.h> @@ -729,17 +730,10 @@ void sigset_from_compat (sigset_t *set, compat_sigset_t *compat) { switch (_NSIG_WORDS) { -#if defined (__COMPAT_ENDIAN_SWAP__) - case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 ); - case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 ); - case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 ); - case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 ); -#else case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); -#endif } } @@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) return ret; } + +#ifdef CONFIG_NUMA +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, + compat_uptr_t __user *pages32, + const int __user *nodes, + int __user *status, + int flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, pages32 + i) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); +} +#endif diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4c..03dcd981846a 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -13,10 +13,10 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/stop_machine.h> -#include <asm/semaphore.h> +#include <linux/mutex.h> /* This protects CPUs going up and down... */ -static DECLARE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpucontrol); static BLOCKING_NOTIFIER_HEAD(cpu_chain); @@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) if (lock_cpu_hotplug_owner != current) { if (interruptible) - ret = down_interruptible(&cpucontrol); + ret = mutex_lock_interruptible(&cpucontrol); else - down(&cpucontrol); + mutex_lock(&cpucontrol); } /* @@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) { if (--lock_cpu_hotplug_depth == 0) { lock_cpu_hotplug_owner = NULL; - up(&cpucontrol); + mutex_unlock(&cpucontrol); } } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ab81fdd4572b..1535af3a912d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -41,6 +41,7 @@ #include <linux/rcupdate.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/security.h> #include <linux/slab.h> #include <linux/smp_lock.h> #include <linux/spinlock.h> @@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data, return 0; } -static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, - int flags, const char *unused_dev_name, - void *data) +static int cpuset_get_sb(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data, struct vfsmount *mnt) { - return get_sb_single(fs_type, flags, data, cpuset_fill_super); + return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt); } static struct file_system_type cpuset_fs_type = { @@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) cpumask_t cpus; nodemask_t from, to; struct mm_struct *mm; + int retval; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) get_task_struct(tsk); } + retval = security_task_setscheduler(tsk, 0, NULL); + if (retval) { + put_task_struct(tsk); + return retval; + } + mutex_lock(&callback_mutex); task_lock(tsk); @@ -2434,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void) */ static int proc_cpuset_show(struct seq_file *m, void *v) { + struct pid *pid; struct task_struct *tsk; char *buf; - int retval = 0; + int retval; + retval = -ENOMEM; buf = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!buf) - return -ENOMEM; + goto out; - tsk = m->private; + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = -EINVAL; mutex_lock(&manage_mutex); + retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); if (retval < 0) - goto out; + goto out_unlock; seq_puts(m, buf); seq_putc(m, '\n'); -out: +out_unlock: mutex_unlock(&manage_mutex); + put_task_struct(tsk); +out_free: kfree(buf); +out: return retval; } static int cpuset_open(struct inode *inode, struct file *file) { - struct task_struct *tsk = PROC_I(inode)->task; - return single_open(file, proc_cpuset_show, tsk); + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); } struct file_operations proc_cpuset_operations = { diff --git a/kernel/exit.c b/kernel/exit.c index e95b93282210..304ef637be6c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -36,6 +36,7 @@ #include <linux/compat.h> #include <linux/pipe_fs_i.h> #include <linux/audit.h> /* for audit_free() */ +#include <linux/resource.h> #include <asm/uaccess.h> #include <asm/unistd.h> @@ -45,8 +46,6 @@ extern void sem_exit (void); extern struct task_struct *child_reaper; -int getrusage(struct task_struct *, int, struct rusage __user *); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) @@ -138,12 +137,8 @@ void release_task(struct task_struct * p) { int zap_leader; task_t *leader; - struct dentry *proc_dentry; - repeat: atomic_dec(&p->user->processes); - spin_lock(&p->proc_lock); - proc_dentry = proc_pid_unhash(p); write_lock_irq(&tasklist_lock); ptrace_unlink(p); BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); @@ -172,8 +167,7 @@ repeat: sched_exit(p); write_unlock_irq(&tasklist_lock); - spin_unlock(&p->proc_lock); - proc_pid_flush(proc_dentry); + proc_flush_task(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); @@ -579,7 +573,7 @@ static void exit_mm(struct task_struct * tsk) down_read(&mm->mmap_sem); } atomic_inc(&mm->mm_count); - if (mm != tsk->active_mm) BUG(); + BUG_ON(mm != tsk->active_mm); /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; @@ -881,14 +875,6 @@ fastcall NORET_TYPE void do_exit(long code) tsk->flags |= PF_EXITING; - /* - * Make sure we don't try to process any timer firings - * while we are already exiting. - */ - tsk->it_virt_expires = cputime_zero; - tsk->it_prof_expires = cputime_zero; - tsk->it_sched_expires = 0; - if (unlikely(in_atomic())) printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", current->comm, current->pid, @@ -903,11 +889,11 @@ fastcall NORET_TYPE void do_exit(long code) if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); exit_itimers(tsk->signal); - acct_process(code); } + acct_collect(code, group_dead); if (unlikely(tsk->robust_list)) exit_robust_list(tsk); -#ifdef CONFIG_COMPAT +#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT) if (unlikely(tsk->compat_robust_list)) compat_exit_robust_list(tsk); #endif @@ -915,6 +901,8 @@ fastcall NORET_TYPE void do_exit(long code) audit_free(tsk); exit_mm(tsk); + if (group_dead) + acct_process(); exit_sem(tsk); __exit_files(tsk); __exit_fs(tsk); @@ -1538,8 +1526,7 @@ check_continued: if (options & __WNOTHREAD) break; tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); + BUG_ON(tsk->signal != current->signal); } while (tsk != current); read_unlock(&tasklist_lock); diff --git a/kernel/fork.c b/kernel/fork.c index ac8100e3088a..9b4e54ef0225 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm) */ void mmput(struct mm_struct *mm) { + might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); exit_mmap(mm); @@ -623,6 +625,7 @@ out: /* * Allocate a new files structure and copy contents from the * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { @@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) int open_files, size, i, expand; struct fdtable *old_fdt, *new_fdt; + *errorp = -ENOMEM; newf = alloc_files(); if (!newf) goto out; @@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * break this. */ tsk->files = NULL; - error = -ENOMEM; newf = dup_fd(oldf, &error); if (!newf) goto out; @@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts tsk->it_prof_expires = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); } + acct_init_pacct(&sig->pacct); return 0; } @@ -989,13 +993,10 @@ static task_t *copy_process(unsigned long clone_flags, if (put_user(p->pid, parent_tidptr)) goto bad_fork_cleanup; - p->proc_dentry = NULL; - INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); - spin_lock_init(&p->proc_lock); clear_tsk_thread_flag(p, TIF_SIGPENDING); init_sigpending(&p->pending); @@ -1155,18 +1156,6 @@ static task_t *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - /* - * Important: if an exit-all has been started then - * do not create this new thread - the whole thread - * group is supposed to exit anyway. - */ - if (current->signal->flags & SIGNAL_GROUP_EXIT) { - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); - retval = -EAGAIN; - goto bad_fork_cleanup_namespace; - } - p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); diff --git a/kernel/futex.c b/kernel/futex.c index 5699c512057b..e1a380c77a5a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1056,11 +1056,11 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, (unsigned long)uaddr2, val2, val3); } -static struct super_block * -futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int futexfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, + struct vfsmount *mnt) { - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); + return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); } static struct file_system_type futex_fs_type = { diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 01fa2ae98a85..55601b3ce60e 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = /** * ktime_get_ts - get the monotonic clock in timespec format - * * @ts: pointer to timespec variable * * The function calculates the monotonic clock from the realtime @@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) # ifndef CONFIG_KTIME_SCALAR /** * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable - * * @kt: addend * @nsec: the scalar nsec value to add * @@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) /** * hrtimer_forward - forward the timer expiry - * * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward @@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) if (base->first == &timer->node) base->first = rb_next(&timer->node); rb_erase(&timer->node, &base->active); - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } /* @@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base) /** * hrtimer_start - (re)start an relative timer on the current CPU - * * @timer: the timer to be added * @tim: expiry time * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) @@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start); /** * hrtimer_try_to_cancel - try to deactivate a timer - * * @timer: hrtimer to stop * * Returns: * 0 when the timer was not active * 1 when the timer was active * -1 when the timer is currently excuting the callback function and - * can not be stopped + * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { @@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. - * * @timer: the timer to be cancelled * * Returns: @@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer - * * @timer: the timer to read */ ktime_t hrtimer_get_remaining(const struct hrtimer *timer) @@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void) /** * hrtimer_init - initialize a timer to the given clock - * * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel @@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, memset(timer, 0, sizeof(struct hrtimer)); - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) clock_id = CLOCK_MONOTONIC; timer->base = &bases[clock_id]; - timer->node.rb_parent = HRTIMER_INACTIVE; + rb_set_parent(&timer->node, &timer->node); } EXPORT_SYMBOL_GPL(hrtimer_init); /** * hrtimer_get_res - get the timer resolution for a clock - * * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * @@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { struct hrtimer_base *bases; - bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); + bases = __raw_get_cpu_var(hrtimer_bases); *tp = ktime_to_timespec(bases[which_clock].resolution); return 0; diff --git a/kernel/intermodule.c b/kernel/intermodule.c deleted file mode 100644 index 55b1e5b85db9..000000000000 --- a/kernel/intermodule.c +++ /dev/null @@ -1,184 +0,0 @@ -/* Deprecated, do not use. Moved from module.c to here. --RR */ - -/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */ -#include <linux/module.h> -#include <linux/kmod.h> -#include <linux/spinlock.h> -#include <linux/list.h> -#include <linux/slab.h> - -/* inter_module functions are always available, even when the kernel is - * compiled without modules. Consumers of inter_module_xxx routines - * will always work, even when both are built into the kernel, this - * approach removes lots of #ifdefs in mainline code. - */ - -static struct list_head ime_list = LIST_HEAD_INIT(ime_list); -static DEFINE_SPINLOCK(ime_lock); -static int kmalloc_failed; - -struct inter_module_entry { - struct list_head list; - const char *im_name; - struct module *owner; - const void *userdata; -}; - -/** - * inter_module_register - register a new set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * @owner: module that is registering the data, always use THIS_MODULE - * @userdata: pointer to arbitrary userdata to be registered - * - * Description: Check that the im_name has not already been registered, - * complain if it has. For new data, add it to the inter_module_entry - * list. - */ -void inter_module_register(const char *im_name, struct module *owner, const void *userdata) -{ - struct list_head *tmp; - struct inter_module_entry *ime, *ime_new; - - if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { - /* Overloaded kernel, not fatal */ - printk(KERN_ERR - "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", - im_name); - kmalloc_failed = 1; - return; - } - ime_new->im_name = im_name; - ime_new->owner = owner; - ime_new->userdata = userdata; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - spin_unlock(&ime_lock); - kfree(ime_new); - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name); - BUG(); - } - } - list_add(&(ime_new->list), &ime_list); - spin_unlock(&ime_lock); -} - -/** - * inter_module_unregister - unregister a set of inter module data. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: Check that the im_name has been registered, complain if - * it has not. For existing data, remove it from the - * inter_module_entry list. - */ -void inter_module_unregister(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - list_del(&(ime->list)); - spin_unlock(&ime_lock); - kfree(ime); - return; - } - } - spin_unlock(&ime_lock); - if (kmalloc_failed) { - printk(KERN_ERR - "inter_module_unregister: no entry for '%s', " - "probably caused by previous kmalloc failure\n", - im_name); - return; - } - else { - /* Program logic error, fatal */ - printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name); - BUG(); - } -} - -/** - * inter_module_get - return arbitrary userdata from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, return NULL. - * Try to increment the use count on the owning module, if that fails - * then return NULL. Otherwise return the userdata. - */ -static const void *inter_module_get(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - const void *result = NULL; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (try_module_get(ime->owner)) - result = ime->userdata; - break; - } - } - spin_unlock(&ime_lock); - return(result); -} - -/** - * inter_module_get_request - im get with automatic request_module. - * @im_name: an arbitrary string to identify the data, must be unique - * @modname: module that is expected to register im_name - * - * Description: If inter_module_get fails, do request_module then retry. - */ -const void *inter_module_get_request(const char *im_name, const char *modname) -{ - const void *result = inter_module_get(im_name); - if (!result) { - request_module("%s", modname); - result = inter_module_get(im_name); - } - return(result); -} - -/** - * inter_module_put - release use of data from another module. - * @im_name: an arbitrary string to identify the data, must be unique - * - * Description: If the im_name has not been registered, complain, - * otherwise decrement the use count on the owning module. - */ -void inter_module_put(const char *im_name) -{ - struct list_head *tmp; - struct inter_module_entry *ime; - - spin_lock(&ime_lock); - list_for_each(tmp, &ime_list) { - ime = list_entry(tmp, struct inter_module_entry, list); - if (strcmp(ime->im_name, im_name) == 0) { - if (ime->owner) - module_put(ime->owner); - spin_unlock(&ime_lock); - return; - } - } - spin_unlock(&ime_lock); - printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name); - BUG(); -} - -EXPORT_SYMBOL(inter_module_register); -EXPORT_SYMBOL(inter_module_unregister); -EXPORT_SYMBOL(inter_module_get_request); -EXPORT_SYMBOL(inter_module_put); - -MODULE_LICENSE("GPL"); - diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 51df337b37db..0f6530117105 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -76,10 +76,11 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs) /* * Have got an event to handle: */ -fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, +fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, struct irqaction *action) { - int ret, retval = 0, status = 0; + irqreturn_t ret, retval = IRQ_NONE; + unsigned int status = 0; if (!(action->flags & SA_INTERRUPT)) local_irq_enable(); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index 134f9f2e0e39..a12d00eb5e7c 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -30,7 +30,7 @@ void move_native_irq(int irq) desc->move_irq = 0; - if (likely(cpus_empty(pending_irq_cpumask[irq]))) + if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) return; if (!desc->handler->set_affinity) @@ -49,7 +49,7 @@ void move_native_irq(int irq) * cause some ioapics to mal-function. * Being paranoid i guess! */ - if (unlikely(!cpus_empty(tmp))) { + if (likely(!cpus_empty(tmp))) { if (likely(!(desc->status & IRQ_DISABLED))) desc->handler->disable(irq); diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index d03b5eef8ce0..afacd6f585fa 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -24,6 +24,8 @@ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; #ifdef CONFIG_GENERIC_PENDING_IRQ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); + /* * Save these away for later use. Re-progam when the * interrupt is pending @@ -33,6 +35,7 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) #else void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { + set_balance_irq_affinity(irq, mask_val); irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 7df9abd5ec86..b2fb3c18d06b 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -11,7 +11,7 @@ #include <linux/kallsyms.h> #include <linux/interrupt.h> -static int irqfixup; +static int irqfixup __read_mostly; /* * Recovery handler for misrouted interrupts. @@ -136,9 +136,9 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, struct pt_regs *regs) { - if (action_ret != IRQ_HANDLED) { + if (unlikely(action_ret != IRQ_HANDLED)) { desc->irqs_unhandled++; - if (action_ret != IRQ_NONE) + if (unlikely(action_ret != IRQ_NONE)) report_bad_irq(irq, desc, action_ret); } @@ -152,11 +152,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, } desc->irq_count++; - if (desc->irq_count < 100000) + if (likely(desc->irq_count < 100000)) return; desc->irq_count = 0; - if (desc->irqs_unhandled > 99900) { + if (unlikely(desc->irqs_unhandled > 99900)) { /* * The interrupt is stuck */ @@ -171,7 +171,7 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, desc->irqs_unhandled = 0; } -int noirqdebug; +int noirqdebug __read_mostly; int __init noirqdebug_setup(char *str) { diff --git a/kernel/kexec.c b/kernel/kexec.c index bf39d28e4c0e..58f0f382597c 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image = NULL; -static struct kimage *kexec_crash_image = NULL; +struct kimage *kexec_image; +struct kimage *kexec_crash_image; /* * A home grown binary mutex. * Nothing can wait so this mutex is safe to use * in interrupt context :) */ -static int kexec_lock = 0; +static int kexec_lock; asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29aa..64aab081153b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,11 +47,17 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; +static atomic_t kprobe_count; DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static struct notifier_block kprobe_page_fault_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to notified first */ +}; + #ifdef __ARCH_WANT_KPROBES_INSN_SLOT /* * kprobe->ainsn.insn points to the copy of the instruction to be @@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) */ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { - struct kprobe *kp; - if (p->break_handler) { - list_for_each_entry_rcu(kp, &old_p->list, list) { - if (kp->break_handler) - return -EEXIST; - } + if (old_p->break_handler) + return -EEXIST; list_add_tail_rcu(&p->list, &old_p->list); + old_p->break_handler = aggr_break_handler; } else list_add_rcu(&p->list, &old_p->list); + if (p->post_handler && !old_p->post_handler) + old_p->post_handler = aggr_post_handler; return 0; } @@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) copy_kprobe(p, ap); ap->addr = p->addr; ap->pre_handler = aggr_pre_handler; - ap->post_handler = aggr_post_handler; ap->fault_handler = aggr_fault_handler; - ap->break_handler = aggr_break_handler; + if (p->post_handler) + ap->post_handler = aggr_post_handler; + if (p->break_handler) + ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); list_add_rcu(&p->list, &ap->list); @@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, old_p = get_kprobe(p->addr); if (old_p) { ret = register_aggr_kprobe(old_p, p); + if (!ret) + atomic_inc(&kprobe_count); goto out; } @@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + if (atomic_add_return(1, &kprobe_count) == \ + (ARCH_INACTIVE_KPROBE_COUNT + 1)) + register_page_fault_notifier(&kprobe_page_fault_nb); + arch_arm_kprobe(p); out: @@ -536,14 +549,40 @@ valid_p: kfree(old_p); } arch_remove_kprobe(p); + } else { + mutex_lock(&kprobe_mutex); + if (p->break_handler) + old_p->break_handler = NULL; + if (p->post_handler){ + list_for_each_entry_rcu(list_p, &old_p->list, list){ + if (list_p->post_handler){ + cleanup_p = 2; + break; + } + } + if (cleanup_p == 0) + old_p->post_handler = NULL; + } + mutex_unlock(&kprobe_mutex); } + + /* Call unregister_page_fault_notifier() + * if no probes are active + */ + mutex_lock(&kprobe_mutex); + if (atomic_add_return(-1, &kprobe_count) == \ + ARCH_INACTIVE_KPROBE_COUNT) + unregister_page_fault_notifier(&kprobe_page_fault_nb); + mutex_unlock(&kprobe_mutex); + return; } static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to notified first */ + .priority = 0x7fffffff /* we need to be notified first */ }; + int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ @@ -652,6 +691,7 @@ static int __init init_kprobes(void) INIT_HLIST_HEAD(&kprobe_table[i]); INIT_HLIST_HEAD(&kretprobe_inst_table[i]); } + atomic_set(&kprobe_count, 0); err = arch_init_kprobes(); if (!err) diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index f119e098e67b..9e28478a17a5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include <linux/sysfs.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/kexec.h> #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s KERNEL_ATTR_RW(uevent_helper); #endif +#ifdef CONFIG_KEXEC +static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_image); +} +KERNEL_ATTR_RO(kexec_loaded); + +static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page) +{ + return sprintf(page, "%d\n", !!kexec_crash_image); +} +KERNEL_ATTR_RO(kexec_crash_loaded); +#endif /* CONFIG_KEXEC */ + decl_subsys(kernel, NULL, NULL); EXPORT_SYMBOL_GPL(kernel_subsys); @@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = { &uevent_seqnum_attr.attr, &uevent_helper_attr.attr, #endif +#ifdef CONFIG_KEXEC + &kexec_loaded_attr.attr, + &kexec_crash_loaded_attr.attr, +#endif NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index c5f3c6613b6d..24be714b04c7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -45,6 +45,13 @@ struct kthread_stop_info static DEFINE_MUTEX(kthread_stop_lock); static struct kthread_stop_info kthread_stop_info; +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ int kthread_should_stop(void) { return (kthread_stop_info.k == current); @@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create) complete(&create->done); } +/** + * kthread_create - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(), kthread_create_on_cpu(). + * + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn can either call do_exit() directly if it is a + * standalone thread for which noone will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM). + */ struct task_struct *kthread_create(int (*threadfn)(void *data), void *data, const char namefmt[], @@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create); +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @k: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create(). + */ void kthread_bind(struct task_struct *k, unsigned int cpu) { BUG_ON(k->state != TASK_INTERRUPTIBLE); @@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu) } EXPORT_SYMBOL(kthread_bind); +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. Your threadfn() must not call do_exit() + * itself if you use this function! This can also be called after + * kthread_create() instead of calling wake_up_process(): the thread + * will exit without calling threadfn(). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop(struct task_struct *k) { return kthread_stop_sem(k, NULL); } EXPORT_SYMBOL(kthread_stop); +/** + * kthread_stop_sem - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * @s: semaphore that @k waits on while idle. + * + * Does essentially the same thing as kthread_stop() above, but wakes + * @k by calling up(@s). + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) { int ret; @@ -210,5 +269,5 @@ static __init int helper_init(void) return 0; } -core_initcall(helper_init); +core_initcall(helper_init); diff --git a/kernel/module.c b/kernel/module.c index 690381508d09..10e5b872adf6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -40,6 +40,7 @@ #include <linux/string.h> #include <linux/sched.h> #include <linux/mutex.h> +#include <linux/unwind.h> #include <asm/uaccess.h> #include <asm/semaphore.h> #include <asm/cacheflush.h> @@ -1052,6 +1053,8 @@ static void free_module(struct module *mod) remove_sect_attrs(mod); mod_kobject_remove(mod); + unwind_remove_table(mod->unwind_info, 0); + /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1317,7 +1320,7 @@ int is_exported(const char *name, const struct module *mod) if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) return 1; else - if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) + if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) return 1; else return 0; @@ -1403,7 +1406,7 @@ static struct module *load_module(void __user *umod, unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, exportindex, modindex, obsparmindex, infoindex, gplindex, crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, - gplfuturecrcindex; + gplfuturecrcindex, unwindex = 0; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ @@ -1493,6 +1496,9 @@ static struct module *load_module(void __user *umod, versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); +#ifdef ARCH_UNWIND_SECTION_NAME + unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); +#endif /* Don't keep modinfo section */ sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; @@ -1501,6 +1507,8 @@ static struct module *load_module(void __user *umod, sechdrs[symindex].sh_flags |= SHF_ALLOC; sechdrs[strindex].sh_flags |= SHF_ALLOC; #endif + if (unwindex) + sechdrs[unwindex].sh_flags |= SHF_ALLOC; /* Check module struct version now, before we try to use module. */ if (!check_modstruct_version(sechdrs, versindex, mod)) { @@ -1729,6 +1737,11 @@ static struct module *load_module(void __user *umod, goto arch_cleanup; add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); + /* Size of section 0 is 0, so this works well if no unwind info. */ + mod->unwind_info = unwind_add_table(mod, + (void *)sechdrs[unwindex].sh_addr, + sechdrs[unwindex].sh_size); + /* Get rid of temporary copy */ vfree(hdr); @@ -1827,6 +1840,7 @@ sys_init_module(void __user *umod, mod->state = MODULE_STATE_LIVE; /* Drop initial reference. */ module_put(mod); + unwind_remove_table(mod->unwind_info, 1); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c376950..036b6285b15c 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -153,13 +153,13 @@ next: continue; count++; cursor = curr->next; - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n#%03d: ", count); printk_lock(lock, filter ? 0 : 1); goto next; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("\n"); } @@ -316,7 +316,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, lock held at task exit time!\n", task->comm, task->pid); @@ -325,7 +325,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) printk("exiting task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* @@ -352,7 +352,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) continue; list_del_init(curr); DEBUG_OFF(); - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", current->comm, current->pid, lock, from, to); @@ -362,7 +362,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) printk("freeing task is not even the owner??\n"); return; } - debug_spin_lock_restore(&debug_mutex_lock, flags); + debug_spin_unlock_restore(&debug_mutex_lock, flags); } /* diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb1..a5196c36a5fd 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h @@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name); -#define debug_spin_lock(lock) \ - do { \ - local_irq_disable(); \ - if (debug_mutex_on) \ - spin_lock(lock); \ - } while (0) - -#define debug_spin_unlock(lock) \ - do { \ - if (debug_mutex_on) \ - spin_unlock(lock); \ - local_irq_enable(); \ - preempt_check_resched(); \ - } while (0) - #define debug_spin_lock_save(lock, flags) \ do { \ local_irq_save(flags); \ @@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); spin_lock(lock); \ } while (0) -#define debug_spin_lock_restore(lock, flags) \ +#define debug_spin_unlock_restore(lock, flags) \ do { \ if (debug_mutex_on) \ spin_unlock(lock); \ @@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); preempt_check_resched(); \ } while (0) -#define spin_lock_mutex(lock) \ +#define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ \ DEBUG_WARN_ON(in_interrupt()); \ - debug_spin_lock(&debug_mutex_lock); \ + debug_spin_lock_save(&debug_mutex_lock, flags); \ spin_lock(lock); \ DEBUG_WARN_ON(l->magic != l); \ } while (0) -#define spin_unlock_mutex(lock) \ +#define spin_unlock_mutex(lock, flags) \ do { \ spin_unlock(lock); \ - debug_spin_unlock(&debug_mutex_lock); \ + debug_spin_unlock_restore(&debug_mutex_lock, flags); \ } while (0) #define DEBUG_OFF() \ diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9ed..7043db21bbce 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) struct task_struct *task = current; struct mutex_waiter waiter; unsigned int old_val; + unsigned long flags; debug_mutex_init_waiter(&waiter); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); @@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (unlikely(state == TASK_INTERRUPTIBLE && signal_pending(task))) { mutex_remove_waiter(lock, &waiter, task->thread_info); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); return -EINTR; @@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) __set_task_state(task, state); /* didnt get the lock, go to sleep: */ - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); schedule(); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); } /* got the lock - rejoice! */ @@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); @@ -203,10 +204,11 @@ static fastcall noinline void __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; DEBUG_WARN_ON(lock->owner != current_thread_info()); - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); /* * some architectures leave the lock unlocked in the fastpath failure @@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) debug_mutex_clear_owner(lock); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); } /* @@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) static inline int __mutex_trylock_slowpath(atomic_t *lock_count) { struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; int prev; - spin_lock_mutex(&lock->wait_lock); + spin_lock_mutex(&lock->wait_lock, flags); prev = atomic_xchg(&lock->count, -1); if (likely(prev == 1)) @@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) if (likely(list_empty(&lock->wait_list))) atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock); + spin_unlock_mutex(&lock->wait_lock, flags); return prev == 1; } diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b672..069189947257 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h @@ -9,8 +9,10 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define spin_lock_mutex(lock) spin_lock(lock) -#define spin_unlock_mutex(lock) spin_unlock(lock) +#define spin_lock_mutex(lock, flags) \ + do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ + do { spin_unlock(lock); (void)(flags); } while (0) #define mutex_remove_waiter(lock, waiter, ti) \ __list_del((waiter)->list.prev, (waiter)->list.next) diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 520f6c59948d..d38d9ec3276c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now) struct cpu_timer_list *next; unsigned long i; - if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING)) - return; - head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? p->cpu_timers : p->signal->cpu_timers); head += CPUCLOCK_WHICH(timer->it_clock); @@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk, } t = tsk; do { + if (unlikely(t->flags & PF_EXITING)) + continue; + ticks = cputime_add(cputime_add(t->utime, t->stime), prof_left); if (!cputime_eq(prof_expires, cputime_zero) && @@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk, t->it_sched_expires > sched)) { t->it_sched_expires = sched; } - - do { - t = next_thread(t); - } while (unlikely(t->flags & PF_EXITING)); - } while (t != tsk); + } while ((t = next_thread(t)) != tsk); } } @@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk) #undef UNEXPIRED - BUG_ON(tsk->exit_state); - /* * Double-check with locks held. */ read_lock(&tasklist_lock); - spin_lock(&tsk->sighand->siglock); + if (likely(tsk->signal != NULL)) { + spin_lock(&tsk->sighand->siglock); - /* - * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] - * all the timers that are firing, and put them on the firing list. - */ - check_thread_timers(tsk, &firing); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] + * all the timers that are firing, and put them on the firing list. + */ + check_thread_timers(tsk, &firing); + check_process_timers(tsk, &firing); - /* - * We must release these locks before taking any timer's lock. - * There is a potential race with timer deletion here, as the - * siglock now protects our private firing list. We have set - * the firing flag in each timer, so that a deletion attempt - * that gets the timer lock before we do will give it up and - * spin until we've taken care of that timer below. - */ - spin_unlock(&tsk->sighand->siglock); + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + spin_unlock(&tsk->sighand->siglock); + } read_unlock(&tasklist_lock); /* diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index ce0dfb8f4a4e..fc311a4673a2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -36,6 +36,15 @@ config PM_DEBUG code. This is helpful when debugging and reporting various PM bugs, like suspend support. +config PM_TRACE + bool "Suspend/resume event tracing" + depends on PM && PM_DEBUG && X86_32 + default y + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + config SOFTWARE_SUSPEND bool "Software Suspend" depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 81d4d982f3f0..e13e74067845 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -231,7 +231,7 @@ static int software_resume(void) late_initcall(software_resume); -static char * pm_disk_modes[] = { +static const char * const pm_disk_modes[] = { [PM_DISK_FIRMWARE] = "firmware", [PM_DISK_PLATFORM] = "platform", [PM_DISK_SHUTDOWN] = "shutdown", diff --git a/kernel/power/main.c b/kernel/power/main.c index a6d9ef46009e..6d295c776794 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -15,7 +15,7 @@ #include <linux/errno.h> #include <linux/init.h> #include <linux/pm.h> - +#include <linux/console.h> #include "power.h" @@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state) goto Thaw; } + suspend_console(); if ((error = device_suspend(PMSG_SUSPEND))) { printk(KERN_ERR "Some devices failed to suspend\n"); goto Finish; @@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state) static void suspend_finish(suspend_state_t state) { device_resume(); + resume_console(); thaw_processes(); enable_nonboot_cpus(); if (pm_ops && pm_ops->finish) @@ -143,7 +145,7 @@ static void suspend_finish(suspend_state_t state) -static char *pm_states[PM_SUSPEND_MAX] = { +static const char * const pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", #ifdef CONFIG_SOFTWARE_SUSPEND @@ -260,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf) static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) { suspend_state_t state = PM_SUSPEND_STANDBY; - char ** s; + const char * const *s; char *p; int error; int len; diff --git a/kernel/power/power.h b/kernel/power/power.h index f06f12f21767..57a792982fb9 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -55,7 +55,7 @@ struct snapshot_handle { unsigned int page; unsigned int page_offset; unsigned int prev; - struct pbe *pbe; + struct pbe *pbe, *last_pbe; void *buffer; unsigned int buf_offset; }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 3eeedbb13b78..24c96f354231 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -150,6 +150,10 @@ int restore_highmem(void) } return 0; } +#else +static inline unsigned int count_highmem_pages(void) {return 0;} +static inline int save_highmem(void) {return 0;} +static inline int restore_highmem(void) {return 0;} #endif static int pfn_is_nosave(unsigned long pfn) @@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) } } -/** - * On resume it is necessary to trace and eventually free the unsafe - * pages that have been allocated, because they are needed for I/O - * (on x86-64 we likely will "eat" these pages once again while - * creating the temporary page translation tables) - */ - -struct eaten_page { - struct eaten_page *next; - char padding[PAGE_SIZE - sizeof(void *)]; -}; - -static struct eaten_page *eaten_pages = NULL; - -static void release_eaten_pages(void) -{ - struct eaten_page *p, *q; - - p = eaten_pages; - while (p) { - q = p->next; - /* We don't want swsusp_free() to free this page again */ - ClearPageNosave(virt_to_page(p)); - free_page((unsigned long)p); - p = q; - } - eaten_pages = NULL; -} +static unsigned int unsafe_pages; /** * @safe_needed - on resume, for storing the PBE list and the image, * we can only use memory pages that do not conflict with the pages - * which had been used before suspend. + * used before suspend. * * The unsafe pages are marked with the PG_nosave_free flag - * - * Allocated but unusable (ie eaten) memory pages should be marked - * so that swsusp_free() can release them + * and we count them using unsafe_pages */ static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) { void *res; + res = (void *)get_zeroed_page(gfp_mask); if (safe_needed) - do { + while (res && PageNosaveFree(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + SetPageNosave(virt_to_page(res)); + unsafe_pages++; res = (void *)get_zeroed_page(gfp_mask); - if (res && PageNosaveFree(virt_to_page(res))) { - /* This is for swsusp_free() */ - SetPageNosave(virt_to_page(res)); - ((struct eaten_page *)res)->next = eaten_pages; - eaten_pages = res; - } - } while (res && PageNosaveFree(virt_to_page(res))); - else - res = (void *)get_zeroed_page(gfp_mask); + } if (res) { SetPageNosave(virt_to_page(res)); SetPageNosaveFree(virt_to_page(res)); @@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask) * On each page we set up a list of struct_pbe elements. */ -struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) +static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, + int safe_needed) { unsigned int num; struct pbe *pblist, *pbe; @@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist) return -EFAULT; } + unsafe_pages = 0; + return 0; } @@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf, } /** - * create_image - use metadata contained in the PBE list + * prepare_image - use metadata contained in the PBE list * pointed to by pagedir_nosave to mark the pages that will * be overwritten in the process of restoring the system - * memory state from the image and allocate memory for - * the image avoiding these pages + * memory state from the image ("unsafe" pages) and allocate + * memory for the image + * + * The idea is to allocate the PBE list first and then + * allocate as many pages as it's needed for the image data, + * but not to assign these pages to the PBEs initially. + * Instead, we just mark them as allocated and create a list + * of "safe" which will be used later */ -static int create_image(struct snapshot_handle *handle) +struct safe_page { + struct safe_page *next; + char padding[PAGE_SIZE - sizeof(void *)]; +}; + +static struct safe_page *safe_pages; + +static int prepare_image(struct snapshot_handle *handle) { int error = 0; - struct pbe *p, *pblist; + unsigned int nr_pages = nr_copy_pages; + struct pbe *p, *pblist = NULL; p = pagedir_nosave; error = mark_unsafe_pages(p); if (!error) { - pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); + pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); if (pblist) copy_page_backup_list(pblist, p); free_pagedir(p, 0); if (!pblist) error = -ENOMEM; } - if (!error) - error = alloc_data_pages(pblist, GFP_ATOMIC, 1); + safe_pages = NULL; + if (!error && nr_pages > unsafe_pages) { + nr_pages -= unsafe_pages; + while (nr_pages--) { + struct safe_page *ptr; + + ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); + if (!ptr) { + error = -ENOMEM; + break; + } + if (!PageNosaveFree(virt_to_page(ptr))) { + /* The page is "safe", add it to the list */ + ptr->next = safe_pages; + safe_pages = ptr; + } + /* Mark the page as allocated */ + SetPageNosave(virt_to_page(ptr)); + SetPageNosaveFree(virt_to_page(ptr)); + } + } if (!error) { - release_eaten_pages(); pagedir_nosave = pblist; } else { - pagedir_nosave = NULL; handle->pbe = NULL; - nr_copy_pages = 0; - nr_meta_pages = 0; + swsusp_free(); } return error; } +static void *get_buffer(struct snapshot_handle *handle) +{ + struct pbe *pbe = handle->pbe, *last = handle->last_pbe; + struct page *page = virt_to_page(pbe->orig_address); + + if (PageNosave(page) && PageNosaveFree(page)) { + /* + * We have allocated the "original" page frame and we can + * use it directly to store the read page + */ + pbe->address = 0; + if (last && last->next) + last->next = NULL; + return (void *)pbe->orig_address; + } + /* + * The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the read page + */ + pbe->address = (unsigned long)safe_pages; + safe_pages = safe_pages->next; + if (last) + last->next = pbe; + handle->last_pbe = pbe; + return (void *)pbe->address; +} + /** * snapshot_write_next - used for writing the system memory snapshot. * @@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) } else if (handle->prev <= nr_meta_pages) { handle->pbe = unpack_orig_addresses(buffer, handle->pbe); if (!handle->pbe) { - error = create_image(handle); + error = prepare_image(handle); if (error) return error; handle->pbe = pagedir_nosave; - handle->buffer = (void *)handle->pbe->address; + handle->last_pbe = NULL; + handle->buffer = get_buffer(handle); } } else { handle->pbe = handle->pbe->next; - handle->buffer = (void *)handle->pbe->address; + handle->buffer = get_buffer(handle); } handle->prev = handle->page; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index c4016cbbd3e0..17f669c83012 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void); int save_highmem(void); int restore_highmem(void); #else -static int save_highmem(void) { return 0; } -static int restore_highmem(void) { return 0; } -static unsigned int count_highmem_pages(void) { return 0; } +static inline int save_highmem(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +static inline unsigned int count_highmem_pages(void) { return 0; } #endif /** @@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap) */ #define SHRINK_BITE 10000 +static inline unsigned long __shrink_memory(long tmp) +{ + if (tmp > SHRINK_BITE) + tmp = SHRINK_BITE; + return shrink_all_memory(tmp); +} int swsusp_shrink_memory(void) { @@ -192,15 +198,17 @@ int swsusp_shrink_memory(void) PAGES_FOR_IO; tmp = size; for_each_zone (zone) - if (!is_highmem(zone)) + if (!is_highmem(zone) && populated_zone(zone)) { tmp -= zone->free_pages; + tmp += zone->lowmem_reserve[ZONE_NORMAL]; + } if (tmp > 0) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(tmp); if (!tmp) return -ENOMEM; pages += tmp; } else if (size > image_size / PAGE_SIZE) { - tmp = shrink_all_memory(SHRINK_BITE); + tmp = __shrink_memory(size - (image_size / PAGE_SIZE)); pages += tmp; } printk("\b%c", p[i++%4]); diff --git a/kernel/printk.c b/kernel/printk.c index c056f3324432..95b7fe17f124 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -24,6 +24,7 @@ #include <linux/console.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/moduleparam.h> #include <linux/interrupt.h> /* For in_interrupt() */ #include <linux/config.h> #include <linux/delay.h> @@ -67,6 +68,7 @@ EXPORT_SYMBOL(oops_in_progress); * driver system. */ static DECLARE_MUTEX(console_sem); +static DECLARE_MUTEX(secondary_console_sem); struct console *console_drivers; /* * This is used for debugging the mess that is the VT code by @@ -76,7 +78,7 @@ struct console *console_drivers; * path in the console code where we end up in places I want * locked without the console sempahore held */ -static int console_locked; +static int console_locked, console_suspended; /* * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars @@ -326,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end) struct console *con; for (con = console_drivers; con; con = con->next) { - if ((con->flags & CON_ENABLED) && con->write) + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) con->write(con, &LOG_BUF(start), end - start); } } @@ -436,6 +440,7 @@ static int printk_time = 1; #else static int printk_time = 0; #endif +module_param(printk_time, int, S_IRUGO | S_IWUSR); static int __init printk_time_setup(char *str) { @@ -452,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void) return sched_clock(); } +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for (con = console_drivers; con; con = con->next) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + /** * printk - print a kernel message * @fmt: format string @@ -565,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id())) { + if (!down_trylock(&console_sem)) { /* - * Some console drivers may assume that per-cpu resources have - * been allocated. So don't allow them to be called by this - * CPU until it is officially up. We shouldn't be calling into - * random console drivers on a CPU which doesn't exist yet.. + * We own the drivers. We can drop the spinlock and + * let release_console_sem() print the text, maybe ... */ + console_locked = 1; printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); - goto out; - } - if (!down_trylock(&console_sem)) { - console_locked = 1; + /* - * We own the drivers. We can drop the spinlock and let - * release_console_sem() print the text + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. */ - printk_cpu = UINT_MAX; - spin_unlock_irqrestore(&logbuf_lock, flags); - console_may_schedule = 0; - release_console_sem(); + if (cpu_online(smp_processor_id()) || have_callable_console()) { + console_may_schedule = 0; + release_console_sem(); + } else { + /* Release by hand to avoid flushing the buffer. */ + console_locked = 0; + up(&console_sem); + } } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -595,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } -out: + preempt_enable(); return printed_len; } @@ -698,6 +717,23 @@ int __init add_preferred_console(char *name, int idx, char *options) } /** + * suspend_console - suspend the console subsystem + * + * This disables printk() while we go into suspend states + */ +void suspend_console(void) +{ + acquire_console_sem(); + console_suspended = 1; +} + +void resume_console(void) +{ + console_suspended = 0; + release_console_sem(); +} + +/** * acquire_console_sem - lock the console system for exclusive use. * * Acquires a semaphore which guarantees that the caller has @@ -708,6 +744,10 @@ int __init add_preferred_console(char *name, int idx, char *options) void acquire_console_sem(void) { BUG_ON(in_interrupt()); + if (console_suspended) { + down(&secondary_console_sem); + return; + } down(&console_sem); console_locked = 1; console_may_schedule = 1; @@ -750,6 +790,10 @@ void release_console_sem(void) unsigned long _con_start, _log_end; unsigned long wake_klogd = 0; + if (console_suspended) { + up(&secondary_console_sem); + return; + } for ( ; ; ) { spin_lock_irqsave(&logbuf_lock, flags); wake_klogd |= log_start - log_end; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e4..335c5b932e14 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) static int may_attach(struct task_struct *task) { - if (!task->mm) - return -EPERM; + /* May we inspect the given task? + * This check is used both for attaching with ptrace + * and for allowing access to sensitive information in /proc. + * + * ptrace_attach denies several cases that /proc allows + * because setting up the necessary parent/child relationship + * or halting the specified task is impossible. + */ + int dumpable = 0; + /* Don't let security modules deny introspection */ + if (task == current) + return 0; if (((current->uid != task->euid) || (current->uid != task->suid) || (current->uid != task->uid) || @@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) return -EPERM; smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + if (task->mm) + dumpable = task->mm->dumpable; + if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; return security_ptrace(current, task); @@ -176,6 +188,8 @@ repeat: goto repeat; } + if (!task->mm) + goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; @@ -200,7 +214,7 @@ out: return retval; } -void __ptrace_detach(struct task_struct *child, unsigned int data) +static inline void __ptrace_detach(struct task_struct *child, unsigned int data) { child->exit_code = data; /* .. re-parent .. */ @@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) ptrace_disable(child); write_lock_irq(&tasklist_lock); + /* protect against de_thread()->release_task() */ if (child->ptrace) __ptrace_detach(child, data); write_unlock_irq(&tasklist_lock); diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 2058f88c7bbb..20e9710fc21c 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -612,14 +612,6 @@ void synchronize_rcu(void) wait_for_completion(&rcu.completion); } -/* - * Deprecated, use synchronize_rcu() or synchronize_sched() instead. - */ -void synchronize_kernel(void) -{ - synchronize_rcu(); -} - module_param(blimit, int, 0); module_param(qhimark, int, 0); module_param(qlowmark, int, 0); @@ -627,7 +619,6 @@ module_param(qlowmark, int, 0); module_param(rsinterval, int, 0); #endif EXPORT_SYMBOL_GPL(rcu_batches_completed); -EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ -EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ +EXPORT_SYMBOL_GPL(call_rcu); +EXPORT_SYMBOL_GPL(call_rcu_bh); EXPORT_SYMBOL_GPL(synchronize_rcu); -EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */ diff --git a/kernel/sched.c b/kernel/sched.c index c13f1bd2df7d..a856040c200a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -818,6 +818,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) * the target CPU. */ #ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + static void resched_task(task_t *p) { int cpu; @@ -833,9 +838,9 @@ static void resched_task(task_t *p) if (cpu == smp_processor_id()) return; - /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ + /* NEED_RESCHED must be visible before we test polling */ smp_mb(); - if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) + if (!tsk_is_polling(p)) smp_send_reschedule(cpu); } #else @@ -3886,6 +3891,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) !capable(CAP_SYS_NICE)) goto out_unlock; + retval = security_task_setscheduler(p, 0, NULL); + if (retval) + goto out_unlock; + cpus_allowed = cpuset_cpus_allowed(p); cpus_and(new_mask, new_mask, cpus_allowed); retval = set_cpus_allowed(p, new_mask); @@ -3954,7 +3963,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (!p) goto out_unlock; - retval = 0; + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + cpus_and(*mask, p->cpus_allowed, cpu_online_map); out_unlock: @@ -4046,6 +4058,9 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP + __might_sleep(__FILE__, __LINE__); +#endif /* * The BKS might be reacquired before we have dropped * PREEMPT_ACTIVE, which could trigger a second @@ -4142,7 +4157,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); atomic_inc(&rq->nr_iowait); schedule(); @@ -4153,7 +4168,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); + struct runqueue *rq = &__raw_get_cpu_var(runqueues); long ret; atomic_inc(&rq->nr_iowait); @@ -4237,7 +4252,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) if (retval) goto out_unlock; - jiffies_to_timespec(p->policy & SCHED_FIFO ? + jiffies_to_timespec(p->policy == SCHED_FIFO ? 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -4746,6 +4761,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!cpu_rq(cpu)->migration_thread) + break; /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread, any_online_cpu(cpu_online_map)); diff --git a/kernel/signal.c b/kernel/signal.c index e5f8aea78ffe..52adf53929f6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -23,12 +23,12 @@ #include <linux/syscalls.h> #include <linux/ptrace.h> #include <linux/signal.h> -#include <linux/audit.h> #include <linux/capability.h> #include <asm/param.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/siginfo.h> +#include "audit.h" /* audit_signal_info() */ /* * SLAB caches for signal bits. @@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) spin_unlock_irqrestore(&sighand->siglock, flags); } +static inline int may_ptrace_stop(void) +{ + if (!likely(current->ptrace & PT_PTRACED)) + return 0; + + if (unlikely(current->parent == current->real_parent && + (current->ptrace & PT_ATTACHED))) + return 0; + + if (unlikely(current->signal == current->parent->signal) && + unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) + return 0; + + /* + * Are we in the middle of do_coredump? + * If so and our tracer is also part of the coredump stopping + * is a deadlock situation, and pointless because our tracer + * is dead so don't allow us to stop. + * If SIGKILL was already sent before the caller unlocked + * ->siglock we must see ->core_waiters != 0. Otherwise it + * is safe to enter schedule(). + */ + if (unlikely(current->mm->core_waiters) && + unlikely(current->mm == current->parent->mm)) + return 0; + + return 1; +} + /* * This must be called with current->sighand->siglock held. * @@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); try_to_freeze(); read_lock(&tasklist_lock); - if (likely(current->ptrace & PT_PTRACED) && - likely(current->parent != current->real_parent || - !(current->ptrace & PT_ATTACHED)) && - (likely(current->parent->signal != current->signal) || - !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { + if (may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 336f92d64e2e..9e2f1c6e73d7 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb, break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(ksoftirqd, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(ksoftirqd, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 14c7faf02909..b5c3b94e01ce 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -36,7 +36,7 @@ static struct notifier_block panic_block = { void touch_softlockup_watchdog(void) { - per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; + __raw_get_cpu_var(touch_timestamp) = jiffies; } EXPORT_SYMBOL(touch_softlockup_watchdog); @@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + if (!per_cpu(watchdog_task, hotcpu)) + break; /* Unbind so it can run. Fall thru. */ kthread_bind(per_cpu(watchdog_task, hotcpu), any_online_cpu(cpu_online_map)); diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d731466..2c0aacc37c55 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -4,6 +4,7 @@ #include <linux/cpu.h> #include <linux/err.h> #include <linux/syscalls.h> +#include <linux/kthread.h> #include <asm/atomic.h> #include <asm/semaphore.h> #include <asm/uaccess.h> @@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads; static atomic_t stopmachine_thread_ack; static DECLARE_MUTEX(stopmachine_mutex); -static int stopmachine(void *cpu) +static int stopmachine(void *unused) { int irqs_disabled = 0; int prepared = 0; - set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu)); - /* Ack: we are alive */ smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ atomic_inc(&stopmachine_thread_ack); @@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state) static int stop_machine(void) { - int i, ret = 0; + int ret = 0; + unsigned int i; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; /* One high-prio thread per cpu. We'll do this one. */ @@ -96,11 +96,16 @@ static int stop_machine(void) stopmachine_state = STOPMACHINE_WAIT; for_each_online_cpu(i) { + struct task_struct *tsk; if (i == raw_smp_processor_id()) continue; - ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); - if (ret < 0) + tsk = kthread_create(stopmachine, NULL, "stopmachine"); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); break; + } + kthread_bind(tsk, i); + wake_up_process(tsk); stopmachine_num_threads++; } diff --git a/kernel/sys.c b/kernel/sys.c index 0b6ec0e7936f..2d5179c67cec 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -13,7 +13,6 @@ #include <linux/notifier.h> #include <linux/reboot.h> #include <linux/prctl.h> -#include <linux/init.h> #include <linux/highuid.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -57,6 +56,12 @@ #ifndef GET_FPEXC_CTL # define GET_FPEXC_CTL(a,b) (-EINVAL) #endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif /* * this is where the system-wide overflow UID and GID are defined, for @@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, unsigned long val, void *v) { int ret = NOTIFY_DONE; - struct notifier_block *nb; + struct notifier_block *nb, *next_nb; nb = rcu_dereference(*nl); while (nb) { + next_nb = rcu_dereference(nb->next); ret = nb->notifier_call(nb, val, v); if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) break; - nb = rcu_dereference(nb->next); + nb = next_nb; } return ret; } @@ -583,7 +589,7 @@ void emergency_restart(void) } EXPORT_SYMBOL_GPL(emergency_restart); -void kernel_restart_prepare(char *cmd) +static void kernel_restart_prepare(char *cmd) { blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; @@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart); * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ -void kernel_kexec(void) +static void kernel_kexec(void) { #ifdef CONFIG_KEXEC struct kimage *image; @@ -631,7 +637,6 @@ void kernel_kexec(void) machine_kexec(image); #endif } -EXPORT_SYMBOL_GPL(kernel_kexec); void kernel_shutdown_prepare(enum system_states state) { @@ -1860,23 +1865,20 @@ out: * fields when reaping, so a sample either gets all the additions of a * given child after it's reaped, or none so this sample is before reaping. * - * tasklist_lock locking optimisation: - * If we are current and single threaded, we do not need to take the tasklist - * lock or the siglock. No one else can take our signal_struct away, - * no one else can reap the children to update signal->c* counters, and - * no one else can race with the signal-> fields. - * If we do not take the tasklist_lock, the signal-> fields could be read - * out of order while another thread was just exiting. So we place a - * read memory barrier when we avoid the lock. On the writer side, - * write memory barrier is implied in __exit_signal as __exit_signal releases - * the siglock spinlock after updating the signal-> fields. - * - * We don't really need the siglock when we access the non c* fields - * of the signal_struct (for RUSAGE_SELF) even in multithreaded - * case, since we take the tasklist lock for read and the non c* signal-> - * fields are updated only in __exit_signal, which is called with - * tasklist_lock taken for write, hence these two threads cannot execute - * concurrently. + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. * */ @@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) struct task_struct *t; unsigned long flags; cputime_t utime, stime; - int need_lock = 0; memset((char *) r, 0, sizeof *r); utime = stime = cputime_zero; - if (p != current || !thread_group_empty(p)) - need_lock = 1; - - if (need_lock) { - read_lock(&tasklist_lock); - if (unlikely(!p->signal)) { - read_unlock(&tasklist_lock); - return; - } - } else - /* See locking comments above */ - smp_rmb(); + rcu_read_lock(); + if (!lock_task_sighand(p, &flags)) { + rcu_read_unlock(); + return; + } switch (who) { case RUSAGE_BOTH: case RUSAGE_CHILDREN: - spin_lock_irqsave(&p->sighand->siglock, flags); utime = p->signal->cutime; stime = p->signal->cstime; r->ru_nvcsw = p->signal->cnvcsw; r->ru_nivcsw = p->signal->cnivcsw; r->ru_minflt = p->signal->cmin_flt; r->ru_majflt = p->signal->cmaj_flt; - spin_unlock_irqrestore(&p->sighand->siglock, flags); if (who == RUSAGE_CHILDREN) break; @@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r) BUG(); } - if (need_lock) - read_unlock(&tasklist_lock); + unlock_task_sighand(p, &flags); + rcu_read_unlock(); + cputime_to_timeval(utime, &r->ru_utime); cputime_to_timeval(stime, &r->ru_stime); } @@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, return -EFAULT; return 0; } + case PR_GET_ENDIAN: + error = GET_ENDIAN(current, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(current, arg2); + break; + default: error = -EINVAL; break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 5433195040f1..6991bece67e8 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); cond_syscall(sys_chown16); cond_syscall(sys_fchown16); cond_syscall(sys_getegid16); @@ -132,3 +133,4 @@ cond_syscall(sys_mincore); cond_syscall(sys_madvise); cond_syscall(sys_mremap); cond_syscall(sys_remap_file_pages); +cond_syscall(compat_sys_move_pages); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e82726faeeff..f1a4eb1a655e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp, extern int C_A_D; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; +extern int sysctl_panic_on_oom; extern int max_threads; extern int sysrq_enabled; extern int core_uses_pid; @@ -72,6 +73,7 @@ extern int printk_ratelimit_burst; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; +extern int compat_log; #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) int unknown_nmi_panic; @@ -142,7 +144,6 @@ static struct ctl_table_header root_table_header = static ctl_table kern_table[]; static ctl_table vm_table[]; -static ctl_table proc_table[]; static ctl_table fs_table[]; static ctl_table debug_table[]; static ctl_table dev_table[]; @@ -150,7 +151,7 @@ extern ctl_table random_table[]; #ifdef CONFIG_UNIX98_PTYS extern ctl_table pty_table[]; #endif -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER extern ctl_table inotify_table[]; #endif @@ -202,12 +203,6 @@ static ctl_table root_table[] = { }, #endif { - .ctl_name = CTL_PROC, - .procname = "proc", - .mode = 0555, - .child = proc_table, - }, - { .ctl_name = CTL_FS, .procname = "fs", .mode = 0555, @@ -398,7 +393,7 @@ static ctl_table kern_table[] = { .strategy = &sysctl_string, }, #endif -#ifdef CONFIG_HOTPLUG +#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { .ctl_name = KERN_HOTPLUG, .procname = "hotplug", @@ -683,6 +678,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, #endif +#ifdef CONFIG_COMPAT + { + .ctl_name = KERN_COMPAT_LOG, + .procname = "compat-log", + .data = &compat_log, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; @@ -702,6 +707,14 @@ static ctl_table vm_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = VM_PANIC_ON_OOM, + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = VM_OVERCOMMIT_RATIO, .procname = "overcommit_ratio", .data = &sysctl_overcommit_ratio, @@ -918,10 +931,6 @@ static ctl_table vm_table[] = { { .ctl_name = 0 } }; -static ctl_table proc_table[] = { - { .ctl_name = 0 } -}; - static ctl_table fs_table[] = { { .ctl_name = FS_NRINODE, @@ -1028,7 +1037,7 @@ static ctl_table fs_table[] = { .mode = 0644, .proc_handler = &proc_doulongvec_minmax, }, -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER { .ctl_name = FS_INOTIFY, .procname = "inotify", diff --git a/kernel/time.c b/kernel/time.c index b00ddc71cedb..5bd489747643 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); #else +#ifndef CONFIG_GENERIC_TIME /* * Simulate gettimeofday using do_gettimeofday which only allows a timeval * and therefore only yields usec accuracy @@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) } EXPORT_SYMBOL_GPL(getnstimeofday); #endif +#endif /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000000..e1dfd8e86cce --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1 @@ +obj-y += clocksource.o jiffies.o diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000000..74eca5939bd9 --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,349 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + * o get rid of clocksource_jiffies extern + */ + +#include <linux/clocksource.h> +#include <linux/sysdev.h> +#include <linux/init.h> +#include <linux/module.h> + +/* XXX - Would like a better way for initializing curr_clocksource */ +extern struct clocksource clocksource_jiffies; + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. Initialized to clocksource_jiffies. + * next_clocksource: + * pending next selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_lock: + * protects manipulations to curr_clocksource and next_clocksource + * and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource = &clocksource_jiffies; +static struct clocksource *next_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_SPINLOCK(clocksource_lock); +static char override_name[32]; +static int finished_booting; + +/* clocksource_done_booting - Called near the end of bootup + * + * Hack to avoid lots of clocksource churn at boot time + */ +static int __init clocksource_done_booting(void) +{ + finished_booting = 1; + return 0; +} + +late_initcall(clocksource_done_booting); + +/** + * clocksource_get_next - Returns the selected clocksource + * + */ +struct clocksource *clocksource_get_next(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + if (next_clocksource && finished_booting) { + curr_clocksource = next_clocksource; + next_clocksource = NULL; + } + spin_unlock_irqrestore(&clocksource_lock, flags); + + return curr_clocksource; +} + +/** + * select_clocksource - Finds the best registered clocksource. + * + * Private function. Must hold clocksource_lock when called. + * + * Looks through the list of registered clocksources, returning + * the one with the highest rating value. If there is a clocksource + * name that matches the override string, it returns that clocksource. + */ +static struct clocksource *select_clocksource(void) +{ + struct clocksource *best = NULL; + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (!best) + best = src; + + /* check for override: */ + if (strlen(src->name) == strlen(override_name) && + !strcmp(src->name, override_name)) { + best = src; + break; + } + /* pick the highest rating: */ + if (src->rating > best->rating) + best = src; + } + + return best; +} + +/** + * is_registered_source - Checks if clocksource is registered + * @c: pointer to a clocksource + * + * Private helper function. Must hold clocksource_lock when called. + * + * Returns one if the clocksource is already registered, zero otherwise. + */ +static int is_registered_source(struct clocksource *c) +{ + int len = strlen(c->name); + struct list_head *tmp; + + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + if (strlen(src->name) == len && !strcmp(src->name, c->name)) + return 1; + } + + return 0; +} + +/** + * clocksource_register - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int clocksource_register(struct clocksource *c) +{ + int ret = 0; + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + /* check if clocksource is already registered */ + if (is_registered_source(c)) { + printk("register_clocksource: Cannot register %s. " + "Already registered!", c->name); + ret = -EBUSY; + } else { + /* register it */ + list_add(&c->list, &clocksource_list); + /* scan the registered clocksources, and pick the best one */ + next_clocksource = select_clocksource(); + } + spin_unlock_irqrestore(&clocksource_lock, flags); + return ret; +} +EXPORT_SYMBOL(clocksource_register); + +/** + * clocksource_reselect - Rescan list for next clocksource + * + * A quick helper function to be used if a clocksource changes its + * rating. Forces the clocksource list to be re-scanned for the best + * clocksource. + */ +void clocksource_reselect(void) +{ + unsigned long flags; + + spin_lock_irqsave(&clocksource_lock, flags); + next_clocksource = select_clocksource(); + spin_unlock_irqrestore(&clocksource_lock, flags); +} +EXPORT_SYMBOL(clocksource_reselect); + +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, char *buf) +{ + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + curr += sprintf(curr, "%s ", curr_clocksource->name); + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selction. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + const char *buf, size_t count) +{ + size_t ret = count; + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + spin_lock_irq(&clocksource_lock); + + /* copy the name given: */ + memcpy(override_name, buf, count); + override_name[count] = 0; + + /* try to select it: */ + next_clocksource = select_clocksource(); + + spin_unlock_irq(&clocksource_lock); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, char *buf) +{ + struct list_head *tmp; + char *curr = buf; + + spin_lock_irq(&clocksource_lock); + list_for_each(tmp, &clocksource_list) { + struct clocksource *src; + + src = list_entry(tmp, struct clocksource, list); + curr += sprintf(curr, "%s ", src->name); + } + spin_unlock_irq(&clocksource_lock); + + curr += sprintf(curr, "\n"); + + return curr - buf; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0600, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + set_kset_name("clocksource"), +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + unsigned long flags; + spin_lock_irqsave(&clocksource_lock, flags); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + spin_unlock_irqrestore(&clocksource_lock, flags); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000000..126bb30c4afe --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,73 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include <linux/clocksource.h> +#include <linux/jiffies.h> +#include <linux/init.h> + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not reccomended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater then 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(void) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 0, /* lowest rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, + .is_continuous = 0, /* tick based, not free running */ +}; + +static int __init init_jiffies_clocksource(void) +{ + return clocksource_register(&clocksource_jiffies); +} + +module_init(init_jiffies_clocksource); diff --git a/kernel/timer.c b/kernel/timer.c index 9e49deed468c..5bb6b7976eec 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) void fastcall init_timer(struct timer_list *timer) { timer->entry.next = NULL; - timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); + timer->base = __raw_get_cpu_var(tvec_bases); } EXPORT_SYMBOL(init_timer); @@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(tvec_base_t *base, tvec_t *tv, int index) { /* cascade all the timers from tv up one level */ - struct list_head *head, *curr; + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); - head = tv->vec + index; - curr = head->next; /* - * We are removing _all_ timers from the list, so we don't have to - * detach them individually, just clear the list afterwards. + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. */ - while (curr != head) { - struct timer_list *tmp; - - tmp = list_entry(curr, struct timer_list, entry); - BUG_ON(tmp->base != base); - curr = curr->next; - internal_add_timer(base, tmp); + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(timer->base != base); + internal_add_timer(base, timer); } - INIT_LIST_HEAD(head); return index; } @@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base) spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list = LIST_HEAD_INIT(work_list); + struct list_head work_list; struct list_head *head = &work_list; int index = base->timer_jiffies & TVR_MASK; - + /* * Cascade timers: */ @@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base) (!cascade(base, &base->tv3, INDEX(1))) && !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); - ++base->timer_jiffies; - list_splice_init(base->tv1.vec + index, &work_list); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); while (!list_empty(head)) { void (*fn)(unsigned long); unsigned long data; @@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ long time_precision = 1; /* clock precision (us) */ long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ -static long time_phase; /* phase offset (scaled us) */ long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; /* frequency offset (scaled ppm)*/ static long time_adj; /* tick adjust (scaled 1 / HZ) */ @@ -751,27 +746,14 @@ static long adjtime_adjustment(void) } /* in the NTP reference this is called "hardclock()" */ -static void update_wall_time_one_tick(void) +static void update_ntp_one_tick(void) { - long time_adjust_step, delta_nsec; + long time_adjust_step; time_adjust_step = adjtime_adjustment(); if (time_adjust_step) /* Reduce by this step the amount of time left */ time_adjust -= time_adjust_step; - delta_nsec = tick_nsec + time_adjust_step * 1000; - /* - * Advance the phase, once it gets to one microsecond, then - * advance the tick more. - */ - time_phase += time_adj; - if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { - long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); - time_phase -= ltemp << (SHIFT_SCALE - 10); - delta_nsec += ltemp; - } - xtime.tv_nsec += delta_nsec; - time_interpolator_update(delta_nsec); /* Changes by adjtime() do not take effect till next tick. */ if (time_next_adjust != 0) { @@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void) * Return how long ticks are at the moment, that is, how much time * update_wall_time_one_tick will add to xtime next time we call it * (assuming no calls to do_adjtimex in the meantime). - * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 - * bits to the right of the binary point. + * The return value is in fixed-point nanoseconds shifted by the + * specified number of bits to the right of the binary point. * This function has no side-effects. */ u64 current_tick_length(void) { long delta_nsec; + u64 ret; + /* calculate the finest interval NTP will allow. + * ie: nanosecond value shifted by (SHIFT_SCALE - 10) + */ delta_nsec = tick_nsec + adjtime_adjustment() * 1000; - return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; + ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; + ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); + + return ret; } -/* - * Using a loop looks inefficient, but "ticks" is - * usually just one (we shouldn't be losing ticks, - * we're doing this this way mainly for interrupt - * latency reasons, not because we think we'll - * have lots of lost timer ticks +/* XXX - all of this timekeeping code should be later moved to time.c */ +#include <linux/clocksource.h> +static struct clocksource *clock; /* pointer to current clocksource */ + +#ifdef CONFIG_GENERIC_TIME +/** + * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook + * + * private function, must hold xtime_lock lock when being + * called. Returns the number of nanoseconds since the + * last call to update_wall_time() (adjusted by NTP scaling) + */ +static inline s64 __get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + +/** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. Used by + * do_gettimeofday() and get_realtime_clock_ts(). + */ +static inline void __get_realtime_clock_ts(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = __get_nsec_offset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + __get_realtime_clock_ts(ts); +} + +EXPORT_SYMBOL(getnstimeofday); + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using get_realtime_clock_ts() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + __get_realtime_clock_ts(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(struct timespec *tv) +{ + unsigned long flags; + time_t wtm_sec, sec = tv->tv_sec; + long wtm_nsec, nsec = tv->tv_nsec; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + nsec -= __get_nsec_offset(); + + wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); + wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); + + set_normalized_timespec(&xtime, sec, nsec); + set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + + ntp_clear(); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void) +{ + struct clocksource *new; + cycle_t now; + u64 nsec; + new = clocksource_get_next(); + if (clock != new) { + now = clocksource_read(new); + nsec = __get_nsec_offset(); + timespec_add_ns(&xtime, nsec); + + clock = new; + clock->cycle_last = now; + printk(KERN_INFO "Time: %s clocksource has been installed.\n", + clock->name); + return 1; + } else if (clock->update_callback) { + return clock->update_callback(); + } + return 0; +} +#else +#define change_clocksource() (0) +#endif + +/** + * timeofday_is_continuous - check to see if timekeeping is free running */ -static void update_wall_time(unsigned long ticks) +int timekeeping_is_continuous(void) { + unsigned long seq; + int ret; + do { - ticks--; - update_wall_time_one_tick(); - if (xtime.tv_nsec >= 1000000000) { - xtime.tv_nsec -= 1000000000; + seq = read_seqbegin(&xtime_lock); + + ret = clock->is_continuous; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + clock = clocksource_get_next(); + clocksource_calculate_interval(clock, tick_nsec); + clock->cycle_last = clocksource_read(clock); + ntp_clear(); + write_sequnlock_irqrestore(&xtime_lock, flags); +} + + +/* + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * @dev: unused + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static int timekeeping_resume(struct sys_device *dev) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + /* restart the last cycle value */ + clock->cycle_last = clocksource_read(clock); + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct sysdev_class timekeeping_sysclass = { + .resume = timekeeping_resume, + set_kset_name("timekeeping"), +}; + +static struct sys_device device_timer = { + .id = 0, + .cls = &timekeeping_sysclass, +}; + +static int __init timekeeping_init_device(void) +{ + int error = sysdev_class_register(&timekeeping_sysclass); + if (!error) + error = sysdev_register(&device_timer); + return error; +} + +device_initcall(timekeeping_init_device); + +/* + * If the error is already larger, we look ahead another tick, + * to compensate for late or lost adjustments. + */ +static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) +{ + int adj; + + /* + * As soon as the machine is synchronized to the external time + * source this should be the common case. + */ + error >>= 2; + if (likely(sign > 0 ? error <= *interval : error >= *interval)) + return sign; + + /* + * An extra look ahead dampens the effect of the current error, + * which can grow quite large with continously late updates, as + * it would dominate the adjustment value and can lead to + * oscillation. + */ + error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); + error -= clock->xtime_interval >> 1; + + adj = 0; + while (1) { + error >>= 1; + if (sign > 0 ? error <= *interval : error >= *interval) + break; + adj++; + } + + /* + * Add the current adjustments to the error and take the offset + * into account, the latter can cause the error to be hardly + * reduced at the next tick. Check the error again if there's + * room for another adjustment, thus further reducing the error + * which otherwise had to be corrected at the next update. + */ + error = (error << 1) - *interval + *offset; + if (sign > 0 ? error > *interval : error < *interval) + adj++; + + *interval <<= adj; + *offset <<= adj; + return sign << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void clocksource_adjust(struct clocksource *clock, s64 offset) +{ + s64 error, interval = clock->cycle_interval; + int adj; + + error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); + if (error > interval) { + adj = clocksource_bigadjust(1, error, &interval, &offset); + } else if (error < -interval) { + interval = -interval; + offset = -offset; + adj = clocksource_bigadjust(-1, error, &interval, &offset); + } else + return; + + clock->mult += adj; + clock->xtime_interval += interval; + clock->xtime_nsec -= offset; + clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); +} + +/* + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + cycle_t offset; + + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + +#ifdef CONFIG_GENERIC_TIME + offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; +#else + offset = clock->cycle_interval; +#endif + + /* normally this loop will run just once, however in the + * case of lost or late ticks, it will accumulate correctly. + */ + while (offset >= clock->cycle_interval) { + /* accumulate one interval */ + clock->xtime_nsec += clock->xtime_interval; + clock->cycle_last += clock->cycle_interval; + offset -= clock->cycle_interval; + + if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { + clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; xtime.tv_sec++; second_overflow(); } - } while (ticks); + + /* interpolator bits */ + time_interpolator_update(clock->xtime_interval + >> clock->shift); + /* increment the NTP state machine */ + update_ntp_one_tick(); + + /* accumulate error between NTP and clock interval */ + clock->error += current_tick_length(); + clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); + } + + /* correct the clock when NTP error is too big */ + clocksource_adjust(clock, offset); + + /* store full nanoseconds into xtime */ + xtime.tv_nsec = clock->xtime_nsec >> clock->shift; + clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; + + /* check to see if there is a new clocksource to use */ + if (change_clocksource()) { + clock->error = 0; + clock->xtime_nsec = 0; + clocksource_calculate_interval(clock, tick_nsec); + } } /* @@ -919,10 +1243,8 @@ static inline void update_times(void) unsigned long ticks; ticks = jiffies - wall_jiffies; - if (ticks) { - wall_jiffies += ticks; - update_wall_time(ticks); - } + wall_jiffies += ticks; + update_wall_time(); calc_load(ticks); } diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 000000000000..f69c804c8e62 --- /dev/null +++ b/kernel/unwind.c @@ -0,0 +1,918 @@ +/* + * Copyright (C) 2002-2006 Novell, Inc. + * Jan Beulich <jbeulich@novell.com> + * This code is released under version 2 of the GNU GPL. + * + * A simple API for unwinding kernel stacks. This is used for + * debugging and error reporting purposes. The kernel doesn't need + * full-blown stack unwinding with all the bells and whistles, so there + * is not much point in implementing the full Dwarf2 unwind API. + */ + +#include <linux/unwind.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/stop_machine.h> +#include <asm/sections.h> +#include <asm/uaccess.h> +#include <asm/unaligned.h> + +extern char __start_unwind[], __end_unwind[]; + +#define MAX_STACK_DEPTH 8 + +#define EXTRA_INFO(f) { \ + BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ + % FIELD_SIZEOF(struct unwind_frame_info, f)) \ + + offsetof(struct unwind_frame_info, f) \ + / FIELD_SIZEOF(struct unwind_frame_info, f), \ + FIELD_SIZEOF(struct unwind_frame_info, f) \ + } +#define PTREGS_INFO(f) EXTRA_INFO(regs.f) + +static const struct { + unsigned offs:BITS_PER_LONG / 2; + unsigned width:BITS_PER_LONG / 2; +} reg_info[] = { + UNW_REGISTER_INFO +}; + +#undef PTREGS_INFO +#undef EXTRA_INFO + +#ifndef REG_INVALID +#define REG_INVALID(r) (reg_info[r].width == 0) +#endif + +#define DW_CFA_nop 0x00 +#define DW_CFA_set_loc 0x01 +#define DW_CFA_advance_loc1 0x02 +#define DW_CFA_advance_loc2 0x03 +#define DW_CFA_advance_loc4 0x04 +#define DW_CFA_offset_extended 0x05 +#define DW_CFA_restore_extended 0x06 +#define DW_CFA_undefined 0x07 +#define DW_CFA_same_value 0x08 +#define DW_CFA_register 0x09 +#define DW_CFA_remember_state 0x0a +#define DW_CFA_restore_state 0x0b +#define DW_CFA_def_cfa 0x0c +#define DW_CFA_def_cfa_register 0x0d +#define DW_CFA_def_cfa_offset 0x0e +#define DW_CFA_def_cfa_expression 0x0f +#define DW_CFA_expression 0x10 +#define DW_CFA_offset_extended_sf 0x11 +#define DW_CFA_def_cfa_sf 0x12 +#define DW_CFA_def_cfa_offset_sf 0x13 +#define DW_CFA_val_offset 0x14 +#define DW_CFA_val_offset_sf 0x15 +#define DW_CFA_val_expression 0x16 +#define DW_CFA_lo_user 0x1c +#define DW_CFA_GNU_window_save 0x2d +#define DW_CFA_GNU_args_size 0x2e +#define DW_CFA_GNU_negative_offset_extended 0x2f +#define DW_CFA_hi_user 0x3f + +#define DW_EH_PE_FORM 0x07 +#define DW_EH_PE_native 0x00 +#define DW_EH_PE_leb128 0x01 +#define DW_EH_PE_data2 0x02 +#define DW_EH_PE_data4 0x03 +#define DW_EH_PE_data8 0x04 +#define DW_EH_PE_signed 0x08 +#define DW_EH_PE_ADJUST 0x70 +#define DW_EH_PE_abs 0x00 +#define DW_EH_PE_pcrel 0x10 +#define DW_EH_PE_textrel 0x20 +#define DW_EH_PE_datarel 0x30 +#define DW_EH_PE_funcrel 0x40 +#define DW_EH_PE_aligned 0x50 +#define DW_EH_PE_indirect 0x80 +#define DW_EH_PE_omit 0xff + +typedef unsigned long uleb128_t; +typedef signed long sleb128_t; + +static struct unwind_table { + struct { + unsigned long pc; + unsigned long range; + } core, init; + const void *address; + unsigned long size; + struct unwind_table *link; + const char *name; +} root_table, *last_table; + +struct unwind_item { + enum item_location { + Nowhere, + Memory, + Register, + Value + } where; + uleb128_t value; +}; + +struct unwind_state { + uleb128_t loc, org; + const u8 *cieStart, *cieEnd; + uleb128_t codeAlign; + sleb128_t dataAlign; + struct cfa { + uleb128_t reg, offs; + } cfa; + struct unwind_item regs[ARRAY_SIZE(reg_info)]; + unsigned stackDepth:8; + unsigned version:8; + const u8 *label; + const u8 *stack[MAX_STACK_DEPTH]; +}; + +static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; + +static struct unwind_table *find_table(unsigned long pc) +{ + struct unwind_table *table; + + for (table = &root_table; table; table = table->link) + if ((pc >= table->core.pc + && pc < table->core.pc + table->core.range) + || (pc >= table->init.pc + && pc < table->init.pc + table->init.range)) + break; + + return table; +} + +static void init_unwind_table(struct unwind_table *table, + const char *name, + const void *core_start, + unsigned long core_size, + const void *init_start, + unsigned long init_size, + const void *table_start, + unsigned long table_size) +{ + table->core.pc = (unsigned long)core_start; + table->core.range = core_size; + table->init.pc = (unsigned long)init_start; + table->init.range = init_size; + table->address = table_start; + table->size = table_size; + table->link = NULL; + table->name = name; +} + +void __init unwind_init(void) +{ + init_unwind_table(&root_table, "kernel", + _text, _end - _text, + NULL, 0, + __start_unwind, __end_unwind - __start_unwind); +} + +#ifdef CONFIG_MODULES + +/* Must be called with module_mutex held. */ +void *unwind_add_table(struct module *module, + const void *table_start, + unsigned long table_size) +{ + struct unwind_table *table; + + if (table_size <= 0) + return NULL; + + table = kmalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return NULL; + + init_unwind_table(table, module->name, + module->module_core, module->core_size, + module->module_init, module->init_size, + table_start, table_size); + + if (last_table) + last_table->link = table; + else + root_table.link = table; + last_table = table; + + return table; +} + +struct unlink_table_info +{ + struct unwind_table *table; + int init_only; +}; + +static int unlink_table(void *arg) +{ + struct unlink_table_info *info = arg; + struct unwind_table *table = info->table, *prev; + + for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) + ; + + if (prev->link) { + if (info->init_only) { + table->init.pc = 0; + table->init.range = 0; + info->table = NULL; + } else { + prev->link = table->link; + if (!prev->link) + last_table = prev; + } + } else + info->table = NULL; + + return 0; +} + +/* Must be called with module_mutex held. */ +void unwind_remove_table(void *handle, int init_only) +{ + struct unwind_table *table = handle; + struct unlink_table_info info; + + if (!table || table == &root_table) + return; + + if (init_only && table == last_table) { + table->init.pc = 0; + table->init.range = 0; + return; + } + + info.table = table; + info.init_only = init_only; + stop_machine_run(unlink_table, &info, NR_CPUS); + + if (info.table) + kfree(table); +} + +#endif /* CONFIG_MODULES */ + +static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) +{ + const u8 *cur = *pcur; + uleb128_t value; + unsigned shift; + + for (shift = 0, value = 0; cur < end; shift += 7) { + if (shift + 7 > 8 * sizeof(value) + && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { + cur = end + 1; + break; + } + value |= (uleb128_t)(*cur & 0x7f) << shift; + if (!(*cur++ & 0x80)) + break; + } + *pcur = cur; + + return value; +} + +static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) +{ + const u8 *cur = *pcur; + sleb128_t value; + unsigned shift; + + for (shift = 0, value = 0; cur < end; shift += 7) { + if (shift + 7 > 8 * sizeof(value) + && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { + cur = end + 1; + break; + } + value |= (sleb128_t)(*cur & 0x7f) << shift; + if (!(*cur & 0x80)) { + value |= -(*cur++ & 0x40) << shift; + break; + } + } + *pcur = cur; + + return value; +} + +static unsigned long read_pointer(const u8 **pLoc, + const void *end, + signed ptrType) +{ + unsigned long value = 0; + union { + const u8 *p8; + const u16 *p16u; + const s16 *p16s; + const u32 *p32u; + const s32 *p32s; + const unsigned long *pul; + } ptr; + + if (ptrType < 0 || ptrType == DW_EH_PE_omit) + return 0; + ptr.p8 = *pLoc; + switch(ptrType & DW_EH_PE_FORM) { + case DW_EH_PE_data2: + if (end < (const void *)(ptr.p16u + 1)) + return 0; + if(ptrType & DW_EH_PE_signed) + value = get_unaligned(ptr.p16s++); + else + value = get_unaligned(ptr.p16u++); + break; + case DW_EH_PE_data4: +#ifdef CONFIG_64BIT + if (end < (const void *)(ptr.p32u + 1)) + return 0; + if(ptrType & DW_EH_PE_signed) + value = get_unaligned(ptr.p32s++); + else + value = get_unaligned(ptr.p32u++); + break; + case DW_EH_PE_data8: + BUILD_BUG_ON(sizeof(u64) != sizeof(value)); +#else + BUILD_BUG_ON(sizeof(u32) != sizeof(value)); +#endif + case DW_EH_PE_native: + if (end < (const void *)(ptr.pul + 1)) + return 0; + value = get_unaligned(ptr.pul++); + break; + case DW_EH_PE_leb128: + BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); + value = ptrType & DW_EH_PE_signed + ? get_sleb128(&ptr.p8, end) + : get_uleb128(&ptr.p8, end); + if ((const void *)ptr.p8 > end) + return 0; + break; + default: + return 0; + } + switch(ptrType & DW_EH_PE_ADJUST) { + case DW_EH_PE_abs: + break; + case DW_EH_PE_pcrel: + value += (unsigned long)*pLoc; + break; + default: + return 0; + } + if ((ptrType & DW_EH_PE_indirect) + && __get_user(value, (unsigned long *)value)) + return 0; + *pLoc = ptr.p8; + + return value; +} + +static signed fde_pointer_type(const u32 *cie) +{ + const u8 *ptr = (const u8 *)(cie + 2); + unsigned version = *ptr; + + if (version != 1) + return -1; /* unsupported */ + if (*++ptr) { + const char *aug; + const u8 *end = (const u8 *)(cie + 1) + *cie; + uleb128_t len; + + /* check if augmentation size is first (and thus present) */ + if (*ptr != 'z') + return -1; + /* check if augmentation string is nul-terminated */ + if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) + return -1; + ++ptr; /* skip terminator */ + get_uleb128(&ptr, end); /* skip code alignment */ + get_sleb128(&ptr, end); /* skip data alignment */ + /* skip return address column */ + version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); + len = get_uleb128(&ptr, end); /* augmentation length */ + if (ptr + len < ptr || ptr + len > end) + return -1; + end = ptr + len; + while (*++aug) { + if (ptr >= end) + return -1; + switch(*aug) { + case 'L': + ++ptr; + break; + case 'P': { + signed ptrType = *ptr++; + + if (!read_pointer(&ptr, end, ptrType) || ptr > end) + return -1; + } + break; + case 'R': + return *ptr; + default: + return -1; + } + } + } + return DW_EH_PE_native|DW_EH_PE_abs; +} + +static int advance_loc(unsigned long delta, struct unwind_state *state) +{ + state->loc += delta * state->codeAlign; + + return delta > 0; +} + +static void set_rule(uleb128_t reg, + enum item_location where, + uleb128_t value, + struct unwind_state *state) +{ + if (reg < ARRAY_SIZE(state->regs)) { + state->regs[reg].where = where; + state->regs[reg].value = value; + } +} + +static int processCFI(const u8 *start, + const u8 *end, + unsigned long targetLoc, + signed ptrType, + struct unwind_state *state) +{ + union { + const u8 *p8; + const u16 *p16; + const u32 *p32; + } ptr; + int result = 1; + + if (start != state->cieStart) { + state->loc = state->org; + result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); + if (targetLoc == 0 && state->label == NULL) + return result; + } + for (ptr.p8 = start; result && ptr.p8 < end; ) { + switch(*ptr.p8 >> 6) { + uleb128_t value; + + case 0: + switch(*ptr.p8++) { + case DW_CFA_nop: + break; + case DW_CFA_set_loc: + if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) + result = 0; + break; + case DW_CFA_advance_loc1: + result = ptr.p8 < end && advance_loc(*ptr.p8++, state); + break; + case DW_CFA_advance_loc2: + result = ptr.p8 <= end + 2 + && advance_loc(*ptr.p16++, state); + break; + case DW_CFA_advance_loc4: + result = ptr.p8 <= end + 4 + && advance_loc(*ptr.p32++, state); + break; + case DW_CFA_offset_extended: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_val_offset: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Value, get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_offset_extended_sf: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); + break; + case DW_CFA_val_offset_sf: + value = get_uleb128(&ptr.p8, end); + set_rule(value, Value, get_sleb128(&ptr.p8, end), state); + break; + case DW_CFA_restore_extended: + case DW_CFA_undefined: + case DW_CFA_same_value: + set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); + break; + case DW_CFA_register: + value = get_uleb128(&ptr.p8, end); + set_rule(value, + Register, + get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_remember_state: + if (ptr.p8 == state->label) { + state->label = NULL; + return 1; + } + if (state->stackDepth >= MAX_STACK_DEPTH) + return 0; + state->stack[state->stackDepth++] = ptr.p8; + break; + case DW_CFA_restore_state: + if (state->stackDepth) { + const uleb128_t loc = state->loc; + const u8 *label = state->label; + + state->label = state->stack[state->stackDepth - 1]; + memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); + memset(state->regs, 0, sizeof(state->regs)); + state->stackDepth = 0; + result = processCFI(start, end, 0, ptrType, state); + state->loc = loc; + state->label = label; + } else + return 0; + break; + case DW_CFA_def_cfa: + state->cfa.reg = get_uleb128(&ptr.p8, end); + /*nobreak*/ + case DW_CFA_def_cfa_offset: + state->cfa.offs = get_uleb128(&ptr.p8, end); + break; + case DW_CFA_def_cfa_sf: + state->cfa.reg = get_uleb128(&ptr.p8, end); + /*nobreak*/ + case DW_CFA_def_cfa_offset_sf: + state->cfa.offs = get_sleb128(&ptr.p8, end) + * state->dataAlign; + break; + case DW_CFA_def_cfa_register: + state->cfa.reg = get_uleb128(&ptr.p8, end); + break; + /*todo case DW_CFA_def_cfa_expression: */ + /*todo case DW_CFA_expression: */ + /*todo case DW_CFA_val_expression: */ + case DW_CFA_GNU_args_size: + get_uleb128(&ptr.p8, end); + break; + case DW_CFA_GNU_negative_offset_extended: + value = get_uleb128(&ptr.p8, end); + set_rule(value, + Memory, + (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); + break; + case DW_CFA_GNU_window_save: + default: + result = 0; + break; + } + break; + case 1: + result = advance_loc(*ptr.p8++ & 0x3f, state); + break; + case 2: + value = *ptr.p8++ & 0x3f; + set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); + break; + case 3: + set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); + break; + } + if (ptr.p8 > end) + result = 0; + if (result && targetLoc != 0 && targetLoc < state->loc) + return 1; + } + + return result + && ptr.p8 == end + && (targetLoc == 0 + || (/*todo While in theory this should apply, gcc in practice omits + everything past the function prolog, and hence the location + never reaches the end of the function. + targetLoc < state->loc &&*/ state->label == NULL)); +} + +/* Unwind to previous to frame. Returns 0 if successful, negative + * number in case of an error. */ +int unwind(struct unwind_frame_info *frame) +{ +#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) + const u32 *fde = NULL, *cie = NULL; + const u8 *ptr = NULL, *end = NULL; + unsigned long startLoc = 0, endLoc = 0, cfa; + unsigned i; + signed ptrType = -1; + uleb128_t retAddrReg = 0; + struct unwind_table *table; + struct unwind_state state; + + if (UNW_PC(frame) == 0) + return -EINVAL; + if ((table = find_table(UNW_PC(frame))) != NULL + && !(table->size & (sizeof(*fde) - 1))) { + unsigned long tableSize = table->size; + + for (fde = table->address; + tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; + tableSize -= sizeof(*fde) + *fde, + fde += 1 + *fde / sizeof(*fde)) { + if (!*fde || (*fde & (sizeof(*fde) - 1))) + break; + if (!fde[1]) + continue; /* this is a CIE */ + if ((fde[1] & (sizeof(*fde) - 1)) + || fde[1] > (unsigned long)(fde + 1) + - (unsigned long)table->address) + continue; /* this is not a valid FDE */ + cie = fde + 1 - fde[1] / sizeof(*fde); + if (*cie <= sizeof(*cie) + 4 + || *cie >= fde[1] - sizeof(*fde) + || (*cie & (sizeof(*cie) - 1)) + || cie[1] + || (ptrType = fde_pointer_type(cie)) < 0) { + cie = NULL; /* this is not a (valid) CIE */ + continue; + } + ptr = (const u8 *)(fde + 2); + startLoc = read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType); + endLoc = startLoc + + read_pointer(&ptr, + (const u8 *)(fde + 1) + *fde, + ptrType & DW_EH_PE_indirect + ? ptrType + : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); + if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) + break; + cie = NULL; + } + } + if (cie != NULL) { + memset(&state, 0, sizeof(state)); + state.cieEnd = ptr; /* keep here temporarily */ + ptr = (const u8 *)(cie + 2); + end = (const u8 *)(cie + 1) + *cie; + if ((state.version = *ptr) != 1) + cie = NULL; /* unsupported version */ + else if (*++ptr) { + /* check if augmentation size is first (and thus present) */ + if (*ptr == 'z') { + /* check for ignorable (or already handled) + * nul-terminated augmentation string */ + while (++ptr < end && *ptr) + if (strchr("LPR", *ptr) == NULL) + break; + } + if (ptr >= end || *ptr) + cie = NULL; + } + ++ptr; + } + if (cie != NULL) { + /* get code aligment factor */ + state.codeAlign = get_uleb128(&ptr, end); + /* get data aligment factor */ + state.dataAlign = get_sleb128(&ptr, end); + if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) + cie = NULL; + else { + retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); + /* skip augmentation */ + if (((const char *)(cie + 2))[1] == 'z') + ptr += get_uleb128(&ptr, end); + if (ptr > end + || retAddrReg >= ARRAY_SIZE(reg_info) + || REG_INVALID(retAddrReg) + || reg_info[retAddrReg].width != sizeof(unsigned long)) + cie = NULL; + } + } + if (cie != NULL) { + state.cieStart = ptr; + ptr = state.cieEnd; + state.cieEnd = end; + end = (const u8 *)(fde + 1) + *fde; + /* skip augmentation */ + if (((const char *)(cie + 2))[1] == 'z') { + uleb128_t augSize = get_uleb128(&ptr, end); + + if ((ptr += augSize) > end) + fde = NULL; + } + } + if (cie == NULL || fde == NULL) { +#ifdef CONFIG_FRAME_POINTER + unsigned long top, bottom; +#endif + +#ifdef CONFIG_FRAME_POINTER + top = STACK_TOP(frame->task); + bottom = STACK_BOTTOM(frame->task); +# if FRAME_RETADDR_OFFSET < 0 + if (UNW_SP(frame) < top + && UNW_FP(frame) <= UNW_SP(frame) + && bottom < UNW_FP(frame) +# else + if (UNW_SP(frame) > top + && UNW_FP(frame) >= UNW_SP(frame) + && bottom > UNW_FP(frame) +# endif + && !((UNW_SP(frame) | UNW_FP(frame)) + & (sizeof(unsigned long) - 1))) { + unsigned long link; + + if (!__get_user(link, + (unsigned long *)(UNW_FP(frame) + + FRAME_LINK_OFFSET)) +# if FRAME_RETADDR_OFFSET < 0 + && link > bottom && link < UNW_FP(frame) +# else + && link > UNW_FP(frame) && link < bottom +# endif + && !(link & (sizeof(link) - 1)) + && !__get_user(UNW_PC(frame), + (unsigned long *)(UNW_FP(frame) + + FRAME_RETADDR_OFFSET))) { + UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET +# if FRAME_RETADDR_OFFSET < 0 + - +# else + + +# endif + sizeof(UNW_PC(frame)); + UNW_FP(frame) = link; + return 0; + } + } +#endif + return -ENXIO; + } + state.org = startLoc; + memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); + /* process instructions */ + if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) + || state.loc > endLoc + || state.regs[retAddrReg].where == Nowhere + || state.cfa.reg >= ARRAY_SIZE(reg_info) + || reg_info[state.cfa.reg].width != sizeof(unsigned long) + || state.cfa.offs % sizeof(unsigned long)) + return -EIO; + /* update frame */ + cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; + startLoc = min((unsigned long)UNW_SP(frame), cfa); + endLoc = max((unsigned long)UNW_SP(frame), cfa); + if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { + startLoc = min(STACK_LIMIT(cfa), cfa); + endLoc = max(STACK_LIMIT(cfa), cfa); + } +#ifndef CONFIG_64BIT +# define CASES CASE(8); CASE(16); CASE(32) +#else +# define CASES CASE(8); CASE(16); CASE(32); CASE(64) +#endif + for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { + if (REG_INVALID(i)) { + if (state.regs[i].where == Nowhere) + continue; + return -EIO; + } + switch(state.regs[i].where) { + default: + break; + case Register: + if (state.regs[i].value >= ARRAY_SIZE(reg_info) + || REG_INVALID(state.regs[i].value) + || reg_info[i].width > reg_info[state.regs[i].value].width) + return -EIO; + switch(reg_info[state.regs[i].value].width) { +#define CASE(n) \ + case sizeof(u##n): \ + state.regs[i].value = FRAME_REG(state.regs[i].value, \ + const u##n); \ + break + CASES; +#undef CASE + default: + return -EIO; + } + break; + } + } + for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { + if (REG_INVALID(i)) + continue; + switch(state.regs[i].where) { + case Nowhere: + if (reg_info[i].width != sizeof(UNW_SP(frame)) + || &FRAME_REG(i, __typeof__(UNW_SP(frame))) + != &UNW_SP(frame)) + continue; + UNW_SP(frame) = cfa; + break; + case Register: + switch(reg_info[i].width) { +#define CASE(n) case sizeof(u##n): \ + FRAME_REG(i, u##n) = state.regs[i].value; \ + break + CASES; +#undef CASE + default: + return -EIO; + } + break; + case Value: + if (reg_info[i].width != sizeof(unsigned long)) + return -EIO; + FRAME_REG(i, unsigned long) = cfa + state.regs[i].value + * state.dataAlign; + break; + case Memory: { + unsigned long addr = cfa + state.regs[i].value + * state.dataAlign; + + if ((state.regs[i].value * state.dataAlign) + % sizeof(unsigned long) + || addr < startLoc + || addr + sizeof(unsigned long) < addr + || addr + sizeof(unsigned long) > endLoc) + return -EIO; + switch(reg_info[i].width) { +#define CASE(n) case sizeof(u##n): \ + __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ + break + CASES; +#undef CASE + default: + return -EIO; + } + } + break; + } + } + + return 0; +#undef CASES +#undef FRAME_REG +} +EXPORT_SYMBOL(unwind); + +int unwind_init_frame_info(struct unwind_frame_info *info, + struct task_struct *tsk, + /*const*/ struct pt_regs *regs) +{ + info->task = tsk; + arch_unw_init_frame_info(info, regs); + + return 0; +} +EXPORT_SYMBOL(unwind_init_frame_info); + +/* + * Prepare to unwind a blocked task. + */ +int unwind_init_blocked(struct unwind_frame_info *info, + struct task_struct *tsk) +{ + info->task = tsk; + arch_unw_init_blocked(info); + + return 0; +} +EXPORT_SYMBOL(unwind_init_blocked); + +/* + * Prepare to unwind the currently running thread. + */ +int unwind_init_running(struct unwind_frame_info *info, + asmlinkage int (*callback)(struct unwind_frame_info *, + void *arg), + void *arg) +{ + info->task = current; + + return arch_unwind_init_running(info, callback, arg); +} +EXPORT_SYMBOL(unwind_init_running); + +/* + * Unwind until the return pointer is in user-land (or until an error + * occurs). Returns 0 if successful, negative number in case of + * error. + */ +int unwind_to_user(struct unwind_frame_info *info) +{ + while (!arch_unw_user_mode(info)) { + int err = unwind(info); + + if (err < 0) + return err; + } + + return 0; +} +EXPORT_SYMBOL(unwind_to_user); diff --git a/kernel/user.c b/kernel/user.c index 2116642f42c6..6408c0424291 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid) atomic_set(&new->processes, 0); atomic_set(&new->files, 0); atomic_set(&new->sigpending, 0); -#ifdef CONFIG_INOTIFY +#ifdef CONFIG_INOTIFY_USER atomic_set(&new->inotify_watches, 0); atomic_set(&new->inotify_devs, 0); #endif @@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid) new->mq_bytes = 0; new->locked_shm = 0; - if (alloc_uid_keyring(new) < 0) { + if (alloc_uid_keyring(new, current) < 0) { kmem_cache_free(uid_cachep, new); return NULL; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 880fb415a8f6..565cf7a1febd 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu, return ret; } -int schedule_on_each_cpu(void (*func) (void *info), void *info) +/** + * schedule_on_each_cpu - call a function on each online CPU from keventd + * @func: the function to call + * @info: a pointer to pass to func() + * + * Returns zero on success. + * Returns -ve errno on failure. + * + * Appears to be racy against CPU hotplug. + * + * schedule_on_each_cpu() is very slow. + */ +int schedule_on_each_cpu(void (*func)(void *info), void *info) { int cpu; - struct work_struct *work; + struct work_struct *works; - work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); - - if (!work) + works = alloc_percpu(struct work_struct); + if (!works) return -ENOMEM; + for_each_online_cpu(cpu) { - INIT_WORK(work + cpu, func, info); + INIT_WORK(per_cpu_ptr(works, cpu), func, info); __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), - work + cpu); + per_cpu_ptr(works, cpu)); } flush_workqueue(keventd_wq); - kfree(work); + free_percpu(works); return 0; } @@ -531,11 +543,11 @@ int current_is_keventd(void) static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) { struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); - LIST_HEAD(list); + struct list_head list; struct work_struct *work; spin_lock_irq(&cwq->lock); - list_splice_init(&cwq->worklist, &list); + list_replace_init(&cwq->worklist, &list); while (!list_empty(&list)) { printk("Taking work for %s\n", wq->name); @@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_CANCELED: list_for_each_entry(wq, &workqueues, list) { + if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread) + continue; /* Unbind so it can run. */ kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, any_online_cpu(cpu_online_map)); |