diff options
Diffstat (limited to 'arch/x86/kernel/ds.c')
-rw-r--r-- | arch/x86/kernel/ds.c | 1147 |
1 files changed, 644 insertions, 503 deletions
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index a2d1176c38ee..da91701a2348 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -6,14 +6,13 @@ * precise-event based sampling (PEBS). * * It manages: - * - per-thread and per-cpu allocation of BTS and PEBS - * - buffer memory allocation (optional) - * - buffer overflow handling + * - DS and BTS hardware configuration + * - buffer overflow handling (to be done) * - buffer access * - * It assumes: - * - get_task_struct on all parameter tasks - * - current is allowed to trace parameter tasks + * It does not do: + * - security checking (is the caller allowed to trace the task) + * - buffer allocation (memory accounting) * * * Copyright (C) 2007-2008 Intel Corporation. @@ -28,22 +27,69 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/kernel.h> /* * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the size of the DS structure in bytes */ - unsigned char sizeof_ds; - /* the size of one pointer-typed field in the DS structure in bytes; - this covers the first 8 fields related to buffer management. */ + /* the name of the configuration */ + const char *name; + /* the size of one pointer-typed field in the DS structure and + in the BTS and PEBS buffers in bytes; + this covers the first 8 DS fields related to buffer management. */ unsigned char sizeof_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; + /* a series of bit-masks to control various features indexed + * by enum ds_feature */ + unsigned long ctl[dsf_ctl_max]; }; -static struct ds_configuration ds_cfg; +static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) + +#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ +#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ + +#define BTS_CONTROL \ + (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ + ds_cfg.ctl[dsf_bts_overflow]) + + +/* + * A BTS or PEBS tracer. + * + * This holds the configuration of the tracer and serves as a handle + * to identify tracers. + */ +struct ds_tracer { + /* the DS context (partially) owned by this tracer */ + struct ds_context *context; + /* the buffer provided on ds_request() and its size in bytes */ + void *buffer; + size_t size; +}; + +struct bts_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* the trace including the DS configuration */ + struct bts_trace trace; + /* buffer overflow notification function */ + bts_ovfl_callback_t ovfl; +}; + +struct pebs_tracer { + /* the common DS part */ + struct ds_tracer ds; + /* the trace including the DS configuration */ + struct pebs_trace trace; + /* buffer overflow notification function */ + pebs_ovfl_callback_t ovfl; +}; /* * Debug Store (DS) save area configuration (see Intel64 and IA32 @@ -109,32 +155,9 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, /* - * Locking is done only for allocating BTS or PEBS resources and for - * guarding context and buffer memory allocation. - * - * Most functions require the current task to own the ds context part - * they are going to access. All the locking is done when validating - * access to the context. + * Locking is done only for allocating BTS or PEBS resources. */ -static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); - -/* - * Validate that the current task is allowed to access the BTS/PEBS - * buffer of the parameter task. - * - * Returns 0, if access is granted; -Eerrno, otherwise. - */ -static inline int ds_validate_access(struct ds_context *context, - enum ds_qualifier qual) -{ - if (!context) - return -EPERM; - - if (context->owner[qual] == current) - return 0; - - return -EPERM; -} +static DEFINE_SPINLOCK(ds_lock); /* @@ -150,27 +173,32 @@ static inline int ds_validate_access(struct ds_context *context, * >0 number of per-thread tracers * <0 number of per-cpu tracers * - * The below functions to get and put tracers and to check the - * allocation type require the ds_lock to be held by the caller. - * * Tracers essentially gives the number of ds contexts for a certain * type of allocation. */ -static long tracers; +static atomic_t tracers = ATOMIC_INIT(0); static inline void get_tracer(struct task_struct *task) { - tracers += (task ? 1 : -1); + if (task) + atomic_inc(&tracers); + else + atomic_dec(&tracers); } static inline void put_tracer(struct task_struct *task) { - tracers -= (task ? 1 : -1); + if (task) + atomic_dec(&tracers); + else + atomic_inc(&tracers); } static inline int check_tracer(struct task_struct *task) { - return (task ? (tracers >= 0) : (tracers <= 0)); + return task ? + (atomic_read(&tracers) >= 0) : + (atomic_read(&tracers) <= 0); } @@ -183,99 +211,70 @@ static inline int check_tracer(struct task_struct *task) * * Contexts are use-counted. They are allocated on first access and * deallocated when the last user puts the context. - * - * We distinguish between an allocating and a non-allocating get of a - * context: - * - the allocating get is used for requesting BTS/PEBS resources. It - * requires the caller to hold the global ds_lock. - * - the non-allocating get is used for all other cases. A - * non-existing context indicates an error. It acquires and releases - * the ds_lock itself for obtaining the context. - * - * A context and its DS configuration are allocated and deallocated - * together. A context always has a DS configuration of the - * appropriate size. - */ -static DEFINE_PER_CPU(struct ds_context *, system_context); - -#define this_system_context per_cpu(system_context, smp_processor_id()) - -/* - * Returns the pointer to the parameter task's context or to the - * system-wide context, if task is NULL. - * - * Increases the use count of the returned context, if not NULL. */ -static inline struct ds_context *ds_get_context(struct task_struct *task) -{ - struct ds_context *context; - unsigned long irq; +struct ds_context { + /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + unsigned char ds[MAX_SIZEOF_DS]; + /* the owner of the BTS and PEBS configuration, respectively */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + /* use count */ + unsigned long count; + /* a pointer to the context location inside the thread_struct + * or the per_cpu context array */ + struct ds_context **this; + /* a pointer to the task owning this context, or NULL, if the + * context is owned by a cpu */ + struct task_struct *task; +}; - spin_lock_irqsave(&ds_lock, irq); +static DEFINE_PER_CPU(struct ds_context *, system_context_array); - context = (task ? task->thread.ds_ctx : this_system_context); - if (context) - context->count++; +#define system_context per_cpu(system_context_array, smp_processor_id()) - spin_unlock_irqrestore(&ds_lock, irq); - - return context; -} -/* - * Same as ds_get_context, but allocates the context and it's DS - * structure, if necessary; returns NULL; if out of memory. - */ -static inline struct ds_context *ds_alloc_context(struct task_struct *task) +static inline struct ds_context *ds_get_context(struct task_struct *task) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &this_system_context); - struct ds_context *context = *p_context; + (task ? &task->thread.ds_ctx : &system_context); + struct ds_context *context = NULL; + struct ds_context *new_context = NULL; unsigned long irq; - if (!context) { - context = kzalloc(sizeof(*context), GFP_KERNEL); - if (!context) - return NULL; - - context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); - if (!context->ds) { - kfree(context); - return NULL; - } + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + if (!new_context) + return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irqsave(&ds_lock, irq); - if (*p_context) { - kfree(context->ds); - kfree(context); + context = *p_context; + if (!context) { + context = new_context; - context = *p_context; - } else { - *p_context = context; + context->this = p_context; + context->task = task; + context->count = 0; - context->this = p_context; - context->task = task; + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + if (!task || (task == current)) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, - (unsigned long)context->ds); - } - spin_unlock_irqrestore(&ds_lock, irq); + *p_context = context; } context->count++; + spin_unlock_irqrestore(&ds_lock, irq); + + if (context != new_context) + kfree(new_context); + return context; } -/* - * Decreases the use count of the parameter context, if not NULL. - * Deallocates the context, if the use count reaches zero. - */ static inline void ds_put_context(struct ds_context *context) { unsigned long irq; @@ -285,8 +284,10 @@ static inline void ds_put_context(struct ds_context *context) spin_lock_irqsave(&ds_lock, irq); - if (--context->count) - goto out; + if (--context->count) { + spin_unlock_irqrestore(&ds_lock, irq); + return; + } *(context->this) = NULL; @@ -296,135 +297,263 @@ static inline void ds_put_context(struct ds_context *context) if (!context->task || (context->task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); - put_tracer(context->task); + spin_unlock_irqrestore(&ds_lock, irq); - /* free any leftover buffers from tracers that did not - * deallocate them properly. */ - kfree(context->buffer[ds_bts]); - kfree(context->buffer[ds_pebs]); - kfree(context->ds); kfree(context); - out: - spin_unlock_irqrestore(&ds_lock, irq); } /* - * Handle a buffer overflow + * Call the tracer's callback on a buffer overflow. * - * task: the task whose buffers are overflowing; - * NULL for a buffer overflow on the current cpu * context: the ds context * qual: the buffer type */ -static void ds_overflow(struct task_struct *task, struct ds_context *context, - enum ds_qualifier qual) +static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) { - if (!context) - return; - - if (context->callback[qual]) - (*context->callback[qual])(task); - - /* todo: do some more overflow handling */ + switch (qual) { + case ds_bts: + if (context->bts_master && + context->bts_master->ovfl) + context->bts_master->ovfl(context->bts_master); + break; + case ds_pebs: + if (context->pebs_master && + context->pebs_master->ovfl) + context->pebs_master->ovfl(context->pebs_master); + break; + } } /* - * Allocate a non-pageable buffer of the parameter size. - * Checks the memory and the locked memory rlimit. + * Write raw data into the BTS or PEBS buffer. * - * Returns the buffer, if successful; - * NULL, if out of memory or rlimit exceeded. + * The remainder of any partially written record is zeroed out. * - * size: the requested buffer size in bytes - * pages (out): if not NULL, contains the number of pages reserved + * context: the DS context + * qual: the buffer type + * record: the data to write + * size: the size of the data */ -static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) +static int ds_write(struct ds_context *context, enum ds_qualifier qual, + const void *record, size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer; + int bytes_written = 0; - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + if (!record) + return -EINVAL; - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) - return NULL; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) - return NULL; + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - return NULL; + write_end = min(end, int_th); - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; - if (pages) - *pages = pgsz; + if (write_end <= index) + break; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); - return buffer; + record = (const char *)record + write_size; + size -= write_size; + bytes_written += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(context, qual); + } + + return bytes_written; } -static int ds_request(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl, enum ds_qualifier qual) + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + * + * We use two levels of abstraction: + * - the raw data level defined here + * - an arch-independent level defined in ds.h + */ + +enum bts_field { + bts_from, + bts_to, + bts_flags, + + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, + + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) { - struct ds_context *context; - unsigned long buffer, adj; - const unsigned long alignment = (1 << 3); - unsigned long irq; - int error = 0; + base += (ds_cfg.sizeof_field * field); + return *(unsigned long *)base; +} + +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (ds_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; +} - if (!ds_cfg.sizeof_ds) - return -EOPNOTSUPP; - /* we require some space to do alignment adjustments below */ - if (size < (alignment + ds_cfg.sizeof_rec[qual])) +/* + * The raw BTS data is architecture dependent. + * + * For higher-level users, we give an arch-independent view. + * - ds.h defines struct bts_struct + * - bts_read translates one raw bts record into a bts_struct + * - bts_write translates one bts_struct into the raw format and + * writes it into the top of the parameter tracer's buffer. + * + * return: bytes read/written on success; -Eerrno, otherwise + */ +static int bts_read(struct bts_tracer *tracer, const void *at, + struct bts_struct *out) +{ + if (!tracer) return -EINVAL; - /* buffer overflow notification is not yet implemented */ - if (ovfl) - return -EOPNOTSUPP; + if (at < tracer->trace.ds.begin) + return -EINVAL; + if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) + return -EINVAL; - context = ds_alloc_context(task); - if (!context) - return -ENOMEM; + memset(out, 0, sizeof(*out)); + if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { + out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); + out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); + out->variant.timestamp.pid = bts_get(at, bts_pid); + } else { + out->qualifier = bts_branch; + out->variant.lbr.from = bts_get(at, bts_from); + out->variant.lbr.to = bts_get(at, bts_to); + + if (!out->variant.lbr.from && !out->variant.lbr.to) + out->qualifier = bts_invalid; + } - spin_lock_irqsave(&ds_lock, irq); + return ds_cfg.sizeof_rec[ds_bts]; +} - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; +static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) +{ + unsigned char raw[MAX_SIZEOF_BTS]; - get_tracer(task); + if (!tracer) + return -EINVAL; - error = -EALREADY; - if (context->owner[qual] == current) - goto out_put_tracer; - error = -EPERM; - if (context->owner[qual] != NULL) - goto out_put_tracer; - context->owner[qual] = current; + if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) + return -EOVERFLOW; - spin_unlock_irqrestore(&ds_lock, irq); + switch (in->qualifier) { + case bts_invalid: + bts_set(raw, bts_from, 0); + bts_set(raw, bts_to, 0); + bts_set(raw, bts_flags, 0); + break; + case bts_branch: + bts_set(raw, bts_from, in->variant.lbr.from); + bts_set(raw, bts_to, in->variant.lbr.to); + bts_set(raw, bts_flags, 0); + break; + case bts_task_arrives: + case bts_task_departs: + bts_set(raw, bts_qual, (bts_escape | in->qualifier)); + bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); + bts_set(raw, bts_pid, in->variant.timestamp.pid); + break; + default: + return -EINVAL; + } + return ds_write(tracer->ds.context, ds_bts, raw, + ds_cfg.sizeof_rec[ds_bts]); +} - error = -ENOMEM; - if (!base) { - base = ds_allocate_buffer(size, &context->pages[qual]); - if (!base) - goto out_release; - context->buffer[qual] = base; - } - error = 0; +static void ds_write_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; + + ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); + ds_set(ds, qual, ds_index, (unsigned long)cfg->top); + ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); + ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); +} + +static void ds_read_config(struct ds_context *context, + struct ds_trace *cfg, enum ds_qualifier qual) +{ + unsigned char *ds = context->ds; - context->callback[qual] = ovfl; + cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); + cfg->top = (void *)ds_get(ds, qual, ds_index); + cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); + cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); +} + +static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, + void *base, size_t size, size_t ith, + unsigned int flags) { + unsigned long buffer, adj; /* adjust the buffer address and size to meet alignment * constraints: @@ -436,410 +565,383 @@ static int ds_request(struct task_struct *task, void *base, size_t size, */ buffer = (unsigned long)base; - adj = ALIGN(buffer, alignment) - buffer; + adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; buffer += adj; size -= adj; - size /= ds_cfg.sizeof_rec[qual]; - size *= ds_cfg.sizeof_rec[qual]; - - ds_set(context->ds, qual, ds_buffer_base, buffer); - ds_set(context->ds, qual, ds_index, buffer); - ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + trace->n = size / ds_cfg.sizeof_rec[qual]; + trace->size = ds_cfg.sizeof_rec[qual]; - if (ovfl) { - /* todo: select a suitable interrupt threshold */ - } else - ds_set(context->ds, qual, - ds_interrupt_threshold, buffer + size + 1); + size = (trace->n * trace->size); - /* we keep the context until ds_release */ - return error; - - out_release: - context->owner[qual] = NULL; - ds_put_context(context); - put_tracer(task); - return error; - - out_put_tracer: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - put_tracer(task); - return error; + trace->begin = (void *)buffer; + trace->top = trace->begin; + trace->end = (void *)(buffer + size); + /* The value for 'no threshold' is -1, which will set the + * threshold outside of the buffer, just like we want it. + */ + trace->ith = (void *)(buffer + size - ith); - out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); - ds_put_context(context); - return error; + trace->flags = flags; } -int ds_request_bts(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_bts); -} -int ds_request_pebs(struct task_struct *task, void *base, size_t size, - ds_ovfl_callback_t ovfl) -{ - return ds_request(task, base, size, ovfl, ds_pebs); -} - -static int ds_release(struct task_struct *task, enum ds_qualifier qual) +static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, + enum ds_qualifier qual, struct task_struct *task, + void *base, size_t size, size_t th, unsigned int flags) { struct ds_context *context; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + error = -EINVAL; + if (!base) goto out; - kfree(context->buffer[qual]); - context->buffer[qual] = NULL; - - current->mm->total_vm -= context->pages[qual]; - current->mm->locked_vm -= context->pages[qual]; - context->pages[qual] = 0; - context->owner[qual] = NULL; - - /* - * we put the context twice: - * once for the ds_get_context - * once for the corresponding ds_request - */ - ds_put_context(context); - out: - ds_put_context(context); - return error; -} + /* we require some space to do alignment adjustments below */ + error = -EINVAL; + if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + goto out; -int ds_release_bts(struct task_struct *task) -{ - return ds_release(task, ds_bts); -} + if (th != (size_t)-1) { + th *= ds_cfg.sizeof_rec[qual]; -int ds_release_pebs(struct task_struct *task) -{ - return ds_release(task, ds_pebs); -} + error = -EINVAL; + if (size <= th) + goto out; + } -static int ds_get_index(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, index; - int error; + tracer->buffer = base; + tracer->size = size; + error = -ENOMEM; context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + if (!context) goto out; + tracer->context = context; - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); + ds_init_ds_trace(trace, qual, base, size, th, flags); - error = ((index - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; + error = 0; out: - ds_put_context(context); return error; } -int ds_get_bts_index(struct task_struct *task, size_t *pos) -{ - return ds_get_index(task, pos, ds_bts); -} - -int ds_get_pebs_index(struct task_struct *task, size_t *pos) +struct bts_tracer *ds_request_bts(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { - return ds_get_index(task, pos, ds_pebs); -} - -static int ds_get_end(struct task_struct *task, size_t *pos, - enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, end; + struct bts_tracer *tracer; + unsigned long irq; int error; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) + error = -EOPNOTSUPP; + if (!ds_cfg.ctl[dsf_bts]) goto out; - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; - error = ((end - base) / ds_cfg.sizeof_rec[qual]); - if (pos) - *pos = error; - out: - ds_put_context(context); - return error; -} + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) + goto out; + tracer->ovfl = ovfl; -int ds_get_bts_end(struct task_struct *task, size_t *pos) -{ - return ds_get_end(task, pos, ds_bts); -} + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_bts, task, base, size, th, flags); + if (error < 0) + goto out_tracer; -int ds_get_pebs_end(struct task_struct *task, size_t *pos) -{ - return ds_get_end(task, pos, ds_pebs); -} -static int ds_access(struct task_struct *task, size_t index, - const void **record, enum ds_qualifier qual) -{ - struct ds_context *context; - unsigned long base, idx; - int error; + spin_lock_irqsave(&ds_lock, irq); - if (!record) - return -EINVAL; + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + error = -EPERM; + if (tracer->ds.context->bts_master) + goto out_put_tracer; + tracer->ds.context->bts_master = tracer; - base = ds_get(context->ds, qual, ds_buffer_base); - idx = base + (index * ds_cfg.sizeof_rec[qual]); + spin_unlock_irqrestore(&ds_lock, irq); - error = -EINVAL; - if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) - goto out; - *record = (const void *)idx; - error = ds_cfg.sizeof_rec[qual]; - out: - ds_put_context(context); - return error; -} + tracer->trace.read = bts_read; + tracer->trace.write = bts_write; -int ds_access_bts(struct task_struct *task, size_t index, const void **record) -{ - return ds_access(task, index, record, ds_bts); -} + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_bts(tracer); -int ds_access_pebs(struct task_struct *task, size_t index, const void **record) -{ - return ds_access(task, index, record, ds_pebs); + return tracer; + + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); } -static int ds_write(struct task_struct *task, const void *record, size_t size, - enum ds_qualifier qual, int force) +struct pebs_tracer *ds_request_pebs(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { - struct ds_context *context; + struct pebs_tracer *tracer; + unsigned long irq; int error; - if (!record) - return -EINVAL; + /* buffer overflow notification is not yet implemented */ + error = -EOPNOTSUPP; + if (ovfl) + goto out; - error = -EPERM; - context = ds_get_context(task); - if (!context) + error = -ENOMEM; + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + if (!tracer) goto out; + tracer->ovfl = ovfl; - if (!force) { - error = ds_validate_access(context, qual); - if (error < 0) - goto out; - } + error = ds_request(&tracer->ds, &tracer->trace.ds, + ds_pebs, task, base, size, th, flags); + if (error < 0) + goto out_tracer; - error = 0; - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; + spin_lock_irqsave(&ds_lock, irq); - /* - * write as much as possible without producing an - * overflow interrupt. - * - * interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + error = -EPERM; + if (!check_tracer(task)) + goto out_unlock; + get_tracer(task); - write_end = min(end, int_th); + error = -EPERM; + if (tracer->ds.context->pebs_master) + goto out_put_tracer; + tracer->ds.context->pebs_master = tracer; - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ - if (write_end <= index) - write_end = end; + spin_unlock_irqrestore(&ds_lock, irq); - if (write_end <= index) - goto out; + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_resume_pebs(tracer); - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); + return tracer; - record = (const char *)record + write_size; - size -= write_size; - error += write_size; + out_put_tracer: + put_tracer(task); + out_unlock: + spin_unlock_irqrestore(&ds_lock, irq); + ds_put_context(tracer->ds.context); + out_tracer: + kfree(tracer); + out: + return ERR_PTR(error); +} - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; +void ds_release_bts(struct bts_tracer *tracer) +{ + if (!tracer) + return; - /* zero out trailing bytes */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; + ds_suspend_bts(tracer); - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); + WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); + tracer->ds.context->bts_master = NULL; - if (index >= int_th) - ds_overflow(task, context, qual); - } + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); - out: - ds_put_context(context); - return error; + kfree(tracer); } -int ds_write_bts(struct task_struct *task, const void *record, size_t size) +void ds_suspend_bts(struct bts_tracer *tracer) { - return ds_write(task, record, size, ds_bts, /* force = */ 0); -} + struct task_struct *task; -int ds_write_pebs(struct task_struct *task, const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 0); -} + if (!tracer) + return; -int ds_unchecked_write_bts(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_bts, /* force = */ 1); -} + task = tracer->ds.context->task; -int ds_unchecked_write_pebs(struct task_struct *task, - const void *record, size_t size) -{ - return ds_write(task, record, size, ds_pebs, /* force = */ 1); + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + + if (task) { + task->thread.debugctlmsr &= ~BTS_CONTROL; + + if (!task->thread.debugctlmsr) + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } } -static int ds_reset_or_clear(struct task_struct *task, - enum ds_qualifier qual, int clear) +void ds_resume_bts(struct bts_tracer *tracer) { - struct ds_context *context; - unsigned long base, end; - int error; + struct task_struct *task; + unsigned long control; - context = ds_get_context(task); - error = ds_validate_access(context, qual); - if (error < 0) - goto out; + if (!tracer) + return; - base = ds_get(context->ds, qual, ds_buffer_base); - end = ds_get(context->ds, qual, ds_absolute_maximum); + task = tracer->ds.context->task; - if (clear) - memset((void *)base, 0, end - base); + control = ds_cfg.ctl[dsf_bts]; + if (!(tracer->trace.ds.flags & BTS_KERNEL)) + control |= ds_cfg.ctl[dsf_bts_kernel]; + if (!(tracer->trace.ds.flags & BTS_USER)) + control |= ds_cfg.ctl[dsf_bts_user]; - ds_set(context->ds, qual, ds_index, base); + if (task) { + task->thread.debugctlmsr |= control; + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + } - error = 0; - out: - ds_put_context(context); - return error; + if (!task || (task == current)) + update_debugctlmsr(get_debugctlmsr() | control); } -int ds_reset_bts(struct task_struct *task) +void ds_release_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); + if (!tracer) + return; + + ds_suspend_pebs(tracer); + + WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); + tracer->ds.context->pebs_master = NULL; + + put_tracer(tracer->ds.context->task); + ds_put_context(tracer->ds.context); + + kfree(tracer); } -int ds_reset_pebs(struct task_struct *task) +void ds_suspend_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); + } -int ds_clear_bts(struct task_struct *task) +void ds_resume_pebs(struct pebs_tracer *tracer) { - return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); + } -int ds_clear_pebs(struct task_struct *task) +const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { - return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); + if (!tracer) + return NULL; + + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + return &tracer->trace; } -int ds_get_pebs_reset(struct task_struct *task, u64 *value) +const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - int error; + if (!tracer) + return NULL; + + ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + tracer->trace.reset_value = + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); - if (!value) + return &tracer->trace; +} + +int ds_reset_bts(struct bts_tracer *tracer) +{ + if (!tracer) return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + tracer->trace.ds.top = tracer->trace.ds.begin; - *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); - error = 0; - out: - ds_put_context(context); - return error; + return 0; } -int ds_set_pebs_reset(struct task_struct *task, u64 value) +int ds_reset_pebs(struct pebs_tracer *tracer) { - struct ds_context *context; - int error; + if (!tracer) + return -EINVAL; - context = ds_get_context(task); - error = ds_validate_access(context, ds_pebs); - if (error < 0) - goto out; + tracer->trace.ds.top = tracer->trace.ds.begin; - *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + ds_set(tracer->ds.context->ds, ds_bts, ds_index, + (unsigned long)tracer->trace.ds.top); - error = 0; - out: - ds_put_context(context); - return error; + return 0; +} + +int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +{ + if (!tracer) + return -EINVAL; + + *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + + return 0; } -static const struct ds_configuration ds_cfg_var = { - .sizeof_ds = sizeof(long) * 12, - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, +static const struct ds_configuration ds_cfg_netburst = { + .name = "netburst", + .ctl[dsf_bts] = (1 << 2) | (1 << 3), + .ctl[dsf_bts_kernel] = (1 << 5), + .ctl[dsf_bts_user] = (1 << 6), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = sizeof(long) * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; -static const struct ds_configuration ds_cfg_64 = { - .sizeof_ds = 8 * 12, - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, +static const struct ds_configuration ds_cfg_pentium_m = { + .name = "pentium m", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, #ifdef __i386__ - .sizeof_rec[ds_pebs] = 8 * 10 + .sizeof_rec[ds_pebs] = sizeof(long) * 10, #else - .sizeof_rec[ds_pebs] = 8 * 18 + .sizeof_rec[ds_pebs] = sizeof(long) * 18, #endif }; +static const struct ds_configuration ds_cfg_core2 = { + .name = "core 2", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 18, +}; -static inline void +static void ds_configure(const struct ds_configuration *cfg) { + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; + + printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + + if (!cpu_has_bts) { + ds_cfg.ctl[dsf_bts] = 0; + printk(KERN_INFO "[ds] bts not available\n"); + } + if (!cpu_has_pebs) + printk(KERN_INFO "[ds] pebs not available\n"); + + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -847,16 +949,15 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { + case 0 ... 0xC: + /* sorry, don't know about them */ + break; case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_pentium_m); break; - case 0xF: /* Core2 */ - case 0x1C: /* Atom */ - ds_configure(&ds_cfg_64); - break; - default: - /* sorry, don't know about them */ + default: /* Core2, Atom, ... */ + ds_configure(&ds_cfg_core2); break; } break; @@ -865,7 +966,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_var); + ds_configure(&ds_cfg_netburst); break; default: /* sorry, don't know about them */ @@ -878,12 +979,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } -void ds_free(struct ds_context *context) +/* + * Change the DS configuration from tracing prev to tracing next. + */ +void ds_switch_to(struct task_struct *prev, struct task_struct *next) +{ + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + + if (prev_ctx) { + update_debugctlmsr(0); + + if (prev_ctx->bts_master && + (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_departs, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = prev->pid + }; + bts_write(prev_ctx->bts_master, &ts); + } + } + + if (next_ctx) { + if (next_ctx->bts_master && + (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { + struct bts_struct ts = { + .qualifier = bts_task_arrives, + .variant.timestamp.jiffies = jiffies_64, + .variant.timestamp.pid = next->pid + }; + bts_write(next_ctx->bts_master, &ts); + } + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); + } + + update_debugctlmsr(next->thread.debugctlmsr); +} + +void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +{ + clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); + tsk->thread.ds_ctx = NULL; +} + +void ds_exit_thread(struct task_struct *tsk) { - /* This is called when the task owning the parameter context - * is dying. There should not be any user of that context left - * to disturb us, anymore. */ - unsigned long leftovers = context->count; - while (leftovers--) - ds_put_context(context); + WARN_ON(tsk->thread.ds_ctx); } |