From 9961079348d43dddb1f21118c17f054f63bbaa05 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 22 Sep 2009 12:40:33 +1000 Subject: tracing/workqueue: Use %pf in workqueue trace events Using %pf instead of %pF supresses printing of the function offset which will always be 0 in the case of worklet functions. Signed-off-by: Anton Blanchard Cc: KOSAKI Motohiro Cc: Li Zefan Cc: Zhaolei Cc: Lai Jiangshan Cc: Ingo Molnar Cc: Steven Rostedt LKML-Reference: <20090922024033.GB31801@kryten> Signed-off-by: Frederic Weisbecker --- include/trace/events/workqueue.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index fcfd9a1e4b96..e4612dbd7ba6 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -26,7 +26,7 @@ TRACE_EVENT(workqueue_insertion, __entry->func = work->func; ), - TP_printk("thread=%s:%d func=%pF", __entry->thread_comm, + TP_printk("thread=%s:%d func=%pf", __entry->thread_comm, __entry->thread_pid, __entry->func) ); @@ -48,7 +48,7 @@ TRACE_EVENT(workqueue_execution, __entry->func = work->func; ), - TP_printk("thread=%s:%d func=%pF", __entry->thread_comm, + TP_printk("thread=%s:%d func=%pf", __entry->thread_comm, __entry->thread_pid, __entry->func) ); -- cgit v1.2.3 From 55138e0bc29c0751e2152df9ad35deea542f29b3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 29 Sep 2009 13:31:31 -0400 Subject: ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks Work around problems in the writeback code to force out writebacks in larger chunks than just 4mb, which is just too small. This also works around limitations in the ext4 block allocator, which can't allocate more than 2048 blocks at a time. So we need to defeat the round-robin characteristics of the writeback code and try to write out as many blocks in one inode before allowing the writeback code to move on to another inode. We add a a new per-filesystem tunable, max_writeback_mb_bump, which caps this to a default of 128mb per inode. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 105 +++++++++++++++++++++++++++++++++++++++----- fs/ext4/super.c | 3 ++ include/trace/events/ext4.h | 14 ++++-- 4 files changed, 107 insertions(+), 16 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index e227eea23f05..a58438e18d0b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -942,6 +942,7 @@ struct ext4_sb_info { unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; + unsigned int s_max_writeback_mb_bump; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5fb72a98ccbe..20e2d704dc2e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1144,6 +1144,64 @@ static int check_block_validity(struct inode *inode, const char *msg, return 0; } +/* + * Return the number of dirty pages in the given inode starting at + * page frame idx. + */ +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, + unsigned int max_pages) +{ + struct address_space *mapping = inode->i_mapping; + pgoff_t index; + struct pagevec pvec; + pgoff_t num = 0; + int i, nr_pages, done = 0; + + if (max_pages == 0) + return 0; + pagevec_init(&pvec, 0); + while (!done) { + index = idx; + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + (pgoff_t)PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + struct buffer_head *bh, *head; + + lock_page(page); + if (unlikely(page->mapping != mapping) || + !PageDirty(page) || + PageWriteback(page) || + page->index != idx) { + done = 1; + unlock_page(page); + break; + } + head = page_buffers(page); + bh = head; + do { + if (!buffer_delay(bh) && + !buffer_unwritten(bh)) { + done = 1; + break; + } + } while ((bh = bh->b_this_page) != head); + unlock_page(page); + if (done) + break; + idx++; + num++; + if (num >= max_pages) + break; + } + pagevec_release(&pvec); + } + return num; +} + /* * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -2743,8 +2801,10 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + unsigned int max_pages; int range_cyclic, cycled = 1, io_done = 0; - int needed_blocks, ret = 0, nr_to_writebump = 0; + int needed_blocks, ret = 0; + long desired_nr_to_write, nr_to_writebump = 0; loff_t range_start = wbc->range_start; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); @@ -2771,16 +2831,6 @@ static int ext4_da_writepages(struct address_space *mapping, if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) return -EROFS; - /* - * Make sure nr_to_write is >= sbi->s_mb_stream_request - * This make sure small files blocks are allocated in - * single attempt. This ensure that small files - * get less fragmented. - */ - if (wbc->nr_to_write < sbi->s_mb_stream_request) { - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; - wbc->nr_to_write = sbi->s_mb_stream_request; - } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; @@ -2795,6 +2845,36 @@ static int ext4_da_writepages(struct address_space *mapping, } else index = wbc->range_start >> PAGE_CACHE_SHIFT; + /* + * This works around two forms of stupidity. The first is in + * the writeback code, which caps the maximum number of pages + * written to be 1024 pages. This is wrong on multiple + * levels; different architectues have a different page size, + * which changes the maximum amount of data which gets + * written. Secondly, 4 megabytes is way too small. XFS + * forces this value to be 16 megabytes by multiplying + * nr_to_write parameter by four, and then relies on its + * allocator to allocate larger extents to make them + * contiguous. Unfortunately this brings us to the second + * stupidity, which is that ext4's mballoc code only allocates + * at most 2048 blocks. So we force contiguous writes up to + * the number of dirty blocks in the inode, or + * sbi->max_writeback_mb_bump whichever is smaller. + */ + max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT); + if (!range_cyclic && range_whole) + desired_nr_to_write = wbc->nr_to_write * 8; + else + desired_nr_to_write = ext4_num_dirty_pages(inode, index, + max_pages); + if (desired_nr_to_write > max_pages) + desired_nr_to_write = max_pages; + + if (wbc->nr_to_write < desired_nr_to_write) { + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write; + wbc->nr_to_write = desired_nr_to_write; + } + mpd.wbc = wbc; mpd.inode = mapping->host; @@ -2914,7 +2994,8 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - wbc->nr_to_write -= nr_to_writebump; + if (wbc->nr_to_write > nr_to_writebump) + wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index df539ba27779..16817737ba52 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2197,6 +2197,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan); EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); +EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2210,6 +2211,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_order2_req), ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), + ATTR_LIST(max_writeback_mb_bump), NULL, }; @@ -2679,6 +2681,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_stripe = ext4_get_stripe_size(sbi); + sbi->s_max_writeback_mb_bump = 128; /* * set up enough so that it can read an inode diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index c1bd8f1e8b94..7c6bbb7198a3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -236,6 +236,7 @@ TRACE_EVENT(ext4_da_writepages, __field( char, for_kupdate ) __field( char, for_reclaim ) __field( char, range_cyclic ) + __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -249,15 +250,17 @@ TRACE_EVENT(ext4_da_writepages, __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; + __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, __entry->for_kupdate, __entry->for_reclaim, - __entry->range_cyclic) + __entry->range_cyclic, + (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_pages, @@ -309,6 +312,7 @@ TRACE_EVENT(ext4_da_writepages_result, __field( char, encountered_congestion ) __field( char, more_io ) __field( char, no_nrwrite_index_update ) + __field( pgoff_t, writeback_index ) ), TP_fast_assign( @@ -320,14 +324,16 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->encountered_congestion = wbc->encountered_congestion; __entry->more_io = wbc->more_io; __entry->no_nrwrite_index_update = wbc->no_nrwrite_index_update; + __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d", + TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld congestion %d more_io %d no_nrwrite_index_update %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->encountered_congestion, __entry->more_io, - __entry->no_nrwrite_index_update) + __entry->no_nrwrite_index_update, + (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_begin, -- cgit v1.2.3 From 296c355cd6443d89fa251885a8d78778fe111dc4 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 00:32:42 -0400 Subject: ext4: Use tracepoints for mb_history trace file The /proc/fs/ext4//mb_history was maintained manually, and had a number of problems: it required a largish amount of memory to be allocated for each ext4 filesystem, and the s_mb_history_lock introduced a CPU contention problem. By ripping out the mb_history code and replacing it with ftrace tracepoints, and we get more functionality: timestamps, event filtering, the ability to correlate mballoc history with other ext4 tracepoints, etc. Signed-off-by: "Theodore Ts'o" --- Documentation/filesystems/proc.txt | 1 - fs/ext4/ext4.h | 14 +- fs/ext4/mballoc.c | 301 ++----------------------------------- fs/ext4/mballoc.h | 33 ---- fs/ext4/super.c | 18 +-- include/trace/events/ext4.h | 163 ++++++++++++++++++++ 6 files changed, 182 insertions(+), 348 deletions(-) (limited to 'include/trace') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index b5aee7838a00..2c48f945546b 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1113,7 +1113,6 @@ Table 1-12: Files in /proc/fs/ext4/ .............................................................................. File Content mb_groups details of multiblock allocator buddy cache of free blocks - mb_history multiblock allocation history .............................................................................. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b491576e11c3..c508cf7be75c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -65,6 +65,12 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +/* + * Flags used in mballoc's allocation_context flags field. + * + * Also used to show what's going on for debugging purposes when the + * flag field is exported via the traceport interface + */ /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 @@ -971,14 +977,6 @@ struct ext4_sb_info { unsigned long s_mb_last_group; unsigned long s_mb_last_start; - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; - int s_mb_history_cur; - int s_mb_history_max; - int s_mb_history_num; - spinlock_t s_mb_history_lock; - int s_mb_history_filter; - /* stats for buddy allocator */ spinlock_t s_mb_pa_lock; atomic_t s_bal_reqs; /* number of reqs with len > 1 */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 3e2320e66721..bba12824defa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2096,207 +2096,6 @@ out: return err; } -#ifdef EXT4_MB_HISTORY -struct ext4_mb_proc_session { - struct ext4_mb_history *history; - struct super_block *sb; - int start; - int max; -}; - -static void *ext4_mb_history_skip_empty(struct ext4_mb_proc_session *s, - struct ext4_mb_history *hs, - int first) -{ - if (hs == s->history + s->max) - hs = s->history; - if (!first && hs == s->history + s->start) - return NULL; - while (hs->orig.fe_len == 0) { - hs++; - if (hs == s->history + s->max) - hs = s->history; - if (hs == s->history + s->start) - return NULL; - } - return hs; -} - -static void *ext4_mb_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - hs = ext4_mb_history_skip_empty(s, s->history + s->start, 1); - if (!hs) - return NULL; - while (--l && (hs = ext4_mb_history_skip_empty(s, ++hs, 0)) != NULL); - return hs; -} - -static void *ext4_mb_seq_history_next(struct seq_file *seq, void *v, - loff_t *pos) -{ - struct ext4_mb_proc_session *s = seq->private; - struct ext4_mb_history *hs = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return ext4_mb_history_skip_empty(s, s->history + s->start, 1); - else - return ext4_mb_history_skip_empty(s, ++hs, 0); -} - -static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) -{ - char buf[25], buf2[25], buf3[25], *fmt; - struct ext4_mb_history *hs = v; - - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s " - "%-5s %-2s %-6s %-5s %-5s %-6s\n", - "pid", "inode", "original", "goal", "result", "found", - "grps", "cr", "flags", "merge", "tail", "broken"); - return 0; - } - - if (hs->op == EXT4_MB_HISTORY_ALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " - "0x%04x %-5s %-5u %-6u\n"; - sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, - hs->goal.fe_start, hs->goal.fe_len, - hs->goal.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, - hs->found, hs->groups, hs->cr, hs->flags, - hs->merged ? "M" : "", hs->tail, - hs->buddy ? 1 << hs->buddy : 0); - } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { - fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len, - hs->result.fe_logical); - sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, - hs->orig.fe_start, hs->orig.fe_len, - hs->orig.fe_logical); - seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); - } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%u/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s discard\n", - hs->pid, hs->ino, buf2); - } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%u/%d/%u", hs->result.fe_group, - hs->result.fe_start, hs->result.fe_len); - seq_printf(seq, "%-5u %-8u %-23s free\n", - hs->pid, hs->ino, buf2); - } - return 0; -} - -static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations ext4_mb_seq_history_ops = { - .start = ext4_mb_seq_history_start, - .next = ext4_mb_seq_history_next, - .stop = ext4_mb_seq_history_stop, - .show = ext4_mb_seq_history_show, -}; - -static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) -{ - struct super_block *sb = PDE(inode)->data; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_mb_proc_session *s; - int rc; - int size; - - if (unlikely(sbi->s_mb_history == NULL)) - return -ENOMEM; - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - s->sb = sb; - size = sizeof(struct ext4_mb_history) * sbi->s_mb_history_max; - s->history = kmalloc(size, GFP_KERNEL); - if (s->history == NULL) { - kfree(s); - return -ENOMEM; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(s->history, sbi->s_mb_history, size); - s->max = sbi->s_mb_history_max; - s->start = sbi->s_mb_history_cur % s->max; - spin_unlock(&sbi->s_mb_history_lock); - - rc = seq_open(file, &ext4_mb_seq_history_ops); - if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; - m->private = s; - } else { - kfree(s->history); - kfree(s); - } - return rc; - -} - -static int ext4_mb_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - kfree(s->history); - kfree(s); - return seq_release(inode, file); -} - -static ssize_t ext4_mb_seq_history_write(struct file *file, - const char __user *buffer, - size_t count, loff_t *ppos) -{ - struct seq_file *seq = (struct seq_file *)file->private_data; - struct ext4_mb_proc_session *s = seq->private; - struct super_block *sb = s->sb; - char str[32]; - int value; - - if (count >= sizeof(str)) { - printk(KERN_ERR "EXT4-fs: %s string too long, max %u bytes\n", - "mb_history", (int)sizeof(str)); - return -EOVERFLOW; - } - - if (copy_from_user(str, buffer, count)) - return -EFAULT; - - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - EXT4_SB(sb)->s_mb_history_filter = value; - - return count; -} - -static const struct file_operations ext4_mb_seq_history_fops = { - .owner = THIS_MODULE, - .open = ext4_mb_seq_history_open, - .read = seq_read, - .write = ext4_mb_seq_history_write, - .llseek = seq_lseek, - .release = ext4_mb_seq_history_release, -}; - static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; @@ -2396,82 +2195,6 @@ static const struct file_operations ext4_mb_seq_groups_fops = { .release = seq_release, }; -static void ext4_mb_history_release(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - - if (sbi->s_proc != NULL) { - remove_proc_entry("mb_groups", sbi->s_proc); - if (sbi->s_mb_history_max) - remove_proc_entry("mb_history", sbi->s_proc); - } - kfree(sbi->s_mb_history); -} - -static void ext4_mb_history_init(struct super_block *sb) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - int i; - - if (sbi->s_proc != NULL) { - if (sbi->s_mb_history_max) - proc_create_data("mb_history", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_history_fops, sb); - proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_groups_fops, sb); - } - - sbi->s_mb_history_cur = 0; - spin_lock_init(&sbi->s_mb_history_lock); - i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; - /* if we can't allocate history, then we simple won't use it */ -} - -static noinline_for_stack void -ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - struct ext4_mb_history h; - - if (sbi->s_mb_history == NULL) - return; - - if (!(ac->ac_op & sbi->s_mb_history_filter)) - return; - - h.op = ac->ac_op; - h.pid = current->pid; - h.ino = ac->ac_inode ? ac->ac_inode->i_ino : 0; - h.orig = ac->ac_o_ex; - h.result = ac->ac_b_ex; - h.flags = ac->ac_flags; - h.found = ac->ac_found; - h.groups = ac->ac_groups_scanned; - h.cr = ac->ac_criteria; - h.tail = ac->ac_tail; - h.buddy = ac->ac_buddy; - h.merged = 0; - if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) { - if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && - ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) - h.merged = 1; - h.goal = ac->ac_g_ex; - h.result = ac->ac_f_ex; - } - - spin_lock(&sbi->s_mb_history_lock); - memcpy(sbi->s_mb_history + sbi->s_mb_history_cur, &h, sizeof(h)); - if (++sbi->s_mb_history_cur >= sbi->s_mb_history_max) - sbi->s_mb_history_cur = 0; - spin_unlock(&sbi->s_mb_history_lock); -} - -#else -#define ext4_mb_history_release(sb) -#define ext4_mb_history_init(sb) -#endif - /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, @@ -2690,7 +2413,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; - sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); @@ -2708,7 +2430,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) spin_lock_init(&lg->lg_prealloc_lock); } - ext4_mb_history_init(sb); + if (sbi->s_proc) + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_groups_fops, sb); if (sbi->s_journal) sbi->s_journal->j_commit_callback = release_blocks_on_commit; @@ -2788,7 +2512,8 @@ int ext4_mb_release(struct super_block *sb) } free_percpu(sbi->s_locality_groups); - ext4_mb_history_release(sb); + if (sbi->s_proc) + remove_proc_entry("mb_groups", sbi->s_proc); return 0; } @@ -3274,7 +2999,10 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) atomic_inc(&sbi->s_bal_breaks); } - ext4_mb_store_history(ac); + if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) + trace_ext4_mballoc_alloc(ac); + else + trace_ext4_mballoc_prealloc(ac); } /* @@ -3774,7 +3502,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (ac) { ac->ac_sb = sb; ac->ac_inode = pa->pa_inode; - ac->ac_op = EXT4_MB_HISTORY_DISCARD; } while (bit < end) { @@ -3794,7 +3521,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = next - bit; ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); + trace_ext4_mballoc_discard(ac); } trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, @@ -3829,9 +3556,6 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - if (ac) - ac->ac_op = EXT4_MB_HISTORY_DISCARD; - trace_ext4_mb_release_group_pa(ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); @@ -3846,7 +3570,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = pa->pa_len; ac->ac_b_ex.fe_logical = 0; - ext4_mb_store_history(ac); + trace_ext4_mballoc_discard(ac); } return 0; @@ -4737,7 +4461,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { - ac->ac_op = EXT4_MB_HISTORY_FREE; ac->ac_inode = inode; ac->ac_sb = sb; } @@ -4804,7 +4527,7 @@ do_more: ac->ac_b_ex.fe_group = block_group; ac->ac_b_ex.fe_start = bit; ac->ac_b_ex.fe_len = count; - ext4_mb_store_history(ac); + trace_ext4_mballoc_free(ac); } err = ext4_mb_load_buddy(sb, block_group, &e4b); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 14f25f253112..0ca811061bc7 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -52,18 +52,8 @@ extern u8 mb_enable_debug; #define mb_debug(n, fmt, a...) #endif -/* - * with EXT4_MB_HISTORY mballoc stores last N allocations in memory - * and you can monitor it in /proc/fs/ext4//mb_history - */ -#define EXT4_MB_HISTORY #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ -#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ -#define EXT4_MB_HISTORY_FREE 8 /* free */ - -#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ - EXT4_MB_HISTORY_PREALLOC) /* * How long mballoc can look for a best extent (in found extents) @@ -217,22 +207,6 @@ struct ext4_allocation_context { #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 -struct ext4_mb_history { - struct ext4_free_extent orig; /* orig allocation */ - struct ext4_free_extent goal; /* goal allocation */ - struct ext4_free_extent result; /* result allocation */ - unsigned pid; - unsigned ino; - __u16 found; /* how many extents have been found */ - __u16 groups; /* how many groups have been scanned */ - __u16 tail; /* what tail broke some buddy */ - __u16 buddy; /* buddy the tail ^^^ broke */ - __u16 flags; - __u8 cr:3; /* which phase the result extent was found at */ - __u8 op:4; - __u8 merged:1; -}; - struct ext4_buddy { struct page *bd_buddy_page; void *bd_buddy; @@ -247,13 +221,6 @@ struct ext4_buddy { #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) -#ifndef EXT4_MB_HISTORY -static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - return; -} -#endif - #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5b206a043a5..12e726a7073f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -50,13 +50,6 @@ #define CREATE_TRACE_POINTS #include -static int default_mb_history_length = 1000; - -module_param_named(default_mb_history_length, default_mb_history_length, - int, 0644); -MODULE_PARM_DESC(default_mb_history_length, - "Default number of entries saved for mb_history"); - struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; @@ -1079,7 +1072,7 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, @@ -1126,7 +1119,6 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, - {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1367,13 +1359,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; - case Opt_mb_history_length: - if (match_int(&args[0], &option)) - return 0; - if (option < 0) - return 0; - sbi->s_mb_history_max = option; - break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2435,7 +2420,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7c6bbb7198a3..b8320256dc5d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -743,6 +743,169 @@ TRACE_EVENT(ext4_alloc_da_blocks, __entry->data_blocks, __entry->meta_blocks) ); +TRACE_EVENT(ext4_mballoc_alloc, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u16, found ) + __field( __u16, groups ) + __field( __u16, buddy ) + __field( __u16, flags ) + __field( __u16, tail ) + __field( __u8, cr ) + __field( __u32, orig_logical ) + __field( int, orig_start ) + __field( __u32, orig_group ) + __field( int, orig_len ) + __field( __u32, goal_logical ) + __field( int, goal_start ) + __field( __u32, goal_group ) + __field( int, goal_len ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->found = ac->ac_found; + __entry->flags = ac->ac_flags; + __entry->groups = ac->ac_groups_scanned; + __entry->buddy = ac->ac_buddy; + __entry->tail = ac->ac_tail; + __entry->cr = ac->ac_criteria; + __entry->orig_logical = ac->ac_o_ex.fe_logical; + __entry->orig_start = ac->ac_o_ex.fe_start; + __entry->orig_group = ac->ac_o_ex.fe_group; + __entry->orig_len = ac->ac_o_ex.fe_len; + __entry->goal_logical = ac->ac_g_ex.fe_logical; + __entry->goal_start = ac->ac_g_ex.fe_start; + __entry->goal_group = ac->ac_g_ex.fe_group; + __entry->goal_len = ac->ac_g_ex.fe_len; + __entry->result_logical = ac->ac_f_ex.fe_logical; + __entry->result_start = ac->ac_f_ex.fe_start; + __entry->result_group = ac->ac_f_ex.fe_group; + __entry->result_len = ac->ac_f_ex.fe_len; + ), + + TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " + "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " + "tail %u broken %u", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->orig_group, __entry->orig_start, + __entry->orig_len, __entry->orig_logical, + __entry->goal_group, __entry->goal_start, + __entry->goal_len, __entry->goal_logical, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical, + __entry->found, __entry->groups, __entry->cr, + __entry->flags, __entry->tail, + __entry->buddy ? 1 << __entry->buddy : 0) +); + +TRACE_EVENT(ext4_mballoc_prealloc, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u32, orig_logical ) + __field( int, orig_start ) + __field( __u32, orig_group ) + __field( int, orig_len ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->orig_logical = ac->ac_o_ex.fe_logical; + __entry->orig_start = ac->ac_o_ex.fe_start; + __entry->orig_group = ac->ac_o_ex.fe_group; + __entry->orig_len = ac->ac_o_ex.fe_len; + __entry->result_logical = ac->ac_b_ex.fe_logical; + __entry->result_start = ac->ac_b_ex.fe_start; + __entry->result_group = ac->ac_b_ex.fe_group; + __entry->result_len = ac->ac_b_ex.fe_len; + ), + + TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->orig_group, __entry->orig_start, + __entry->orig_len, __entry->orig_logical, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical) +); + +TRACE_EVENT(ext4_mballoc_discard, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->result_logical = ac->ac_b_ex.fe_logical; + __entry->result_start = ac->ac_b_ex.fe_start; + __entry->result_group = ac->ac_b_ex.fe_group; + __entry->result_len = ac->ac_b_ex.fe_len; + ), + + TP_printk("dev %s inode %lu extent %u/%d/%u@%u ", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical) +); + +TRACE_EVENT(ext4_mballoc_free, + TP_PROTO(struct ext4_allocation_context *ac), + + TP_ARGS(ac), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( __u32, result_logical ) + __field( int, result_start ) + __field( __u32, result_group ) + __field( int, result_len ) + ), + + TP_fast_assign( + __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->ino = ac->ac_inode->i_ino; + __entry->result_logical = ac->ac_b_ex.fe_logical; + __entry->result_start = ac->ac_b_ex.fe_start; + __entry->result_group = ac->ac_b_ex.fe_group; + __entry->result_len = ac->ac_b_ex.fe_len; + ), + + TP_printk("dev %s inode %lu extent %u/%d/%u@%u ", + jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->result_group, __entry->result_start, + __entry->result_len, __entry->result_logical) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From bf6993276f74d46776f35c45ddef29b981b1d1c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 30 Sep 2009 00:32:06 -0400 Subject: jbd2: Use tracepoints for history file The /proc/fs/jbd2//history was maintained manually; by using tracepoints, we can get all of the existing functionality of the /proc file plus extra capabilities thanks to the ftrace infrastructure. We save memory as a bonus. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/checkpoint.c | 7 ++ fs/jbd2/commit.c | 59 +++++++------- fs/jbd2/journal.c | 187 +++----------------------------------------- include/linux/jbd2.h | 27 ++----- include/trace/events/jbd2.h | 78 ++++++++++++++++++ 5 files changed, 130 insertions(+), 228 deletions(-) (limited to 'include/trace') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 5d70b3e6d49b..ca0f5eb62b20 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -643,6 +643,7 @@ out: int __jbd2_journal_remove_checkpoint(struct journal_head *jh) { + struct transaction_chp_stats_s *stats; transaction_t *transaction; journal_t *journal; int ret = 0; @@ -679,6 +680,12 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) /* OK, that was the last buffer for the transaction: we can now safely remove this transaction from the log */ + stats = &transaction->t_chp_stats; + if (stats->cs_chp_time) + stats->cs_chp_time = jbd2_time_diff(stats->cs_chp_time, + jiffies); + trace_jbd2_checkpoint_stats(journal->j_fs_dev->bd_dev, + transaction->t_tid, stats); __jbd2_journal_drop_transaction(journal, transaction); kfree(transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 26d991ddc1e6..d4cfd6d2779e 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -410,10 +410,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (commit_transaction->t_synchronous_commit) write_op = WRITE_SYNC_PLUG; trace_jbd2_commit_locking(journal, commit_transaction); - stats.u.run.rs_wait = commit_transaction->t_max_wait; - stats.u.run.rs_locked = jiffies; - stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, - stats.u.run.rs_locked); + stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_locked = jiffies; + stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, + stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); while (commit_transaction->t_updates) { @@ -486,9 +486,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_switch_revoke_table(journal); trace_jbd2_commit_flushing(journal, commit_transaction); - stats.u.run.rs_flushing = jiffies; - stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked, - stats.u.run.rs_flushing); + stats.run.rs_flushing = jiffies; + stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked, + stats.run.rs_flushing); commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; @@ -523,11 +523,11 @@ void jbd2_journal_commit_transaction(journal_t *journal) spin_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); - stats.u.run.rs_logging = jiffies; - stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing, - stats.u.run.rs_logging); - stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; - stats.u.run.rs_blocks_logged = 0; + stats.run.rs_logging = jiffies; + stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, + stats.run.rs_logging); + stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= commit_transaction->t_outstanding_credits); @@ -695,7 +695,7 @@ start_journal_io: submit_bh(write_op, bh); } cond_resched(); - stats.u.run.rs_blocks_logged += bufs; + stats.run.rs_blocks_logged += bufs; /* Force a new descriptor to be generated next time round the loop. */ @@ -988,33 +988,30 @@ restart_loop: J_ASSERT(commit_transaction->t_state == T_COMMIT); commit_transaction->t_start = jiffies; - stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging, - commit_transaction->t_start); + stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging, + commit_transaction->t_start); /* - * File the transaction for history + * File the transaction statistics */ - stats.ts_type = JBD2_STATS_RUN; stats.ts_tid = commit_transaction->t_tid; - stats.u.run.rs_handle_count = commit_transaction->t_handle_count; - spin_lock(&journal->j_history_lock); - memcpy(journal->j_history + journal->j_history_cur, &stats, - sizeof(stats)); - if (++journal->j_history_cur == journal->j_history_max) - journal->j_history_cur = 0; + stats.run.rs_handle_count = commit_transaction->t_handle_count; + trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, + commit_transaction->t_tid, &stats.run); /* * Calculate overall stats */ + spin_lock(&journal->j_history_lock); journal->j_stats.ts_tid++; - journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait; - journal->j_stats.u.run.rs_running += stats.u.run.rs_running; - journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked; - journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing; - journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging; - journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count; - journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks; - journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged; + journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_running += stats.run.rs_running; + journal->j_stats.run.rs_locked += stats.run.rs_locked; + journal->j_stats.run.rs_flushing += stats.run.rs_flushing; + journal->j_stats.run.rs_logging += stats.run.rs_logging; + journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count; + journal->j_stats.run.rs_blocks += stats.run.rs_blocks; + journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged; spin_unlock(&journal->j_history_lock); commit_transaction->t_state = T_FINISHED; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 977a8dafb76d..761af77491f5 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -676,153 +676,6 @@ struct jbd2_stats_proc_session { int max; }; -static void *jbd2_history_skip_empty(struct jbd2_stats_proc_session *s, - struct transaction_stats_s *ts, - int first) -{ - if (ts == s->stats + s->max) - ts = s->stats; - if (!first && ts == s->stats + s->start) - return NULL; - while (ts->ts_type == 0) { - ts++; - if (ts == s->stats + s->max) - ts = s->stats; - if (ts == s->stats + s->start) - return NULL; - } - return ts; - -} - -static void *jbd2_seq_history_start(struct seq_file *seq, loff_t *pos) -{ - struct jbd2_stats_proc_session *s = seq->private; - struct transaction_stats_s *ts; - int l = *pos; - - if (l == 0) - return SEQ_START_TOKEN; - ts = jbd2_history_skip_empty(s, s->stats + s->start, 1); - if (!ts) - return NULL; - l--; - while (l) { - ts = jbd2_history_skip_empty(s, ++ts, 0); - if (!ts) - break; - l--; - } - return ts; -} - -static void *jbd2_seq_history_next(struct seq_file *seq, void *v, loff_t *pos) -{ - struct jbd2_stats_proc_session *s = seq->private; - struct transaction_stats_s *ts = v; - - ++*pos; - if (v == SEQ_START_TOKEN) - return jbd2_history_skip_empty(s, s->stats + s->start, 1); - else - return jbd2_history_skip_empty(s, ++ts, 0); -} - -static int jbd2_seq_history_show(struct seq_file *seq, void *v) -{ - struct transaction_stats_s *ts = v; - if (v == SEQ_START_TOKEN) { - seq_printf(seq, "%-4s %-5s %-5s %-5s %-5s %-5s %-5s %-6s %-5s " - "%-5s %-5s %-5s %-5s %-5s\n", "R/C", "tid", - "wait", "run", "lock", "flush", "log", "hndls", - "block", "inlog", "ctime", "write", "drop", - "close"); - return 0; - } - if (ts->ts_type == JBD2_STATS_RUN) - seq_printf(seq, "%-4s %-5lu %-5u %-5u %-5u %-5u %-5u " - "%-6lu %-5lu %-5lu\n", "R", ts->ts_tid, - jiffies_to_msecs(ts->u.run.rs_wait), - jiffies_to_msecs(ts->u.run.rs_running), - jiffies_to_msecs(ts->u.run.rs_locked), - jiffies_to_msecs(ts->u.run.rs_flushing), - jiffies_to_msecs(ts->u.run.rs_logging), - ts->u.run.rs_handle_count, - ts->u.run.rs_blocks, - ts->u.run.rs_blocks_logged); - else if (ts->ts_type == JBD2_STATS_CHECKPOINT) - seq_printf(seq, "%-4s %-5lu %48s %-5u %-5lu %-5lu %-5lu\n", - "C", ts->ts_tid, " ", - jiffies_to_msecs(ts->u.chp.cs_chp_time), - ts->u.chp.cs_written, ts->u.chp.cs_dropped, - ts->u.chp.cs_forced_to_close); - else - J_ASSERT(0); - return 0; -} - -static void jbd2_seq_history_stop(struct seq_file *seq, void *v) -{ -} - -static const struct seq_operations jbd2_seq_history_ops = { - .start = jbd2_seq_history_start, - .next = jbd2_seq_history_next, - .stop = jbd2_seq_history_stop, - .show = jbd2_seq_history_show, -}; - -static int jbd2_seq_history_open(struct inode *inode, struct file *file) -{ - journal_t *journal = PDE(inode)->data; - struct jbd2_stats_proc_session *s; - int rc, size; - - s = kmalloc(sizeof(*s), GFP_KERNEL); - if (s == NULL) - return -ENOMEM; - size = sizeof(struct transaction_stats_s) * journal->j_history_max; - s->stats = kmalloc(size, GFP_KERNEL); - if (s->stats == NULL) { - kfree(s); - return -ENOMEM; - } - spin_lock(&journal->j_history_lock); - memcpy(s->stats, journal->j_history, size); - s->max = journal->j_history_max; - s->start = journal->j_history_cur % s->max; - spin_unlock(&journal->j_history_lock); - - rc = seq_open(file, &jbd2_seq_history_ops); - if (rc == 0) { - struct seq_file *m = file->private_data; - m->private = s; - } else { - kfree(s->stats); - kfree(s); - } - return rc; - -} - -static int jbd2_seq_history_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - struct jbd2_stats_proc_session *s = seq->private; - - kfree(s->stats); - kfree(s); - return seq_release(inode, file); -} - -static struct file_operations jbd2_seq_history_fops = { - .owner = THIS_MODULE, - .open = jbd2_seq_history_open, - .read = seq_read, - .llseek = seq_lseek, - .release = jbd2_seq_history_release, -}; - static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos) { return *pos ? NULL : SEQ_START_TOKEN; @@ -839,29 +692,29 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) return 0; - seq_printf(seq, "%lu transaction, each upto %u blocks\n", + seq_printf(seq, "%lu transaction, each up to %u blocks\n", s->stats->ts_tid, s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_wait / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); seq_printf(seq, " %ums running transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_running / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", - jiffies_to_msecs(s->stats->u.run.rs_locked / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_locked / s->stats->ts_tid)); seq_printf(seq, " %ums flushing data (in ordered mode)\n", - jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", - jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + jiffies_to_msecs(s->stats->run.rs_logging / s->stats->ts_tid)); seq_printf(seq, " %lluus average transaction commit time\n", div_u64(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", - s->stats->u.run.rs_handle_count / s->stats->ts_tid); + s->stats->run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", - s->stats->u.run.rs_blocks / s->stats->ts_tid); + s->stats->run.rs_blocks / s->stats->ts_tid); seq_printf(seq, " %lu logged blocks per transaction\n", - s->stats->u.run.rs_blocks_logged / s->stats->ts_tid); + s->stats->run.rs_blocks_logged / s->stats->ts_tid); return 0; } @@ -931,8 +784,6 @@ static void jbd2_stats_proc_init(journal_t *journal) { journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { - proc_create_data("history", S_IRUGO, journal->j_proc_entry, - &jbd2_seq_history_fops, journal); proc_create_data("info", S_IRUGO, journal->j_proc_entry, &jbd2_seq_info_fops, journal); } @@ -941,27 +792,9 @@ static void jbd2_stats_proc_init(journal_t *journal) static void jbd2_stats_proc_exit(journal_t *journal) { remove_proc_entry("info", journal->j_proc_entry); - remove_proc_entry("history", journal->j_proc_entry); remove_proc_entry(journal->j_devname, proc_jbd2_stats); } -static void journal_init_stats(journal_t *journal) -{ - int size; - - if (!proc_jbd2_stats) - return; - - journal->j_history_max = 100; - size = sizeof(struct transaction_stats_s) * journal->j_history_max; - journal->j_history = kzalloc(size, GFP_KERNEL); - if (!journal->j_history) { - journal->j_history_max = 0; - return; - } - spin_lock_init(&journal->j_history_lock); -} - /* * Management for journal control blocks: functions to create and * destroy journal_t structures, and to initialise and read existing @@ -1006,7 +839,7 @@ static journal_t * journal_init_common (void) goto fail; } - journal_init_stats(journal); + spin_lock_init(&journal->j_history_lock); return journal; fail: diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 52695d3dfd0b..f1011f7f3d41 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -464,9 +464,9 @@ struct handle_s */ struct transaction_chp_stats_s { unsigned long cs_chp_time; - unsigned long cs_forced_to_close; - unsigned long cs_written; - unsigned long cs_dropped; + __u32 cs_forced_to_close; + __u32 cs_written; + __u32 cs_dropped; }; /* The transaction_t type is the guts of the journaling mechanism. It @@ -668,23 +668,16 @@ struct transaction_run_stats_s { unsigned long rs_flushing; unsigned long rs_logging; - unsigned long rs_handle_count; - unsigned long rs_blocks; - unsigned long rs_blocks_logged; + __u32 rs_handle_count; + __u32 rs_blocks; + __u32 rs_blocks_logged; }; struct transaction_stats_s { - int ts_type; unsigned long ts_tid; - union { - struct transaction_run_stats_s run; - struct transaction_chp_stats_s chp; - } u; + struct transaction_run_stats_s run; }; -#define JBD2_STATS_RUN 1 -#define JBD2_STATS_CHECKPOINT 2 - static inline unsigned long jbd2_time_diff(unsigned long start, unsigned long end) { @@ -988,12 +981,6 @@ struct journal_s /* * Journal statistics */ - struct transaction_stats_s *j_history; - int j_history_max; - int j_history_cur; - /* - * Protect the transactions statistics history - */ spinlock_t j_history_lock; struct proc_dir_entry *j_proc_entry; struct transaction_stats_s j_stats; diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index b851f0b4701c..3c60b75adb9e 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -7,6 +7,9 @@ #include #include +struct transaction_chp_stats_s; +struct transaction_run_stats_s; + TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), @@ -162,6 +165,81 @@ TRACE_EVENT(jbd2_submit_inode_data, jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) ); +TRACE_EVENT(jbd2_run_stats, + TP_PROTO(dev_t dev, unsigned long tid, + struct transaction_run_stats_s *stats), + + TP_ARGS(dev, tid, stats), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned long, wait ) + __field( unsigned long, running ) + __field( unsigned long, locked ) + __field( unsigned long, flushing ) + __field( unsigned long, logging ) + __field( __u32, handle_count ) + __field( __u32, blocks ) + __field( __u32, blocks_logged ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->wait = stats->rs_wait; + __entry->running = stats->rs_running; + __entry->locked = stats->rs_locked; + __entry->flushing = stats->rs_flushing; + __entry->logging = stats->rs_logging; + __entry->handle_count = stats->rs_handle_count; + __entry->blocks = stats->rs_blocks; + __entry->blocks_logged = stats->rs_blocks_logged; + ), + + TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + "logging %u handle_count %u blocks %u blocks_logged %u", + jbd2_dev_to_name(__entry->dev), __entry->tid, + jiffies_to_msecs(__entry->wait), + jiffies_to_msecs(__entry->running), + jiffies_to_msecs(__entry->locked), + jiffies_to_msecs(__entry->flushing), + jiffies_to_msecs(__entry->logging), + __entry->handle_count, __entry->blocks, + __entry->blocks_logged) +); + +TRACE_EVENT(jbd2_checkpoint_stats, + TP_PROTO(dev_t dev, unsigned long tid, + struct transaction_chp_stats_s *stats), + + TP_ARGS(dev, tid, stats), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned long, chp_time ) + __field( __u32, forced_to_close ) + __field( __u32, written ) + __field( __u32, dropped ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->chp_time = stats->cs_chp_time; + __entry->forced_to_close= stats->cs_forced_to_close; + __entry->written = stats->cs_written; + __entry->dropped = stats->cs_dropped; + ), + + TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + "written %u dropped %u", + jbd2_dev_to_name(__entry->dev), __entry->tid, + jiffies_to_msecs(__entry->chp_time), + __entry->forced_to_close, __entry->written, __entry->dropped) +); + #endif /* _TRACE_JBD2_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 0ef122494020521309be855bfdeeb41f34bf8c94 Mon Sep 17 00:00:00 2001 From: Josh Stone Date: Wed, 30 Sep 2009 00:51:22 -0400 Subject: ext4: Add a stub for mpage_da_data in the trace header The tracepoint ext4_da_write_pages has a struct mpage_da_data* parameter, but that struct is only defined in fs/ext4/ext4.h. This patch adds a forward declaration for that struct, so this tracepoint header can still be used by tools like SystemTap. This is a continuation of the fix in commit 3661d286. http://sourceware.org/bugzilla/show_bug.cgi?id=10703 Signed-off-by: Josh Stone Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b8320256dc5d..d09550bf3f95 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -11,6 +11,7 @@ struct ext4_allocation_context; struct ext4_allocation_request; struct ext4_prealloc_space; struct ext4_inode_info; +struct mpage_da_data; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) -- cgit v1.2.3 From b0da3f0dada78832c9da03ad2152ae76bd9a2496 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Thu, 1 Oct 2009 21:16:13 +0200 Subject: Add a tracepoint for block request remapping Since 2.6.31 now has request-based device-mapper, it's useful to have a tracepoint for request-remapping as well as bio-remapping. This patch adds a tracepoint for request-remapping, trace_block_rq_remap(). Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Alasdair G Kergon Cc: Li Zefan Signed-off-by: Jens Axboe --- block/blk-core.c | 1 + include/linux/blktrace_api.h | 2 +- include/trace/events/block.h | 33 +++++++++++++++++++++++++++++++++ kernel/trace/blktrace.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 69 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/block/blk-core.c b/block/blk-core.c index 34504f309728..ddaaea4fdffc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -34,6 +34,7 @@ #include "blk.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); static int __make_request(struct request_queue *q, struct bio *bio); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 622939a23299..3b73b9992b26 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -212,7 +212,7 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) -# define blk_trace_remove_sysfs(struct device *dev) do { } while (0) +# define blk_trace_remove_sysfs(dev) do { } while (0) static inline int blk_trace_init_sysfs(struct device *dev) { return 0; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d86af94691c2..00405b5f624a 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -488,6 +488,39 @@ TRACE_EVENT(block_remap, (unsigned long long)__entry->old_sector) ); +TRACE_EVENT(block_rq_remap, + + TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, + sector_t from), + + TP_ARGS(q, rq, dev, from), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( dev_t, old_dev ) + __field( sector_t, old_sector ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(rq->rq_disk); + __entry->sector = blk_rq_pos(rq); + __entry->nr_sector = blk_rq_sectors(rq); + __entry->old_dev = dev; + __entry->old_sector = from; + blk_fill_rwbs_rq(__entry->rwbs, rq); + ), + + TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, + MAJOR(__entry->old_dev), MINOR(__entry->old_dev), + (unsigned long long)__entry->old_sector) +); + #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 60b5c5a3d4b4..d9d6206e0b14 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -855,6 +855,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, sizeof(r), &r); } +/** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + /** * blk_add_driver_data - Add binary message with driver-specific data * @q: queue the io is for @@ -922,10 +953,13 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_remap(blk_add_trace_remap); WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + WARN_ON(ret); } static void blk_unregister_tracepoints(void) { + unregister_trace_block_rq_remap(blk_add_trace_rq_remap); unregister_trace_block_remap(blk_add_trace_remap); unregister_trace_block_split(blk_add_trace_split); unregister_trace_block_unplug_io(blk_add_trace_unplug_io); -- cgit v1.2.3