From 33659ebbae262228eef4e0fe990f393d1f0ed941 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 7 Aug 2010 18:17:56 +0200 Subject: block: remove wrappers for request type/flags Remove all the trivial wrappers for the cmd_type and cmd_flags fields in struct requests. This allows much easier grepping for different request types instead of unwinding through macros. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/trace/events/block.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d870a918559c..d8ce278515c3 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -25,8 +25,10 @@ DECLARE_EVENT_CLASS(block_rq_with_error, TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; - __entry->sector = blk_pc_request(rq) ? 0 : blk_rq_pos(rq); - __entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq); + __entry->sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + 0 : blk_rq_pos(rq); + __entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + 0 : blk_rq_sectors(rq); __entry->errors = rq->errors; blk_fill_rwbs_rq(__entry->rwbs, rq); @@ -109,9 +111,12 @@ DECLARE_EVENT_CLASS(block_rq, TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; - __entry->sector = blk_pc_request(rq) ? 0 : blk_rq_pos(rq); - __entry->nr_sector = blk_pc_request(rq) ? 0 : blk_rq_sectors(rq); - __entry->bytes = blk_pc_request(rq) ? blk_rq_bytes(rq) : 0; + __entry->sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + 0 : blk_rq_pos(rq); + __entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + 0 : blk_rq_sectors(rq); + __entry->bytes = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ? + blk_rq_bytes(rq) : 0; blk_fill_rwbs_rq(__entry->rwbs, rq); blk_dump_cmd(__get_str(cmd), rq); -- cgit v1.2.3 From 455b2864686d3591b3b2f39eb46290c95f76471f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:06 +1000 Subject: writeback: Initial tracing support Trace queue/sched/exec parts of the writeback loop. This provides insight into when and why flusher threads are scheduled to run. e.g a sync invocation leaves traces like: sync-[...]: writeback_queue: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0 flush-8:0-[...]: writeback_exec: bdi 8:0: sb_dev 8:1 nr_pages=7712 sync_mode=0 kupdate=0 range_cyclic=0 background=0 This also lays the foundation for adding more writeback tracing to provide deeper insight into the whole writeback path. The original tracing code is from Jens Axboe, though this version is a rewrite as a result of the code being traced changing significantly. Signed-off-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 38 +++++++++++++---- include/trace/events/writeback.h | 91 ++++++++++++++++++++++++++++++++++++++++ mm/backing-dev.c | 3 ++ 3 files changed, 124 insertions(+), 8 deletions(-) create mode 100644 include/trace/events/writeback.h (limited to 'include/trace') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c8471b3ddccf..73acab4dc2b7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -26,15 +26,9 @@ #include #include #include +#include #include "internal.h" -#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) - -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -50,6 +44,21 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; +/* + * Include the creation of the trace points after defining the + * wb_writeback_work structure so that the definition remains local to this + * file. + */ +#define CREATE_TRACE_POINTS +#include + +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) + +/* + * We don't actually have pdflush, but this one is exported though /proc... + */ +int nr_pdflush_threads; + /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. @@ -65,6 +74,8 @@ int writeback_in_progress(struct backing_dev_info *bdi) static void bdi_queue_work(struct backing_dev_info *bdi, struct wb_writeback_work *work) { + trace_writeback_queue(bdi, work); + spin_lock(&bdi->wb_lock); list_add_tail(&work->list, &bdi->work_list); spin_unlock(&bdi->wb_lock); @@ -74,6 +85,7 @@ static void bdi_queue_work(struct backing_dev_info *bdi, * it gets created and wakes up, we'll run this work. */ if (unlikely(!bdi->wb.task)) { + trace_writeback_nothread(bdi, work); wake_up_process(default_backing_dev_info.wb.task); } else { struct bdi_writeback *wb = &bdi->wb; @@ -95,8 +107,10 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - if (bdi->wb.task) + if (bdi->wb.task) { + trace_writeback_nowork(bdi); wake_up_process(bdi->wb.task); + } return; } @@ -751,6 +765,8 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait) if (force_wait) work->sync_mode = WB_SYNC_ALL; + trace_writeback_exec(bdi, work); + wrote += wb_writeback(wb, work); /* @@ -805,9 +821,13 @@ int bdi_writeback_thread(void *data) smp_mb__after_clear_bit(); wake_up_bit(&bdi->state, BDI_pending); + trace_writeback_thread_start(bdi); + while (!kthread_should_stop()) { pages_written = wb_do_writeback(wb, 0); + trace_writeback_pages_written(pages_written); + if (pages_written) last_active = jiffies; else if (wait_jiffies != -1UL) { @@ -845,6 +865,8 @@ int bdi_writeback_thread(void *data) */ if (!list_empty(&bdi->work_list)) wb_do_writeback(wb, 1); + + trace_writeback_thread_stop(bdi); return 0; } diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h new file mode 100644 index 000000000000..562fcae10d9d --- /dev/null +++ b/include/trace/events/writeback.h @@ -0,0 +1,91 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM writeback + +#if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WRITEBACK_H + +#include +#include + +struct wb_writeback_work; + +DECLARE_EVENT_CLASS(writeback_work_class, + TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), + TP_ARGS(bdi, work), + TP_STRUCT__entry( + __array(char, name, 32) + __field(long, nr_pages) + __field(dev_t, sb_dev) + __field(int, sync_mode) + __field(int, for_kupdate) + __field(int, range_cyclic) + __field(int, for_background) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->nr_pages = work->nr_pages; + __entry->sb_dev = work->sb ? work->sb->s_dev : 0; + __entry->sync_mode = work->sync_mode; + __entry->for_kupdate = work->for_kupdate; + __entry->range_cyclic = work->range_cyclic; + __entry->for_background = work->for_background; + ), + TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " + "kupdate=%d range_cyclic=%d background=%d", + __entry->name, + MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), + __entry->nr_pages, + __entry->sync_mode, + __entry->for_kupdate, + __entry->range_cyclic, + __entry->for_background + ) +); +#define DEFINE_WRITEBACK_WORK_EVENT(name) \ +DEFINE_EVENT(writeback_work_class, name, \ + TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \ + TP_ARGS(bdi, work)) +DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread); +DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); +DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); + +TRACE_EVENT(writeback_pages_written, + TP_PROTO(long pages_written), + TP_ARGS(pages_written), + TP_STRUCT__entry( + __field(long, pages) + ), + TP_fast_assign( + __entry->pages = pages_written; + ), + TP_printk("%ld", __entry->pages) +); + +DECLARE_EVENT_CLASS(writeback_class, + TP_PROTO(struct backing_dev_info *bdi), + TP_ARGS(bdi), + TP_STRUCT__entry( + __array(char, name, 32) + ), + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + ), + TP_printk("bdi %s", + __entry->name + ) +); +#define DEFINE_WRITEBACK_EVENT(name) \ +DEFINE_EVENT(writeback_class, name, \ + TP_PROTO(struct backing_dev_info *bdi), \ + TP_ARGS(bdi)) + +DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_bdi_register); +DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); +DEFINE_WRITEBACK_EVENT(writeback_thread_start); +DEFINE_WRITEBACK_EVENT(writeback_thread_stop); + +#endif /* _TRACE_WRITEBACK_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/backing-dev.c b/mm/backing-dev.c index bceac647e4d1..ac78a3336181 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -10,6 +10,7 @@ #include #include #include +#include static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); @@ -518,6 +519,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, bdi_debug_register(bdi, dev_name(dev)); set_bit(BDI_registered, &bdi->state); + trace_writeback_bdi_register(bdi); exit: return ret; } @@ -578,6 +580,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); if (!bdi_cap_flush_forker(bdi)) -- cgit v1.2.3 From 028c2dd184c097809986684f2f0627eea5529fea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:07 +1000 Subject: writeback: Add tracing to balance_dirty_pages Tracing high level background writeback events is good, but it doesn't give the entire picture. Add visibility into write throttling to catch IO dispatched by foreground throttling of processing dirtying lots of pages. Signed-off-by: Dave Chinner Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 5 ++++ include/trace/events/writeback.h | 64 ++++++++++++++++++++++++++++++++++++++++ mm/page-writeback.c | 4 +++ 3 files changed, 73 insertions(+) (limited to 'include/trace') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 73acab4dc2b7..bf10cbf379dd 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -656,10 +656,14 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.more_io = 0; wbc.nr_to_write = MAX_WRITEBACK_PAGES; wbc.pages_skipped = 0; + + trace_wbc_writeback_start(&wbc, wb->bdi); if (work->sb) __writeback_inodes_sb(work->sb, wb, &wbc); else writeback_inodes_wb(wb, &wbc); + trace_wbc_writeback_written(&wbc, wb->bdi); + work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; @@ -687,6 +691,7 @@ static long wb_writeback(struct bdi_writeback *wb, if (!list_empty(&wb->b_more_io)) { inode = list_entry(wb->b_more_io.prev, struct inode, i_list); + trace_wbc_writeback_wait(&wbc, wb->bdi); inode_wait_for_writeback(inode); } spin_unlock(&inode_lock); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 562fcae10d9d..0be26acae064 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -85,6 +85,70 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); DEFINE_WRITEBACK_EVENT(writeback_thread_stop); +DECLARE_EVENT_CLASS(wbc_class, + TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), + TP_ARGS(wbc, bdi), + TP_STRUCT__entry( + __array(char, name, 32) + __field(long, nr_to_write) + __field(long, pages_skipped) + __field(int, sync_mode) + __field(int, nonblocking) + __field(int, encountered_congestion) + __field(int, for_kupdate) + __field(int, for_background) + __field(int, for_reclaim) + __field(int, range_cyclic) + __field(int, more_io) + __field(unsigned long, older_than_this) + __field(long, range_start) + __field(long, range_end) + ), + + TP_fast_assign( + strncpy(__entry->name, dev_name(bdi->dev), 32); + __entry->nr_to_write = wbc->nr_to_write; + __entry->pages_skipped = wbc->pages_skipped; + __entry->sync_mode = wbc->sync_mode; + __entry->for_kupdate = wbc->for_kupdate; + __entry->for_background = wbc->for_background; + __entry->for_reclaim = wbc->for_reclaim; + __entry->range_cyclic = wbc->range_cyclic; + __entry->more_io = wbc->more_io; + __entry->older_than_this = wbc->older_than_this ? + *wbc->older_than_this : 0; + __entry->range_start = (long)wbc->range_start; + __entry->range_end = (long)wbc->range_end; + ), + + TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " + "bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx " + "start=0x%lx end=0x%lx", + __entry->name, + __entry->nr_to_write, + __entry->pages_skipped, + __entry->sync_mode, + __entry->for_kupdate, + __entry->for_background, + __entry->for_reclaim, + __entry->range_cyclic, + __entry->more_io, + __entry->older_than_this, + __entry->range_start, + __entry->range_end) +) + +#define DEFINE_WBC_EVENT(name) \ +DEFINE_EVENT(wbc_class, name, \ + TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ + TP_ARGS(wbc, bdi)) +DEFINE_WBC_EVENT(wbc_writeback_start); +DEFINE_WBC_EVENT(wbc_writeback_written); +DEFINE_WBC_EVENT(wbc_writeback_wait); +DEFINE_WBC_EVENT(wbc_balance_dirty_start); +DEFINE_WBC_EVENT(wbc_balance_dirty_written); +DEFINE_WBC_EVENT(wbc_balance_dirty_wait); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 37498ef61548..d556cd829af6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,6 +34,7 @@ #include #include #include +#include /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited @@ -535,11 +536,13 @@ static void balance_dirty_pages(struct address_space *mapping, * threshold otherwise wait until the disk writes catch * up. */ + trace_wbc_balance_dirty_start(&wbc, bdi); if (bdi_nr_reclaimable > bdi_thresh) { writeback_inodes_wb(&bdi->wb, &wbc); pages_written += write_chunk - wbc.nr_to_write; get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + trace_wbc_balance_dirty_written(&wbc, bdi); } /* @@ -565,6 +568,7 @@ static void balance_dirty_pages(struct address_space *mapping, if (pages_written >= write_chunk) break; /* We've done our duty */ + trace_wbc_balance_dirty_wait(&wbc, bdi); __set_current_state(TASK_INTERRUPTIBLE); io_schedule_timeout(pause); -- cgit v1.2.3 From 9e094383b60066996fbc3b53891324e5d2ec858d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 7 Jul 2010 13:24:08 +1000 Subject: writeback: Add tracing to write_cache_pages Add a trace event to the ->writepage loop in write_cache_pages to give visibility into how the ->writepage call is changing variables within the writeback control structure. Of most interest is how wbc->nr_to_write changes from call to call, especially with filesystems that write multiple pages in ->writepage. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/trace/events/writeback.h | 1 + mm/page-writeback.c | 1 + 2 files changed, 2 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 0be26acae064..bde92e07fa4f 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -148,6 +148,7 @@ DEFINE_WBC_EVENT(wbc_writeback_wait); DEFINE_WBC_EVENT(wbc_balance_dirty_start); DEFINE_WBC_EVENT(wbc_balance_dirty_written); DEFINE_WBC_EVENT(wbc_balance_dirty_wait); +DEFINE_WBC_EVENT(wbc_writepage); #endif /* _TRACE_WRITEBACK_H */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d556cd829af6..3d2111a22236 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -933,6 +933,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { -- cgit v1.2.3 From 96dccab1d63cb35d3f5e75d2ef275fdbff4d5f3b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 19 Jul 2010 16:49:17 -0700 Subject: writeback.h: needs linux/device.h include/trace/events/writeback.h uses dev_name(), so it needs to include linux/device.h. include/trace/events/writeback.h:12: error: implicit declaration of function 'dev_name' Signed-off-by: Randy Dunlap Signed-off-by: Jens Axboe --- include/trace/events/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/trace') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index bde92e07fa4f..84ab72df953c 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -5,6 +5,7 @@ #define _TRACE_WRITEBACK_H #include +#include #include struct wb_writeback_work; -- cgit v1.2.3 From 603320239fb436f175c8b6bfa43d5023c47a6dc2 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 25 Jul 2010 14:29:24 +0300 Subject: writeback: add new tracepoints Add 2 new trace points to the periodic write-back wake up case, just like we do in the 'bdi_queue_work()' function. Namely, introduce: 1. trace_writeback_wake_thread(bdi) 2. trace_writeback_wake_forker_thread(bdi) The first event is triggered every time we wake up a bdi thread to start periodic background write-out. The second event is triggered only when the bdi thread does not exist and should be created by the forker thread. This patch was suggested by Dave Chinner and Christoph Hellwig. Signed-off-by: Artem Bityutskiy Signed-off-by: Jens Axboe --- include/trace/events/writeback.h | 2 ++ mm/backing-dev.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 84ab72df953c..f345f66ae9d1 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -81,6 +81,8 @@ DEFINE_EVENT(writeback_class, name, \ TP_ARGS(bdi)) DEFINE_WRITEBACK_EVENT(writeback_nowork); +DEFINE_WRITEBACK_EVENT(writeback_wake_thread); +DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread); DEFINE_WRITEBACK_EVENT(writeback_bdi_register); DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister); DEFINE_WRITEBACK_EVENT(writeback_thread_start); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 99890831d557..9008c4e207f6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -310,6 +310,7 @@ static void wakeup_timer_fn(unsigned long data) spin_lock_bh(&bdi->wb_lock); if (bdi->wb.task) { + trace_writeback_wake_thread(bdi); wake_up_process(bdi->wb.task); } else { /* @@ -317,6 +318,7 @@ static void wakeup_timer_fn(unsigned long data) * In this case we have to wake-up the forker thread which * should create and run the bdi thread. */ + trace_writeback_wake_forker_thread(bdi); wake_up_process(default_backing_dev_info.wb.task); } spin_unlock_bh(&bdi->wb_lock); -- cgit v1.2.3 From ede1b4290781ae82ccf0f2ecc6dada8d3dd35779 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 18 Aug 2010 15:33:13 -0700 Subject: tracing: Fix timer tracing PowerTOP would like to be able to trace timers. Unfortunately, the current timer tracing is not very useful: the actual timer function is not recorded in the trace at the start of timer execution. Although this is recorded for timer "start" time (when it gets armed), this is not useful; most timers get started early, and a tracer like PowerTOP will never see this event, but will only see the actual running of the timer. This patch just adds the function to the timer tracing; I've verified with PowerTOP that now it can get useful information about timers. Signed-off-by: Arjan van de Ven Cc: xiaoguangrong@cn.fujitsu.com Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: # .35.x, .34.x, .33.x LKML-Reference: <4C6C5FA9.3000405@linux.intel.com> Signed-off-by: Ingo Molnar --- include/trace/events/timer.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index c624126a9c8a..425bcfe56c62 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -81,14 +81,16 @@ TRACE_EVENT(timer_expire_entry, TP_STRUCT__entry( __field( void *, timer ) __field( unsigned long, now ) + __field( void *, function) ), TP_fast_assign( __entry->timer = timer; __entry->now = jiffies; + __entry->function = timer->function; ), - TP_printk("timer=%p now=%lu", __entry->timer, __entry->now) + TP_printk("timer=%p function=%pf now=%lu", __entry->timer, __entry->function,__entry->now) ); /** @@ -200,14 +202,16 @@ TRACE_EVENT(hrtimer_expire_entry, TP_STRUCT__entry( __field( void *, hrtimer ) __field( s64, now ) + __field( void *, function) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = now->tv64; + __entry->function = hrtimer->function; ), - TP_printk("hrtimer=%p now=%llu", __entry->hrtimer, + TP_printk("hrtimer=%p function=%pf now=%llu", __entry->hrtimer, __entry->function, (unsigned long long)ktime_to_ns((ktime_t) { .tv64 = __entry->now })) ); -- cgit v1.2.3 From e36c886a0f9d624377977fa6cae309cfd7f362fa Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 21 Aug 2010 13:07:26 -0700 Subject: workqueue: Add basic tracepoints to track workqueue execution With the introduction of the new unified work queue thread pools, we lost one feature: It's no longer possible to know which worker is causing the CPU to wake out of idle. The result is that PowerTOP now reports a lot of "kworker/a:b" instead of more readable results. This patch adds a pair of tracepoints to the new workqueue code, similar in style to the timer/hrtimer tracepoints. With this pair of tracepoints, the next PowerTOP can correctly report which work item caused the wakeup (and how long it took): Interrupt (43) i915 time 3.51ms wakeups 141 Work ieee80211_iface_work time 0.81ms wakeups 29 Work do_dbs_timer time 0.55ms wakeups 24 Process Xorg time 21.36ms wakeups 4 Timer sched_rt_period_timer time 0.01ms wakeups 1 Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- include/trace/events/workqueue.h | 62 ++++++++++++++++++++++++++++++++++++++++ kernel/workqueue.c | 9 ++++++ 2 files changed, 71 insertions(+) create mode 100644 include/trace/events/workqueue.h (limited to 'include/trace') diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h new file mode 100644 index 000000000000..49682d7e9d60 --- /dev/null +++ b/include/trace/events/workqueue.h @@ -0,0 +1,62 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM workqueue + +#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_WORKQUEUE_H + +#include +#include + +/** + * workqueue_execute_start - called immediately before the workqueue callback + * @work: pointer to struct work_struct + * + * Allows to track workqueue execution. + */ +TRACE_EVENT(workqueue_execute_start, + + TP_PROTO(struct work_struct *work), + + TP_ARGS(work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + ), + + TP_printk("work struct %p: function %pf", __entry->work, __entry->function) +); + +/** + * workqueue_execute_end - called immediately before the workqueue callback + * @work: pointer to struct work_struct + * + * Allows to track workqueue execution. + */ +TRACE_EVENT(workqueue_execute_end, + + TP_PROTO(struct work_struct *work), + + TP_ARGS(work), + + TP_STRUCT__entry( + __field( void *, work ) + ), + + TP_fast_assign( + __entry->work = work; + ), + + TP_printk("work struct %p", __entry->work) +); + + +#endif /* _TRACE_WORKQUEUE_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2994a0e3a61c..8bd600c020e5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -35,6 +35,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + #include "workqueue_sched.h" enum { @@ -1790,7 +1793,13 @@ static void process_one_work(struct worker *worker, struct work_struct *work) work_clear_pending(work); lock_map_acquire(&cwq->wq->lockdep_map); lock_map_acquire(&lockdep_map); + trace_workqueue_execute_start(work); f(work); + /* + * While we must be careful to not use "work" after this, the trace + * point will only record its address. + */ + trace_workqueue_execute_end(work); lock_map_release(&lockdep_map); lock_map_release(&cwq->wq->lockdep_map); -- cgit v1.2.3 From 2bf2160d8805de64308e2e7c3cd97813cb58ed2f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 23 Aug 2010 18:42:48 +0900 Subject: irq: Add tracepoint to softirq_raise Add a tracepoint for tracing when softirq action is raised. This and the existing tracepoints complete softirq's tracepoints: softirq_raise, softirq_entry and softirq_exit. And when this tracepoint is used in combination with the softirq_entry tracepoint we can determine the softirq raise latency. Signed-off-by: Lai Jiangshan Acked-by: Mathieu Desnoyers Acked-by: Neil Horman Cc: David Miller Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C724298.4050509@jp.fujitsu.com> [ factorize softirq events with DECLARE_EVENT_CLASS ] Signed-off-by: Koki Sanagi Signed-off-by: Frederic Weisbecker --- include/linux/interrupt.h | 8 +++++++- include/trace/events/irq.h | 26 ++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index a0384a4d1e6f..531495db1708 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -18,6 +18,7 @@ #include #include #include +#include /* * These correspond to the IORESOURCE_IRQ_* defines in @@ -407,7 +408,12 @@ asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); -#define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0) +static inline void __raise_softirq_irqoff(unsigned int nr) +{ + trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL); + or_softirq_pending(1UL << nr); +} + extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); extern void wakeup_softirqd(void); diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 0e4cfb694fe7..6fa7cbab7d93 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -5,7 +5,9 @@ #define _TRACE_IRQ_H #include -#include + +struct irqaction; +struct softirq_action; #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ @@ -93,7 +95,10 @@ DECLARE_EVENT_CLASS(softirq, ), TP_fast_assign( - __entry->vec = (int)(h - vec); + if (vec) + __entry->vec = (int)(h - vec); + else + __entry->vec = (int)(long)h; ), TP_printk("vec=%d [action=%s]", __entry->vec, @@ -136,6 +141,23 @@ DEFINE_EVENT(softirq, softirq_exit, TP_ARGS(h, vec) ); +/** + * softirq_raise - called immediately when a softirq is raised + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter contains a pointer to the softirq vector number which is + * raised. @vec is NULL and it means @h includes vector number not + * softirq_action. When used in combination with the softirq_entry tracepoint + * we can determine the softirq raise latency. + */ +DEFINE_EVENT(softirq, softirq_raise, + + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + + TP_ARGS(h, vec) +); + #endif /* _TRACE_IRQ_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 3e4b10d7a4d2a78af64f8096dc7cdb3bebd65adb Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Mon, 23 Aug 2010 18:43:51 +0900 Subject: napi: Convert trace_napi_poll to TRACE_EVENT This patch converts trace_napi_poll from DECLARE_EVENT to TRACE_EVENT to improve the usability of napi_poll tracepoint. -0 [001] 241302.750777: napi_poll: napi poll on napi struct f6acc480 for device eth3 -0 [000] 241302.852389: napi_poll: napi poll on napi struct f5d0d70c for device eth1 The original patch is below: http://marc.info/?l=linux-kernel&m=126021713809450&w=2 [ sanagi.koki@jp.fujitsu.com: And add a fix by Steven Rostedt: http://marc.info/?l=linux-kernel&m=126150506519173&w=2 ] Signed-off-by: Neil Horman Acked-by: David S. Miller Acked-by: Neil Horman Cc: Mathieu Desnoyers Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C7242D7.4050009@jp.fujitsu.com> Signed-off-by: Koki Sanagi Signed-off-by: Frederic Weisbecker --- include/trace/events/napi.h | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/napi.h b/include/trace/events/napi.h index 188deca2f3c7..8fe1e93f531d 100644 --- a/include/trace/events/napi.h +++ b/include/trace/events/napi.h @@ -6,10 +6,31 @@ #include #include +#include + +#define NO_DEV "(no_device)" + +TRACE_EVENT(napi_poll, -DECLARE_TRACE(napi_poll, TP_PROTO(struct napi_struct *napi), - TP_ARGS(napi)); + + TP_ARGS(napi), + + TP_STRUCT__entry( + __field( struct napi_struct *, napi) + __string( dev_name, napi->dev ? napi->dev->name : NO_DEV) + ), + + TP_fast_assign( + __entry->napi = napi; + __assign_str(dev_name, napi->dev ? napi->dev->name : NO_DEV); + ), + + TP_printk("napi poll on napi struct %p for device %s", + __entry->napi, __get_str(dev_name)) +); + +#undef NO_DEV #endif /* _TRACE_NAPI_H_ */ -- cgit v1.2.3 From cf66ba58b5cb8b1526e9dd2fb96ff8db048d4d44 Mon Sep 17 00:00:00 2001 From: Koki Sanagi Date: Mon, 23 Aug 2010 18:45:02 +0900 Subject: netdev: Add tracepoints to netdev layer This patch adds tracepoint to dev_queue_xmit, dev_hard_start_xmit, netif_rx and netif_receive_skb. These tracepoints help you to monitor network driver's input/output. -0 [001] 112447.902030: netif_rx: dev=eth1 skbaddr=f3ef0900 len=84 -0 [001] 112447.902039: netif_receive_skb: dev=eth1 skbaddr=f3ef0900 len=84 sshd-6828 [000] 112447.903257: net_dev_queue: dev=eth4 skbaddr=f3fca538 len=226 sshd-6828 [000] 112447.903260: net_dev_xmit: dev=eth4 skbaddr=f3fca538 len=226 rc=0 Signed-off-by: Koki Sanagi Acked-by: David S. Miller Acked-by: Neil Horman Cc: Mathieu Desnoyers Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C72431E.3000901@jp.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/events/net.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/dev.c | 6 ++++ net/core/net-traces.c | 1 + 3 files changed, 89 insertions(+) create mode 100644 include/trace/events/net.h (limited to 'include/trace') diff --git a/include/trace/events/net.h b/include/trace/events/net.h new file mode 100644 index 000000000000..5f247f5ffc56 --- /dev/null +++ b/include/trace/events/net.h @@ -0,0 +1,82 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM net + +#if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_NET_H + +#include +#include +#include +#include + +TRACE_EVENT(net_dev_xmit, + + TP_PROTO(struct sk_buff *skb, + int rc), + + TP_ARGS(skb, rc), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( int, rc ) + __string( name, skb->dev->name ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->rc = rc; + __assign_str(name, skb->dev->name); + ), + + TP_printk("dev=%s skbaddr=%p len=%u rc=%d", + __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) +); + +DECLARE_EVENT_CLASS(net_dev_template, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned int, len ) + __string( name, skb->dev->name ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + __entry->len = skb->len; + __assign_str(name, skb->dev->name); + ), + + TP_printk("dev=%s skbaddr=%p len=%u", + __get_str(name), __entry->skbaddr, __entry->len) +) + +DEFINE_EVENT(net_dev_template, net_dev_queue, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); + +DEFINE_EVENT(net_dev_template, netif_receive_skb, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); + +DEFINE_EVENT(net_dev_template, netif_rx, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb) +); +#endif /* _TRACE_NET_H */ + +/* This part must be outside protection */ +#include diff --git a/net/core/dev.c b/net/core/dev.c index 3721fbb9a83c..5a4fbc7405e2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -128,6 +128,7 @@ #include #include #include +#include #include #include "net-sysfs.h" @@ -1978,6 +1979,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, } rc = ops->ndo_start_xmit(skb, dev); + trace_net_dev_xmit(skb, rc); if (rc == NETDEV_TX_OK) txq_trans_update(txq); return rc; @@ -1998,6 +2000,7 @@ gso: skb_dst_drop(nskb); rc = ops->ndo_start_xmit(nskb, dev); + trace_net_dev_xmit(nskb, rc); if (unlikely(rc != NETDEV_TX_OK)) { if (rc & ~NETDEV_TX_MASK) goto out_kfree_gso_skb; @@ -2186,6 +2189,7 @@ int dev_queue_xmit(struct sk_buff *skb) #ifdef CONFIG_NET_CLS_ACT skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS); #endif + trace_net_dev_queue(skb); if (q->enqueue) { rc = __dev_xmit_skb(skb, q, dev, txq); goto out; @@ -2512,6 +2516,7 @@ int netif_rx(struct sk_buff *skb) if (netdev_tstamp_prequeue) net_timestamp_check(skb); + trace_netif_rx(skb); #ifdef CONFIG_RPS { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -2828,6 +2833,7 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); + trace_netif_receive_skb(skb); if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) return NET_RX_SUCCESS; diff --git a/net/core/net-traces.c b/net/core/net-traces.c index afa6380ed88a..7f1bb2aba03b 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -26,6 +26,7 @@ #define CREATE_TRACE_POINTS #include +#include #include EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); -- cgit v1.2.3 From 07dc22e7295f25526f110d704655ff0ea7687420 Mon Sep 17 00:00:00 2001 From: Koki Sanagi Date: Mon, 23 Aug 2010 18:46:12 +0900 Subject: skb: Add tracepoints to freeing skb This patch adds tracepoint to consume_skb and add trace_kfree_skb before __kfree_skb in skb_free_datagram_locked and net_tx_action. Combinating with tracepoint on dev_hard_start_xmit, we can check how long it takes to free transmitted packets. And using it, we can calculate how many packets driver had at that time. It is useful when a drop of transmitted packet is a problem. sshd-6828 [000] 112689.258154: consume_skb: skbaddr=f2d99bb8 Signed-off-by: Koki Sanagi Acked-by: David S. Miller Acked-by: Neil Horman Cc: Mathieu Desnoyers Cc: Kaneshige Kenji Cc: Izumo Taku Cc: Kosaki Motohiro Cc: Lai Jiangshan Cc: Scott Mcmillan Cc: Steven Rostedt Cc: Eric Dumazet LKML-Reference: <4C724364.50903@jp.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/events/skb.h | 17 +++++++++++++++++ net/core/datagram.c | 1 + net/core/dev.c | 2 ++ net/core/skbuff.c | 1 + 4 files changed, 21 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 4b2be6dc76f0..75ce9d500d8e 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -35,6 +35,23 @@ TRACE_EVENT(kfree_skb, __entry->skbaddr, __entry->protocol, __entry->location) ); +TRACE_EVENT(consume_skb, + + TP_PROTO(struct sk_buff *skb), + + TP_ARGS(skb), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + ), + + TP_printk("skbaddr=%p", __entry->skbaddr) +); + TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), diff --git a/net/core/datagram.c b/net/core/datagram.c index 251997a95483..282806ba7a57 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -243,6 +243,7 @@ void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb) unlock_sock_fast(sk, slow); /* skb is now orphaned, can be freed outside of locked section */ + trace_kfree_skb(skb, skb_free_datagram_locked); __kfree_skb(skb); } EXPORT_SYMBOL(skb_free_datagram_locked); diff --git a/net/core/dev.c b/net/core/dev.c index 5a4fbc7405e2..2308cce48048 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include #include "net-sysfs.h" @@ -2576,6 +2577,7 @@ static void net_tx_action(struct softirq_action *h) clist = clist->next; WARN_ON(atomic_read(&skb->users)); + trace_kfree_skb(skb, net_tx_action); __kfree_skb(skb); } } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3a2513f0d0c3..12e61e351d0e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -466,6 +466,7 @@ void consume_skb(struct sk_buff *skb) smp_rmb(); else if (likely(!atomic_dec_and_test(&skb->users))) return; + trace_consume_skb(skb); __kfree_skb(skb); } EXPORT_SYMBOL(consume_skb); -- cgit v1.2.3 From 74704ac6ea402d50c9543cb28247e6d9f521f7ae Mon Sep 17 00:00:00 2001 From: Jean Pihet Date: Tue, 7 Sep 2010 09:21:32 +0200 Subject: tracing, perf: Add more power related events This patch adds new generic events for dynamic power management tracing: - clock events class: used for clock enable/disable and for clock rate change, - power_domain events class: used for power domains transitions. The OMAP architecture will be using the new events for PM debugging, however the new events are made generic enough to be used by all platforms. Signed-off-by: Jean Pihet Acked-by: Thomas Renninger Cc: discuss@lesswatts.org Cc: linux-pm@lists.linux-foundation.org Cc: Thomas Renninger Cc: Arjan van de Ven Cc: Peter Zijlstra Cc: Len Brown Cc: Kevin Hilman LKML-Reference: Signed-off-by: Ingo Molnar --- include/trace/events/power.h | 90 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 87 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/power.h b/include/trace/events/power.h index 35a2a6e7bf1e..286784d69b8f 100644 --- a/include/trace/events/power.h +++ b/include/trace/events/power.h @@ -10,12 +10,17 @@ #ifndef _TRACE_POWER_ENUM_ #define _TRACE_POWER_ENUM_ enum { - POWER_NONE = 0, - POWER_CSTATE = 1, - POWER_PSTATE = 2, + POWER_NONE = 0, + POWER_CSTATE = 1, /* C-State */ + POWER_PSTATE = 2, /* Fequency change or DVFS */ + POWER_SSTATE = 3, /* Suspend */ }; #endif +/* + * The power events are used for cpuidle & suspend (power_start, power_end) + * and for cpufreq (power_frequency) + */ DECLARE_EVENT_CLASS(power, TP_PROTO(unsigned int type, unsigned int state, unsigned int cpu_id), @@ -70,6 +75,85 @@ TRACE_EVENT(power_end, ); +/* + * The clock events are used for clock enable/disable and for + * clock rate change + */ +DECLARE_EVENT_CLASS(clock, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id), + + TP_STRUCT__entry( + __string( name, name ) + __field( u64, state ) + __field( u64, cpu_id ) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->state = state; + __entry->cpu_id = cpu_id; + ), + + TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), + (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(clock, clock_enable, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +DEFINE_EVENT(clock, clock_disable, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +DEFINE_EVENT(clock, clock_set_rate, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + +/* + * The power domain events are used for power domains transitions + */ +DECLARE_EVENT_CLASS(power_domain, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id), + + TP_STRUCT__entry( + __string( name, name ) + __field( u64, state ) + __field( u64, cpu_id ) + ), + + TP_fast_assign( + __assign_str(name, name); + __entry->state = state; + __entry->cpu_id = cpu_id; +), + + TP_printk("%s state=%lu cpu_id=%lu", __get_str(name), + (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) +); + +DEFINE_EVENT(power_domain, power_domain_target, + + TP_PROTO(const char *name, unsigned int state, unsigned int cpu_id), + + TP_ARGS(name, state, cpu_id) +); + #endif /* _TRACE_POWER_H */ /* This part must be outside protection */ -- cgit v1.2.3 From a8027073eb127cd207070891374b5c54c2ce3d23 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Sep 2010 15:13:34 -0400 Subject: tracing/sched: Add sched_pi_setprio tracepoint Add a tracepoint that shows the priority of a task being boosted via priority inheritance. Cc: Gregory Haskins Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/sched.h | 29 +++++++++++++++++++++++++++++ kernel/sched.c | 1 + 2 files changed, 30 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9208c92aeab5..f6334782a593 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime, (unsigned long long)__entry->vruntime) ); +/* + * Tracepoint for showing priority inheritance modifying a tasks + * priority. + */ +TRACE_EVENT(sched_pi_setprio, + + TP_PROTO(struct task_struct *tsk, int newprio), + + TP_ARGS(tsk, newprio), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, oldprio ) + __field( int, newprio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->oldprio = tsk->prio; + __entry->newprio = newprio; + ), + + TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", + __entry->comm, __entry->pid, + __entry->oldprio, __entry->newprio) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched.c b/kernel/sched.c index 9ca8ad05950b..4ad473814350 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4355,6 +4355,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) rq = task_rq_lock(p, &flags); + trace_sched_pi_setprio(p, prio); oldprio = p->prio; prev_class = p->sched_class; on_rq = p->se.on_rq; -- cgit v1.2.3 From 97bd234701b2b39a0e749c1fe0e44f1d14c94292 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:41:14 +0200 Subject: workqueue: prepare for more tracepoints Define workqueue_work event class and use it for workqueue_execute_end trace point. Also, move trace/events/workqueue.h include downwards such that all struct definitions are visible to it. This is to prepare for more tracepoints and doesn't cause any functional change. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- include/trace/events/workqueue.h | 32 +++++++++++++++++++------------- kernel/workqueue.c | 6 +++--- 2 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 49682d7e9d60..ec9d7244eb9f 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -7,13 +7,7 @@ #include #include -/** - * workqueue_execute_start - called immediately before the workqueue callback - * @work: pointer to struct work_struct - * - * Allows to track workqueue execution. - */ -TRACE_EVENT(workqueue_execute_start, +DECLARE_EVENT_CLASS(workqueue_work, TP_PROTO(struct work_struct *work), @@ -21,24 +15,22 @@ TRACE_EVENT(workqueue_execute_start, TP_STRUCT__entry( __field( void *, work ) - __field( void *, function) ), TP_fast_assign( __entry->work = work; - __entry->function = work->func; ), - TP_printk("work struct %p: function %pf", __entry->work, __entry->function) + TP_printk("work struct %p", __entry->work) ); /** - * workqueue_execute_end - called immediately before the workqueue callback + * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct * * Allows to track workqueue execution. */ -TRACE_EVENT(workqueue_execute_end, +TRACE_EVENT(workqueue_execute_start, TP_PROTO(struct work_struct *work), @@ -46,15 +38,29 @@ TRACE_EVENT(workqueue_execute_end, TP_STRUCT__entry( __field( void *, work ) + __field( void *, function) ), TP_fast_assign( __entry->work = work; + __entry->function = work->func; ), - TP_printk("work struct %p", __entry->work) + TP_printk("work struct %p: function %pf", __entry->work, __entry->function) ); +/** + * workqueue_execute_end - called immediately before the workqueue callback + * @work: pointer to struct work_struct + * + * Allows to track workqueue execution. + */ +DEFINE_EVENT(workqueue_work, workqueue_execute_end, + + TP_PROTO(struct work_struct *work), + + TP_ARGS(work) +); #endif /* _TRACE_WORKQUEUE_H */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 19e4bc15ee99..026f778e879b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -42,9 +42,6 @@ #include #include -#define CREATE_TRACE_POINTS -#include - #include "workqueue_sched.h" enum { @@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq); EXPORT_SYMBOL_GPL(system_nrt_wq); EXPORT_SYMBOL_GPL(system_unbound_wq); +#define CREATE_TRACE_POINTS +#include + #define for_each_busy_worker(worker, i, pos, gcwq) \ for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) -- cgit v1.2.3 From cdadf0097cdca06c497ffaeb5982e028c6e4ed38 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 5 Oct 2010 10:49:55 +0200 Subject: workqueue: add queue_work and activate_work trace points These two tracepoints allow tracking when and how a work is queued and activated. This patch is based on Frederic's patch to add queue_work trace point. Signed-off-by: Tejun Heo Cc: Frederic Weisbecker --- include/trace/events/workqueue.h | 53 ++++++++++++++++++++++++++++++++++++++++ kernel/workqueue.c | 3 +++ 2 files changed, 56 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index ec9d7244eb9f..7d497291c85d 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -24,6 +24,59 @@ DECLARE_EVENT_CLASS(workqueue_work, TP_printk("work struct %p", __entry->work) ); +/** + * workqueue_queue_work - called when a work gets queued + * @req_cpu: the requested cpu + * @cwq: pointer to struct cpu_workqueue_struct + * @work: pointer to struct work_struct + * + * This event occurs when a work is queued immediately or once a + * delayed work is actually queued on a workqueue (ie: once the delay + * has been reached). + */ +TRACE_EVENT(workqueue_queue_work, + + TP_PROTO(unsigned int req_cpu, struct cpu_workqueue_struct *cwq, + struct work_struct *work), + + TP_ARGS(req_cpu, cwq, work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + __field( void *, workqueue) + __field( unsigned int, req_cpu ) + __field( unsigned int, cpu ) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + __entry->workqueue = cwq->wq; + __entry->req_cpu = req_cpu; + __entry->cpu = cwq->gcwq->cpu; + ), + + TP_printk("work struct=%p function=%pf workqueue=%p req_cpu=%u cpu=%u", + __entry->work, __entry->function, __entry->workqueue, + __entry->req_cpu, __entry->cpu) +); + +/** + * workqueue_activate_work - called when a work gets activated + * @work: pointer to struct work_struct + * + * This event occurs when a queued work is put on the active queue, + * which happens immediately after queueing unless @max_active limit + * is reached. + */ +DEFINE_EVENT(workqueue_work, workqueue_activate_work, + + TP_PROTO(struct work_struct *work), + + TP_ARGS(work) +); + /** * workqueue_execute_start - called immediately before the workqueue callback * @work: pointer to struct work_struct diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 026f778e879b..cb2ccfbed0c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -997,6 +997,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, /* gcwq determined, get cwq and queue */ cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(cpu, cwq, work); BUG_ON(!list_empty(&work->entry)); @@ -1004,6 +1005,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, work_flags = work_color_to_flags(cwq->work_color); if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); cwq->nr_active++; worklist = gcwq_determine_ins_pos(gcwq, cwq); } else { @@ -1679,6 +1681,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) struct work_struct, entry); struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + trace_workqueue_activate_work(work); move_linked_works(work, pos, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); cwq->nr_active++; -- cgit v1.2.3 From f4bc6bb2d562703eafc895c37e7be20906de139d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Oct 2010 15:00:13 +0200 Subject: tracing: Cleanup the convoluted softirq tracepoints With the addition of trace_softirq_raise() the softirq tracepoint got even more convoluted. Why the tracepoints take two pointers to assign an integer is beyond my comprehension. But adding an extra case which treats the first pointer as an unsigned long when the second pointer is NULL including the back and forth type casting is just horrible. Convert the softirq tracepoints to take a single unsigned int argument for the softirq vector number and fix the call sites. Signed-off-by: Thomas Gleixner LKML-Reference: Acked-by: Peter Zijlstra Acked-by: mathieu.desnoyers@efficios.com Cc: Frederic Weisbecker Cc: Steven Rostedt --- include/linux/interrupt.h | 2 +- include/trace/events/irq.h | 54 +++++++++++++++++----------------------------- kernel/softirq.c | 16 ++++++++------ 3 files changed, 30 insertions(+), 42 deletions(-) (limited to 'include/trace') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 531495db1708..0ac194946fec 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -410,7 +410,7 @@ extern void open_softirq(int nr, void (*action)(struct softirq_action *)); extern void softirq_init(void); static inline void __raise_softirq_irqoff(unsigned int nr) { - trace_softirq_raise((struct softirq_action *)(unsigned long)nr, NULL); + trace_softirq_raise(nr); or_softirq_pending(1UL << nr); } diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 6fa7cbab7d93..1c09820df585 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -86,76 +86,62 @@ TRACE_EVENT(irq_handler_exit, DECLARE_EVENT_CLASS(softirq, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec), + TP_ARGS(vec_nr), TP_STRUCT__entry( - __field( int, vec ) + __field( unsigned int, vec ) ), TP_fast_assign( - if (vec) - __entry->vec = (int)(h - vec); - else - __entry->vec = (int)(long)h; + __entry->vec = vec_nr; ), - TP_printk("vec=%d [action=%s]", __entry->vec, + TP_printk("vec=%u [action=%s]", __entry->vec, show_softirq_name(__entry->vec)) ); /** * softirq_entry - called immediately before the softirq handler - * @h: pointer to struct softirq_action - * @vec: pointer to first struct softirq_action in softirq_vec array + * @vec_nr: softirq vector number * - * The @h parameter, contains a pointer to the struct softirq_action - * which has a pointer to the action handler that is called. By subtracting - * the @vec pointer from the @h pointer, we can determine the softirq - * number. Also, when used in combination with the softirq_exit tracepoint - * we can determine the softirq latency. + * When used in combination with the softirq_exit tracepoint + * we can determine the softirq handler runtine. */ DEFINE_EVENT(softirq, softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec) + TP_ARGS(vec_nr) ); /** * softirq_exit - called immediately after the softirq handler returns - * @h: pointer to struct softirq_action - * @vec: pointer to first struct softirq_action in softirq_vec array + * @vec_nr: softirq vector number * - * The @h parameter contains a pointer to the struct softirq_action - * that has handled the softirq. By subtracting the @vec pointer from - * the @h pointer, we can determine the softirq number. Also, when used in - * combination with the softirq_entry tracepoint we can determine the softirq - * latency. + * When used in combination with the softirq_entry tracepoint + * we can determine the softirq handler runtine. */ DEFINE_EVENT(softirq, softirq_exit, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec) + TP_ARGS(vec_nr) ); /** * softirq_raise - called immediately when a softirq is raised - * @h: pointer to struct softirq_action - * @vec: pointer to first struct softirq_action in softirq_vec array + * @vec_nr: softirq vector number * - * The @h parameter contains a pointer to the softirq vector number which is - * raised. @vec is NULL and it means @h includes vector number not - * softirq_action. When used in combination with the softirq_entry tracepoint - * we can determine the softirq raise latency. + * When used in combination with the softirq_entry tracepoint + * we can determine the softirq raise to run latency. */ DEFINE_EVENT(softirq, softirq_raise, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_PROTO(unsigned int vec_nr), - TP_ARGS(h, vec) + TP_ARGS(vec_nr) ); #endif /* _TRACE_IRQ_H */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 07b4f1b1a73a..b3cb1dc15795 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -212,18 +212,20 @@ restart: do { if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; int prev_count = preempt_count(); - kstat_incr_softirqs_this_cpu(h - softirq_vec); - trace_softirq_entry(h, softirq_vec); + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); h->action(h); - trace_softirq_exit(h, softirq_vec); + trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { - printk(KERN_ERR "huh, entered softirq %td %s %p" + printk(KERN_ERR "huh, entered softirq %u %s %p" "with preempt_count %08x," - " exited with %08x?\n", h - softirq_vec, - softirq_to_name[h - softirq_vec], - h->action, prev_count, preempt_count()); + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); preempt_count() = prev_count; } -- cgit v1.2.3 From 1b430beee5e388605dfb092b214ef0320f752cf6 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 26 Oct 2010 14:21:26 -0700 Subject: writeback: remove nonblocking/encountered_congestion references This removes more dead code that was somehow missed by commit 0d99519efef (writeback: remove unused nonblocking and congestion checks). There are no behavior change except for the removal of two entries from one of the ext4 tracing interface. The nonblocking checks in ->writepages are no longer used because the flusher now prefer to block on get_request_wait() than to skip inodes on IO congestion. The latter will lead to more seeky IO. The nonblocking checks in ->writepage are no longer used because it's redundant with the WB_SYNC_NONE check. We no long set ->nonblocking in VM page out and page migration, because a) it's effectively redundant with WB_SYNC_NONE in current code b) it's old semantic of "Don't get stuck on request queues" is mis-behavior: that would skip some dirty inodes on congestion and page out others, which is unfair in terms of LRU age. Inspired by Christoph Hellwig. Thanks! Signed-off-by: Wu Fengguang Cc: Theodore Ts'o Cc: David Howells Cc: Sage Weil Cc: Steve French Cc: Chris Mason Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/write.c | 19 +------------------ fs/buffer.c | 2 +- fs/ceph/addr.c | 9 --------- fs/cifs/file.c | 10 ---------- fs/gfs2/meta_io.c | 2 +- fs/nfs/write.c | 4 +--- fs/reiserfs/inode.c | 2 +- fs/xfs/linux-2.6/xfs_aops.c | 3 +-- include/trace/events/ext4.h | 8 +++++--- include/trace/events/writeback.h | 2 -- mm/migrate.c | 1 - mm/vmscan.c | 1 - 12 files changed, 11 insertions(+), 52 deletions(-) (limited to 'include/trace') diff --git a/fs/afs/write.c b/fs/afs/write.c index 722743b152d8..15690bb1d3b5 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -438,7 +438,6 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - struct backing_dev_info *bdi = page->mapping->backing_dev_info; struct afs_writeback *wb; int ret; @@ -455,8 +454,6 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) } wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) - wbc->encountered_congestion = 1; _leave(" = 0"); return 0; @@ -469,7 +466,6 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, pgoff_t index, pgoff_t end, pgoff_t *_next) { - struct backing_dev_info *bdi = mapping->backing_dev_info; struct afs_writeback *wb; struct page *page; int ret, n; @@ -529,11 +525,6 @@ static int afs_writepages_region(struct address_space *mapping, wbc->nr_to_write -= ret; - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - break; - } - cond_resched(); } while (index < end && wbc->nr_to_write > 0); @@ -548,24 +539,16 @@ static int afs_writepages_region(struct address_space *mapping, int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; pgoff_t start, end, next; int ret; _enter(""); - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - _leave(" = 0 [congest]"); - return 0; - } - if (wbc->range_cyclic) { start = mapping->writeback_index; end = -1; ret = afs_writepages_region(mapping, wbc, start, end, &next); - if (start > 0 && wbc->nr_to_write > 0 && ret == 0 && - !(wbc->nonblocking && wbc->encountered_congestion)) + if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); mapping->writeback_index = next; diff --git a/fs/buffer.c b/fs/buffer.c index 7f0b9b083f77..ec21a92e08b4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1706,7 +1706,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, * and kswapd activity, but those code paths have their own * higher-level throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 51bcc5ce3230..e9c874abc9e1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -591,7 +591,6 @@ static int ceph_writepages_start(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct backing_dev_info *bdi = mapping->backing_dev_info; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc; pgoff_t index, start, end; @@ -633,13 +632,6 @@ static int ceph_writepages_start(struct address_space *mapping, pagevec_init(&pvec, 0); - /* ?? */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - dout(" writepages congested\n"); - wbc->encountered_congestion = 1; - goto out_final; - } - /* where to start/end? */ if (wbc->range_cyclic) { start = mapping->writeback_index; /* Start from prev offset */ @@ -885,7 +877,6 @@ out: rc = 0; /* vfs expects us to return 0 */ ceph_put_snap_context(snapc); dout("writepages done, rc = %d\n", rc); -out_final: return rc; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 8c81e7b14d53..45af003865d2 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1303,7 +1303,6 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) static int cifs_writepages(struct address_space *mapping, struct writeback_control *wbc) { - struct backing_dev_info *bdi = mapping->backing_dev_info; unsigned int bytes_to_write; unsigned int bytes_written; struct cifs_sb_info *cifs_sb; @@ -1326,15 +1325,6 @@ static int cifs_writepages(struct address_space *mapping, int scanned = 0; int xid, long_op; - /* - * BB: Is this meaningful for a non-block-device file system? - * If it is, we should test it again after we do I/O - */ - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } - cifs_sb = CIFS_SB(mapping->host->i_sb); /* diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f3b071f921aa..939739c7b3f9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -55,7 +55,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb * activity, but those code paths have their own higher-level * throttling. */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else if (!trylock_buffer(bh)) { redirty_page_for_writepage(wbc, page); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 605e292501f4..4c14c17a5276 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -290,9 +290,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page->index); - ret = nfs_page_async_flush(pgio, page, - wbc->sync_mode == WB_SYNC_NONE || - wbc->nonblocking != 0); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index caa758377d66..c1f93896cb53 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2438,7 +2438,7 @@ static int reiserfs_write_full_page(struct page *page, /* from this point on, we know the buffer is mapped to a * real block and not a direct item */ - if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + if (wbc->sync_mode != WB_SYNC_NONE) { lock_buffer(bh); } else { if (!trylock_buffer(bh)) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index b552f816de15..c9af48fffcd7 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1139,8 +1139,7 @@ xfs_vm_writepage( type = IO_DELAY; flags = BMAPI_ALLOCATE; - if (wbc->sync_mode == WB_SYNC_NONE && - wbc->nonblocking) + if (wbc->sync_mode == WB_SYNC_NONE) flags |= BMAPI_TRYLOCK; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 01e9e0076a92..6bcb00645de4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -242,18 +242,20 @@ TRACE_EVENT(ext4_da_writepages, __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; - __entry->nonblocking = wbc->nonblocking; __entry->for_kupdate = wbc->for_kupdate; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", + TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld " + "range_start %llu range_end %llu " + "for_kupdate %d for_reclaim %d " + "range_cyclic %d writeback_index %lu", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, - __entry->range_end, __entry->nonblocking, + __entry->range_end, __entry->for_kupdate, __entry->for_reclaim, __entry->range_cyclic, (unsigned long) __entry->writeback_index) diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index f345f66ae9d1..0bb01ab2e984 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -96,8 +96,6 @@ DECLARE_EVENT_CLASS(wbc_class, __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) - __field(int, nonblocking) - __field(int, encountered_congestion) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) diff --git a/mm/migrate.c b/mm/migrate.c index f8c9bccf2520..d917ac3207f5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -497,7 +497,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; diff --git a/mm/vmscan.c b/mm/vmscan.c index b94c9464f262..6cbc1aac23ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -376,7 +376,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; -- cgit v1.2.3 From e11da5b4fdf01d71d73c21cb92b00595b917d7fd Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:40 -0700 Subject: tracing, vmscan: add trace events for LRU list shrinking There have been numerous reports of stalls that pointed at the problem being somewhere in the VM. There are multiple roots to the problems which means dealing with any of the root problems in isolation is tricky to justify on their own and they would still need integration testing. This patch series puts together two different patch sets which in combination should tackle some of the root causes of latency problems being reported. Patch 1 adds a tracepoint for shrink_inactive_list. For this series, the most important results is being able to calculate the scanning/reclaim ratio as a measure of the amount of work being done by page reclaim. Patch 2 accounts for time spent in congestion_wait. Patches 3-6 were originally developed by Kosaki Motohiro but reworked for this series. It has been noted that lumpy reclaim is far too aggressive and trashes the system somewhat. As SLUB uses high-order allocations, a large cost incurred by lumpy reclaim will be noticeable. It was also reported during transparent hugepage support testing that lumpy reclaim was trashing the system and these patches should mitigate that problem without disabling lumpy reclaim. Patch 7 adds wait_iff_congested() and replaces some callers of congestion_wait(). wait_iff_congested() only sleeps if there is a BDI that is currently congested. Patch 8 notes that any BDI being congested is not necessarily a problem because there could be multiple BDIs of varying speeds and numberous zones. It attempts to track when a zone being reclaimed contains many pages backed by a congested BDI and if so, reclaimers wait on the congestion queue. I ran a number of tests with monitoring on X86, X86-64 and PPC64. Each machine had 3G of RAM and the CPUs were X86: Intel P4 2-core X86-64: AMD Phenom 4-core PPC64: PPC970MP Each used a single disk and the onboard IO controller. Dirty ratio was left at 20. I'm just going to report for X86-64 and PPC64 in a vague attempt to keep this report short. Four kernels were tested each based on v2.6.36-rc4 traceonly-v2r2: Patches 1 and 2 to instrument vmscan reclaims and congestion_wait lowlumpy-v2r3: Patches 1-6 to test if lumpy reclaim is better waitcongest-v2r3: Patches 1-7 to only wait on congestion waitwriteback-v2r4: Patches 1-8 to detect when a zone is congested nocongest-v1r5: Patches 1-3 for testing wait_iff_congestion nodirect-v1r5: Patches 1-10 to disable filesystem writeback for better IO The tests run were as follows kernbench compile-based benchmark. Smoke test performance sysbench OLTP read-only benchmark. Will be re-run in the future as read-write micro-mapped-file-stream This is a micro-benchmark from Johannes Weiner that accesses a large sparse-file through mmap(). It was configured to run in only single-CPU mode but can be indicative of how well page reclaim identifies suitable pages. stress-highalloc Tries to allocate huge pages under heavy load. kernbench, iozone and sysbench did not report any performance regression on any machine. sysbench did pressure the system lightly and there was reclaim activity but there were no difference of major interest between the kernels. X86-64 micro-mapped-file-stream traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4 pgalloc_dma 1639.00 ( 0.00%) 667.00 (-145.73%) 1167.00 ( -40.45%) 578.00 (-183.56%) pgalloc_dma32 2842410.00 ( 0.00%) 2842626.00 ( 0.01%) 2843043.00 ( 0.02%) 2843014.00 ( 0.02%) pgalloc_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgsteal_dma 729.00 ( 0.00%) 85.00 (-757.65%) 609.00 ( -19.70%) 125.00 (-483.20%) pgsteal_dma32 2338721.00 ( 0.00%) 2447354.00 ( 4.44%) 2429536.00 ( 3.74%) 2436772.00 ( 4.02%) pgsteal_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_kswapd_dma 1469.00 ( 0.00%) 532.00 (-176.13%) 1078.00 ( -36.27%) 220.00 (-567.73%) pgscan_kswapd_dma32 4597713.00 ( 0.00%) 4503597.00 ( -2.09%) 4295673.00 ( -7.03%) 3891686.00 ( -18.14%) pgscan_kswapd_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_direct_dma 71.00 ( 0.00%) 134.00 ( 47.01%) 243.00 ( 70.78%) 352.00 ( 79.83%) pgscan_direct_dma32 305820.00 ( 0.00%) 280204.00 ( -9.14%) 600518.00 ( 49.07%) 957485.00 ( 68.06%) pgscan_direct_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pageoutrun 16296.00 ( 0.00%) 21254.00 ( 23.33%) 18447.00 ( 11.66%) 20067.00 ( 18.79%) allocstall 443.00 ( 0.00%) 273.00 ( -62.27%) 513.00 ( 13.65%) 1568.00 ( 71.75%) These are based on the raw figures taken from /proc/vmstat. It's a rough measure of reclaim activity. Note that allocstall counts are higher because we are entering direct reclaim more often as a result of not sleeping in congestion. In itself, it's not necessarily a bad thing. It's easier to get a view of what happened from the vmscan tracepoint report. FTrace Reclaim Statistics: vmscan traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3 waitwriteback-v2r4 Direct reclaims 443 273 513 1568 Direct reclaim pages scanned 305968 280402 600825 957933 Direct reclaim pages reclaimed 43503 19005 30327 117191 Direct reclaim write file async I/O 0 0 0 0 Direct reclaim write anon async I/O 0 3 4 12 Direct reclaim write file sync I/O 0 0 0 0 Direct reclaim write anon sync I/O 0 0 0 0 Wake kswapd requests 187649 132338 191695 267701 Kswapd wakeups 3 1 4 1 Kswapd pages scanned 4599269 4454162 4296815 3891906 Kswapd pages reclaimed 2295947 2428434 2399818 2319706 Kswapd reclaim write file async I/O 1 0 1 1 Kswapd reclaim write anon async I/O 59 187 41 222 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 4.34 2.52 6.63 2.96 Time kswapd awake (seconds) 11.15 10.25 11.01 10.19 Total pages scanned 4905237 4734564 4897640 4849839 Total pages reclaimed 2339450 2447439 2430145 2436897 %age total pages scanned/reclaimed 47.69% 51.69% 49.62% 50.25% %age total pages scanned/written 0.00% 0.00% 0.00% 0.00% %age file pages scanned/written 0.00% 0.00% 0.00% 0.00% Percentage Time Spent Direct Reclaim 29.23% 19.02% 38.48% 20.25% Percentage Time kswapd Awake 78.58% 78.85% 76.83% 79.86% What is interesting here for nocongest in particular is that while direct reclaim scans more pages, the overall number of pages scanned remains the same and the ratio of pages scanned to pages reclaimed is more or less the same. In other words, while we are sleeping less, reclaim is not doing more work and as direct reclaim and kswapd is awake for less time, it would appear to be doing less work. FTrace Reclaim Statistics: congestion_wait Direct number congest waited 87 196 64 0 Direct time congest waited 4604ms 4732ms 5420ms 0ms Direct full congest waited 72 145 53 0 Direct number conditional waited 0 0 324 1315 Direct time conditional waited 0ms 0ms 0ms 0ms Direct full conditional waited 0 0 0 0 KSwapd number congest waited 20 10 15 7 KSwapd time congest waited 1264ms 536ms 884ms 284ms KSwapd full congest waited 10 4 6 2 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 The vanilla kernel spent 8 seconds asleep in direct reclaim and no time at all asleep with the patches. MMTests Statistics: duration User/Sys Time Running Test (seconds) 10.51 10.73 10.6 11.66 Total Elapsed Time (seconds) 14.19 13.00 14.33 12.76 Overall, the tests completed faster. It is interesting to note that backing off further when a zone is congested and not just a BDI was more efficient overall. PPC64 micro-mapped-file-stream pgalloc_dma 3024660.00 ( 0.00%) 3027185.00 ( 0.08%) 3025845.00 ( 0.04%) 3026281.00 ( 0.05%) pgalloc_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgsteal_dma 2508073.00 ( 0.00%) 2565351.00 ( 2.23%) 2463577.00 ( -1.81%) 2532263.00 ( 0.96%) pgsteal_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_kswapd_dma 4601307.00 ( 0.00%) 4128076.00 ( -11.46%) 3912317.00 ( -17.61%) 3377165.00 ( -36.25%) pgscan_kswapd_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pgscan_direct_dma 629825.00 ( 0.00%) 971622.00 ( 35.18%) 1063938.00 ( 40.80%) 1711935.00 ( 63.21%) pgscan_direct_normal 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) 0.00 ( 0.00%) pageoutrun 27776.00 ( 0.00%) 20458.00 ( -35.77%) 18763.00 ( -48.04%) 18157.00 ( -52.98%) allocstall 977.00 ( 0.00%) 2751.00 ( 64.49%) 2098.00 ( 53.43%) 5136.00 ( 80.98%) Similar trends to x86-64. allocstalls are up but it's not necessarily bad. FTrace Reclaim Statistics: vmscan Direct reclaims 977 2709 2098 5136 Direct reclaim pages scanned 629825 963814 1063938 1711935 Direct reclaim pages reclaimed 75550 242538 150904 387647 Direct reclaim write file async I/O 0 0 0 2 Direct reclaim write anon async I/O 0 10 0 4 Direct reclaim write file sync I/O 0 0 0 0 Direct reclaim write anon sync I/O 0 0 0 0 Wake kswapd requests 392119 1201712 571935 571921 Kswapd wakeups 3 2 3 3 Kswapd pages scanned 4601307 4128076 3912317 3377165 Kswapd pages reclaimed 2432523 2318797 2312673 2144616 Kswapd reclaim write file async I/O 20 1 1 1 Kswapd reclaim write anon async I/O 57 132 11 121 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 6.19 7.30 13.04 10.88 Time kswapd awake (seconds) 21.73 26.51 25.55 23.90 Total pages scanned 5231132 5091890 4976255 5089100 Total pages reclaimed 2508073 2561335 2463577 2532263 %age total pages scanned/reclaimed 47.95% 50.30% 49.51% 49.76% %age total pages scanned/written 0.00% 0.00% 0.00% 0.00% %age file pages scanned/written 0.00% 0.00% 0.00% 0.00% Percentage Time Spent Direct Reclaim 18.89% 20.65% 32.65% 27.65% Percentage Time kswapd Awake 72.39% 80.68% 78.21% 77.40% Again, a similar trend that the congestion_wait changes mean that direct reclaim scans more pages but the overall number of pages scanned while slightly reduced, are very similar. The ratio of scanning/reclaimed remains roughly similar. The downside is that kswapd and direct reclaim was awake longer and for a larger percentage of the overall workload. It's possible there were big differences in the amount of time spent reclaiming slab pages between the different kernels which is plausible considering that the micro tests runs after fsmark and sysbench. Trace Reclaim Statistics: congestion_wait Direct number congest waited 845 1312 104 0 Direct time congest waited 19416ms 26560ms 7544ms 0ms Direct full congest waited 745 1105 72 0 Direct number conditional waited 0 0 1322 2935 Direct time conditional waited 0ms 0ms 12ms 312ms Direct full conditional waited 0 0 0 3 KSwapd number congest waited 39 102 75 63 KSwapd time congest waited 2484ms 6760ms 5756ms 3716ms KSwapd full congest waited 20 48 46 25 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 The vanilla kernel spent 20 seconds asleep in direct reclaim and only 312ms asleep with the patches. The time kswapd spent congest waited was also reduced by a large factor. MMTests Statistics: duration ser/Sys Time Running Test (seconds) 26.58 28.05 26.9 28.47 Total Elapsed Time (seconds) 30.02 32.86 32.67 30.88 With all patches applies, the completion times are very similar. X86-64 STRESS-HIGHALLOC traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Pass 1 82.00 ( 0.00%) 84.00 ( 2.00%) 85.00 ( 3.00%) 85.00 ( 3.00%) Pass 2 90.00 ( 0.00%) 87.00 (-3.00%) 88.00 (-2.00%) 89.00 (-1.00%) At Rest 92.00 ( 0.00%) 90.00 (-2.00%) 90.00 (-2.00%) 91.00 (-1.00%) Success figures across the board are broadly similar. traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Direct reclaims 1045 944 886 887 Direct reclaim pages scanned 135091 119604 109382 101019 Direct reclaim pages reclaimed 88599 47535 47863 46671 Direct reclaim write file async I/O 494 283 465 280 Direct reclaim write anon async I/O 29357 13710 16656 13462 Direct reclaim write file sync I/O 154 2 2 3 Direct reclaim write anon sync I/O 14594 571 509 561 Wake kswapd requests 7491 933 872 892 Kswapd wakeups 814 778 731 780 Kswapd pages scanned 7290822 15341158 11916436 13703442 Kswapd pages reclaimed 3587336 3142496 3094392 3187151 Kswapd reclaim write file async I/O 91975 32317 28022 29628 Kswapd reclaim write anon async I/O 1992022 789307 829745 849769 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 4588.93 2467.16 2495.41 2547.07 Time kswapd awake (seconds) 2497.66 1020.16 1098.06 1176.82 Total pages scanned 7425913 15460762 12025818 13804461 Total pages reclaimed 3675935 3190031 3142255 3233822 %age total pages scanned/reclaimed 49.50% 20.63% 26.13% 23.43% %age total pages scanned/written 28.66% 5.41% 7.28% 6.47% %age file pages scanned/written 1.25% 0.21% 0.24% 0.22% Percentage Time Spent Direct Reclaim 57.33% 42.15% 42.41% 42.99% Percentage Time kswapd Awake 43.56% 27.87% 29.76% 31.25% Scanned/reclaimed ratios again look good with big improvements in efficiency. The Scanned/written ratios also look much improved. With a better scanned/written ration, there is an expectation that IO would be more efficient and indeed, the time spent in direct reclaim is much reduced by the full series and kswapd spends a little less time awake. Overall, indications here are that allocations were happening much faster and this can be seen with a graph of the latency figures as the allocations were taking place http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-hydra-mean.ps FTrace Reclaim Statistics: congestion_wait Direct number congest waited 1333 204 169 4 Direct time congest waited 78896ms 8288ms 7260ms 200ms Direct full congest waited 756 92 69 2 Direct number conditional waited 0 0 26 186 Direct time conditional waited 0ms 0ms 0ms 2504ms Direct full conditional waited 0 0 0 25 KSwapd number congest waited 4 395 227 282 KSwapd time congest waited 384ms 25136ms 10508ms 18380ms KSwapd full congest waited 3 232 98 176 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 KSwapd full conditional waited 318 0 312 9 Overall, the time spent speeping is reduced. kswapd is still hitting congestion_wait() but that is because there are callers remaining where it wasn't clear in advance if they should be changed to wait_iff_congested() or not. Overall the sleep imes are reduced though - from 79ish seconds to about 19. MMTests Statistics: duration User/Sys Time Running Test (seconds) 3415.43 3386.65 3388.39 3377.5 Total Elapsed Time (seconds) 5733.48 3660.33 3689.41 3765.39 With the full series, the time to complete the tests are reduced by 30% PPC64 STRESS-HIGHALLOC traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Pass 1 17.00 ( 0.00%) 34.00 (17.00%) 38.00 (21.00%) 43.00 (26.00%) Pass 2 25.00 ( 0.00%) 37.00 (12.00%) 42.00 (17.00%) 46.00 (21.00%) At Rest 49.00 ( 0.00%) 43.00 (-6.00%) 45.00 (-4.00%) 51.00 ( 2.00%) Success rates there are *way* up particularly considering that the 16MB huge pages on PPC64 mean that it's always much harder to allocate them. FTrace Reclaim Statistics: vmscan stress-highalloc stress-highalloc stress-highalloc stress-highalloc traceonly-v2r2 lowlumpy-v2r3 waitcongest-v2r3waitwriteback-v2r4 Direct reclaims 499 505 564 509 Direct reclaim pages scanned 223478 41898 51818 45605 Direct reclaim pages reclaimed 137730 21148 27161 23455 Direct reclaim write file async I/O 399 136 162 136 Direct reclaim write anon async I/O 46977 2865 4686 3998 Direct reclaim write file sync I/O 29 0 1 3 Direct reclaim write anon sync I/O 31023 159 237 239 Wake kswapd requests 420 351 360 326 Kswapd wakeups 185 294 249 277 Kswapd pages scanned 15703488 16392500 17821724 17598737 Kswapd pages reclaimed 5808466 2908858 3139386 3145435 Kswapd reclaim write file async I/O 159938 18400 18717 13473 Kswapd reclaim write anon async I/O 3467554 228957 322799 234278 Kswapd reclaim write file sync I/O 0 0 0 0 Kswapd reclaim write anon sync I/O 0 0 0 0 Time stalled direct reclaim (seconds) 9665.35 1707.81 2374.32 1871.23 Time kswapd awake (seconds) 9401.21 1367.86 1951.75 1328.88 Total pages scanned 15926966 16434398 17873542 17644342 Total pages reclaimed 5946196 2930006 3166547 3168890 %age total pages scanned/reclaimed 37.33% 17.83% 17.72% 17.96% %age total pages scanned/written 23.27% 1.52% 1.94% 1.43% %age file pages scanned/written 1.01% 0.11% 0.11% 0.08% Percentage Time Spent Direct Reclaim 44.55% 35.10% 41.42% 36.91% Percentage Time kswapd Awake 86.71% 43.58% 52.67% 41.14% While the scanning rates are slightly up, the scanned/reclaimed and scanned/written figures are much improved. The time spent in direct reclaim and with kswapd are massively reduced, mostly by the lowlumpy patches. FTrace Reclaim Statistics: congestion_wait Direct number congest waited 725 303 126 3 Direct time congest waited 45524ms 9180ms 5936ms 300ms Direct full congest waited 487 190 52 3 Direct number conditional waited 0 0 200 301 Direct time conditional waited 0ms 0ms 0ms 1904ms Direct full conditional waited 0 0 0 19 KSwapd number congest waited 0 2 23 4 KSwapd time congest waited 0ms 200ms 420ms 404ms KSwapd full congest waited 0 2 2 4 KSwapd number conditional waited 0 0 0 0 KSwapd time conditional waited 0ms 0ms 0ms 0ms KSwapd full conditional waited 0 0 0 0 Not as dramatic a story here but the time spent asleep is reduced and we can still see what wait_iff_congested is going to sleep when necessary. MMTests Statistics: duration User/Sys Time Running Test (seconds) 12028.09 3157.17 3357.79 3199.16 Total Elapsed Time (seconds) 10842.07 3138.72 3705.54 3229.85 The time to complete this test goes way down. With the full series, we are allocating over twice the number of huge pages in 30% of the time and there is a corresponding impact on the allocation latency graph available at. http://www.csn.ul.ie/~mel/postings/vmscanreduce-20101509/highalloc-interlatency-powyah-mean.ps This patch: Add a trace event for shrink_inactive_list() and updates the sample postprocessing script appropriately. It can be used to determine how many pages were reclaimed and for non-lumpy reclaim where exactly the pages were reclaimed from. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../trace/postprocess/trace-vmscan-postprocess.pl | 39 ++++++++++++++------ include/trace/events/vmscan.h | 42 ++++++++++++++++++++++ mm/vmscan.c | 6 ++++ 3 files changed, 77 insertions(+), 10 deletions(-) (limited to 'include/trace') diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 1b55146d1c8d..b3e73ddb1567 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -46,7 +46,7 @@ use constant HIGH_KSWAPD_LATENCY => 20; use constant HIGH_KSWAPD_REWAKEUP => 21; use constant HIGH_NR_SCANNED => 22; use constant HIGH_NR_TAKEN => 23; -use constant HIGH_NR_RECLAIM => 24; +use constant HIGH_NR_RECLAIMED => 24; use constant HIGH_NR_CONTIG_DIRTY => 25; my %perprocesspid; @@ -58,11 +58,13 @@ my $opt_read_procstat; my $total_wakeup_kswapd; my ($total_direct_reclaim, $total_direct_nr_scanned); my ($total_direct_latency, $total_kswapd_latency); +my ($total_direct_nr_reclaimed); my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async); my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async); my ($total_kswapd_nr_scanned, $total_kswapd_wake); my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async); my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async); +my ($total_kswapd_nr_reclaimed); # Catch sigint and exit on request my $sigint_report = 0; @@ -104,7 +106,7 @@ my $regex_kswapd_wake_default = 'nid=([0-9]*) order=([0-9]*)'; my $regex_kswapd_sleep_default = 'nid=([0-9]*)'; my $regex_wakeup_kswapd_default = 'nid=([0-9]*) zid=([0-9]*) order=([0-9]*)'; my $regex_lru_isolate_default = 'isolate_mode=([0-9]*) order=([0-9]*) nr_requested=([0-9]*) nr_scanned=([0-9]*) nr_taken=([0-9]*) contig_taken=([0-9]*) contig_dirty=([0-9]*) contig_failed=([0-9]*)'; -my $regex_lru_shrink_inactive_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*)'; +my $regex_lru_shrink_inactive_default = 'nid=([0-9]*) zid=([0-9]*) nr_scanned=([0-9]*) nr_reclaimed=([0-9]*) priority=([0-9]*) flags=([A-Z_|]*)'; my $regex_lru_shrink_active_default = 'lru=([A-Z_]*) nr_scanned=([0-9]*) nr_rotated=([0-9]*) priority=([0-9]*)'; my $regex_writepage_default = 'page=([0-9a-f]*) pfn=([0-9]*) flags=([A-Z_|]*)'; @@ -203,8 +205,8 @@ $regex_lru_shrink_inactive = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_inactive", $regex_lru_shrink_inactive_default, "nid", "zid", - "lru", - "nr_scanned", "nr_reclaimed", "priority"); + "nr_scanned", "nr_reclaimed", "priority", + "flags"); $regex_lru_shrink_active = generate_traceevent_regex( "vmscan/mm_vmscan_lru_shrink_active", $regex_lru_shrink_active_default, @@ -375,6 +377,16 @@ EVENT_PROCESS: my $nr_contig_dirty = $7; $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; $perprocesspid{$process_pid}->{HIGH_NR_CONTIG_DIRTY} += $nr_contig_dirty; + } elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") { + $details = $5; + if ($details !~ /$regex_lru_shrink_inactive/o) { + print "WARNING: Failed to parse mm_vmscan_lru_shrink_inactive as expected\n"; + print " $details\n"; + print " $regex_lru_shrink_inactive/o\n"; + next; + } + my $nr_reclaimed = $4; + $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed; } elsif ($tracepoint eq "mm_vmscan_writepage") { $details = $5; if ($details !~ /$regex_writepage/o) { @@ -464,8 +476,8 @@ sub dump_stats { # Print out process activity printf("\n"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "Process", "Direct", "Wokeup", "Pages", "Pages", "Pages", "Time"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s\n", "details", "Rclms", "Kswapd", "Scanned", "Sync-IO", "ASync-IO", "Stalled"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s\n", "Process", "Direct", "Wokeup", "Pages", "Pages", "Pages", "Pages", "Time"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s %8s %8s\n", "details", "Rclms", "Kswapd", "Scanned", "Rclmed", "Sync-IO", "ASync-IO", "Stalled"); foreach $process_pid (keys %stats) { if (!$stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}) { @@ -475,6 +487,7 @@ sub dump_stats { $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}; $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_direct_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -489,11 +502,12 @@ sub dump_stats { $index++; } - printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8u %8.3f", + printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8u %8u %8.3f", $process_pid, $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}, $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}, $this_reclaim_delay / 1000); @@ -529,8 +543,8 @@ sub dump_stats { # Print out kswapd activity printf("\n"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Kswapd", "Kswapd", "Order", "Pages", "Pages", "Pages"); - printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Sync-IO", "ASync-IO"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Kswapd", "Kswapd", "Order", "Pages", "Pages", "Pages", "Pages"); + printf("%-" . $max_strlen . "s %8s %10s %8s %8s %8s %8s\n", "Instance", "Wakeups", "Re-wakeup", "Scanned", "Rclmed", "Sync-IO", "ASync-IO"); foreach $process_pid (keys %stats) { if (!$stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}) { @@ -539,16 +553,18 @@ sub dump_stats { $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}; $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_kswapd_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; $total_kswapd_writepage_anon_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}; - printf("%-" . $max_strlen . "s %8d %10d %8u %8i %8u", + printf("%-" . $max_strlen . "s %8d %10d %8u %8u %8i %8u", $process_pid, $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}, $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}); @@ -579,6 +595,7 @@ sub dump_stats { print "\nSummary\n"; print "Direct reclaims: $total_direct_reclaim\n"; print "Direct reclaim pages scanned: $total_direct_nr_scanned\n"; + print "Direct reclaim pages reclaimed: $total_direct_nr_reclaimed\n"; print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n"; print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n"; print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n"; @@ -588,6 +605,7 @@ sub dump_stats { print "\n"; print "Kswapd wakeups: $total_kswapd_wake\n"; print "Kswapd pages scanned: $total_kswapd_nr_scanned\n"; + print "Kswapd pages reclaimed: $total_kswapd_nr_reclaimed\n"; print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n"; print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n"; print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n"; @@ -612,6 +630,7 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}; $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED}; + $perprocess{$process}->{HIGH_NR_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 370aa5a87322..ecf952192a93 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -10,6 +10,7 @@ #define RECLAIM_WB_ANON 0x0001u #define RECLAIM_WB_FILE 0x0002u +#define RECLAIM_WB_MIXED 0x0010u #define RECLAIM_WB_SYNC 0x0004u #define RECLAIM_WB_ASYNC 0x0008u @@ -17,6 +18,7 @@ (flags) ? __print_flags(flags, "|", \ {RECLAIM_WB_ANON, "RECLAIM_WB_ANON"}, \ {RECLAIM_WB_FILE, "RECLAIM_WB_FILE"}, \ + {RECLAIM_WB_MIXED, "RECLAIM_WB_MIXED"}, \ {RECLAIM_WB_SYNC, "RECLAIM_WB_SYNC"}, \ {RECLAIM_WB_ASYNC, "RECLAIM_WB_ASYNC"} \ ) : "RECLAIM_WB_NONE" @@ -26,6 +28,12 @@ (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) +#define trace_shrink_flags(file, sync) ( \ + (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ + (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ + (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + ) + TRACE_EVENT(mm_vmscan_kswapd_sleep, TP_PROTO(int nid), @@ -269,6 +277,40 @@ TRACE_EVENT(mm_vmscan_writepage, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_lru_shrink_inactive, + + TP_PROTO(int nid, int zid, + unsigned long nr_scanned, unsigned long nr_reclaimed, + int priority, int reclaim_flags), + + TP_ARGS(nid, zid, nr_scanned, nr_reclaimed, priority, reclaim_flags), + + TP_STRUCT__entry( + __field(int, nid) + __field(int, zid) + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_reclaimed) + __field(int, priority) + __field(int, reclaim_flags) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->zid = zid; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->priority = priority; + __entry->reclaim_flags = reclaim_flags; + ), + + TP_printk("nid=%d zid=%d nr_scanned=%ld nr_reclaimed=%ld priority=%d flags=%s", + __entry->nid, __entry->zid, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->priority, + show_reclaim_flags(__entry->reclaim_flags)) +); + + #endif /* _TRACE_VMSCAN_H */ /* This part must be outside protection */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 28b0521e4bfd..4a6dccb57586 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1358,6 +1358,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + priority, + trace_shrink_flags(file, sc->lumpy_reclaim_mode)); return nr_reclaimed; } -- cgit v1.2.3 From 52bb9198668968506f9d12bf35d7f5d3f094921e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:41 -0700 Subject: writeback: account for time spent congestion_waited There is strong evidence to indicate a lot of time is being spent in congestion_wait(), some of it unnecessarily. This patch adds a tracepoint for congestion_wait to record when congestion_wait() was called, how long the timeout was for and how long it actually slept. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Johannes Weiner Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/writeback.h | 28 ++++++++++++++++++++++++++++ mm/backing-dev.c | 5 +++++ 2 files changed, 33 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 0bb01ab2e984..d2b2654606ec 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -151,6 +151,34 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written); DEFINE_WBC_EVENT(wbc_balance_dirty_wait); DEFINE_WBC_EVENT(wbc_writepage); +DECLARE_EVENT_CLASS(writeback_congest_waited_template, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed), + + TP_STRUCT__entry( + __field( unsigned int, usec_timeout ) + __field( unsigned int, usec_delayed ) + ), + + TP_fast_assign( + __entry->usec_timeout = usec_timeout; + __entry->usec_delayed = usec_delayed; + ), + + TP_printk("usec_timeout=%u usec_delayed=%u", + __entry->usec_timeout, + __entry->usec_delayed) +); + +DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 65d420499a61..55627306abe0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -764,12 +764,17 @@ EXPORT_SYMBOL(set_bdi_congested); long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); -- cgit v1.2.3 From 7d3579e8e61937cbba268ea9b218d006b6d64221 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 26 Oct 2010 14:21:42 -0700 Subject: vmscan: narrow the scenarios in whcih lumpy reclaim uses synchrounous reclaim shrink_page_list() can decide to give up reclaiming a page under a number of conditions such as 1. trylock_page() failure 2. page is unevictable 3. zone reclaim and page is mapped 4. PageWriteback() is true 5. page is swapbacked and swap is full 6. add_to_swap() failure 7. page is dirty and gfpmask don't have GFP_IO, GFP_FS 8. page is pinned 9. IO queue is congested 10. pageout() start IO, but not finished With lumpy reclaim, failures result in entering synchronous lumpy reclaim but this can be unnecessary. In cases (2), (3), (5), (6), (7) and (8), there is no point retrying. This patch causes lumpy reclaim to abort when it is known it will fail. Case (9) is more interesting. current behavior is, 1. start shrink_page_list(async) 2. found queue_congested() 3. skip pageout write 4. still start shrink_page_list(sync) 5. wait on a lot of pages 6. again, found queue_congested() 7. give up pageout write again So, it's useless time wasting. However, just skipping page reclaim is also notgood as x86 allocating a huge page needs 512 pages for example. It can have more dirty pages than queue congestion threshold (~=128). After this patch, pageout() behaves as follows; - If order > PAGE_ALLOC_COSTLY_ORDER Ignore queue congestion always. - If order <= PAGE_ALLOC_COSTLY_ORDER skip write page and disable lumpy reclaim. Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +-- mm/vmscan.c | 120 ++++++++++++++++++++++++++---------------- 2 files changed, 78 insertions(+), 48 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index ecf952192a93..c255fcc587bf 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_MIXED : \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == PAGEOUT_IO_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index 399d54e8a82c..d9fc2dce93af 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -51,6 +51,12 @@ #define CREATE_TRACE_POINTS #include +enum lumpy_mode { + LUMPY_MODE_NONE, + LUMPY_MODE_ASYNC, + LUMPY_MODE_SYNC, +}; + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -82,7 +88,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - bool lumpy_reclaim_mode; + enum lumpy_mode lumpy_reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -265,6 +271,36 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, + bool sync) +{ + enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + + /* + * Some reclaim have alredy been failed. No worth to try synchronous + * lumpy reclaim. + */ + if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = mode; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = mode; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + +static void disable_lumpy_reclaim_mode(struct scan_control *sc) +{ + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} + static inline int is_page_cache_freeable(struct page *page) { /* @@ -275,7 +311,8 @@ static inline int is_page_cache_freeable(struct page *page) return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -283,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) return 1; if (bdi == current->backing_dev_info) return 1; + + /* lumpy reclaim for hugepage often need a lot of write */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return 1; return 0; } @@ -307,12 +348,6 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -330,7 +365,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -366,8 +401,10 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) { + disable_lumpy_reclaim_mode(sc); return PAGE_KEEP; + } if (clear_page_dirty_for_io(page)) { int res; @@ -393,7 +430,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + if (PageWriteback(page) && + sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -401,7 +439,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sync_writeback)); + trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -579,7 +617,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) return PAGEREF_RECLAIM; /* @@ -643,8 +681,7 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -693,10 +730,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + may_enter_fs) wait_on_page_writeback(page); - else - goto keep_locked; + else { + unlock_page(page); + goto keep_lumpy; + } } references = page_check_references(page, sc); @@ -750,14 +790,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep_lumpy; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -840,6 +883,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); + disable_lumpy_reclaim_mode(sc); continue; activate_locked: @@ -852,6 +896,8 @@ activate_locked: keep_locked: unlock_page(page); keep: + disable_lumpy_reclaim_mode(sc); +keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } @@ -1252,7 +1298,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (!sc->lumpy_reclaim_mode) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1297,15 +1343,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - + set_lumpy_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1317,8 +1363,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode ? - ISOLATE_BOTH : ISOLATE_INACTIVE, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, zone, sc->mem_cgroup, 0, file); /* @@ -1336,7 +1382,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + nr_reclaimed = shrink_page_list(&page_list, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { @@ -1347,7 +1393,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, nr_active = clear_active_flags(&page_list, NULL); count_vm_events(PGDEACTIVATE, nr_active); - nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC); + set_lumpy_reclaim_mode(priority, sc, true); + nr_reclaimed += shrink_page_list(&page_list, sc); } local_irq_disable(); @@ -1739,21 +1786,6 @@ out: } } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc) -{ - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = 1; - else - sc->lumpy_reclaim_mode = 0; -} - /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1768,8 +1800,6 @@ static void shrink_zone(int priority, struct zone *zone, get_scan_count(zone, sc, nr, priority); - set_lumpy_reclaim_mode(priority, sc); - while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { -- cgit v1.2.3 From 0e093d99763eb4cea09f8ca4f1d01f34e121d10b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 26 Oct 2010 14:21:45 -0700 Subject: writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone If congestion_wait() is called with no BDI congested, the caller will sleep for the full timeout and this may be an unnecessary sleep. This patch adds a wait_iff_congested() that checks congestion and only sleeps if a BDI is congested else, it calls cond_resched() to ensure the caller is not hogging the CPU longer than its quota but otherwise will not sleep. This is aimed at reducing some of the major desktop stalls reported during IO. For example, while kswapd is operating, it calls congestion_wait() but it could just have been reclaiming clean page cache pages with no congestion. Without this patch, it would sleep for a full timeout but after this patch, it'll just call schedule() if it has been on the CPU too long. Similar logic applies to direct reclaimers that are not making enough progress. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- include/linux/mmzone.h | 8 ++++++ include/trace/events/writeback.h | 7 +++++ mm/backing-dev.c | 61 ++++++++++++++++++++++++++++++++++++++-- mm/page_alloc.c | 4 +-- mm/vmscan.c | 42 ++++++++++++++++++++++----- 6 files changed, 112 insertions(+), 12 deletions(-) (limited to 'include/trace') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 35b00746c712..f1b402a50679 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -285,7 +285,7 @@ enum { void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); - +long wait_iff_congested(struct zone *zone, int sync, long timeout); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c3c17fb675ee..39c24ebe9cfd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -423,6 +423,9 @@ struct zone { typedef enum { ZONE_RECLAIM_LOCKED, /* prevents concurrent reclaim */ ZONE_OOM_LOCKED, /* zone is in OOM killer zonelist */ + ZONE_CONGESTED, /* zone has many dirty pages backed by + * a congested BDI + */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag) clear_bit(flag, &zone->flags); } +static inline int zone_is_reclaim_congested(const struct zone *zone) +{ + return test_bit(ZONE_CONGESTED, &zone->flags); +} + static inline int zone_is_reclaim_locked(const struct zone *zone) { return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index d2b2654606ec..89a2b2db4375 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_ARGS(usec_timeout, usec_delayed) ); +DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, + + TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), + + TP_ARGS(usec_timeout, usec_delayed) +); + #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 55627306abe0..5ad3c106606b 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); @@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout) } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a683f819439..b13bc5e5bd7d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -2095,7 +2095,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(BLK_RW_ASYNC, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 130ad0239f52..30fd658bb289 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) { - disable_lumpy_reclaim_mode(sc); + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; - } if (clear_page_dirty_for_io(page)) { int res; @@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, + struct zone *zone, struct scan_control *sc) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; cond_resched(); @@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { + nr_dirty++; + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: + nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -902,6 +907,15 @@ keep_lumpy: VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + /* + * Tag a zone as congested if all the dirty pages encountered were + * backed by a congested BDI. In this case, reclaimers should just + * back off and wait for congestion to clear because further reclaim + * will encounter the same problem + */ + if (nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + free_page_list(&free_pages); list_splice(&ret_pages, page_list); @@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, sc); + nr_reclaimed = shrink_page_list(&page_list, zone, sc); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_lumpy_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, sc); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); } local_irq_disable(); @@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (!sc->hibernation_mode && sc->nr_scanned && - priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + priority < DEF_PRIORITY - 2) { + struct zone *preferred_zone; + + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), + NULL, &preferred_zone); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); + } } out: @@ -2282,6 +2301,15 @@ loop_again: if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; + } else { + /* + * If a zone reaches its high watermark, + * consider it to be no longer congested. It's + * possible there are dirty pages backed by + * congested BDIs but as pressure is relieved, + * spectulatively avoid congestion waits + */ + zone_clear_flag(zone, ZONE_CONGESTED); } } -- cgit v1.2.3 From b853fd364810a241050778124842a8c415c72a69 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Wed, 27 Oct 2010 21:27:12 -0400 Subject: ext4: avoid null dereference in trace_ext4_mballoc_discard ac->inode is set to null in function ext4_mb_release_group_pa(), and then trace_ext4_mballoc_discard(ac) is called, the kernel will panic. BUG: unable to handle kernel NULL pointer dereference at 000000a4 IP: [] ftrace_raw_event_ext4__mballoc+0x54/0xc0 [ext4] *pdpt = 0000000000abd001 *pde = 0000000000000000 Oops: 0000 [#1] SMP Pid: 550, comm: flush-8:16 Not tainted 2.6.36-rc1 #1 SE7320EP2/Altos G530 EIP: 0060:[] EFLAGS: 00010206 CPU: 1 EIP is at ftrace_raw_event_ext4__mballoc+0x54/0xc0 [ext4] EAX: f32ac840 EBX: f3f1cf88 ECX: f32ac840 EDX: 00000000 ESI: f32ac83c EDI: f880b9d8 EBP: 00000000 ESP: f4b77ae4 DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 Process flush-8:16 (pid: 550, ti=f4b76000 task=f613e540 task.ti=f4b76000) Call Trace: [] ? ext4_mb_release_group_pa+0x121/0x150 [ext4] [] ? ext4_mb_discard_group_preallocations+0x336/0x400 [ext4] [] ? ext4_mb_new_blocks+0x3d1/0x4f0 [ext4] [] ? __make_request+0x10b/0x440 [] ? ext4_ext_map_blocks+0x1334/0x1980 [ext4] [] ? rb_reserve_next_event+0xaa/0x3b0 [] ? ext4_map_blocks+0xd6/0x1d0 [ext4] [] ? mpage_da_map_blocks+0xc7/0x8a0 [ext4] [] ? find_get_pages_tag+0x38/0x110 [] ? __pagevec_release+0x15/0x20 [] ? ext4_da_writepages+0x2b5/0x5d0 [ext4] [] ? __writepage+0x0/0x30 [] ? do_writepages+0x14/0x30 [] ? writeback_single_inode+0xa0/0x240 [] ? writeback_sb_inodes+0xc1/0x180 [] ? writeback_inodes_wb+0x88/0x140 [] ? wb_writeback+0x20b/0x320 [] ? lock_timer_base+0x27/0x50 [] ? wb_do_writeback+0x150/0x190 [] ? bdi_writeback_thread+0x88/0x1f0 [] ? complete+0x40/0x60 [] ? bdi_writeback_thread+0x0/0x1f0 [] ? kthread+0x74/0x80 [] ? kthread+0x0/0x80 [] ? kernel_thread_helper+0x6/0x10 Signed-off-by: Wen Congyang Acked-by: Steven Rostedt Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 01e9e0076a92..6a1fcff95f7c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -796,8 +796,9 @@ DECLARE_EVENT_CLASS(ext4__mballoc, ), TP_fast_assign( - __entry->dev = ac->ac_inode->i_sb->s_dev; - __entry->ino = ac->ac_inode->i_ino; + __entry->dev = ac->ac_sb->s_dev; + __entry->ino = ac->ac_inode ? + ac->ac_inode->i_ino : 0; __entry->result_logical = ac->ac_b_ex.fe_logical; __entry->result_start = ac->ac_b_ex.fe_start; __entry->result_group = ac->ac_b_ex.fe_group; -- cgit v1.2.3 From 4d5476164a052e80d4ef430e368e76dbde96801f Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:07 -0400 Subject: ext4: fix oops in trace_ext4_mb_release_group_pa Our QA reported an oops in the ext4_mb_release_group_pa tracing, and Josef Bacik pointed out that it was because we may have a non-null but uninitialized ac_inode in the allocation context. I can reproduce it when running xfstests with ext4 tracepoints on, on a CONFIG_SLAB_DEBUG kernel. We call trace_ext4_mb_release_group_pa from 2 places, ext4_mb_discard_group_preallocations and ext4_mb_discard_lg_preallocations In both cases we allocate an ac as a container just for tracing (!) and never fill in the ac_inode. There's no reason to be assigning, testing, or printing it as far as I can see, so just remove it from the tracepoint. Signed-off-by: Eric Sandeen Reviewed-by: Josef Bacik Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6a1fcff95f7c..fbdfa3a6bbd8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -432,7 +432,6 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_STRUCT__entry( __field( dev_t, dev ) - __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) @@ -440,8 +439,6 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->ino = (ac && ac->ac_inode) ? - ac->ac_inode->i_ino : 0; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), -- cgit v1.2.3 From 3e1e5f501632460184a98237d5460c521510535e Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 27 Oct 2010 21:30:07 -0400 Subject: ext4: don't use ext4_allocation_contexts for tracing Many tracepoints were populating an ext4_allocation_context to pass in, but this requires a slab allocation even when tracepoints are off. In fact, 4 of 5 of these allocations were only for tracing. In addition, we were only using a small fraction of the 144 bytes of this structure for this purpose. We can do away with all these alloc/frees of the ac and simply pass in the bits we care about, instead. I tested this by turning on tracing and running through xfstests on x86_64. I did not actually do anything with the trace output, however. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 81 +++++++-------------------------------------- include/trace/events/ext4.h | 51 ++++++++++++++++------------ 2 files changed, 41 insertions(+), 91 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ccdfec6acb75..3f62df50f483 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3591,8 +3591,7 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) */ static noinline_for_stack int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3610,11 +3609,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = pa->pa_inode; - } - while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) @@ -3625,16 +3619,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, (unsigned) next - bit, (unsigned) group); free += next - bit; - if (ac) { - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = next - bit; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } - - trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit, - next - bit); + trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); + trace_ext4_mb_release_inode_pa(sb, pa->pa_inode, pa, + grp_blk_start + bit, next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3657,29 +3644,19 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, static noinline_for_stack int ext4_mb_release_group_pa(struct ext4_buddy *e4b, - struct ext4_prealloc_space *pa, - struct ext4_allocation_context *ac) + struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(sb, ac, pa); + trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); - - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = NULL; - ac->ac_b_ex.fe_group = group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = pa->pa_len; - ac->ac_b_ex.fe_logical = 0; - trace_ext4_mballoc_discard(ac); - } + trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); return 0; } @@ -3700,7 +3677,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; struct list_head list; struct ext4_buddy e4b; int err; @@ -3729,9 +3705,6 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, needed = EXT4_BLOCKS_PER_GROUP(sb) + 1; INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3786,9 +3759,9 @@ repeat: spin_unlock(pa->pa_obj_lock); if (pa->pa_type == MB_GROUP_PA) - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); else - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); @@ -3796,8 +3769,6 @@ repeat: out: ext4_unlock_group(sb, group); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); return free; @@ -3818,7 +3789,6 @@ void ext4_discard_preallocations(struct inode *inode) struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; @@ -3834,11 +3804,6 @@ void ext4_discard_preallocations(struct inode *inode) INIT_LIST_HEAD(&list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_sb = sb; - ac->ac_inode = inode; - } repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -3908,7 +3873,7 @@ repeat: ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -3917,8 +3882,6 @@ repeat: list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4116,14 +4079,10 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_buddy e4b; struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - struct ext4_allocation_context *ac; mb_debug(1, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) - ac->ac_sb = sb; spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], @@ -4175,15 +4134,13 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_group_pa(&e4b, pa, ac); + ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4547,7 +4504,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, { struct buffer_head *bitmap_bh = NULL; struct super_block *sb = inode->i_sb; - struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; unsigned long freed = 0; unsigned int overflow; @@ -4602,12 +4558,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, if (!ext4_should_writeback_data(inode)) flags |= EXT4_FREE_BLOCKS_METADATA; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { - ac->ac_inode = inode; - ac->ac_sb = sb; - } - do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -4665,12 +4615,7 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - if (ac) { - ac->ac_b_ex.fe_group = block_group; - ac->ac_b_ex.fe_start = bit; - ac->ac_b_ex.fe_len = count; - trace_ext4_mballoc_free(ac); - } + trace_ext4_mballoc_free(sb, inode, block_group, bit, count); err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) @@ -4741,7 +4686,5 @@ error_return: dquot_free_block(inode, freed); brelse(bitmap_bh); ext4_std_error(sb, err); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index fbdfa3a6bbd8..b5f4938d612d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -396,11 +396,11 @@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, TRACE_EVENT(ext4_mb_release_inode_pa, TP_PROTO(struct super_block *sb, - struct ext4_allocation_context *ac, + struct inode *inode, struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), - TP_ARGS(sb, ac, pa, block, count), + TP_ARGS(sb, inode, pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) @@ -412,8 +412,7 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->ino = (ac && ac->ac_inode) ? - ac->ac_inode->i_ino : 0; + __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; ), @@ -425,10 +424,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TRACE_EVENT(ext4_mb_release_group_pa, TP_PROTO(struct super_block *sb, - struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), - TP_ARGS(sb, ac, pa), + TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -779,47 +777,56 @@ TRACE_EVENT(ext4_mballoc_prealloc, ); DECLARE_EVENT_CLASS(ext4__mballoc, - TP_PROTO(struct ext4_allocation_context *ac), + TP_PROTO(struct super_block *sb, + struct inode *inode, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), - TP_ARGS(ac), + TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( - __entry->dev = ac->ac_sb->s_dev; - __entry->ino = ac->ac_inode ? - ac->ac_inode->i_ino : 0; - __entry->result_logical = ac->ac_b_ex.fe_logical; - __entry->result_start = ac->ac_b_ex.fe_start; - __entry->result_group = ac->ac_b_ex.fe_group; - __entry->result_len = ac->ac_b_ex.fe_len; + __entry->dev = sb->s_dev; + __entry->ino = inode ? inode->i_ino : 0; + __entry->result_start = start; + __entry->result_group = group; + __entry->result_len = len; ), - TP_printk("dev %s inode %lu extent %u/%d/%u@%u ", + TP_printk("dev %s inode %lu extent %u/%d/%u ", jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, - __entry->result_len, __entry->result_logical) + __entry->result_len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, - TP_PROTO(struct ext4_allocation_context *ac), + TP_PROTO(struct super_block *sb, + struct inode *inode, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), - TP_ARGS(ac) + TP_ARGS(sb, inode, group, start, len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, - TP_PROTO(struct ext4_allocation_context *ac), + TP_PROTO(struct super_block *sb, + struct inode *inode, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t len), - TP_ARGS(ac) + TP_ARGS(sb, inode, group, start, len) ); TRACE_EVENT(ext4_forget, -- cgit v1.2.3 From a269029d0e2192046be4c07ed78a45022469ee4c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 27 Oct 2010 22:08:50 -0400 Subject: ext4,jbd2: convert tracepoints to use major/minor numbers Unfortunately perf can't deal with anything other than direct structure accesses in the TP_printk() section. It will drop dead when it sees jbd2_dev_to_name() in the "print fmt" section of the tracepoint. Addresses-Google-Bug: 3138508 Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 329 +++++++++++++++++++++++++++----------------- include/trace/events/jbd2.h | 78 ++++++----- 2 files changed, 251 insertions(+), 156 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b5f4938d612d..8f59db107bbb 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -21,7 +21,8 @@ TRACE_EVENT(ext4_free_inode, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( uid_t, uid ) @@ -30,7 +31,8 @@ TRACE_EVENT(ext4_free_inode, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->uid = inode->i_uid; @@ -38,9 +40,10 @@ TRACE_EVENT(ext4_free_inode, __entry->blocks = inode->i_blocks; ), - TP_printk("dev %s ino %lu mode 0%o uid %u gid %u blocks %llu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, __entry->uid, __entry->gid, + TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, + __entry->uid, __entry->gid, (unsigned long long) __entry->blocks) ); @@ -50,20 +53,22 @@ TRACE_EVENT(ext4_request_inode, TP_ARGS(dir, mode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, dir ) __field( umode_t, mode ) ), TP_fast_assign( - __entry->dev = dir->i_sb->s_dev; + __entry->dev_major = MAJOR(dir->i_sb->s_dev); + __entry->dev_minor = MINOR(dir->i_sb->s_dev); __entry->dir = dir->i_ino; __entry->mode = mode; ), - TP_printk("dev %s dir %lu mode 0%o", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->dir, - __entry->mode) + TP_printk("dev %d,%d dir %lu mode 0%o", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, @@ -72,21 +77,24 @@ TRACE_EVENT(ext4_allocate_inode, TP_ARGS(inode, dir, mode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( ino_t, dir ) __field( umode_t, mode ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), - TP_printk("dev %s ino %lu dir %lu mode 0%o", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); @@ -98,7 +106,8 @@ DECLARE_EVENT_CLASS(ext4__write_begin, TP_ARGS(inode, pos, len, flags), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) @@ -106,15 +115,17 @@ DECLARE_EVENT_CLASS(ext4__write_begin, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->flags = flags; ), - TP_printk("dev %s ino %lu pos %llu len %u flags %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu pos %llu len %u flags %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->flags) ); @@ -141,7 +152,8 @@ DECLARE_EVENT_CLASS(ext4__write_end, TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) @@ -149,16 +161,18 @@ DECLARE_EVENT_CLASS(ext4__write_end, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), - TP_printk("dev %s ino %lu pos %llu len %u copied %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->pos, __entry->len, __entry->copied) + TP_printk("dev %d,%d ino %lu pos %llu len %u copied %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->pos, + __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_ordered_write_end, @@ -199,21 +213,23 @@ TRACE_EVENT(ext4_writepage, TP_ARGS(inode, page), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->index = page->index; ), - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->index) + TP_printk("dev %d,%d ino %lu page_index %lu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->index) ); TRACE_EVENT(ext4_da_writepages, @@ -222,7 +238,8 @@ TRACE_EVENT(ext4_da_writepages, TP_ARGS(inode, wbc), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) @@ -236,7 +253,8 @@ TRACE_EVENT(ext4_da_writepages, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; @@ -249,8 +267,8 @@ TRACE_EVENT(ext4_da_writepages, __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", - jbd2_dev_to_name(__entry->dev), + TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld range_start %llu range_end %llu nonblocking %d for_kupdate %d for_reclaim %d range_cyclic %d writeback_index %lu", + __entry->dev_major, __entry->dev_minor, (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->nonblocking, @@ -265,7 +283,8 @@ TRACE_EVENT(ext4_da_write_pages, TP_ARGS(inode, mpd), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, b_blocknr ) __field( __u32, b_size ) @@ -276,7 +295,8 @@ TRACE_EVENT(ext4_da_write_pages, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->b_blocknr = mpd->b_blocknr; __entry->b_size = mpd->b_size; @@ -286,8 +306,9 @@ TRACE_EVENT(ext4_da_write_pages, __entry->pages_written = mpd->pages_written; ), - TP_printk("dev %s ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu b_blocknr %llu b_size %u b_state 0x%04x first_page %lu io_done %d pages_written %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->b_blocknr, __entry->b_size, __entry->b_state, __entry->first_page, __entry->io_done, __entry->pages_written) @@ -300,7 +321,8 @@ TRACE_EVENT(ext4_da_writepages_result, TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) @@ -310,7 +332,8 @@ TRACE_EVENT(ext4_da_writepages_result, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; @@ -319,8 +342,8 @@ TRACE_EVENT(ext4_da_writepages_result, __entry->writeback_index = inode->i_mapping->writeback_index; ), - TP_printk("dev %s ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu", - jbd2_dev_to_name(__entry->dev), + TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld more_io %d writeback_index %lu", + __entry->dev_major, __entry->dev_minor, (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->more_io, @@ -334,20 +357,23 @@ TRACE_EVENT(ext4_discard_blocks, TP_ARGS(sb, blk, count), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->blk = blk; __entry->count = count; ), - TP_printk("dev %s blk %llu count %llu", - jbd2_dev_to_name(__entry->dev), __entry->blk, __entry->count) + TP_printk("dev %d,%d blk %llu count %llu", + __entry->dev_major, __entry->dev_minor, + __entry->blk, __entry->count) ); DECLARE_EVENT_CLASS(ext4__mb_new_pa, @@ -357,7 +383,8 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_ARGS(ac, pa), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) @@ -366,16 +393,18 @@ DECLARE_EVENT_CLASS(ext4__mb_new_pa, ), TP_fast_assign( - __entry->dev = ac->ac_sb->s_dev; + __entry->dev_major = MAJOR(ac->ac_sb->s_dev); + __entry->dev_minor = MINOR(ac->ac_sb->s_dev); __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; __entry->pa_lstart = pa->pa_lstart; ), - TP_printk("dev %s ino %lu pstart %llu len %u lstart %llu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) + TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->pa_pstart, + __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, @@ -403,7 +432,8 @@ TRACE_EVENT(ext4_mb_release_inode_pa, TP_ARGS(sb, inode, pa, block, count), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) @@ -411,15 +441,16 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; ), - TP_printk("dev %s ino %lu block %llu count %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->block, __entry->count) + TP_printk("dev %d,%d ino %lu block %llu count %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, @@ -429,20 +460,23 @@ TRACE_EVENT(ext4_mb_release_group_pa, TP_ARGS(sb, pa), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), - TP_printk("dev %s pstart %llu len %u", - jbd2_dev_to_name(__entry->dev), __entry->pa_pstart, __entry->pa_len) + TP_printk("dev %d,%d pstart %llu len %u", + __entry->dev_major, __entry->dev_minor, + __entry->pa_pstart, __entry->pa_len) ); TRACE_EVENT(ext4_discard_preallocations, @@ -451,18 +485,21 @@ TRACE_EVENT(ext4_discard_preallocations, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mb_discard_preallocations, @@ -471,18 +508,20 @@ TRACE_EVENT(ext4_mb_discard_preallocations, TP_ARGS(sb, needed), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( int, needed ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->needed = needed; ), - TP_printk("dev %s needed %d", - jbd2_dev_to_name(__entry->dev), __entry->needed) + TP_printk("dev %d,%d needed %d", + __entry->dev_major, __entry->dev_minor, __entry->needed) ); TRACE_EVENT(ext4_request_blocks, @@ -491,7 +530,8 @@ TRACE_EVENT(ext4_request_blocks, TP_ARGS(ar), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( unsigned int, len ) @@ -504,7 +544,8 @@ TRACE_EVENT(ext4_request_blocks, ), TP_fast_assign( - __entry->dev = ar->inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev); __entry->ino = ar->inode->i_ino; __entry->flags = ar->flags; __entry->len = ar->len; @@ -516,8 +557,9 @@ TRACE_EVENT(ext4_request_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %s ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu flags %u len %u lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->flags, __entry->len, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, @@ -533,7 +575,8 @@ TRACE_EVENT(ext4_allocate_blocks, TP_ARGS(ar, block), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, flags ) @@ -547,7 +590,8 @@ TRACE_EVENT(ext4_allocate_blocks, ), TP_fast_assign( - __entry->dev = ar->inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ar->inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ar->inode->i_sb->s_dev); __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->flags = ar->flags; @@ -560,9 +604,10 @@ TRACE_EVENT(ext4_allocate_blocks, __entry->pright = ar->pright; ), - TP_printk("dev %s ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->flags, __entry->len, __entry->block, + TP_printk("dev %d,%d ino %lu flags %u len %u block %llu lblk %llu goal %llu lleft %llu lright %llu pleft %llu pright %llu ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->flags, + __entry->len, __entry->block, (unsigned long long) __entry->logical, (unsigned long long) __entry->goal, (unsigned long long) __entry->lleft, @@ -578,7 +623,8 @@ TRACE_EVENT(ext4_free_blocks, TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, block ) @@ -587,7 +633,8 @@ TRACE_EVENT(ext4_free_blocks, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->block = block; @@ -595,8 +642,9 @@ TRACE_EVENT(ext4_free_blocks, __entry->flags = flags; ), - TP_printk("dev %s ino %lu mode 0%o block %llu count %lu flags %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, __entry->flags) ); @@ -607,7 +655,8 @@ TRACE_EVENT(ext4_sync_file, TP_ARGS(file, datasync), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) @@ -616,14 +665,16 @@ TRACE_EVENT(ext4_sync_file, TP_fast_assign( struct dentry *dentry = file->f_path.dentry; - __entry->dev = dentry->d_inode->i_sb->s_dev; + __entry->dev_major = MAJOR(dentry->d_inode->i_sb->s_dev); + __entry->dev_minor = MINOR(dentry->d_inode->i_sb->s_dev); __entry->ino = dentry->d_inode->i_ino; __entry->datasync = datasync; __entry->parent = dentry->d_parent->d_inode->i_ino; ), - TP_printk("dev %s ino %ld parent %ld datasync %d ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %ld parent %ld datasync %d ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); @@ -633,18 +684,20 @@ TRACE_EVENT(ext4_sync_fs, TP_ARGS(sb, wait), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( int, wait ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->wait = wait; ), - TP_printk("dev %s wait %d", jbd2_dev_to_name(__entry->dev), - __entry->wait) + TP_printk("dev %d,%d wait %d", __entry->dev_major, + __entry->dev_minor, __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, @@ -653,21 +706,24 @@ TRACE_EVENT(ext4_alloc_da_blocks, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) __field( unsigned int, meta_blocks ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), - TP_printk("dev %s ino %lu data_blocks %u meta_blocks %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu data_blocks %u meta_blocks %u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->data_blocks, __entry->meta_blocks) ); @@ -677,7 +733,8 @@ TRACE_EVENT(ext4_mballoc_alloc, TP_ARGS(ac), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u16, found ) __field( __u16, groups ) @@ -700,7 +757,8 @@ TRACE_EVENT(ext4_mballoc_alloc, ), TP_fast_assign( - __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ac->ac_inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ac->ac_inode->i_sb->s_dev); __entry->ino = ac->ac_inode->i_ino; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; @@ -722,10 +780,11 @@ TRACE_EVENT(ext4_mballoc_alloc, __entry->result_len = ac->ac_f_ex.fe_len; ), - TP_printk("dev %s inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " + TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %u flags 0x%04x " "tail %u broken %u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, @@ -743,7 +802,8 @@ TRACE_EVENT(ext4_mballoc_prealloc, TP_ARGS(ac), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) @@ -756,7 +816,8 @@ TRACE_EVENT(ext4_mballoc_prealloc, ), TP_fast_assign( - __entry->dev = ac->ac_inode->i_sb->s_dev; + __entry->dev_major = MAJOR(ac->ac_inode->i_sb->s_dev); + __entry->dev_minor = MINOR(ac->ac_inode->i_sb->s_dev); __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; @@ -768,8 +829,9 @@ TRACE_EVENT(ext4_mballoc_prealloc, __entry->result_len = ac->ac_b_ex.fe_len; ), - TP_printk("dev %s inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, @@ -786,7 +848,8 @@ DECLARE_EVENT_CLASS(ext4__mballoc, TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) @@ -794,15 +857,17 @@ DECLARE_EVENT_CLASS(ext4__mballoc, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; __entry->result_len = len; ), - TP_printk("dev %s inode %lu extent %u/%d/%u ", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d inode %lu extent %u/%d/%u ", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); @@ -835,7 +900,8 @@ TRACE_EVENT(ext4_forget, TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( int, is_metadata ) @@ -843,16 +909,18 @@ TRACE_EVENT(ext4_forget, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->is_metadata = is_metadata; __entry->block = block; ), - TP_printk("dev %s ino %lu mode 0%o is_metadata %d block %llu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, __entry->is_metadata, __entry->block) + TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, + __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, @@ -861,7 +929,8 @@ TRACE_EVENT(ext4_da_update_reserve_space, TP_ARGS(inode, used_blocks), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -872,7 +941,8 @@ TRACE_EVENT(ext4_da_update_reserve_space, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -882,9 +952,10 @@ TRACE_EVENT(ext4_da_update_reserve_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; ), - TP_printk("dev %s ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, - __entry->mode, (unsigned long long) __entry->i_blocks, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, + (unsigned long long) __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) ); @@ -895,7 +966,8 @@ TRACE_EVENT(ext4_da_reserve_space, TP_ARGS(inode, md_needed), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -905,7 +977,8 @@ TRACE_EVENT(ext4_da_reserve_space, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -914,8 +987,9 @@ TRACE_EVENT(ext4_da_reserve_space, __entry->reserved_meta_blocks = EXT4_I(inode)->i_reserved_meta_blocks; ), - TP_printk("dev %s ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu md_needed %d reserved_data_blocks %d reserved_meta_blocks %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, (unsigned long long) __entry->i_blocks, __entry->md_needed, __entry->reserved_data_blocks, __entry->reserved_meta_blocks) @@ -927,7 +1001,8 @@ TRACE_EVENT(ext4_da_release_space, TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) __field( umode_t, mode ) __field( __u64, i_blocks ) @@ -938,7 +1013,8 @@ TRACE_EVENT(ext4_da_release_space, ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; __entry->mode = inode->i_mode; __entry->i_blocks = inode->i_blocks; @@ -948,8 +1024,9 @@ TRACE_EVENT(ext4_da_release_space, __entry->allocated_meta_blocks = EXT4_I(inode)->i_allocated_meta_blocks; ), - TP_printk("dev %s ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino, + TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d reserved_data_blocks %d reserved_meta_blocks %d allocated_meta_blocks %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->mode, (unsigned long long) __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks, __entry->reserved_meta_blocks, __entry->allocated_meta_blocks) @@ -961,18 +1038,20 @@ DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_ARGS(sb, group), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( __u32, group ) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev_major = MAJOR(sb->s_dev); + __entry->dev_minor = MINOR(sb->s_dev); __entry->group = group; ), - TP_printk("dev %s group %u", - jbd2_dev_to_name(__entry->dev), __entry->group) + TP_printk("dev %d,%d group %u", + __entry->dev_major, __entry->dev_minor, __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index bf16545cc977..7447ea9305b5 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -17,17 +17,19 @@ TRACE_EVENT(jbd2_checkpoint, TP_ARGS(journal, result), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( int, result ) ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->result = result; ), - TP_printk("dev %s result %d", - jbd2_dev_to_name(__entry->dev), __entry->result) + TP_printk("dev %d,%d result %d", + __entry->dev_major, __entry->dev_minor, __entry->result) ); DECLARE_EVENT_CLASS(jbd2_commit, @@ -37,20 +39,22 @@ DECLARE_EVENT_CLASS(jbd2_commit, TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( char, sync_commit ) __field( int, transaction ) ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; ), - TP_printk("dev %s transaction %d sync %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit) + TP_printk("dev %d,%d transaction %d sync %d", + __entry->dev_major, __entry->dev_minor, + __entry->transaction, __entry->sync_commit) ); DEFINE_EVENT(jbd2_commit, jbd2_start_commit, @@ -87,22 +91,24 @@ TRACE_EVENT(jbd2_end_commit, TP_ARGS(journal, commit_transaction), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( char, sync_commit ) __field( int, transaction ) __field( int, head ) ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->sync_commit = commit_transaction->t_synchronous_commit; __entry->transaction = commit_transaction->t_tid; __entry->head = journal->j_tail_sequence; ), - TP_printk("dev %s transaction %d sync %d head %d", - jbd2_dev_to_name(__entry->dev), __entry->transaction, - __entry->sync_commit, __entry->head) + TP_printk("dev %d,%d transaction %d sync %d head %d", + __entry->dev_major, __entry->dev_minor, + __entry->transaction, __entry->sync_commit, __entry->head) ); TRACE_EVENT(jbd2_submit_inode_data, @@ -111,17 +117,20 @@ TRACE_EVENT(jbd2_submit_inode_data, TP_ARGS(inode), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( ino_t, ino ) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); __entry->ino = inode->i_ino; ), - TP_printk("dev %s ino %lu", - jbd2_dev_to_name(__entry->dev), (unsigned long) __entry->ino) + TP_printk("dev %d,%d ino %lu", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino) ); TRACE_EVENT(jbd2_run_stats, @@ -131,7 +140,8 @@ TRACE_EVENT(jbd2_run_stats, TP_ARGS(dev, tid, stats), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( unsigned long, tid ) __field( unsigned long, wait ) __field( unsigned long, running ) @@ -144,7 +154,8 @@ TRACE_EVENT(jbd2_run_stats, ), TP_fast_assign( - __entry->dev = dev; + __entry->dev_major = MAJOR(dev); + __entry->dev_minor = MINOR(dev); __entry->tid = tid; __entry->wait = stats->rs_wait; __entry->running = stats->rs_running; @@ -156,9 +167,9 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %s tid %lu wait %u running %u locked %u flushing %u " + TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " "logging %u handle_count %u blocks %u blocks_logged %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + __entry->dev_major, __entry->dev_minor, __entry->tid, jiffies_to_msecs(__entry->wait), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), @@ -175,7 +186,8 @@ TRACE_EVENT(jbd2_checkpoint_stats, TP_ARGS(dev, tid, stats), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( unsigned long, tid ) __field( unsigned long, chp_time ) __field( __u32, forced_to_close ) @@ -184,7 +196,8 @@ TRACE_EVENT(jbd2_checkpoint_stats, ), TP_fast_assign( - __entry->dev = dev; + __entry->dev_major = MAJOR(dev); + __entry->dev_minor = MINOR(dev); __entry->tid = tid; __entry->chp_time = stats->cs_chp_time; __entry->forced_to_close= stats->cs_forced_to_close; @@ -192,9 +205,9 @@ TRACE_EVENT(jbd2_checkpoint_stats, __entry->dropped = stats->cs_dropped; ), - TP_printk("dev %s tid %lu chp_time %u forced_to_close %u " + TP_printk("dev %d,%d tid %lu chp_time %u forced_to_close %u " "written %u dropped %u", - jbd2_dev_to_name(__entry->dev), __entry->tid, + __entry->dev_major, __entry->dev_minor, __entry->tid, jiffies_to_msecs(__entry->chp_time), __entry->forced_to_close, __entry->written, __entry->dropped) ); @@ -207,7 +220,8 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, TP_ARGS(journal, first_tid, block_nr, freed), TP_STRUCT__entry( - __field( dev_t, dev ) + __field( int, dev_major ) + __field( int, dev_minor ) __field( tid_t, tail_sequence ) __field( tid_t, first_tid ) __field(unsigned long, block_nr ) @@ -215,16 +229,18 @@ TRACE_EVENT(jbd2_cleanup_journal_tail, ), TP_fast_assign( - __entry->dev = journal->j_fs_dev->bd_dev; + __entry->dev_major = MAJOR(journal->j_fs_dev->bd_dev); + __entry->dev_minor = MINOR(journal->j_fs_dev->bd_dev); __entry->tail_sequence = journal->j_tail_sequence; __entry->first_tid = first_tid; __entry->block_nr = block_nr; __entry->freed = freed; ), - TP_printk("dev %s from %u to %u offset %lu freed %lu", - jbd2_dev_to_name(__entry->dev), __entry->tail_sequence, - __entry->first_tid, __entry->block_nr, __entry->freed) + TP_printk("dev %d,%d from %u to %u offset %lu freed %lu", + __entry->dev_major, __entry->dev_minor, + __entry->tail_sequence, __entry->first_tid, + __entry->block_nr, __entry->freed) ); #endif /* _TRACE_JBD2_H */ -- cgit v1.2.3 From 7ff9c073dd4d7200399076554f7ab9b876f196f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Nov 2010 13:51:33 -0500 Subject: ext4: Add new ext4 inode tracepoints Add ext4_evict_inode, ext4_drop_inode, ext4_mark_inode_dirty, and ext4_begin_ordered_truncate() Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++ fs/ext4/super.c | 10 +++++ include/trace/events/ext4.h | 97 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 110 insertions(+) (limited to 'include/trace') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 191616470466..846e1e9db434 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -53,6 +53,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { + trace_ext4_begin_ordered_truncate(inode, new_size); return jbd2_journal_begin_ordered_truncate( EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode, @@ -178,6 +179,7 @@ void ext4_evict_inode(struct inode *inode) handle_t *handle; int err; + trace_ext4_evict_inode(inode); if (inode->i_nlink) { truncate_inode_pages(&inode->i_data, 0); goto no_delete; @@ -5649,6 +5651,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) int err, ret; might_sleep(); + trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (ext4_handle_valid(handle) && EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ee91e29ddf95..61182fe6254e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -833,6 +833,14 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } +static int ext4_drop_inode(struct inode *inode) +{ + int drop = generic_drop_inode(inode); + + trace_ext4_drop_inode(inode, drop); + return drop; +} + static void ext4_destroy_inode(struct inode *inode) { ext4_ioend_wait(inode); @@ -1175,6 +1183,7 @@ static const struct super_operations ext4_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .put_super = ext4_put_super, .sync_fs = ext4_sync_fs, @@ -1196,6 +1205,7 @@ static const struct super_operations ext4_nojournal_sops = { .destroy_inode = ext4_destroy_inode, .write_inode = ext4_write_inode, .dirty_inode = ext4_dirty_inode, + .drop_inode = ext4_drop_inode, .evict_inode = ext4_evict_inode, .write_super = ext4_write_super, .put_super = ext4_put_super, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 289010d3270b..e5e345fb2a5c 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -98,6 +98,103 @@ TRACE_EVENT(ext4_allocate_inode, (unsigned long) __entry->dir, __entry->mode) ); +TRACE_EVENT(ext4_evict_inode, + TP_PROTO(struct inode *inode), + + TP_ARGS(inode), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( int, nlink ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->nlink = inode->i_nlink; + ), + + TP_printk("dev %d,%d ino %lu nlink %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->nlink) +); + +TRACE_EVENT(ext4_drop_inode, + TP_PROTO(struct inode *inode, int drop), + + TP_ARGS(inode, drop), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( int, drop ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->drop = drop; + ), + + TP_printk("dev %d,%d ino %lu drop %d", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, __entry->drop) +); + +TRACE_EVENT(ext4_mark_inode_dirty, + TP_PROTO(struct inode *inode, unsigned long IP), + + TP_ARGS(inode, IP), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->ip = IP; + ), + + TP_printk("dev %d,%d ino %lu caller %pF", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, (void *)__entry->ip) +); + +TRACE_EVENT(ext4_begin_ordered_truncate, + TP_PROTO(struct inode *inode, loff_t new_size), + + TP_ARGS(inode, new_size), + + TP_STRUCT__entry( + __field( int, dev_major ) + __field( int, dev_minor ) + __field( ino_t, ino ) + __field( loff_t, new_size ) + ), + + TP_fast_assign( + __entry->dev_major = MAJOR(inode->i_sb->s_dev); + __entry->dev_minor = MINOR(inode->i_sb->s_dev); + __entry->ino = inode->i_ino; + __entry->new_size = new_size; + ), + + TP_printk("dev %d,%d ino %lu new_size %lld", + __entry->dev_major, __entry->dev_minor, + (unsigned long) __entry->ino, + (long long) __entry->new_size) +); + DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, -- cgit v1.2.3