From 33c7a2bc48a81fa714572f8ce29f29bc17e6faf0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:55:59 +1100
Subject: xfs: xfs_syncd_stop must die

xfs_syncd_start and xfs_syncd_stop tie a bunch of unrelated
functionailty together that actually have different start and stop
requirements. Kill these functions and open code the start/stop
methods for each of the background functions.

Subsequent patches will move the start/stop functions around to the
correct places to avoid races and shutdown issues.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_super.c | 25 ++++++++++++++++++-------
 fs/xfs/xfs_sync.c  | 30 ++++--------------------------
 fs/xfs/xfs_sync.h  |  6 ++++--
 3 files changed, 26 insertions(+), 35 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..37d1bbce047d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1008,7 +1008,11 @@ xfs_fs_put_super(
 	xfs_filestream_unmount(mp);
 	cancel_delayed_work_sync(&mp->m_sync_work);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1384,9 +1388,11 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	error = xfs_syncd_init(mp);
-	if (error)
-		goto out_filestream_unmount;
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+	xfs_syncd_queue_sync(mp);
 
 	error = xfs_mountfs(mp);
 	if (error)
@@ -1409,8 +1415,10 @@ xfs_fs_fill_super(
 
 	return 0;
  out_syncd_stop:
-	xfs_syncd_stop(mp);
- out_filestream_unmount:
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
+
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1429,7 +1437,10 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-	xfs_syncd_stop(mp);
+
+	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
+	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 9500caf15acf..7502f0621fb9 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -370,7 +370,7 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-static void
+void
 xfs_syncd_queue_sync(
 	struct xfs_mount        *mp)
 {
@@ -383,7 +383,7 @@ xfs_syncd_queue_sync(
  * disk quotas.  We might need to cover the log to indicate that the
  * filesystem is idle and not frozen.
  */
-STATIC void
+void
 xfs_sync_worker(
 	struct work_struct *work)
 {
@@ -445,7 +445,7 @@ xfs_syncd_queue_reclaim(
  * goes low. It scans as quickly as possible avoiding locked inodes or those
  * already being flushed, and once done schedules a future pass.
  */
-STATIC void
+void
 xfs_reclaim_worker(
 	struct work_struct *work)
 {
@@ -478,7 +478,7 @@ xfs_flush_inodes(
 	flush_work(&mp->m_flush_work);
 }
 
-STATIC void
+void
 xfs_flush_worker(
 	struct work_struct *work)
 {
@@ -489,28 +489,6 @@ xfs_flush_worker(
 	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
 }
 
-int
-xfs_syncd_init(
-	struct xfs_mount	*mp)
-{
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
-	return 0;
-}
-
-void
-xfs_syncd_stop(
-	struct xfs_mount	*mp)
-{
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 941202e7ac6e..3f59e5bed66b 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,10 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
+void xfs_syncd_queue_sync(struct xfs_mount *mp);
+void xfs_sync_worker(struct work_struct *work);
+void xfs_flush_worker(struct work_struct *work);
+void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
-- 
cgit v1.2.3


From 7e18530bef6a18a5479690ae7e8256319ecf1300 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:00 +1100
Subject: xfs: rationalise xfs_mount_wq users

Instead of starting and stopping background work on the xfs_mount_wq
all at the same time, separate them to where they really are needed
to start and stop.

The xfs_sync_worker, only needs to be started after all the mount
processing has completed successfully, while it needs to be stopped
before the log is unmounted.

The xfs_reclaim_worker is started on demand, and can be
stopped before the unmount process does it's own inode reclaim pass.

The xfs_flush_inodes work is run on demand, and so we really only
need to ensure that it has stopped running before we start
processing an unmount, freeze or remount,ro.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_mount.c |  6 ++++--
 fs/xfs/xfs_super.c | 34 ++++++++++++++--------------------
 fs/xfs/xfs_sync.c  | 21 +++++----------------
 3 files changed, 23 insertions(+), 38 deletions(-)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..d9a31c6a0c53 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1450,9 +1450,11 @@ xfs_unmountfs(
 
 	/*
 	 * And reclaim all inodes.  At this point there should be no dirty
-	 * inode, and none should be pinned or locked, but use synchronous
-	 * reclaim just to be sure.
+	 * inodes and none should be pinned or locked, but use synchronous
+	 * reclaim just to be sure. We can stop background inode reclaim
+	 * here as well if it is still running.
 	 */
+	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 37d1bbce047d..9805cac81fc9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1005,14 +1005,12 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	xfs_filestream_unmount(mp);
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	xfs_unmountfs(mp);
-
 	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
 	cancel_work_sync(&mp->m_flush_work);
 
+	xfs_filestream_unmount(mp);
+	xfs_unmountfs(mp);
+
 	xfs_freesb(mp);
 	xfs_icsb_destroy_counters(mp);
 	xfs_destroy_mount_workqueues(mp);
@@ -1325,6 +1323,9 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
+	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
 	sb->s_fs_info = mp;
@@ -1388,15 +1389,9 @@ xfs_fs_fill_super(
 	sb->s_time_gran = 1;
 	set_posix_acl_flag(sb);
 
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-	xfs_syncd_queue_sync(mp);
-
 	error = xfs_mountfs(mp);
 	if (error)
-		goto out_syncd_stop;
+		goto out_filestream_unmount;
 
 	root = igrab(VFS_I(mp->m_rootip));
 	if (!root) {
@@ -1413,12 +1408,15 @@ xfs_fs_fill_super(
 		goto out_unmount;
 	}
 
+	/*
+	 * The filesystem is successfully mounted, so we can start background
+	 * sync work now.
+	 */
+	xfs_syncd_queue_sync(mp);
+
 	return 0;
- out_syncd_stop:
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
 
+ out_filestream_unmount:
 	xfs_filestream_unmount(mp);
  out_free_sb:
 	xfs_freesb(mp);
@@ -1437,10 +1435,6 @@ out_destroy_workqueues:
  out_unmount:
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
-
-	cancel_delayed_work_sync(&mp->m_sync_work);
-	cancel_delayed_work_sync(&mp->m_reclaim_work);
-	cancel_work_sync(&mp->m_flush_work);
 	goto out_free_sb;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 7502f0621fb9..a68761696ab5 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -379,9 +379,9 @@ xfs_syncd_queue_sync(
 }
 
 /*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle and not frozen.
  */
 void
 xfs_sync_worker(
@@ -391,17 +391,7 @@ xfs_sync_worker(
 					struct xfs_mount, m_sync_work);
 	int		error;
 
-	/*
-	 * We shouldn't write/force the log if we are in the mount/unmount
-	 * process or on a read only filesystem. The workqueue still needs to be
-	 * active in both cases, however, because it is used for inode reclaim
-	 * during these times.  Use the MS_ACTIVE flag to avoid doing anything
-	 * during mount.  Doing work during unmount is avoided by calling
-	 * cancel_delayed_work_sync on this work queue before tearing down
-	 * the ail and the log in xfs_log_unmount.
-	 */
-	if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-	    !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
 		/* dgc: errors ignored here */
 		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
 		    xfs_log_need_covered(mp))
@@ -409,8 +399,7 @@ xfs_sync_worker(
 		else
 			xfs_log_force(mp, 0);
 
-		/* start pushing all the metadata that is currently
-		 * dirty */
+		/* start pushing all the metadata that is currently dirty */
 		xfs_ail_push_all(mp->m_ail);
 	}
 
-- 
cgit v1.2.3


From 7f7bebefba152c5bdfe961cd2e97e8695a32998c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:01 +1100
Subject: xfs: don't run the sync work if the filesystem is read-only

If the filesystem is mounted or remounted read-only, stop the sync
worker that tries to flush or cover the log if the filesystem is
dirty. It's read-only, so it isn't dirty. Restart it on a remount,rw
as necessary. This avoids the need for RO checks in the work.

Similarly, stop the sync work when the filesystem is frozen, and
start it again when the filesysetm is thawed. This avoids the need
for special freeze checks in the work.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_super.c |  2 ++
 fs/xfs/xfs_sync.c  | 29 ++++++++++++++++-------------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9805cac81fc9..20fa955d80d1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1200,6 +1200,7 @@ xfs_fs_remount(
 		 * value if it is non-zero, otherwise go with the default.
 		 */
 		xfs_restore_resvblks(mp);
+		xfs_syncd_queue_sync(mp);
 	}
 
 	/* rw -> ro */
@@ -1245,6 +1246,7 @@ xfs_fs_unfreeze(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_restore_resvblks(mp);
+	xfs_syncd_queue_sync(mp);
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index a68761696ab5..e898d1807044 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -323,6 +323,9 @@ xfs_quiesce_data(
  * Second stage of a quiesce. The data is already synced, now we have to take
  * care of the metadata. New transactions are already blocked, so we need to
  * wait for any remaining transactions to drain out before proceeding.
+ *
+ * Note: this stops background sync work - the callers must ensure it is started
+ * again when appropriate.
  */
 void
 xfs_quiesce_attr(
@@ -341,6 +344,9 @@ xfs_quiesce_attr(
 	/* flush all pending changes from the AIL */
 	xfs_ail_push_all_sync(mp->m_ail);
 
+	/* stop background sync work */
+	cancel_delayed_work_sync(&mp->m_sync_work);
+
 	/*
 	 * Just warn here till VFS can correctly support
 	 * read-only remount without racing.
@@ -379,9 +385,8 @@ xfs_syncd_queue_sync(
 }
 
 /*
- * Every sync period we need to unpin all items in the AIL and push them to
- * disk. If there is nothing dirty, then we might need to cover the log to
- * indicate that the filesystem is idle and not frozen.
+ * Every sync period we need to push dirty metadata and try to cover the log
+ * to indicate the filesystem is idle and not frozen.
  */
 void
 xfs_sync_worker(
@@ -391,17 +396,15 @@ xfs_sync_worker(
 					struct xfs_mount, m_sync_work);
 	int		error;
 
-	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-		/* dgc: errors ignored here */
-		if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-		    xfs_log_need_covered(mp))
-			error = xfs_fs_log_dummy(mp);
-		else
-			xfs_log_force(mp, 0);
+	/* dgc: errors ignored here */
+	if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
+	    xfs_log_need_covered(mp))
+		error = xfs_fs_log_dummy(mp);
+	else
+		xfs_log_force(mp, 0);
 
-		/* start pushing all the metadata that is currently dirty */
-		xfs_ail_push_all(mp->m_ail);
-	}
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
 
 	/* queue us up again */
 	xfs_syncd_queue_sync(mp);
-- 
cgit v1.2.3


From f661f1e0bf5002bdcc8b5810ad0a184a1841537f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:02 +1100
Subject: xfs: sync work is now only periodic log work

The only thing the periodic sync work does now is flush the AIL and
idle the log. These are really functions of the log code, so move
the work to xfs_log.c and rename it appropriately.

The only wart that this leaves behind is the xfssyncd_centisecs
sysctl, otherwise the xfssyncd is dead. Clean up any comments that
related to xfssyncd to reflect it's passing.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c      | 61 ++++++++++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_log.h      |  3 +++
 fs/xfs/xfs_log_priv.h |  1 +
 fs/xfs/xfs_mount.h    |  1 -
 fs/xfs/xfs_super.c    | 16 ++++----------
 fs/xfs/xfs_sync.c     | 39 +++-----------------------------
 fs/xfs/xfs_sync.h     |  2 --
 7 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f9370d0e7..efea12bfbd6b 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,7 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -679,25 +680,29 @@ out:
 }
 
 /*
- * Finish the recovery of the file system.  This is separate from
- * the xfs_log_mount() call, because it depends on the code in
- * xfs_mountfs() to read in the root and real-time bitmap inodes
- * between calling xfs_log_mount() and here.
+ * Finish the recovery of the file system.  This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
  *
- * mp		- ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
  */
 int
 xfs_log_mount_finish(xfs_mount_t *mp)
 {
-	int	error;
+	int	error = 0;
 
-	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
 		error = xlog_recover_finish(mp->m_log);
-	else {
-		error = 0;
+		if (!error)
+			xfs_log_work_queue(mp);
+	} else {
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
 
+
 	return error;
 }
 
@@ -858,7 +863,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 void
 xfs_log_unmount(xfs_mount_t *mp)
 {
-	cancel_delayed_work_sync(&mp->m_sync_work);
+	cancel_delayed_work_sync(&mp->m_log->l_work);
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
 }
@@ -1161,6 +1166,40 @@ done:
 }	/* xlog_get_iclog_buffer_size */
 
 
+void
+xfs_log_work_queue(
+	struct xfs_mount        *mp)
+{
+	queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work,
+				msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+	struct work_struct	*work)
+{
+	struct xlog		*log = container_of(to_delayed_work(work),
+						struct xlog, l_work);
+	struct xfs_mount	*mp = log->l_mp;
+
+	/* dgc: errors ignored - not fatal and nowhere to report them */
+	if (xfs_log_need_covered(mp))
+		xfs_fs_log_dummy(mp);
+	else
+		xfs_log_force(mp, 0);
+
+	/* start pushing all the metadata that is currently dirty */
+	xfs_ail_push_all(mp->m_ail);
+
+	/* queue us up again */
+	xfs_log_work_queue(mp);
+}
+
 /*
  * This routine initializes some of the log structure for a given mount point.
  * Its primary purpose is to fill in enough, so recovery can occur.  However,
@@ -1195,6 +1234,7 @@ xlog_alloc_log(
 	log->l_logBBsize   = num_bblks;
 	log->l_covered_state = XLOG_STATE_COVER_IDLE;
 	log->l_flags	   |= XLOG_ACTIVE_RECOVERY;
+	INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
 
 	log->l_prev_block  = -1;
 	/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -3700,3 +3740,4 @@ xlog_iclogs_empty(
 	} while (iclog != log->l_iclog);
 	return 1;
 }
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..26ed7de352d7 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,8 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
 				xfs_lsn_t *commit_lsn, int flags);
 bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
+void	xfs_log_work_queue(struct xfs_mount *mp);
+void	xfs_log_worker(struct work_struct *work);
+
 #endif
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..9a4e0e5ec322 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -495,6 +495,7 @@ struct xlog {
 	struct xfs_buf		*l_xbuf;        /* extra buffer for log
 						 * wrapping */
 	struct xfs_buftarg	*l_targ;        /* buftarg of log */
+	struct delayed_work	l_work;		/* background flush work */
 	uint			l_flags;
 	uint			l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
 	struct list_head	*l_buf_cancel_table;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..26e46aeaa3f1 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -197,7 +197,6 @@ typedef struct xfs_mount {
 	struct mutex		m_icsb_mutex;	/* balancer sync lock */
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
-	struct delayed_work	m_sync_work;	/* background sync work */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 20fa955d80d1..37c39a155a58 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1005,7 +1005,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_delayed_work_sync(&mp->m_sync_work);
 	cancel_work_sync(&mp->m_flush_work);
 
 	xfs_filestream_unmount(mp);
@@ -1040,10 +1039,10 @@ xfs_fs_sync_fs(
 	if (laptop_mode) {
 		/*
 		 * The disk must be active because we're syncing.
-		 * We schedule xfssyncd now (now that the disk is
+		 * We schedule log work now (now that the disk is
 		 * active) instead of later (when it might not be).
 		 */
-		flush_delayed_work(&mp->m_sync_work);
+		flush_delayed_work(&mp->m_log->l_work);
 	}
 
 	return 0;
@@ -1200,7 +1199,7 @@ xfs_fs_remount(
 		 * value if it is non-zero, otherwise go with the default.
 		 */
 		xfs_restore_resvblks(mp);
-		xfs_syncd_queue_sync(mp);
+		xfs_log_work_queue(mp);
 	}
 
 	/* rw -> ro */
@@ -1246,7 +1245,7 @@ xfs_fs_unfreeze(
 	struct xfs_mount	*mp = XFS_M(sb);
 
 	xfs_restore_resvblks(mp);
-	xfs_syncd_queue_sync(mp);
+	xfs_log_work_queue(mp);
 	return 0;
 }
 
@@ -1326,7 +1325,6 @@ xfs_fs_fill_super(
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-	INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
@@ -1410,12 +1408,6 @@ xfs_fs_fill_super(
 		goto out_unmount;
 	}
 
-	/*
-	 * The filesystem is successfully mounted, so we can start background
-	 * sync work now.
-	 */
-	xfs_syncd_queue_sync(mp);
-
 	return 0;
 
  out_filestream_unmount:
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index e898d1807044..2174555aebb2 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -19,6 +19,7 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_log.h"
+#include "xfs_log_priv.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
@@ -344,8 +345,8 @@ xfs_quiesce_attr(
 	/* flush all pending changes from the AIL */
 	xfs_ail_push_all_sync(mp->m_ail);
 
-	/* stop background sync work */
-	cancel_delayed_work_sync(&mp->m_sync_work);
+	/* stop background log work */
+	cancel_delayed_work_sync(&mp->m_log->l_work);
 
 	/*
 	 * Just warn here till VFS can correctly support
@@ -376,40 +377,6 @@ xfs_quiesce_attr(
 	xfs_buf_unlock(mp->m_sb_bp);
 }
 
-void
-xfs_syncd_queue_sync(
-	struct xfs_mount        *mp)
-{
-	queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-				msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to push dirty metadata and try to cover the log
- * to indicate the filesystem is idle and not frozen.
- */
-void
-xfs_sync_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_sync_work);
-	int		error;
-
-	/* dgc: errors ignored here */
-	if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-	    xfs_log_need_covered(mp))
-		error = xfs_fs_log_dummy(mp);
-	else
-		xfs_log_force(mp, 0);
-
-	/* start pushing all the metadata that is currently dirty */
-	xfs_ail_push_all(mp->m_ail);
-
-	/* queue us up again */
-	xfs_syncd_queue_sync(mp);
-}
-
 /*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 3f59e5bed66b..8d58fab72a10 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,8 +26,6 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-void xfs_syncd_queue_sync(struct xfs_mount *mp);
-void xfs_sync_worker(struct work_struct *work);
 void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
-- 
cgit v1.2.3


From cf2931db2d189ce0583be7ae880d7e3f8c15f623 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:03 +1100
Subject: xfs: Bring some sanity to log unmounting

When unmounting the filesystem, there are lots of operations that
need to be done in a specific order, and they are spread across
across a couple of functions. We have to drain the AIL before we
write the unmount record, and we have to shut down the background
log work before we do either of them.

But this is all split haphazardly across xfs_unmountfs() and
xfs_log_unmount(). Move all the AIL flushing and log manipulations
to xfs_log_unmount() so that the responisbilities of each function
is clear and the operations they perform obvious.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c   | 29 ++++++++++++++++++++++++++---
 fs/xfs/xfs_mount.c | 24 ------------------------
 2 files changed, 26 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index efea12bfbd6b..e788f39721e3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -855,15 +855,38 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }	/* xfs_log_unmount_write */
 
 /*
- * Deallocate log structures for unmount/relocation.
+ * Shut down and release the AIL and Log.
  *
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record, tear down the AIL and finally free the log.
  */
 void
 xfs_log_unmount(xfs_mount_t *mp)
 {
 	cancel_delayed_work_sync(&mp->m_log->l_work);
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/*
+	 * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+	 * will push it, xfs_wait_buftarg() will not wait for it. Further,
+	 * xfs_buf_iowait() cannot be used because it was pushed with the
+	 * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+	 * the IO to complete.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+	xfs_wait_buftarg(mp->m_ddev_targp);
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+
+	xfs_log_unmount_write(mp);
+
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d9a31c6a0c53..c195ec85c725 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1459,13 +1459,6 @@ xfs_unmountfs(
 
 	xfs_qm_unmount(mp);
 
-	/*
-	 * Flush out the log synchronously so that we know for sure
-	 * that nothing is pinned.  This is important because bflush()
-	 * will skip pinned buffers.
-	 */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
 	/*
 	 * Unreserve any blocks we have so that when we unmount we don't account
 	 * the reserved free space as used. This is really only necessary for
@@ -1491,23 +1484,6 @@ xfs_unmountfs(
 		xfs_warn(mp, "Unable to update superblock counters. "
 				"Freespace may not be correct on next mount.");
 
-	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-	xfs_wait_buftarg(mp->m_ddev_targp);
-
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
-
-	xfs_log_unmount_write(mp);
 	xfs_log_unmount(mp);
 	xfs_uuid_unmount(mp);
 
-- 
cgit v1.2.3


From 9aa05000f2b7cab4be582afba64af10b2d74727e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:04 +1100
Subject: xfs: xfs_sync_data is redundant.

We don't do any data writeback from XFS any more - the VFS is
completely responsible for that, including for freeze. We can
replace the remaining caller with a VFS level function that
achieves the same thing, but without conflicting with current
writeback work.

This means we can remove the flush_work and xfs_flush_inodes() - the
VFS functionality completely replaces the internal flush queue for
doing this writeback work in a separate context to avoid stack
overruns.

This does have one complication - it cannot be called with page
locks held.  Hence move the flushing of delalloc space when ENOSPC
occurs back up into xfs_file_aio_buffered_write when we don't hold
any locks that will stall writeback.

Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to
trigger delalloc conversion fast enough to prevent spurious ENOSPC
whent here are hundreds of writers, thousands of small files and GBs
of free RAM.  Hence we need to use sync_sb_inodes() to block callers
while we wait for writeback like the previous xfs_flush_inodes
implementation did.

That means we have to hold the s_umount lock here, but because this
call can nest inside i_mutex (the parent directory in the create
case, held by the VFS), we have to use down_read_trylock() to avoid
potential deadlocks. In practice, this trylock will succeed on
almost every attempt as unmount/remount type operations are
exceedingly rare.

Note: we always need to pass a count of zero to
generic_file_buffered_write() as the previously written byte count.
We only do this by accident before this patch by the virtue of ret
always being zero when there are no errors. Make this explicit
rather than needing to specifically zero ret in the ENOSPC retry
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Tested-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_file.c     | 13 +++++----
 fs/xfs/xfs_iomap.c    | 23 +++++----------
 fs/xfs/xfs_mount.h    |  1 -
 fs/xfs/xfs_super.c    | 21 ++++++++++++--
 fs/xfs/xfs_super.h    |  1 +
 fs/xfs/xfs_sync.c     | 78 ---------------------------------------------------
 fs/xfs/xfs_sync.h     |  3 --
 fs/xfs/xfs_vnodeops.c |  2 +-
 8 files changed, 34 insertions(+), 108 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..daf4066c24b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -728,16 +728,17 @@ xfs_file_buffered_aio_write(
 write_retry:
 	trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
 	ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-			pos, &iocb->ki_pos, count, ret);
+			pos, &iocb->ki_pos, count, 0);
+
 	/*
-	 * if we just got an ENOSPC, flush the inode now we aren't holding any
-	 * page locks and retry *once*
+	 * If we just got an ENOSPC, try to write back all dirty inodes to
+	 * convert delalloc space to free up some of the excess reserved
+	 * metadata space.
 	 */
 	if (ret == -ENOSPC && !enospc) {
 		enospc = 1;
-		ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-		if (!ret)
-			goto write_retry;
+		xfs_flush_inodes(ip->i_mount);
+		goto write_retry;
 	}
 
 	current->backing_dev_info = NULL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..f858b903678e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -373,7 +373,7 @@ xfs_iomap_write_delay(
 	xfs_extlen_t	extsz;
 	int		nimaps;
 	xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-	int		prealloc, flushed = 0;
+	int		prealloc;
 	int		error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,26 +434,17 @@ retry:
 	}
 
 	/*
-	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
-	 * some of the excess reserved metadata space. For both cases, retry
+	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
 	 * without EOF preallocation.
 	 */
 	if (nimaps == 0) {
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (flushed)
-			return XFS_ERROR(error ? error : ENOSPC);
-
-		if (error == ENOSPC) {
-			xfs_iunlock(ip, XFS_ILOCK_EXCL);
-			xfs_flush_inodes(ip);
-			xfs_ilock(ip, XFS_ILOCK_EXCL);
+		if (prealloc) {
+			prealloc = 0;
+			error = 0;
+			goto retry;
 		}
-
-		flushed = 1;
-		error = 0;
-		prealloc = 0;
-		goto retry;
+		return XFS_ERROR(error ? error : ENOSPC);
 	}
 
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 26e46aeaa3f1..a54b5aa498d4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -198,7 +198,6 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct work_struct	m_flush_work;	/* background inode flush */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 37c39a155a58..9468c6878463 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -882,6 +882,24 @@ xfs_destroy_mount_workqueues(
 	destroy_workqueue(mp->m_unwritten_workqueue);
 }
 
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+	struct xfs_mount	*mp)
+{
+	struct super_block	*sb = mp->m_super;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		sync_inodes_sb(sb);
+		up_read(&sb->s_umount);
+	}
+}
+
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -1005,8 +1023,6 @@ xfs_fs_put_super(
 {
 	struct xfs_mount	*mp = XFS_M(sb);
 
-	cancel_work_sync(&mp->m_flush_work);
-
 	xfs_filestream_unmount(mp);
 	xfs_unmountfs(mp);
 
@@ -1324,7 +1340,6 @@ xfs_fs_fill_super(
 	spin_lock_init(&mp->m_sb_lock);
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
-	INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 
 	mp->m_super = sb;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
 
 extern __uint64_t xfs_max_file_offset(unsigned int);
 
+extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
 extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 2174555aebb2..6a2ada379166 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -216,51 +216,6 @@ xfs_inode_ag_iterator(
 	return XFS_ERROR(last_error);
 }
 
-STATIC int
-xfs_sync_inode_data(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			flags)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct address_space *mapping = inode->i_mapping;
-	int			error = 0;
-
-	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-		return 0;
-
-	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-		if (flags & SYNC_TRYLOCK)
-			return 0;
-		xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	}
-
-	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-				0 : XBF_ASYNC, FI_NONE);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-	return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-	struct xfs_mount	*mp,
-	int			flags)
-{
-	int			error;
-
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-	if (error)
-		return XFS_ERROR(error);
-
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return 0;
-}
-
 STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp)
@@ -415,39 +370,6 @@ xfs_reclaim_worker(
 	xfs_syncd_queue_reclaim(mp);
 }
 
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-	struct xfs_inode	*ip)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-
-	queue_work(xfs_syncd_wq, &mp->m_flush_work);
-	flush_work(&mp->m_flush_work);
-}
-
-void
-xfs_flush_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(work,
-					struct xfs_mount, m_flush_work);
-
-	xfs_sync_data(mp, SYNC_TRYLOCK);
-	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
 void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 8d58fab72a10..0018e846f0dc 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,14 +26,11 @@ struct xfs_perag;
 
 extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
 
-void xfs_flush_worker(struct work_struct *work);
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
 void xfs_quiesce_attr(struct xfs_mount *mp);
 
-void xfs_flush_inodes(struct xfs_inode *ip);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..14928564f106 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -777,7 +777,7 @@ xfs_create(
 			XFS_TRANS_PERM_LOG_RES, log_count);
 	if (error == ENOSPC) {
 		/* flush outstanding delalloc blocks and retry */
-		xfs_flush_inodes(dp);
+		xfs_flush_inodes(mp);
 		error = xfs_trans_reserve(tp, resblks, log_res, 0,
 				XFS_TRANS_PERM_LOG_RES, log_count);
 	}
-- 
cgit v1.2.3


From 5889608df35783590251cfd440fa5d48f1855179 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:05 +1100
Subject: xfs: syncd workqueue is no more

With the syncd functions moved to the log and/or removed, the syncd
workqueue is the only remaining bit left. It is used by the log
covering/ail pushing work, as well as by the inode reclaim work.

Given how cheap workqueues are these days, give the log and inode
reclaim work their own work queues and kill the syncd work queue.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c   |  2 +-
 fs/xfs/xfs_mount.h |  2 ++
 fs/xfs/xfs_super.c | 38 ++++++++++++++++++--------------------
 fs/xfs/xfs_sync.c  | 20 +++++++++-----------
 fs/xfs/xfs_sync.h  |  2 --
 5 files changed, 30 insertions(+), 34 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e788f39721e3..b6ce4d4b6def 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1193,7 +1193,7 @@ void
 xfs_log_work_queue(
 	struct xfs_mount        *mp)
 {
-	queue_delayed_work(xfs_syncd_wq, &mp->m_log->l_work,
+	queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
 				msecs_to_jiffies(xfs_syncd_centisecs * 10));
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a54b5aa498d4..7c417b6b99ee 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -207,6 +207,8 @@ typedef struct xfs_mount {
 	struct workqueue_struct	*m_data_workqueue;
 	struct workqueue_struct	*m_unwritten_workqueue;
 	struct workqueue_struct	*m_cil_workqueue;
+	struct workqueue_struct	*m_reclaim_workqueue;
+	struct workqueue_struct	*m_log_workqueue;
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 9468c6878463..27d5a92e1210 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -863,8 +863,23 @@ xfs_init_mount_workqueues(
 			WQ_MEM_RECLAIM, 0, mp->m_fsname);
 	if (!mp->m_cil_workqueue)
 		goto out_destroy_unwritten;
+
+	mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+			WQ_NON_REENTRANT, 0, mp->m_fsname);
+	if (!mp->m_reclaim_workqueue)
+		goto out_destroy_cil;
+
+	mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+			WQ_NON_REENTRANT, 0, mp->m_fsname);
+	if (!mp->m_log_workqueue)
+		goto out_destroy_reclaim;
+
 	return 0;
 
+out_destroy_reclaim:
+	destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+	destroy_workqueue(mp->m_cil_workqueue);
 out_destroy_unwritten:
 	destroy_workqueue(mp->m_unwritten_workqueue);
 out_destroy_data_iodone_queue:
@@ -877,6 +892,8 @@ STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
 {
+	destroy_workqueue(mp->m_log_workqueue);
+	destroy_workqueue(mp->m_reclaim_workqueue);
 	destroy_workqueue(mp->m_cil_workqueue);
 	destroy_workqueue(mp->m_data_workqueue);
 	destroy_workqueue(mp->m_unwritten_workqueue);
@@ -1391,10 +1408,6 @@ xfs_fs_fill_super(
 	/*
 	 * we must configure the block size in the superblock before we run the
 	 * full mount process as the mount process can lookup and cache inodes.
-	 * For the same reason we must also initialise the syncd and register
-	 * the inode cache shrinker so that inodes can be reclaimed during
-	 * operations like a quotacheck that iterate all inodes in the
-	 * filesystem.
 	 */
 	sb->s_magic = XFS_SB_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1638,16 +1651,6 @@ xfs_destroy_zones(void)
 STATIC int __init
 xfs_init_workqueues(void)
 {
-	/*
-	 * We never want to the same work item to run twice, reclaiming inodes
-	 * or idling the log is not going to get any faster by multiple CPUs
-	 * competing for ressources.  Use the default large max_active value
-	 * so that even lots of filesystems can perform these task in parallel.
-	 */
-	xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
-	if (!xfs_syncd_wq)
-		return -ENOMEM;
-
 	/*
 	 * The allocation workqueue can be used in memory reclaim situations
 	 * (writepage path), and parallelism is only limited by the number of
@@ -1656,20 +1659,15 @@ xfs_init_workqueues(void)
 	 */
 	xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
 	if (!xfs_alloc_wq)
-		goto out_destroy_syncd;
+		return -ENOMEM;
 
 	return 0;
-
-out_destroy_syncd:
-	destroy_workqueue(xfs_syncd_wq);
-	return -ENOMEM;
 }
 
 STATIC void
 xfs_destroy_workqueues(void)
 {
 	destroy_workqueue(xfs_alloc_wq);
-	destroy_workqueue(xfs_syncd_wq);
 }
 
 STATIC int __init
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 6a2ada379166..15be21f074fd 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -40,8 +40,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
-struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
-
 /*
  * The inode lookup is done in batches to keep the amount of lock traffic and
  * radix tree lookups to a minimum. The batch size is a trade off between
@@ -335,18 +333,18 @@ xfs_quiesce_attr(
 /*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
  * tunable, but that can be done if this method proves to be ineffective or too
  * aggressive.
  */
 static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_work_queue(
 	struct xfs_mount        *mp)
 {
 
 	rcu_read_lock();
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
 			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
 	}
 	rcu_read_unlock();
@@ -367,7 +365,7 @@ xfs_reclaim_worker(
 					struct xfs_mount, m_reclaim_work);
 
 	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_syncd_queue_reclaim(mp);
+	xfs_reclaim_work_queue(mp);
 }
 
 void
@@ -388,7 +386,7 @@ __xfs_inode_set_reclaim_tag(
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
 		/* schedule periodic background inode reclaim */
-		xfs_syncd_queue_reclaim(ip->i_mount);
+		xfs_reclaim_work_queue(ip->i_mount);
 
 		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
 							-1, _RET_IP_);
@@ -646,9 +644,9 @@ out:
 	/*
 	 * We could return EAGAIN here to make reclaim rescan the inode tree in
 	 * a short while. However, this just burns CPU time scanning the tree
-	 * waiting for IO to complete and xfssyncd never goes back to the idle
-	 * state. Instead, return 0 to let the next scheduled background reclaim
-	 * attempt to reclaim the inode again.
+	 * waiting for IO to complete and the reclaim work never goes back to
+	 * the idle state. Instead, return 0 to let the next scheduled
+	 * background reclaim attempt to reclaim the inode again.
 	 */
 	return 0;
 }
@@ -804,7 +802,7 @@ xfs_reclaim_inodes_nr(
 	int			nr_to_scan)
 {
 	/* kick background reclaimer and push the AIL */
-	xfs_syncd_queue_reclaim(mp);
+	xfs_reclaim_work_queue(mp);
 	xfs_ail_push_all(mp->m_ail);
 
 	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 0018e846f0dc..0beabea99e73 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -24,8 +24,6 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
-extern struct workqueue_struct	*xfs_syncd_wq;	/* sync workqueue */
-
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_quiesce_data(struct xfs_mount *mp);
-- 
cgit v1.2.3


From 34061f5c420561dd42addd252811a1fa4b0ac69b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:06 +1100
Subject: xfs: xfs_sync_fsdata is redundant

Why do we need to write the superblock to disk once we've written
all the data?  We don't actually - the reasons for doing this are
lost in the mists of time, and go back to the way Irix used to drive
VFS flushing.

On linux, this code is only called from two contexts: remount and
.sync_fs. In the remount case, the call is followed by a metadata
sync, which unpins and writes the superblock.  In the sync_fs case,
we only need to force the log to disk to ensure that the superblock
is correctly on disk, so we don't actually need to write it. Hence
the functionality is either redundant or superfluous and thus can be
removed.

Seeing as xfs_quiesce_data is essentially now just a log force,
remove it as well and fold the code back into the two callers.
Neither of them need the log covering check, either, as that is
redundant for the remount case, and unnecessary for the .sync_fs
case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_super.c | 19 +++++-----------
 fs/xfs/xfs_sync.c  | 67 +++++++-----------------------------------------------
 2 files changed, 14 insertions(+), 72 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 27d5a92e1210..b5e445a13f7b 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1057,7 +1057,6 @@ xfs_fs_sync_fs(
 	int			wait)
 {
 	struct xfs_mount	*mp = XFS_M(sb);
-	int			error;
 
 	/*
 	 * Doing anything during the async pass would be counterproductive.
@@ -1065,10 +1064,7 @@ xfs_fs_sync_fs(
 	if (!wait)
 		return 0;
 
-	error = xfs_quiesce_data(mp);
-	if (error)
-		return -error;
-
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	if (laptop_mode) {
 		/*
 		 * The disk must be active because we're syncing.
@@ -1238,15 +1234,12 @@ xfs_fs_remount(
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
 		/*
-		 * After we have synced the data but before we sync the
-		 * metadata, we need to free up the reserve block pool so that
-		 * the used block count in the superblock on disk is correct at
-		 * the end of the remount. Stash the current reserve pool size
-		 * so that if we get remounted rw, we can return it to the same
-		 * size.
+		 * Before we sync the metadata, we need to free up the reserve
+		 * block pool so that the used block count in the superblock on
+		 * disk is correct at the end of the remount. Stash the current
+		 * reserve pool size so that if we get remounted rw, we can
+		 * return it to the same size.
 		 */
-
-		xfs_quiesce_data(mp);
 		xfs_save_resvblks(mp);
 		xfs_quiesce_attr(mp);
 		mp->m_flags |= XFS_MOUNT_RDONLY;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 15be21f074fd..581eb59a85b5 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -214,70 +214,16 @@ xfs_inode_ag_iterator(
 	return XFS_ERROR(last_error);
 }
 
-STATIC int
-xfs_sync_fsdata(
-	struct xfs_mount	*mp)
-{
-	struct xfs_buf		*bp;
-	int			error;
-
-	/*
-	 * If the buffer is pinned then push on the log so we won't get stuck
-	 * waiting in the write for someone, maybe ourselves, to flush the log.
-	 *
-	 * Even though we just pushed the log above, we did not have the
-	 * superblock buffer locked at that point so it can become pinned in
-	 * between there and here.
-	 */
-	bp = xfs_getsb(mp, 0);
-	if (xfs_buf_ispinned(bp))
-		xfs_log_force(mp, 0);
-	error = xfs_bwrite(bp);
-	xfs_buf_relse(bp);
-	return error;
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
-int
-xfs_quiesce_data(
-	struct xfs_mount	*mp)
-{
-	int			error, error2 = 0;
-
-	/* force out the log */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/* write superblock and hoover up shutdown errors */
-	error = xfs_sync_fsdata(mp);
-
-	/* mark the log as covered if needed */
-	if (xfs_log_need_covered(mp))
-		error2 = xfs_fs_log_dummy(mp);
-
-	return error ? error : error2;
-}
-
 /*
  * Second stage of a quiesce. The data is already synced, now we have to take
  * care of the metadata. New transactions are already blocked, so we need to
  * wait for any remaining transactions to drain out before proceeding.
  *
+ * The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ *
  * Note: this stops background sync work - the callers must ensure it is started
  * again when appropriate.
  */
@@ -291,6 +237,9 @@ xfs_quiesce_attr(
 	while (atomic_read(&mp->m_active_trans) > 0)
 		delay(100);
 
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
 	/* reclaim inodes to do any IO before the freeze completes */
 	xfs_reclaim_inodes(mp, 0);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
-- 
cgit v1.2.3


From c7eea6f7adca4501d2c2db7f0f7c9dc88efac95e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:07 +1100
Subject: xfs: move xfs_quiesce_attr() into xfs_super.c

Both callers of xfs_quiesce_attr() are in xfs_super.c, and there's
nothing really sync-specific about this functionality so it doesn't
really matter where it lives. Move it to benext to it's callers, so
all the remount/sync_fs code is in the one place.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_super.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_sync.c  | 65 ----------------------------------------------------
 fs/xfs/xfs_sync.h  |  3 ---
 3 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index b5e445a13f7b..3bafe66227fb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1148,6 +1148,73 @@ xfs_restore_resvblks(struct xfs_mount *mp)
 	xfs_reserve_blocks(mp, &resblks, NULL);
 }
 
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk (this is the main
+ * difference between a sync and a quiesce).
+ *
+ * This shoul deffectively mimic the code in xfs_unmountfs() and
+ * xfs_log_umount() but without tearing down any structures.
+ * XXX: bug fixes needed!
+ *
+ * Note: this stops background log work - the callers must ensure it is started
+ * again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+	struct xfs_mount	*mp)
+{
+	int	error = 0;
+
+	/* wait for all modifications to complete */
+	while (atomic_read(&mp->m_active_trans) > 0)
+		delay(100);
+
+	/* force the log to unpin objects from the now complete transactions */
+	xfs_log_force(mp, XFS_LOG_SYNC);
+
+	/* reclaim inodes to do any IO before the freeze completes */
+	xfs_reclaim_inodes(mp, 0);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+	/* flush all pending changes from the AIL */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/* stop background log work */
+	cancel_delayed_work_sync(&mp->m_log->l_work);
+
+	/*
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
+	 */
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+	/* Push the superblock and write an unmount record */
+	error = xfs_log_sbcount(mp);
+	if (error)
+		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+				"Frozen image may not be consistent.");
+	xfs_log_unmount_write(mp);
+
+	/*
+	 * At this point we might have modified the superblock again and thus
+	 * added an item to the AIL, thus flush it again.
+	 */
+	xfs_ail_push_all_sync(mp->m_ail);
+
+	/*
+	 * The superblock buffer is uncached and xfsaild_push() will lock and
+	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
+	 * here but a lock on the superblock buffer will block until iodone()
+	 * has completed.
+	 */
+	xfs_buf_lock(mp->m_sb_bp);
+	xfs_buf_unlock(mp->m_sb_bp);
+}
+
 STATIC int
 xfs_fs_remount(
 	struct super_block	*sb,
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 581eb59a85b5..7b630288bab5 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -214,71 +214,6 @@ xfs_inode_ag_iterator(
 	return XFS_ERROR(last_error);
 }
 
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- *
- * The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- *
- * Note: this stops background sync work - the callers must ensure it is started
- * again when appropriate.
- */
-void
-xfs_quiesce_attr(
-	struct xfs_mount	*mp)
-{
-	int	error = 0;
-
-	/* wait for all modifications to complete */
-	while (atomic_read(&mp->m_active_trans) > 0)
-		delay(100);
-
-	/* force the log to unpin objects from the now complete transactions */
-	xfs_log_force(mp, XFS_LOG_SYNC);
-
-	/* reclaim inodes to do any IO before the freeze completes */
-	xfs_reclaim_inodes(mp, 0);
-	xfs_reclaim_inodes(mp, SYNC_WAIT);
-
-	/* flush all pending changes from the AIL */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/* stop background log work */
-	cancel_delayed_work_sync(&mp->m_log->l_work);
-
-	/*
-	 * Just warn here till VFS can correctly support
-	 * read-only remount without racing.
-	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
-	/* Push the superblock and write an unmount record */
-	error = xfs_log_sbcount(mp);
-	if (error)
-		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-				"Frozen image may not be consistent.");
-	xfs_log_unmount_write(mp);
-
-	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
-	 */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
-}
-
 /*
  * Queue a new inode reclaim pass if there are reclaimable inodes and there
  * isn't a reclaim pass already in progress. By default it runs every 5s based
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 0beabea99e73..0ba9c89c316e 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,9 +26,6 @@ struct xfs_perag;
 
 void xfs_reclaim_worker(struct work_struct *work);
 
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-- 
cgit v1.2.3


From c75921a72a7c4bb73a5e09a697a672722e5543f1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:08 +1100
Subject: xfs: xfs_quiesce_attr() should quiesce the log like unmount

xfs_quiesce_attr() is supposed to leave the log empty with an
unmount record written. Right now it does not wait for the AIL to be
emptied before writing the unmount record, not does it wait for
metadata IO completion, either. Fix it to use the same method and
code as xfs_log_unmount().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c   | 25 ++++++++++++++++++-------
 fs/xfs/xfs_log.h   |  1 +
 fs/xfs/xfs_super.c | 41 ++++++++---------------------------------
 3 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b6ce4d4b6def..d2d59692739f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -855,20 +855,17 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }	/* xfs_log_unmount_write */
 
 /*
- * Shut down and release the AIL and Log.
- *
- * During unmount, we need to ensure we flush all the dirty metadata objects
- * from the AIL so that the log is empty before we write the unmount record to
- * the log.
+ * Empty the log for unmount/freeze.
  *
  * To do this, we first need to shut down the background log work so it is not
  * trying to cover the log as we clean up. We then need to unpin all objects in
  * the log so we can then flush them out. Once they have completed their IO and
  * run the callbacks removing themselves from the AIL, we can write the unmount
- * record, tear down the AIL and finally free the log.
+ * record.
  */
 void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_quiesce(
+	struct xfs_mount	*mp)
 {
 	cancel_delayed_work_sync(&mp->m_log->l_work);
 	xfs_log_force(mp, XFS_LOG_SYNC);
@@ -886,6 +883,20 @@ xfs_log_unmount(xfs_mount_t *mp)
 	xfs_buf_unlock(mp->m_sb_bp);
 
 	xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
+ *
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
+ */
+void
+xfs_log_unmount(
+	struct xfs_mount	*mp)
+{
+	xfs_log_quiesce(mp);
 
 	xfs_trans_ail_destroy(mp);
 	xlog_dealloc_log(mp->m_log);
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 26ed7de352d7..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -183,6 +183,7 @@ bool	xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 
 void	xfs_log_work_queue(struct xfs_mount *mp);
 void	xfs_log_worker(struct work_struct *work);
+void	xfs_log_quiesce(struct xfs_mount *mp);
 
 #endif
 #endif	/* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3bafe66227fb..fdedf2cabae3 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1153,15 +1153,11 @@ xfs_restore_resvblks(struct xfs_mount *mp)
  *
  * This ensures that the metadata is written to their location on disk rather
  * than just existing in transactions in the log. This means after a quiesce
- * there is no log replay required to write the inodes to disk (this is the main
- * difference between a sync and a quiesce).
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
  *
- * This shoul deffectively mimic the code in xfs_unmountfs() and
- * xfs_log_umount() but without tearing down any structures.
- * XXX: bug fixes needed!
- *
- * Note: this stops background log work - the callers must ensure it is started
- * again when appropriate.
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
  */
 void
 xfs_quiesce_attr(
@@ -1180,39 +1176,18 @@ xfs_quiesce_attr(
 	xfs_reclaim_inodes(mp, 0);
 	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
-	/* flush all pending changes from the AIL */
-	xfs_ail_push_all_sync(mp->m_ail);
-
-	/* stop background log work */
-	cancel_delayed_work_sync(&mp->m_log->l_work);
-
-	/*
-	 * Just warn here till VFS can correctly support
-	 * read-only remount without racing.
-	 */
-	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
 	/* Push the superblock and write an unmount record */
 	error = xfs_log_sbcount(mp);
 	if (error)
 		xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
 				"Frozen image may not be consistent.");
-	xfs_log_unmount_write(mp);
-
 	/*
-	 * At this point we might have modified the superblock again and thus
-	 * added an item to the AIL, thus flush it again.
+	 * Just warn here till VFS can correctly support
+	 * read-only remount without racing.
 	 */
-	xfs_ail_push_all_sync(mp->m_ail);
+	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
 
-	/*
-	 * The superblock buffer is uncached and xfsaild_push() will lock and
-	 * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-	 * here but a lock on the superblock buffer will block until iodone()
-	 * has completed.
-	 */
-	xfs_buf_lock(mp->m_sb_bp);
-	xfs_buf_unlock(mp->m_sb_bp);
+	xfs_log_quiesce(mp);
 }
 
 STATIC int
-- 
cgit v1.2.3


From 6d8b79cfca39399ef9115fb65dde85993455c9a3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:09 +1100
Subject: xfs: rename xfs_sync.[ch] to xfs_icache.[ch]

xfs_sync.c now only contains inode reclaim functions and inode cache
iteration functions. It is not related to sync operations anymore.
Rename to xfs_icache.c to reflect it's contents and prepare for
consolidation with the other inode cache file that exists
(xfs_iget.c).

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Makefile          |   2 +-
 fs/xfs/xfs_icache.c      | 715 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.h      |  43 +++
 fs/xfs/xfs_iget.c        |   1 +
 fs/xfs/xfs_mount.c       |   1 +
 fs/xfs/xfs_mount.h       |   2 -
 fs/xfs/xfs_qm_syscalls.c |   1 +
 fs/xfs/xfs_super.c       |   2 +-
 fs/xfs/xfs_sync.c        | 714 ----------------------------------------------
 fs/xfs/xfs_sync.h        |  43 ---
 10 files changed, 763 insertions(+), 761 deletions(-)
 create mode 100644 fs/xfs/xfs_icache.c
 create mode 100644 fs/xfs/xfs_icache.h
 delete mode 100644 fs/xfs/xfs_sync.c
 delete mode 100644 fs/xfs/xfs_sync.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..442f256dbcac 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -39,6 +39,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_fsops.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
+				   xfs_icache.o \
 				   xfs_iget.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
@@ -47,7 +48,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mru_cache.o \
 				   xfs_super.o \
-				   xfs_sync.o \
 				   xfs_xattr.o \
 				   xfs_rename.o \
 				   xfs_utils.o \
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
new file mode 100644
index 000000000000..eba216f11d5e
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,715 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_icache.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH	32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+	struct xfs_inode	*ip)
+{
+	struct inode		*inode = VFS_I(ip);
+
+	ASSERT(rcu_read_lock_held());
+
+	/*
+	 * check for stale RCU freed inode
+	 *
+	 * If the inode has been reallocated, it doesn't matter if it's not in
+	 * the AG we are walking - we are walking for writeback, so if it
+	 * passes all the "valid inode" checks and is dirty, then we'll write
+	 * it back anyway.  If it has been reallocated and still being
+	 * initialised, the XFS_INEW check below will catch it.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!ip->i_ino)
+		goto out_unlock_noent;
+
+	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+		goto out_unlock_noent;
+	spin_unlock(&ip->i_flags_lock);
+
+	/* nothing to sync during shutdown */
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return EFSCORRUPTED;
+
+	/* If we can't grab the inode, it must on it's way to reclaim. */
+	if (!igrab(inode))
+		return ENOENT;
+
+	if (is_bad_inode(inode)) {
+		IRELE(ip);
+		return ENOENT;
+	}
+
+	/* inode is valid */
+	return 0;
+
+out_unlock_noent:
+	spin_unlock(&ip->i_flags_lock);
+	return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	uint32_t		first_index;
+	int			last_error = 0;
+	int			skipped;
+	int			done;
+	int			nr_found;
+
+restart:
+	done = 0;
+	skipped = 0;
+	first_index = 0;
+	nr_found = 0;
+	do {
+		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+		int		error = 0;
+		int		i;
+
+		rcu_read_lock();
+		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH);
+		if (!nr_found) {
+			rcu_read_unlock();
+			break;
+		}
+
+		/*
+		 * Grab the inodes before we drop the lock. if we found
+		 * nothing, nr == 0 and the loop will be skipped.
+		 */
+		for (i = 0; i < nr_found; i++) {
+			struct xfs_inode *ip = batch[i];
+
+			if (done || xfs_inode_ag_walk_grab(ip))
+				batch[i] = NULL;
+
+			/*
+			 * Update the index for the next lookup. Catch
+			 * overflows into the next AG range which can occur if
+			 * we have inodes in the last block of the AG and we
+			 * are currently pointing to the last inode.
+			 *
+			 * Because we may see inodes that are from the wrong AG
+			 * due to RCU freeing and reallocation, only update the
+			 * index if it lies in this AG. It was a race that lead
+			 * us to see this inode, so another lookup from the
+			 * same index will not find it again.
+			 */
+			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+				continue;
+			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+				done = 1;
+		}
+
+		/* unlock now we've grabbed the inodes. */
+		rcu_read_unlock();
+
+		for (i = 0; i < nr_found; i++) {
+			if (!batch[i])
+				continue;
+			error = execute(batch[i], pag, flags);
+			IRELE(batch[i]);
+			if (error == EAGAIN) {
+				skipped++;
+				continue;
+			}
+			if (error && last_error != EFSCORRUPTED)
+				last_error = error;
+		}
+
+		/* bail out if the filesystem is corrupted.  */
+		if (error == EFSCORRUPTED)
+			break;
+
+		cond_resched();
+
+	} while (nr_found && !done);
+
+	if (skipped) {
+		delay(1);
+		goto restart;
+	}
+	return last_error;
+}
+
+int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags),
+	int			flags)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get(mp, ag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == EFSCORRUPTED)
+				break;
+		}
+	}
+	return XFS_ERROR(last_error);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+	struct xfs_mount        *mp)
+{
+
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+	}
+	rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+					struct xfs_mount, m_reclaim_work);
+
+	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+	xfs_reclaim_work_queue(mp);
+}
+
+void
+__xfs_inode_set_reclaim_tag(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip)
+{
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		/* schedule periodic background inode reclaim */
+		xfs_reclaim_work_queue(ip->i_mount);
+
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+	pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	spin_lock(&ip->i_flags_lock);
+	__xfs_inode_set_reclaim_tag(pag, ip);
+	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+	spin_unlock(&ip->i_flags_lock);
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+	xfs_mount_t	*mp,
+	xfs_perag_t	*pag,
+	xfs_inode_t	*ip)
+{
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+	__xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+	struct xfs_inode	*ip,
+	int			flags)
+{
+	ASSERT(rcu_read_lock_held());
+
+	/* quick check for stale RCU freed inode */
+	if (!ip->i_ino)
+		return 1;
+
+	/*
+	 * If we are asked for non-blocking operation, do unlocked checks to
+	 * see if the inode already is being flushed or in reclaim to avoid
+	 * lock traffic.
+	 */
+	if ((flags & SYNC_TRYLOCK) &&
+	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+		return 1;
+
+	/*
+	 * The radix tree lock here protects a thread in xfs_iget from racing
+	 * with us starting reclaim on the inode.  Once we have the
+	 * XFS_IRECLAIM flag set it will not touch us.
+	 *
+	 * Due to RCU lookup, we may find inodes that have been freed and only
+	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+	 * aren't candidates for reclaim at all, so we must check the
+	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+		/* not a reclaim candidate. */
+		spin_unlock(&ip->i_flags_lock);
+		return 1;
+	}
+	__xfs_iflags_set(ip, XFS_IRECLAIM);
+	spin_unlock(&ip->i_flags_lock);
+	return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ *	inode state	     iflush ret		required action
+ *      ---------------      ----------         ---------------
+ *	bad			-		reclaim
+ *	shutdown		EIO		unpin and reclaim
+ *	clean, unpinned		0		reclaim
+ *	stale, unpinned		0		reclaim
+ *	clean, pinned(*)	0		requeue
+ *	stale, pinned		EAGAIN		requeue
+ *	dirty, async		-		requeue
+ *	dirty, sync		0		reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting.  For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *	bad		=> reclaim
+ *	shutdown	=> unpin and reclaim
+ *	pinned, async	=> requeue
+ *	pinned, sync	=> unpin
+ *	stale		=> reclaim
+ *	clean		=> reclaim
+ *	dirty, async	=> requeue
+ *	dirty, sync	=> flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			sync_mode)
+{
+	struct xfs_buf		*bp = NULL;
+	int			error;
+
+restart:
+	error = 0;
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_iflock_nowait(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out;
+		xfs_iflock(ip);
+	}
+
+	if (is_bad_inode(VFS_I(ip)))
+		goto reclaim;
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+		xfs_iunpin_wait(ip);
+		xfs_iflush_abort(ip, false);
+		goto reclaim;
+	}
+	if (xfs_ipincount(ip)) {
+		if (!(sync_mode & SYNC_WAIT))
+			goto out_ifunlock;
+		xfs_iunpin_wait(ip);
+	}
+	if (xfs_iflags_test(ip, XFS_ISTALE))
+		goto reclaim;
+	if (xfs_inode_clean(ip))
+		goto reclaim;
+
+	/*
+	 * Never flush out dirty data during non-blocking reclaim, as it would
+	 * just contend with AIL pushing trying to do the same job.
+	 */
+	if (!(sync_mode & SYNC_WAIT))
+		goto out_ifunlock;
+
+	/*
+	 * Now we have an inode that needs flushing.
+	 *
+	 * Note that xfs_iflush will never block on the inode buffer lock, as
+	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
+	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
+	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+	 * result in an ABBA deadlock with xfs_ifree_cluster().
+	 *
+	 * As xfs_ifree_cluser() must gather all inodes that are active in the
+	 * cache to mark them stale, if we hit this case we don't actually want
+	 * to do IO here - we want the inode marked stale so we can simply
+	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
+	 * inode, back off and try again.  Hopefully the next pass through will
+	 * see the stale flag set on the inode.
+	 */
+	error = xfs_iflush(ip, &bp);
+	if (error == EAGAIN) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		/* backoff longer than in xfs_ifree_cluster */
+		delay(2);
+		goto restart;
+	}
+
+	if (!error) {
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+	}
+
+	xfs_iflock(ip);
+reclaim:
+	xfs_ifunlock(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	XFS_STATS_INC(xs_ig_reclaims);
+	/*
+	 * Remove the inode from the per-AG radix tree.
+	 *
+	 * Because radix_tree_delete won't complain even if the item was never
+	 * added to the tree assert that it's been there before to catch
+	 * problems with the inode life time early on.
+	 */
+	spin_lock(&pag->pag_ici_lock);
+	if (!radix_tree_delete(&pag->pag_ici_root,
+				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+		ASSERT(0);
+	__xfs_inode_clear_reclaim(pag, ip);
+	spin_unlock(&pag->pag_ici_lock);
+
+	/*
+	 * Here we do an (almost) spurious inode lock in order to coordinate
+	 * with inode cache radix tree lookups.  This is because the lookup
+	 * can reference the inodes in the cache without taking references.
+	 *
+	 * We make that OK here by ensuring that we wait until the inode is
+	 * unlocked after the lookup before we go ahead and free it.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_qm_dqdetach(ip);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	xfs_inode_free(ip);
+	return error;
+
+out_ifunlock:
+	xfs_ifunlock(ip);
+out:
+	xfs_iflags_clear(ip, XFS_IRECLAIM);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	/*
+	 * We could return EAGAIN here to make reclaim rescan the inode tree in
+	 * a short while. However, this just burns CPU time scanning the tree
+	 * waiting for IO to complete and the reclaim work never goes back to
+	 * the idle state. Instead, return 0 to let the next scheduled
+	 * background reclaim attempt to reclaim the inode again.
+	 */
+	return 0;
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+	struct xfs_mount	*mp,
+	int			flags,
+	int			*nr_to_scan)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+	int			trylock = flags & SYNC_TRYLOCK;
+	int			skipped;
+
+restart:
+	ag = 0;
+	skipped = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		unsigned long	first_index = 0;
+		int		done = 0;
+		int		nr_found = 0;
+
+		ag = pag->pag_agno + 1;
+
+		if (trylock) {
+			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+				skipped++;
+				xfs_perag_put(pag);
+				continue;
+			}
+			first_index = pag->pag_ici_reclaim_cursor;
+		} else
+			mutex_lock(&pag->pag_ici_reclaim_lock);
+
+		do {
+			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+			int	i;
+
+			rcu_read_lock();
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **)batch, first_index,
+					XFS_LOOKUP_BATCH,
+					XFS_ICI_RECLAIM_TAG);
+			if (!nr_found) {
+				done = 1;
+				rcu_read_unlock();
+				break;
+			}
+
+			/*
+			 * Grab the inodes before we drop the lock. if we found
+			 * nothing, nr == 0 and the loop will be skipped.
+			 */
+			for (i = 0; i < nr_found; i++) {
+				struct xfs_inode *ip = batch[i];
+
+				if (done || xfs_reclaim_inode_grab(ip, flags))
+					batch[i] = NULL;
+
+				/*
+				 * Update the index for the next lookup. Catch
+				 * overflows into the next AG range which can
+				 * occur if we have inodes in the last block of
+				 * the AG and we are currently pointing to the
+				 * last inode.
+				 *
+				 * Because we may see inodes that are from the
+				 * wrong AG due to RCU freeing and
+				 * reallocation, only update the index if it
+				 * lies in this AG. It was a race that lead us
+				 * to see this inode, so another lookup from
+				 * the same index will not find it again.
+				 */
+				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+								pag->pag_agno)
+					continue;
+				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+					done = 1;
+			}
+
+			/* unlock now we've grabbed the inodes. */
+			rcu_read_unlock();
+
+			for (i = 0; i < nr_found; i++) {
+				if (!batch[i])
+					continue;
+				error = xfs_reclaim_inode(batch[i], pag, flags);
+				if (error && last_error != EFSCORRUPTED)
+					last_error = error;
+			}
+
+			*nr_to_scan -= XFS_LOOKUP_BATCH;
+
+			cond_resched();
+
+		} while (nr_found && !done && *nr_to_scan > 0);
+
+		if (trylock && !done)
+			pag->pag_ici_reclaim_cursor = first_index;
+		else
+			pag->pag_ici_reclaim_cursor = 0;
+		mutex_unlock(&pag->pag_ici_reclaim_lock);
+		xfs_perag_put(pag);
+	}
+
+	/*
+	 * if we skipped any AG, and we still have scan count remaining, do
+	 * another pass this time using blocking reclaim semantics (i.e
+	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
+	 * ensure that when we get more reclaimers than AGs we block rather
+	 * than spin trying to execute reclaim.
+	 */
+	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+		trylock = 0;
+		goto restart;
+	}
+	return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+	xfs_mount_t	*mp,
+	int		mode)
+{
+	int		nr_to_scan = INT_MAX;
+
+	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+void
+xfs_reclaim_inodes_nr(
+	struct xfs_mount	*mp,
+	int			nr_to_scan)
+{
+	/* kick background reclaimer and push the AIL */
+	xfs_reclaim_work_queue(mp);
+	xfs_ail_push_all(mp->m_ail);
+
+	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		ag = 0;
+	int			reclaimable = 0;
+
+	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+		ag = pag->pag_agno + 1;
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
+	}
+	return reclaimable;
+}
+
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
new file mode 100644
index 000000000000..0ba9c89c316e
--- /dev/null
+++ b/fs/xfs/xfs_icache.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
+#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+				struct xfs_inode *ip);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+	int flags);
+
+#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 784a803383ec..069c5ceb9459 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,6 +38,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 
 /*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c195ec85c725..6f1c997704cd 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 
 #ifdef HAVE_PERCPU_SB
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7c417b6b99ee..a631ca3b9065 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
 
 #else /* __KERNEL__ */
 
-#include "xfs_sync.h"
-
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..7a9071f8855f 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int	xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fdedf2cabae3..3d9ea947e9f8 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
 #include "xfs_trace.h"
 
 #include <linux/namei.h>
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
deleted file mode 100644
index 7b630288bab5..000000000000
--- a/fs/xfs/xfs_sync.c
+++ /dev/null
@@ -1,714 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_log_priv.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dinode.h"
-#include "xfs_error.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_fsops.h"
-
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-
-/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH	32
-
-STATIC int
-xfs_inode_ag_walk_grab(
-	struct xfs_inode	*ip)
-{
-	struct inode		*inode = VFS_I(ip);
-
-	ASSERT(rcu_read_lock_held());
-
-	/*
-	 * check for stale RCU freed inode
-	 *
-	 * If the inode has been reallocated, it doesn't matter if it's not in
-	 * the AG we are walking - we are walking for writeback, so if it
-	 * passes all the "valid inode" checks and is dirty, then we'll write
-	 * it back anyway.  If it has been reallocated and still being
-	 * initialised, the XFS_INEW check below will catch it.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!ip->i_ino)
-		goto out_unlock_noent;
-
-	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-		goto out_unlock_noent;
-	spin_unlock(&ip->i_flags_lock);
-
-	/* nothing to sync during shutdown */
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return EFSCORRUPTED;
-
-	/* If we can't grab the inode, it must on it's way to reclaim. */
-	if (!igrab(inode))
-		return ENOENT;
-
-	if (is_bad_inode(inode)) {
-		IRELE(ip);
-		return ENOENT;
-	}
-
-	/* inode is valid */
-	return 0;
-
-out_unlock_noent:
-	spin_unlock(&ip->i_flags_lock);
-	return ENOENT;
-}
-
-STATIC int
-xfs_inode_ag_walk(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
-{
-	uint32_t		first_index;
-	int			last_error = 0;
-	int			skipped;
-	int			done;
-	int			nr_found;
-
-restart:
-	done = 0;
-	skipped = 0;
-	first_index = 0;
-	nr_found = 0;
-	do {
-		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-		int		error = 0;
-		int		i;
-
-		rcu_read_lock();
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-					(void **)batch, first_index,
-					XFS_LOOKUP_BATCH);
-		if (!nr_found) {
-			rcu_read_unlock();
-			break;
-		}
-
-		/*
-		 * Grab the inodes before we drop the lock. if we found
-		 * nothing, nr == 0 and the loop will be skipped.
-		 */
-		for (i = 0; i < nr_found; i++) {
-			struct xfs_inode *ip = batch[i];
-
-			if (done || xfs_inode_ag_walk_grab(ip))
-				batch[i] = NULL;
-
-			/*
-			 * Update the index for the next lookup. Catch
-			 * overflows into the next AG range which can occur if
-			 * we have inodes in the last block of the AG and we
-			 * are currently pointing to the last inode.
-			 *
-			 * Because we may see inodes that are from the wrong AG
-			 * due to RCU freeing and reallocation, only update the
-			 * index if it lies in this AG. It was a race that lead
-			 * us to see this inode, so another lookup from the
-			 * same index will not find it again.
-			 */
-			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
-				continue;
-			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-				done = 1;
-		}
-
-		/* unlock now we've grabbed the inodes. */
-		rcu_read_unlock();
-
-		for (i = 0; i < nr_found; i++) {
-			if (!batch[i])
-				continue;
-			error = execute(batch[i], pag, flags);
-			IRELE(batch[i]);
-			if (error == EAGAIN) {
-				skipped++;
-				continue;
-			}
-			if (error && last_error != EFSCORRUPTED)
-				last_error = error;
-		}
-
-		/* bail out if the filesystem is corrupted.  */
-		if (error == EFSCORRUPTED)
-			break;
-
-		cond_resched();
-
-	} while (nr_found && !done);
-
-	if (skipped) {
-		delay(1);
-		goto restart;
-	}
-	return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
-	struct xfs_mount	*mp,
-	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
-{
-	struct xfs_perag	*pag;
-	int			error = 0;
-	int			last_error = 0;
-	xfs_agnumber_t		ag;
-
-	ag = 0;
-	while ((pag = xfs_perag_get(mp, ag))) {
-		ag = pag->pag_agno + 1;
-		error = xfs_inode_ag_walk(mp, pag, execute, flags);
-		xfs_perag_put(pag);
-		if (error) {
-			last_error = error;
-			if (error == EFSCORRUPTED)
-				break;
-		}
-	}
-	return XFS_ERROR(last_error);
-}
-
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs periodic sync default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_reclaim_work_queue(
-	struct xfs_mount        *mp)
-{
-
-	rcu_read_lock();
-	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
-			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-	}
-	rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-void
-xfs_reclaim_worker(
-	struct work_struct *work)
-{
-	struct xfs_mount *mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_reclaim_work);
-
-	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-	xfs_reclaim_work_queue(mp);
-}
-
-void
-__xfs_inode_set_reclaim_tag(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip)
-{
-	radix_tree_tag_set(&pag->pag_ici_root,
-			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			   XFS_ICI_RECLAIM_TAG);
-
-	if (!pag->pag_ici_reclaimable) {
-		/* propagate the reclaim tag up into the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-
-		/* schedule periodic background inode reclaim */
-		xfs_reclaim_work_queue(ip->i_mount);
-
-		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-	pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-	xfs_inode_t	*ip)
-{
-	struct xfs_mount *mp = ip->i_mount;
-	struct xfs_perag *pag;
-
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-	spin_lock(&pag->pag_ici_lock);
-	spin_lock(&ip->i_flags_lock);
-	__xfs_inode_set_reclaim_tag(pag, ip);
-	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-	spin_unlock(&ip->i_flags_lock);
-	spin_unlock(&pag->pag_ici_lock);
-	xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	pag->pag_ici_reclaimable--;
-	if (!pag->pag_ici_reclaimable) {
-		/* clear the reclaim tag from the perag radix tree */
-		spin_lock(&ip->i_mount->m_perag_lock);
-		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				XFS_ICI_RECLAIM_TAG);
-		spin_unlock(&ip->i_mount->m_perag_lock);
-		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-							-1, _RET_IP_);
-	}
-}
-
-void
-__xfs_inode_clear_reclaim_tag(
-	xfs_mount_t	*mp,
-	xfs_perag_t	*pag,
-	xfs_inode_t	*ip)
-{
-	radix_tree_tag_clear(&pag->pag_ici_root,
-			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-	__xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
- * Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
- */
-STATIC int
-xfs_reclaim_inode_grab(
-	struct xfs_inode	*ip,
-	int			flags)
-{
-	ASSERT(rcu_read_lock_held());
-
-	/* quick check for stale RCU freed inode */
-	if (!ip->i_ino)
-		return 1;
-
-	/*
-	 * If we are asked for non-blocking operation, do unlocked checks to
-	 * see if the inode already is being flushed or in reclaim to avoid
-	 * lock traffic.
-	 */
-	if ((flags & SYNC_TRYLOCK) &&
-	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
-		return 1;
-
-	/*
-	 * The radix tree lock here protects a thread in xfs_iget from racing
-	 * with us starting reclaim on the inode.  Once we have the
-	 * XFS_IRECLAIM flag set it will not touch us.
-	 *
-	 * Due to RCU lookup, we may find inodes that have been freed and only
-	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-	 * aren't candidates for reclaim at all, so we must check the
-	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-		/* not a reclaim candidate. */
-		spin_unlock(&ip->i_flags_lock);
-		return 1;
-	}
-	__xfs_iflags_set(ip, XFS_IRECLAIM);
-	spin_unlock(&ip->i_flags_lock);
-	return 0;
-}
-
-/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- *	inode state	     iflush ret		required action
- *      ---------------      ----------         ---------------
- *	bad			-		reclaim
- *	shutdown		EIO		unpin and reclaim
- *	clean, unpinned		0		reclaim
- *	stale, unpinned		0		reclaim
- *	clean, pinned(*)	0		requeue
- *	stale, pinned		EAGAIN		requeue
- *	dirty, async		-		requeue
- *	dirty, sync		0		reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting.  For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- *	bad		=> reclaim
- *	shutdown	=> unpin and reclaim
- *	pinned, async	=> requeue
- *	pinned, sync	=> unpin
- *	stale		=> reclaim
- *	clean		=> reclaim
- *	dirty, async	=> requeue
- *	dirty, sync	=> flush, wait and reclaim
- */
-STATIC int
-xfs_reclaim_inode(
-	struct xfs_inode	*ip,
-	struct xfs_perag	*pag,
-	int			sync_mode)
-{
-	struct xfs_buf		*bp = NULL;
-	int			error;
-
-restart:
-	error = 0;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	if (!xfs_iflock_nowait(ip)) {
-		if (!(sync_mode & SYNC_WAIT))
-			goto out;
-		xfs_iflock(ip);
-	}
-
-	if (is_bad_inode(VFS_I(ip)))
-		goto reclaim;
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-		xfs_iunpin_wait(ip);
-		xfs_iflush_abort(ip, false);
-		goto reclaim;
-	}
-	if (xfs_ipincount(ip)) {
-		if (!(sync_mode & SYNC_WAIT))
-			goto out_ifunlock;
-		xfs_iunpin_wait(ip);
-	}
-	if (xfs_iflags_test(ip, XFS_ISTALE))
-		goto reclaim;
-	if (xfs_inode_clean(ip))
-		goto reclaim;
-
-	/*
-	 * Never flush out dirty data during non-blocking reclaim, as it would
-	 * just contend with AIL pushing trying to do the same job.
-	 */
-	if (!(sync_mode & SYNC_WAIT))
-		goto out_ifunlock;
-
-	/*
-	 * Now we have an inode that needs flushing.
-	 *
-	 * Note that xfs_iflush will never block on the inode buffer lock, as
-	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
-	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
-	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
-	 * result in an ABBA deadlock with xfs_ifree_cluster().
-	 *
-	 * As xfs_ifree_cluser() must gather all inodes that are active in the
-	 * cache to mark them stale, if we hit this case we don't actually want
-	 * to do IO here - we want the inode marked stale so we can simply
-	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
-	 * inode, back off and try again.  Hopefully the next pass through will
-	 * see the stale flag set on the inode.
-	 */
-	error = xfs_iflush(ip, &bp);
-	if (error == EAGAIN) {
-		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		/* backoff longer than in xfs_ifree_cluster */
-		delay(2);
-		goto restart;
-	}
-
-	if (!error) {
-		error = xfs_bwrite(bp);
-		xfs_buf_relse(bp);
-	}
-
-	xfs_iflock(ip);
-reclaim:
-	xfs_ifunlock(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	XFS_STATS_INC(xs_ig_reclaims);
-	/*
-	 * Remove the inode from the per-AG radix tree.
-	 *
-	 * Because radix_tree_delete won't complain even if the item was never
-	 * added to the tree assert that it's been there before to catch
-	 * problems with the inode life time early on.
-	 */
-	spin_lock(&pag->pag_ici_lock);
-	if (!radix_tree_delete(&pag->pag_ici_root,
-				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
-		ASSERT(0);
-	__xfs_inode_clear_reclaim(pag, ip);
-	spin_unlock(&pag->pag_ici_lock);
-
-	/*
-	 * Here we do an (almost) spurious inode lock in order to coordinate
-	 * with inode cache radix tree lookups.  This is because the lookup
-	 * can reference the inodes in the cache without taking references.
-	 *
-	 * We make that OK here by ensuring that we wait until the inode is
-	 * unlocked after the lookup before we go ahead and free it.
-	 */
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-	xfs_qm_dqdetach(ip);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	xfs_inode_free(ip);
-	return error;
-
-out_ifunlock:
-	xfs_ifunlock(ip);
-out:
-	xfs_iflags_clear(ip, XFS_IRECLAIM);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	/*
-	 * We could return EAGAIN here to make reclaim rescan the inode tree in
-	 * a short while. However, this just burns CPU time scanning the tree
-	 * waiting for IO to complete and the reclaim work never goes back to
-	 * the idle state. Instead, return 0 to let the next scheduled
-	 * background reclaim attempt to reclaim the inode again.
-	 */
-	return 0;
-}
-
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- */
-int
-xfs_reclaim_inodes_ag(
-	struct xfs_mount	*mp,
-	int			flags,
-	int			*nr_to_scan)
-{
-	struct xfs_perag	*pag;
-	int			error = 0;
-	int			last_error = 0;
-	xfs_agnumber_t		ag;
-	int			trylock = flags & SYNC_TRYLOCK;
-	int			skipped;
-
-restart:
-	ag = 0;
-	skipped = 0;
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		unsigned long	first_index = 0;
-		int		done = 0;
-		int		nr_found = 0;
-
-		ag = pag->pag_agno + 1;
-
-		if (trylock) {
-			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-				skipped++;
-				xfs_perag_put(pag);
-				continue;
-			}
-			first_index = pag->pag_ici_reclaim_cursor;
-		} else
-			mutex_lock(&pag->pag_ici_reclaim_lock);
-
-		do {
-			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-			int	i;
-
-			rcu_read_lock();
-			nr_found = radix_tree_gang_lookup_tag(
-					&pag->pag_ici_root,
-					(void **)batch, first_index,
-					XFS_LOOKUP_BATCH,
-					XFS_ICI_RECLAIM_TAG);
-			if (!nr_found) {
-				done = 1;
-				rcu_read_unlock();
-				break;
-			}
-
-			/*
-			 * Grab the inodes before we drop the lock. if we found
-			 * nothing, nr == 0 and the loop will be skipped.
-			 */
-			for (i = 0; i < nr_found; i++) {
-				struct xfs_inode *ip = batch[i];
-
-				if (done || xfs_reclaim_inode_grab(ip, flags))
-					batch[i] = NULL;
-
-				/*
-				 * Update the index for the next lookup. Catch
-				 * overflows into the next AG range which can
-				 * occur if we have inodes in the last block of
-				 * the AG and we are currently pointing to the
-				 * last inode.
-				 *
-				 * Because we may see inodes that are from the
-				 * wrong AG due to RCU freeing and
-				 * reallocation, only update the index if it
-				 * lies in this AG. It was a race that lead us
-				 * to see this inode, so another lookup from
-				 * the same index will not find it again.
-				 */
-				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
-								pag->pag_agno)
-					continue;
-				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-					done = 1;
-			}
-
-			/* unlock now we've grabbed the inodes. */
-			rcu_read_unlock();
-
-			for (i = 0; i < nr_found; i++) {
-				if (!batch[i])
-					continue;
-				error = xfs_reclaim_inode(batch[i], pag, flags);
-				if (error && last_error != EFSCORRUPTED)
-					last_error = error;
-			}
-
-			*nr_to_scan -= XFS_LOOKUP_BATCH;
-
-			cond_resched();
-
-		} while (nr_found && !done && *nr_to_scan > 0);
-
-		if (trylock && !done)
-			pag->pag_ici_reclaim_cursor = first_index;
-		else
-			pag->pag_ici_reclaim_cursor = 0;
-		mutex_unlock(&pag->pag_ici_reclaim_lock);
-		xfs_perag_put(pag);
-	}
-
-	/*
-	 * if we skipped any AG, and we still have scan count remaining, do
-	 * another pass this time using blocking reclaim semantics (i.e
-	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
-	 * ensure that when we get more reclaimers than AGs we block rather
-	 * than spin trying to execute reclaim.
-	 */
-	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
-		trylock = 0;
-		goto restart;
-	}
-	return XFS_ERROR(last_error);
-}
-
-int
-xfs_reclaim_inodes(
-	xfs_mount_t	*mp,
-	int		mode)
-{
-	int		nr_to_scan = INT_MAX;
-
-	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
-}
-
-/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
- */
-void
-xfs_reclaim_inodes_nr(
-	struct xfs_mount	*mp,
-	int			nr_to_scan)
-{
-	/* kick background reclaimer and push the AIL */
-	xfs_reclaim_work_queue(mp);
-	xfs_ail_push_all(mp->m_ail);
-
-	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
-}
-
-/*
- * Return the number of reclaimable inodes in the filesystem for
- * the shrinker to determine how much to reclaim.
- */
-int
-xfs_reclaim_inodes_count(
-	struct xfs_mount	*mp)
-{
-	struct xfs_perag	*pag;
-	xfs_agnumber_t		ag = 0;
-	int			reclaimable = 0;
-
-	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-		ag = pag->pag_agno + 1;
-		reclaimable += pag->pag_ici_reclaimable;
-		xfs_perag_put(pag);
-	}
-	return reclaimable;
-}
-
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
deleted file mode 100644
index 0ba9c89c316e..000000000000
--- a/fs/xfs/xfs_sync.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef XFS_SYNC_H
-#define XFS_SYNC_H 1
-
-struct xfs_mount;
-struct xfs_perag;
-
-#define SYNC_WAIT		0x0001	/* wait for i/o to complete */
-#define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
-
-void xfs_reclaim_worker(struct work_struct *work);
-
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-				struct xfs_inode *ip);
-
-int xfs_sync_inode_grab(struct xfs_inode *ip);
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
-	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags);
-
-#endif
-- 
cgit v1.2.3


From fa96acadf1eb712fca6d59922ad93787c87e44ec Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:10 +1100
Subject: xfs: move inode locking functions to xfs_inode.c

xfs_ilock() and friends really aren't related to the inode cache in
any way, so move them to xfs_inode.c with all the other inode
related functionality.

While doing this move, move the xfs_ilock() tracepoints to *before*
the lock is taken so that when a hang on a lock occurs we have
events to indicate which process and what inode we were trying to
lock when the hang occurred. This is much better than the current
silence we get on a hang...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_iget.c  | 251 -----------------------------------------------------
 fs/xfs/xfs_inode.c | 250 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 250 insertions(+), 251 deletions(-)

diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 069c5ceb9459..ea9a5fa49a48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -453,254 +453,3 @@ out_error_or_again:
 	return error;
 }
 
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code.  It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in.  Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared().  This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
-	xfs_inode_t	*ip)
-{
-	uint	lock_mode;
-
-	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
-		lock_mode = XFS_ILOCK_EXCL;
-	} else {
-		lock_mode = XFS_ILOCK_SHARED;
-	}
-
-	xfs_ilock(ip, lock_mode);
-
-	return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-	xfs_inode_t	*ip,
-	unsigned int	lock_mode)
-{
-	xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *		XFS_IOLOCK_SHARED,
- *		XFS_IOLOCK_EXCL,
- *		XFS_ILOCK_SHARED,
- *		XFS_ILOCK_EXCL,
- *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_ILOCK_SHARED)
-		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-
-	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep.  It returns 1 if it gets
- * the requested locks and 0 otherwise.  If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be locked.  See the comment for xfs_ilock() for a list
- *	 of valid values.
- */
-int
-xfs_ilock_nowait(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_iolock))
-			goto out;
-	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_iolock))
-			goto out;
-	}
-	if (lock_flags & XFS_ILOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_lock))
-			goto out_undo_iolock;
-	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_lock))
-			goto out_undo_iolock;
-	}
-	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
-	return 1;
-
- out_undo_iolock:
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
- out:
-	return 0;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be unlocked.  See the comment for xfs_ilock() for a list
- *	 of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	/*
-	 * You can't set both SHARED and EXCL for the same lock,
-	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-	ASSERT(lock_flags != 0);
-
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrunlock_excl(&ip->i_lock);
-	else if (lock_flags & XFS_ILOCK_SHARED)
-		mrunlock_shared(&ip->i_lock);
-
-	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * give up write locks.  the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
-	if (lock_flags & XFS_ILOCK_EXCL)
-		mrdemote(&ip->i_lock);
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrdemote(&ip->i_iolock);
-
-	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-
-#ifdef DEBUG
-int
-xfs_isilocked(
-	xfs_inode_t		*ip,
-	uint			lock_flags)
-{
-	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-		if (!(lock_flags & XFS_ILOCK_SHARED))
-			return !!ip->i_lock.mr_writer;
-		return rwsem_is_locked(&ip->i_lock.mr_lock);
-	}
-
-	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-		if (!(lock_flags & XFS_IOLOCK_SHARED))
-			return !!ip->i_iolock.mr_writer;
-		return rwsem_is_locked(&ip->i_iolock.mr_lock);
-	}
-
-	ASSERT(0);
-	return 0;
-}
-#endif
-
-void
-__xfs_iflock(
-	struct xfs_inode	*ip)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-	do {
-		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-		if (xfs_isiflocked(ip))
-			io_schedule();
-	} while (!xfs_iflock_nowait(ip));
-
-	finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258fcfa2..ba404e4b9f0c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -74,6 +74,256 @@ xfs_get_extsz_hint(
 	return 0;
 }
 
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+	xfs_inode_t	*ip)
+{
+	uint	lock_mode;
+
+	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+		lock_mode = XFS_ILOCK_EXCL;
+	} else {
+		lock_mode = XFS_ILOCK_SHARED;
+	}
+
+	xfs_ilock(ip, lock_mode);
+
+	return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+	xfs_inode_t	*ip,
+	unsigned int	lock_mode)
+{
+	xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *		XFS_IOLOCK_SHARED,
+ *		XFS_IOLOCK_EXCL,
+ *		XFS_ILOCK_SHARED,
+ *		XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *	 of valid values.
+ */
+int
+xfs_ilock_nowait(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_iolock))
+			goto out;
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_iolock))
+			goto out;
+	}
+	if (lock_flags & XFS_ILOCK_EXCL) {
+		if (!mrtryupdate(&ip->i_lock))
+			goto out_undo_iolock;
+	} else if (lock_flags & XFS_ILOCK_SHARED) {
+		if (!mrtryaccess(&ip->i_lock))
+			goto out_undo_iolock;
+	}
+	return 1;
+
+ out_undo_iolock:
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+ out:
+	return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *	 of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	/*
+	 * You can't set both SHARED and EXCL for the same lock,
+	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+	ASSERT(lock_flags != 0);
+
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrunlock_excl(&ip->i_iolock);
+	else if (lock_flags & XFS_IOLOCK_SHARED)
+		mrunlock_shared(&ip->i_iolock);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrunlock_excl(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_SHARED)
+		mrunlock_shared(&ip->i_lock);
+
+	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+	if (lock_flags & XFS_ILOCK_EXCL)
+		mrdemote(&ip->i_lock);
+	if (lock_flags & XFS_IOLOCK_EXCL)
+		mrdemote(&ip->i_iolock);
+
+	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+	xfs_inode_t		*ip,
+	uint			lock_flags)
+{
+	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+		if (!(lock_flags & XFS_ILOCK_SHARED))
+			return !!ip->i_lock.mr_writer;
+		return rwsem_is_locked(&ip->i_lock.mr_lock);
+	}
+
+	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+		if (!(lock_flags & XFS_IOLOCK_SHARED))
+			return !!ip->i_iolock.mr_writer;
+		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+	}
+
+	ASSERT(0);
+	return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+	do {
+		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (xfs_isiflocked(ip))
+			io_schedule();
+	} while (!xfs_iflock_nowait(ip));
+
+	finish_wait(wq, &wait.wait);
+}
+
 #ifdef DEBUG
 /*
  * Make sure that the extents in the given memory buffer
-- 
cgit v1.2.3


From 33479e0542df066fb0b47df18780e93bfe6e0dc5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:11 +1100
Subject: xfs: remove xfs_iget.c

The inode cache functions remaining in xfs_iget.c can be moved to xfs_icache.c
along with the other inode cache functions. This removes all functionality from
xfs_iget.c, so the file can simply be removed.

This move results in various functions now only having the scope of a single
file (e.g. xfs_inode_free()), so clean up all the definitions and exported
prototypes in xfs_icache.[ch] and xfs_inode.h appropriately.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Makefile          |   1 -
 fs/xfs/xfs_export.c      |   1 +
 fs/xfs/xfs_icache.c      | 421 ++++++++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_icache.h      |   6 +-
 fs/xfs/xfs_iget.c        | 455 -----------------------------------------------
 fs/xfs/xfs_inode.c       |   1 +
 fs/xfs/xfs_inode.h       |  10 +-
 fs/xfs/xfs_itable.c      |   1 +
 fs/xfs/xfs_log_recover.c |   1 +
 fs/xfs/xfs_qm.c          |   1 +
 fs/xfs/xfs_rtalloc.c     |   1 +
 fs/xfs/xfs_vnodeops.c    |   1 +
 12 files changed, 430 insertions(+), 470 deletions(-)
 delete mode 100644 fs/xfs/xfs_iget.c

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 442f256dbcac..e65357bb3dc6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -40,7 +40,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_fs_subr.o \
 				   xfs_globals.o \
 				   xfs_icache.o \
-				   xfs_iget.o \
 				   xfs_ioctl.o \
 				   xfs_iomap.o \
 				   xfs_iops.o \
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index eba216f11d5e..9c8703b5cd72 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -41,6 +41,421 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+				struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+	struct xfs_mount	*mp,
+	xfs_ino_t		ino)
+{
+	struct xfs_inode	*ip;
+
+	/*
+	 * if this didn't occur in transactions, we could use
+	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+	 * code up to do this anyway.
+	 */
+	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+	if (!ip)
+		return NULL;
+	if (inode_init_always(mp->m_super, VFS_I(ip))) {
+		kmem_zone_free(xfs_inode_zone, ip);
+		return NULL;
+	}
+
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+	ASSERT(ip->i_ino == 0);
+
+	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+	/* initialise the xfs inode */
+	ip->i_ino = ino;
+	ip->i_mount = mp;
+	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+	ip->i_afp = NULL;
+	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+	ip->i_flags = 0;
+	ip->i_delayed_blks = 0;
+	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+	return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+	struct rcu_head		*head)
+{
+	struct inode		*inode = container_of(head, struct inode, i_rcu);
+	struct xfs_inode	*ip = XFS_I(inode);
+
+	kmem_zone_free(xfs_inode_zone, ip);
+}
+
+STATIC void
+xfs_inode_free(
+	struct xfs_inode	*ip)
+{
+	switch (ip->i_d.di_mode & S_IFMT) {
+	case S_IFREG:
+	case S_IFDIR:
+	case S_IFLNK:
+		xfs_idestroy_fork(ip, XFS_DATA_FORK);
+		break;
+	}
+
+	if (ip->i_afp)
+		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+	if (ip->i_itemp) {
+		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+		xfs_inode_item_destroy(ip);
+		ip->i_itemp = NULL;
+	}
+
+	/* asserts to verify all state is correct here */
+	ASSERT(atomic_read(&ip->i_pincount) == 0);
+	ASSERT(!spin_is_locked(&ip->i_flags_lock));
+	ASSERT(!xfs_isiflocked(ip));
+
+	/*
+	 * Because we use RCU freeing we need to ensure the inode always
+	 * appears to be reclaimed with an invalid inode number when in the
+	 * free state. The ip->i_flags_lock provides the barrier against lookup
+	 * races.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	ip->i_flags = XFS_IRECLAIM;
+	ip->i_ino = 0;
+	spin_unlock(&ip->i_flags_lock);
+
+	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+	struct xfs_perag	*pag,
+	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
+	int			flags,
+	int			lock_flags) __releases(RCU)
+{
+	struct inode		*inode = VFS_I(ip);
+	struct xfs_mount	*mp = ip->i_mount;
+	int			error;
+
+	/*
+	 * check for re-use of an inode within an RCU grace period due to the
+	 * radix tree nodes not being updated yet. We monitor for this by
+	 * setting the inode number to zero before freeing the inode structure.
+	 * If the inode has been reallocated and set up, then the inode number
+	 * will not match, so check for that, too.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (ip->i_ino != ino) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+
+	/*
+	 * If we are racing with another cache hit that is currently
+	 * instantiating this inode or currently recycling it out of
+	 * reclaimabe state, wait for the initialisation to complete
+	 * before continuing.
+	 *
+	 * XXX(hch): eventually we should do something equivalent to
+	 *	     wait_on_inode to wait for these flags to be cleared
+	 *	     instead of polling for it.
+	 */
+	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+		trace_xfs_iget_skip(ip);
+		XFS_STATS_INC(xs_ig_frecycle);
+		error = EAGAIN;
+		goto out_error;
+	}
+
+	/*
+	 * If lookup is racing with unlink return an error immediately.
+	 */
+	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_error;
+	}
+
+	/*
+	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+	 * Need to carefully get it back into useable state.
+	 */
+	if (ip->i_flags & XFS_IRECLAIMABLE) {
+		trace_xfs_iget_reclaim(ip);
+
+		/*
+		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+		 * from stomping over us while we recycle the inode.  We can't
+		 * clear the radix tree reclaimable tag yet as it requires
+		 * pag_ici_lock to be held exclusive.
+		 */
+		ip->i_flags |= XFS_IRECLAIM;
+
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+
+		error = -inode_init_always(mp->m_super, inode);
+		if (error) {
+			/*
+			 * Re-initializing the inode failed, and we are in deep
+			 * trouble.  Try to re-add it to the reclaim list.
+			 */
+			rcu_read_lock();
+			spin_lock(&ip->i_flags_lock);
+
+			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+			trace_xfs_iget_reclaim_fail(ip);
+			goto out_error;
+		}
+
+		spin_lock(&pag->pag_ici_lock);
+		spin_lock(&ip->i_flags_lock);
+
+		/*
+		 * Clear the per-lifetime state in the inode as we are now
+		 * effectively a new inode and need to return to the initial
+		 * state before reuse occurs.
+		 */
+		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+		ip->i_flags |= XFS_INEW;
+		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
+		inode->i_state = I_NEW;
+
+		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+		spin_unlock(&ip->i_flags_lock);
+		spin_unlock(&pag->pag_ici_lock);
+	} else {
+		/* If the VFS inode is being torn down, pause and try again. */
+		if (!igrab(inode)) {
+			trace_xfs_iget_skip(ip);
+			error = EAGAIN;
+			goto out_error;
+		}
+
+		/* We've got a live one. */
+		spin_unlock(&ip->i_flags_lock);
+		rcu_read_unlock();
+		trace_xfs_iget_hit(ip);
+	}
+
+	if (lock_flags != 0)
+		xfs_ilock(ip, lock_flags);
+
+	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+	XFS_STATS_INC(xs_ig_found);
+
+	return 0;
+
+out_error:
+	spin_unlock(&ip->i_flags_lock);
+	rcu_read_unlock();
+	return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
+	xfs_trans_t		*tp,
+	xfs_ino_t		ino,
+	struct xfs_inode	**ipp,
+	int			flags,
+	int			lock_flags)
+{
+	struct xfs_inode	*ip;
+	int			error;
+	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
+	int			iflags;
+
+	ip = xfs_inode_alloc(mp, ino);
+	if (!ip)
+		return ENOMEM;
+
+	error = xfs_iread(mp, tp, ip, flags);
+	if (error)
+		goto out_destroy;
+
+	trace_xfs_iget_miss(ip);
+
+	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+		error = ENOENT;
+		goto out_destroy;
+	}
+
+	/*
+	 * Preload the radix tree so we can insert safely under the
+	 * write spinlock. Note that we cannot sleep inside the preload
+	 * region. Since we can be called from transaction context, don't
+	 * recurse into the file system.
+	 */
+	if (radix_tree_preload(GFP_NOFS)) {
+		error = EAGAIN;
+		goto out_destroy;
+	}
+
+	/*
+	 * Because the inode hasn't been added to the radix-tree yet it can't
+	 * be found by another thread, so we can do the non-sleeping lock here.
+	 */
+	if (lock_flags) {
+		if (!xfs_ilock_nowait(ip, lock_flags))
+			BUG();
+	}
+
+	/*
+	 * These values must be set before inserting the inode into the radix
+	 * tree as the moment it is inserted a concurrent lookup (allowed by the
+	 * RCU locking mechanism) can find it and that lookup must see that this
+	 * is an inode currently under construction (i.e. that XFS_INEW is set).
+	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+	 * memory barrier that ensures this detection works correctly at lookup
+	 * time.
+	 */
+	iflags = XFS_INEW;
+	if (flags & XFS_IGET_DONTCACHE)
+		iflags |= XFS_IDONTCACHE;
+	ip->i_udquot = ip->i_gdquot = NULL;
+	xfs_iflags_set(ip, iflags);
+
+	/* insert the new inode */
+	spin_lock(&pag->pag_ici_lock);
+	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+	if (unlikely(error)) {
+		WARN_ON(error != -EEXIST);
+		XFS_STATS_INC(xs_ig_dup);
+		error = EAGAIN;
+		goto out_preload_end;
+	}
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+
+	*ipp = ip;
+	return 0;
+
+out_preload_end:
+	spin_unlock(&pag->pag_ici_lock);
+	radix_tree_preload_end();
+	if (lock_flags)
+		xfs_iunlock(ip, lock_flags);
+out_destroy:
+	__destroy_inode(VFS_I(ip));
+	xfs_inode_free(ip);
+	return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *		 for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+	xfs_mount_t	*mp,
+	xfs_trans_t	*tp,
+	xfs_ino_t	ino,
+	uint		flags,
+	uint		lock_flags,
+	xfs_inode_t	**ipp)
+{
+	xfs_inode_t	*ip;
+	int		error;
+	xfs_perag_t	*pag;
+	xfs_agino_t	agino;
+
+	/*
+	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+	 * doesn't get freed while it's being referenced during a
+	 * radix tree traversal here.  It assumes this function
+	 * aqcuires only the ILOCK (and therefore it has no need to
+	 * involve the IOLOCK in this synchronization).
+	 */
+	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+	/* reject inode numbers outside existing AGs */
+	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+		return EINVAL;
+
+	/* get the perag structure and ensure that it's inode capable */
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+	agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+	error = 0;
+	rcu_read_lock();
+	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+	if (ip) {
+		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	} else {
+		rcu_read_unlock();
+		XFS_STATS_INC(xs_ig_missed);
+
+		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+							flags, lock_flags);
+		if (error)
+			goto out_error_or_again;
+	}
+	xfs_perag_put(pag);
+
+	*ipp = ip;
+
+	/*
+	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
+	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
+	 */
+	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+		xfs_setup_inode(ip);
+	return 0;
+
+out_error_or_again:
+	if (error == EAGAIN) {
+		delay(1);
+		goto again;
+	}
+	xfs_perag_put(pag);
+	return error;
+}
+
 /*
  * The inode lookup is done in batches to keep the amount of lock traffic and
  * radix tree lookups to a minimum. The batch size is a trade off between
@@ -253,7 +668,7 @@ xfs_reclaim_worker(
 	xfs_reclaim_work_queue(mp);
 }
 
-void
+static void
 __xfs_inode_set_reclaim_tag(
 	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
@@ -319,7 +734,7 @@ __xfs_inode_clear_reclaim(
 	}
 }
 
-void
+STATIC void
 __xfs_inode_clear_reclaim_tag(
 	xfs_mount_t	*mp,
 	xfs_perag_t	*pag,
@@ -542,7 +957,7 @@ out:
  * then a shut down during filesystem unmount reclaim walk leak all the
  * unreclaimed inodes.
  */
-int
+STATIC int
 xfs_reclaim_inodes_ag(
 	struct xfs_mount	*mp,
 	int			flags,
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 0ba9c89c316e..222e22f16b4a 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,6 +24,9 @@ struct xfs_perag;
 #define SYNC_WAIT		0x0001	/* wait for i/o to complete */
 #define SYNC_TRYLOCK		0x0002  /* only try to lock inodes */
 
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+	     uint flags, uint lock_flags, xfs_inode_t **ipp);
+
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
@@ -31,9 +34,6 @@ int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-				struct xfs_inode *ip);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index ea9a5fa49a48..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,455 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-#include "xfs_icache.h"
-
-
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
-	struct xfs_mount	*mp,
-	xfs_ino_t		ino)
-{
-	struct xfs_inode	*ip;
-
-	/*
-	 * if this didn't occur in transactions, we could use
-	 * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-	 * code up to do this anyway.
-	 */
-	ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
-	if (!ip)
-		return NULL;
-	if (inode_init_always(mp->m_super, VFS_I(ip))) {
-		kmem_zone_free(xfs_inode_zone, ip);
-		return NULL;
-	}
-
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-	ASSERT(ip->i_ino == 0);
-
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-	/* initialise the xfs inode */
-	ip->i_ino = ino;
-	ip->i_mount = mp;
-	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-	ip->i_afp = NULL;
-	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
-	ip->i_flags = 0;
-	ip->i_delayed_blks = 0;
-	memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-
-	return ip;
-}
-
-STATIC void
-xfs_inode_free_callback(
-	struct rcu_head		*head)
-{
-	struct inode		*inode = container_of(head, struct inode, i_rcu);
-	struct xfs_inode	*ip = XFS_I(inode);
-
-	kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-	struct xfs_inode	*ip)
-{
-	switch (ip->i_d.di_mode & S_IFMT) {
-	case S_IFREG:
-	case S_IFDIR:
-	case S_IFLNK:
-		xfs_idestroy_fork(ip, XFS_DATA_FORK);
-		break;
-	}
-
-	if (ip->i_afp)
-		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-	if (ip->i_itemp) {
-		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
-		xfs_inode_item_destroy(ip);
-		ip->i_itemp = NULL;
-	}
-
-	/* asserts to verify all state is correct here */
-	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!spin_is_locked(&ip->i_flags_lock));
-	ASSERT(!xfs_isiflocked(ip));
-
-	/*
-	 * Because we use RCU freeing we need to ensure the inode always
-	 * appears to be reclaimed with an invalid inode number when in the
-	 * free state. The ip->i_flags_lock provides the barrier against lookup
-	 * races.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	ip->i_flags = XFS_IRECLAIM;
-	ip->i_ino = 0;
-	spin_unlock(&ip->i_flags_lock);
-
-	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
-	struct xfs_perag	*pag,
-	struct xfs_inode	*ip,
-	xfs_ino_t		ino,
-	int			flags,
-	int			lock_flags) __releases(RCU)
-{
-	struct inode		*inode = VFS_I(ip);
-	struct xfs_mount	*mp = ip->i_mount;
-	int			error;
-
-	/*
-	 * check for re-use of an inode within an RCU grace period due to the
-	 * radix tree nodes not being updated yet. We monitor for this by
-	 * setting the inode number to zero before freeing the inode structure.
-	 * If the inode has been reallocated and set up, then the inode number
-	 * will not match, so check for that, too.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (ip->i_ino != ino) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
-
-	/*
-	 * If we are racing with another cache hit that is currently
-	 * instantiating this inode or currently recycling it out of
-	 * reclaimabe state, wait for the initialisation to complete
-	 * before continuing.
-	 *
-	 * XXX(hch): eventually we should do something equivalent to
-	 *	     wait_on_inode to wait for these flags to be cleared
-	 *	     instead of polling for it.
-	 */
-	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
-		trace_xfs_iget_skip(ip);
-		XFS_STATS_INC(xs_ig_frecycle);
-		error = EAGAIN;
-		goto out_error;
-	}
-
-	/*
-	 * If lookup is racing with unlink return an error immediately.
-	 */
-	if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		goto out_error;
-	}
-
-	/*
-	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-	 * Need to carefully get it back into useable state.
-	 */
-	if (ip->i_flags & XFS_IRECLAIMABLE) {
-		trace_xfs_iget_reclaim(ip);
-
-		/*
-		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-		 * from stomping over us while we recycle the inode.  We can't
-		 * clear the radix tree reclaimable tag yet as it requires
-		 * pag_ici_lock to be held exclusive.
-		 */
-		ip->i_flags |= XFS_IRECLAIM;
-
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-
-		error = -inode_init_always(mp->m_super, inode);
-		if (error) {
-			/*
-			 * Re-initializing the inode failed, and we are in deep
-			 * trouble.  Try to re-add it to the reclaim list.
-			 */
-			rcu_read_lock();
-			spin_lock(&ip->i_flags_lock);
-
-			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-			trace_xfs_iget_reclaim_fail(ip);
-			goto out_error;
-		}
-
-		spin_lock(&pag->pag_ici_lock);
-		spin_lock(&ip->i_flags_lock);
-
-		/*
-		 * Clear the per-lifetime state in the inode as we are now
-		 * effectively a new inode and need to return to the initial
-		 * state before reuse occurs.
-		 */
-		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-		ip->i_flags |= XFS_INEW;
-		__xfs_inode_clear_reclaim_tag(mp, pag, ip);
-		inode->i_state = I_NEW;
-
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-		spin_unlock(&ip->i_flags_lock);
-		spin_unlock(&pag->pag_ici_lock);
-	} else {
-		/* If the VFS inode is being torn down, pause and try again. */
-		if (!igrab(inode)) {
-			trace_xfs_iget_skip(ip);
-			error = EAGAIN;
-			goto out_error;
-		}
-
-		/* We've got a live one. */
-		spin_unlock(&ip->i_flags_lock);
-		rcu_read_unlock();
-		trace_xfs_iget_hit(ip);
-	}
-
-	if (lock_flags != 0)
-		xfs_ilock(ip, lock_flags);
-
-	xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-	XFS_STATS_INC(xs_ig_found);
-
-	return 0;
-
-out_error:
-	spin_unlock(&ip->i_flags_lock);
-	rcu_read_unlock();
-	return error;
-}
-
-
-static int
-xfs_iget_cache_miss(
-	struct xfs_mount	*mp,
-	struct xfs_perag	*pag,
-	xfs_trans_t		*tp,
-	xfs_ino_t		ino,
-	struct xfs_inode	**ipp,
-	int			flags,
-	int			lock_flags)
-{
-	struct xfs_inode	*ip;
-	int			error;
-	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
-	int			iflags;
-
-	ip = xfs_inode_alloc(mp, ino);
-	if (!ip)
-		return ENOMEM;
-
-	error = xfs_iread(mp, tp, ip, flags);
-	if (error)
-		goto out_destroy;
-
-	trace_xfs_iget_miss(ip);
-
-	if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-		error = ENOENT;
-		goto out_destroy;
-	}
-
-	/*
-	 * Preload the radix tree so we can insert safely under the
-	 * write spinlock. Note that we cannot sleep inside the preload
-	 * region. Since we can be called from transaction context, don't
-	 * recurse into the file system.
-	 */
-	if (radix_tree_preload(GFP_NOFS)) {
-		error = EAGAIN;
-		goto out_destroy;
-	}
-
-	/*
-	 * Because the inode hasn't been added to the radix-tree yet it can't
-	 * be found by another thread, so we can do the non-sleeping lock here.
-	 */
-	if (lock_flags) {
-		if (!xfs_ilock_nowait(ip, lock_flags))
-			BUG();
-	}
-
-	/*
-	 * These values must be set before inserting the inode into the radix
-	 * tree as the moment it is inserted a concurrent lookup (allowed by the
-	 * RCU locking mechanism) can find it and that lookup must see that this
-	 * is an inode currently under construction (i.e. that XFS_INEW is set).
-	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
-	 * memory barrier that ensures this detection works correctly at lookup
-	 * time.
-	 */
-	iflags = XFS_INEW;
-	if (flags & XFS_IGET_DONTCACHE)
-		iflags |= XFS_IDONTCACHE;
-	ip->i_udquot = ip->i_gdquot = NULL;
-	xfs_iflags_set(ip, iflags);
-
-	/* insert the new inode */
-	spin_lock(&pag->pag_ici_lock);
-	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-	if (unlikely(error)) {
-		WARN_ON(error != -EEXIST);
-		XFS_STATS_INC(xs_ig_dup);
-		error = EAGAIN;
-		goto out_preload_end;
-	}
-	spin_unlock(&pag->pag_ici_lock);
-	radix_tree_preload_end();
-
-	*ipp = ip;
-	return 0;
-
-out_preload_end:
-	spin_unlock(&pag->pag_ici_lock);
-	radix_tree_preload_end();
-	if (lock_flags)
-		xfs_iunlock(ip, lock_flags);
-out_destroy:
-	__destroy_inode(VFS_I(ip));
-	xfs_inode_free(ip);
-	return error;
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *		 for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
-	xfs_mount_t	*mp,
-	xfs_trans_t	*tp,
-	xfs_ino_t	ino,
-	uint		flags,
-	uint		lock_flags,
-	xfs_inode_t	**ipp)
-{
-	xfs_inode_t	*ip;
-	int		error;
-	xfs_perag_t	*pag;
-	xfs_agino_t	agino;
-
-	/*
-	 * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-	 * doesn't get freed while it's being referenced during a
-	 * radix tree traversal here.  It assumes this function
-	 * aqcuires only the ILOCK (and therefore it has no need to
-	 * involve the IOLOCK in this synchronization).
-	 */
-	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-
-	/* reject inode numbers outside existing AGs */
-	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-		return EINVAL;
-
-	/* get the perag structure and ensure that it's inode capable */
-	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-	agino = XFS_INO_TO_AGINO(mp, ino);
-
-again:
-	error = 0;
-	rcu_read_lock();
-	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-
-	if (ip) {
-		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
-		if (error)
-			goto out_error_or_again;
-	} else {
-		rcu_read_unlock();
-		XFS_STATS_INC(xs_ig_missed);
-
-		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
-							flags, lock_flags);
-		if (error)
-			goto out_error_or_again;
-	}
-	xfs_perag_put(pag);
-
-	*ipp = ip;
-
-	/*
-	 * If we have a real type for an on-disk inode, we can set ops(&unlock)
-	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
-	 */
-	if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-		xfs_setup_inode(ip);
-	return 0;
-
-out_error_or_again:
-	if (error == EAGAIN) {
-		delay(1);
-		goto again;
-	}
-	xfs_perag_put(pag);
-	return error;
-}
-
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index ba404e4b9f0c..bba8f37525b3 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..1fc2065e010b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
 	(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
 	 ((pip)->i_d.di_mode & S_ISGID))
 
+
 /*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
  */
-int		xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-			 uint, uint, xfs_inode_t **);
 void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int		xfs_isilocked(xfs_inode_t *, uint);
 uint		xfs_ilock_map_shared(xfs_inode_t *);
 void		xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void		xfs_inode_free(struct xfs_inode *ip);
-
-/*
- * xfs_inode.c prototypes.
- */
 int		xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
 			   xfs_nlink_t, xfs_dev_t, prid_t, int,
 			   struct xfs_buf **, xfs_inode_t **);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..3998fd2a7949 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int
 xfs_internal_inum(
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace352bf..651c98859b04 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -42,6 +42,7 @@
 #include "xfs_quota.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 STATIC int
 xlog_find_zeroed(
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..48c750b0e830 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..a69e0b4750a9 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
+#include "xfs_icache.h"
 
 
 /*
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 14928564f106..2ee1f49da0aa 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 /*
  * The maximum pathlen is 1024 bytes. Since the minimum file system
-- 
cgit v1.2.3


From d35e88faa3b0fc2cea35c3b2dca358b5cd09b45f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 8 Oct 2012 21:56:12 +1100
Subject: xfs: only update the last_sync_lsn when a transaction completes

The log write code stamps each iclog with the current tail LSN in
the iclog header so that recovery knows where to find the tail of
thelog once it has found the head. Normally this is taken from the
first item on the AIL - the log item that corresponds to the oldest
active item in the log.

The problem is that when the AIL is empty, the tail lsn is dervied
from the the l_last_sync_lsn, which is the LSN of the last iclog to
be written to the log. In most cases this doesn't happen, because
the AIL is rarely empty on an active filesystem. However, when it
does, it opens up an interesting case when the transaction being
committed to the iclog spans multiple iclogs.

That is, the first iclog is stamped with the l_last_sync_lsn, and IO
is issued. Then the next iclog is setup, the changes copied into the
iclog (takes some time), and then the l_last_sync_lsn is stamped
into the header and IO is issued. This is still the same
transaction, so the tail lsn of both iclogs must be the same for log
recovery to find the entire transaction to be able to replay it.

The problem arises in that the iclog buffer IO completion updates
the l_last_sync_lsn with it's own LSN. Therefore, If the first iclog
completes it's IO before the second iclog is filled and has the tail
lsn stamped in it, it will stamp the LSN of the first iclog into
it's tail lsn field. If the system fails at this point, log recovery
will not see a complete transaction, so the transaction will no be
replayed.

The fix is simple - the l_last_sync_lsn is updated when a iclog
buffer IO completes, and this is incorrect. The l_last_sync_lsn
shoul dbe updated when a transaction is completed by a iclog buffer
IO. That is, only iclog buffers that have transaction commit
callbacks attached to them should update the l_last_sync_lsn. This
means that the last_sync_lsn will only move forward when a commit
record it written, not in the middle of a large transaction that is
rolling through multiple iclog buffers.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index d2d59692739f..46b6986e39b0 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -2461,14 +2461,27 @@ xlog_state_do_callback(
 
 
 				/*
-				 * update the last_sync_lsn before we drop the
+				 * Completion of a iclog IO does not imply that
+				 * a transaction has completed, as transactions
+				 * can be large enough to span many iclogs. We
+				 * cannot change the tail of the log half way
+				 * through a transaction as this may be the only
+				 * transaction in the log and moving th etail to
+				 * point to the middle of it will prevent
+				 * recovery from finding the start of the
+				 * transaction. Hence we should only update the
+				 * last_sync_lsn if this iclog contains
+				 * transaction completion callbacks on it.
+				 *
+				 * We have to do this before we drop the
 				 * icloglock to ensure we are the only one that
 				 * can update it.
 				 */
 				ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
 					be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
-				atomic64_set(&log->l_last_sync_lsn,
-					be64_to_cpu(iclog->ic_header.h_lsn));
+				if (iclog->ic_callback)
+					atomic64_set(&log->l_last_sync_lsn,
+						be64_to_cpu(iclog->ic_header.h_lsn));
 
 			} else
 				ioerrors++;
-- 
cgit v1.2.3


From a00416844b8f4b0106344bdfd90fe45a854b1d05 Mon Sep 17 00:00:00 2001
From: Mark Tinguely <tinguely@sgi.com>
Date: Thu, 20 Sep 2012 13:16:45 -0500
Subject: xfs: zero allocation_args on the kernel stack

Zero the kernel stack space that makes up the xfs_alloc_arg structures.

Signed-off-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c  | 1 +
 fs/xfs/xfs_bmap.c   | 3 +++
 fs/xfs/xfs_ialloc.c | 1 +
 3 files changed, 5 insertions(+)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe3..0287f3b1b503 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1866,6 +1866,7 @@ xfs_alloc_fix_freelist(
 	/*
 	 * Initialize the args structure.
 	 */
+	memset(&targs, 0, sizeof(targs));
 	targs.tp = tp;
 	targs.mp = mp;
 	targs.agbp = agbp;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa77707b..e1545ec2f7d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
 	 * Normal allocation, done through xfs_alloc_vextent.
 	 */
 	tryagain = isaligned = 0;
+	memset(&args, 0, sizeof(args));
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
@@ -3082,6 +3083,7 @@ xfs_bmap_extents_to_btree(
 	 * Convert to a btree with two levels, one record in root.
 	 */
 	XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+	memset(&args, 0, sizeof(args));
 	args.tp = tp;
 	args.mp = mp;
 	args.firstblock = *firstblock;
@@ -3237,6 +3239,7 @@ xfs_bmap_local_to_extents(
 		xfs_buf_t	*bp;	/* buffer for extent block */
 		xfs_bmbt_rec_host_t *ep;/* extent record pointer */
 
+		memset(&args, 0, sizeof(args));
 		args.tp = tp;
 		args.mp = ip->i_mount;
 		args.firstblock = *firstblock;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 445bf1aef31c..c5c4ef4f2bdb 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -250,6 +250,7 @@ xfs_ialloc_ag_alloc(
 					/* boundary */
 	struct xfs_perag *pag;
 
+	memset(&args, 0, sizeof(args));
 	args.tp = tp;
 	args.mp = tp->t_mountp;
 
-- 
cgit v1.2.3


From 2455881c0b52f87be539c4c7deab1afff4d8a560 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 5 Oct 2012 11:06:58 +1000
Subject: xfs: introduce XFS_BMAPI_STACK_SWITCH

Certain allocation paths through xfs_bmapi_write() are in situations
where we have limited stack available. These are almost always in
the buffered IO writeback path when convertion delayed allocation
extents to real extents.

The current stack switch occurs for userdata allocations, which
means we also do stack switches for preallocation, direct IO and
unwritten extent conversion, even those these call chains have never
been implicated in a stack overrun.

Hence, let's target just the single stack overun offended for stack
switches. To do that, introduce a XFS_BMAPI_STACK_SWITCH flag that
the caller can pass xfs_bmapi_write() to indicate it should switch
stacks if it needs to do allocation.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c | 2 +-
 fs/xfs/xfs_alloc.h | 1 +
 fs/xfs/xfs_bmap.c  | 4 ++++
 fs/xfs/xfs_bmap.h  | 5 ++++-
 fs/xfs/xfs_iomap.c | 4 +++-
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 0287f3b1b503..43f791bcd8b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2447,7 +2447,7 @@ xfs_alloc_vextent(
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 
-	if (!args->userdata)
+	if (!args->stack_switch)
 		return __xfs_alloc_vextent(args);
 
 
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca1..ef7d4885dc2d 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -123,6 +123,7 @@ typedef struct xfs_alloc_arg {
 	struct completion *done;
 	struct work_struct work;
 	int		result;
+	char		stack_switch;
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index e1545ec2f7d2..91259554df8b 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2441,6 +2441,7 @@ xfs_bmap_btalloc(
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
+	args.stack_switch = ap->stack_switch;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
@@ -4675,6 +4676,9 @@ xfs_bmapi_allocate(
 			return error;
 	}
 
+	if (flags & XFS_BMAPI_STACK_SWITCH)
+		bma->stack_switch = 1;
+
 	error = xfs_bmap_alloc(bma);
 	if (error)
 		return error;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce16..b68c598034c1 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef	struct xfs_bmap_free
  * from written to unwritten, otherwise convert from unwritten to written.
  */
 #define XFS_BMAPI_CONVERT	0x040
+#define XFS_BMAPI_STACK_SWITCH	0x080
 
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef	struct xfs_bmap_free
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
-	{ XFS_BMAPI_CONVERT,	"CONVERT" }
+	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
+	{ XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
 
 
 static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,7 @@ typedef struct xfs_bmalloca {
 	char			userdata;/* set if is user data */
 	char			aeof;	/* allocated space at eof */
 	char			conv;	/* overwriting unwritten extents */
+	char			stack_switch;
 } xfs_bmalloca_t;
 
 /*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f858b903678e..a066cf1766ab 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -575,7 +575,9 @@ xfs_iomap_write_allocate(
 			 * pointer that the caller gave to us.
 			 */
 			error = xfs_bmapi_write(tp, ip, map_start_fsb,
-						count_fsb, 0, &first_block, 1,
+						count_fsb,
+						XFS_BMAPI_STACK_SWITCH,
+						&first_block, 1,
 						imap, &nimaps, &free_list);
 			if (error)
 				goto trans_cancel;
-- 
cgit v1.2.3


From e04426b9202bccd4cfcbc70b2fa2aeca1c86d8f5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 5 Oct 2012 11:06:59 +1000
Subject: xfs: move allocation stack switch up to xfs_bmapi_allocate

Switching stacks are xfs_alloc_vextent can cause deadlocks when we
run out of worker threads on the allocation workqueue. This can
occur because xfs_bmap_btalloc can make multiple calls to
xfs_alloc_vextent() and even if xfs_alloc_vextent() fails it can
return with the AGF locked in the current allocation transaction.

If we then need to make another allocation, and all the allocation
worker contexts are exhausted because the are blocked waiting for
the AGF lock, holder of the AGF cannot get it's xfs-alloc_vextent
work completed to release the AGF.  Hence allocation effectively
deadlocks.

To avoid this, move the stack switch one layer up to
xfs_bmapi_allocate() so that all of the allocation attempts in a
single switched stack transaction occur in a single worker context.
This avoids the problem of an allocation being blocked waiting for
a worker thread whilst holding the AGF.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c | 42 +-------------------------------------
 fs/xfs/xfs_alloc.h |  4 ----
 fs/xfs/xfs_bmap.c  | 60 ++++++++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_bmap.h  |  4 ++++
 4 files changed, 54 insertions(+), 56 deletions(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 43f791bcd8b1..335206a9c698 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2208,7 +2208,7 @@ xfs_alloc_read_agf(
  * group or loop over the allocation groups to find the result.
  */
 int				/* error */
-__xfs_alloc_vextent(
+xfs_alloc_vextent(
 	xfs_alloc_arg_t	*args)	/* allocation argument structure */
 {
 	xfs_agblock_t	agsize;	/* allocation group size */
@@ -2418,46 +2418,6 @@ error0:
 	return error;
 }
 
-static void
-xfs_alloc_vextent_worker(
-	struct work_struct	*work)
-{
-	struct xfs_alloc_arg	*args = container_of(work,
-						struct xfs_alloc_arg, work);
-	unsigned long		pflags;
-
-	/* we are in a transaction context here */
-	current_set_flags_nested(&pflags, PF_FSTRANS);
-
-	args->result = __xfs_alloc_vextent(args);
-	complete(args->done);
-
-	current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Data allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Metadata
- * requests, OTOH, are generally from low stack usage paths, so avoid the
- * context switch overhead here.
- */
-int
-xfs_alloc_vextent(
-	struct xfs_alloc_arg	*args)
-{
-	DECLARE_COMPLETION_ONSTACK(done);
-
-	if (!args->stack_switch)
-		return __xfs_alloc_vextent(args);
-
-
-	args->done = &done;
-	INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
-	queue_work(xfs_alloc_wq, &args->work);
-	wait_for_completion(&done);
-	return args->result;
-}
-
 /*
  * Free an extent.
  * Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index ef7d4885dc2d..feacb061bab7 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,10 +120,6 @@ typedef struct xfs_alloc_arg {
 	char		isfl;		/* set if is freelist blocks - !acctg */
 	char		userdata;	/* set if this is user data */
 	xfs_fsblock_t	firstblock;	/* io first block allocated */
-	struct completion *done;
-	struct work_struct work;
-	int		result;
-	char		stack_switch;
 } xfs_alloc_arg_t;
 
 /*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 91259554df8b..83d0cf3df930 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2441,7 +2441,6 @@ xfs_bmap_btalloc(
 	args.tp = ap->tp;
 	args.mp = mp;
 	args.fsbno = ap->blkno;
-	args.stack_switch = ap->stack_switch;
 
 	/* Trim the allocation back to the maximum an AG can fit. */
 	args.maxlen = MIN(ap->length, XFS_ALLOC_AG_MAX_USABLE(mp));
@@ -4620,12 +4619,11 @@ xfs_bmapi_delay(
 
 
 STATIC int
-xfs_bmapi_allocate(
-	struct xfs_bmalloca	*bma,
-	int			flags)
+__xfs_bmapi_allocate(
+	struct xfs_bmalloca	*bma)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
 						XFS_ATTR_FORK : XFS_DATA_FORK;
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	int			tmp_logflags = 0;
@@ -4658,25 +4656,25 @@ xfs_bmapi_allocate(
 	 * Indicate if this is the first user data in the file, or just any
 	 * user data.
 	 */
-	if (!(flags & XFS_BMAPI_METADATA)) {
+	if (!(bma->flags & XFS_BMAPI_METADATA)) {
 		bma->userdata = (bma->offset == 0) ?
 			XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
 	}
 
-	bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
 
 	/*
 	 * Only want to do the alignment at the eof if it is userdata and
 	 * allocation length is larger than a stripe unit.
 	 */
 	if (mp->m_dalign && bma->length >= mp->m_dalign &&
-	    !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+	    !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
 		error = xfs_bmap_isaeof(bma, whichfork);
 		if (error)
 			return error;
 	}
 
-	if (flags & XFS_BMAPI_STACK_SWITCH)
+	if (bma->flags & XFS_BMAPI_STACK_SWITCH)
 		bma->stack_switch = 1;
 
 	error = xfs_bmap_alloc(bma);
@@ -4713,7 +4711,7 @@ xfs_bmapi_allocate(
 	 * A wasdelay extent has been initialized, so shouldn't be flagged
 	 * as unwritten.
 	 */
-	if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+	if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
 	    xfs_sb_version_hasextflgbit(&mp->m_sb))
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
@@ -4741,6 +4739,45 @@ xfs_bmapi_allocate(
 	return 0;
 }
 
+static void
+xfs_bmapi_allocate_worker(
+	struct work_struct	*work)
+{
+	struct xfs_bmalloca	*args = container_of(work,
+						struct xfs_bmalloca, work);
+	unsigned long		pflags;
+
+	/* we are in a transaction context here */
+	current_set_flags_nested(&pflags, PF_FSTRANS);
+
+	args->result = __xfs_bmapi_allocate(args);
+	complete(args->done);
+
+	current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+	struct xfs_bmalloca	*args)
+{
+	DECLARE_COMPLETION_ONSTACK(done);
+
+	if (!args->stack_switch)
+		return __xfs_bmapi_allocate(args);
+
+
+	args->done = &done;
+	INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+	queue_work(xfs_alloc_wq, &args->work);
+	wait_for_completion(&done);
+	return args->result;
+}
+
 STATIC int
 xfs_bmapi_convert_unwritten(
 	struct xfs_bmalloca	*bma,
@@ -4926,6 +4963,7 @@ xfs_bmapi_write(
 			bma.conv = !!(flags & XFS_BMAPI_CONVERT);
 			bma.wasdel = wasdelay;
 			bma.offset = bno;
+			bma.flags = flags;
 
 			/*
 			 * There's a 32/64 bit type mismatch between the
@@ -4941,7 +4979,7 @@ xfs_bmapi_write(
 
 			ASSERT(len > 0);
 			ASSERT(bma.length > 0);
-			error = xfs_bmapi_allocate(&bma, flags);
+			error = xfs_bmapi_allocate(&bma);
 			if (error)
 				goto error0;
 			if (bma.blkno == NULLFSBLOCK)
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index b68c598034c1..5f469c3516eb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -136,6 +136,10 @@ typedef struct xfs_bmalloca {
 	char			aeof;	/* allocated space at eof */
 	char			conv;	/* overwriting unwritten extents */
 	char			stack_switch;
+	int			flags;
+	struct completion	*done;
+	struct work_struct	work;
+	int			result;
 } xfs_bmalloca_t;
 
 /*
-- 
cgit v1.2.3


From 1375cb65e87b327a8dd4f920c3e3d837fb40e9c2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 9 Oct 2012 14:50:52 +1100
Subject: xfs: growfs: don't read garbage for new secondary superblocks

When updating new secondary superblocks in a growfs operation, the
superblock buffer is read from the newly grown region of the
underlying device. This is not guaranteed to be zero, so violates
the underlying assumption that the unused parts of superblocks are
zero filled. Get a new buffer for these secondary superblocks to
ensure that the unused regions are zero filled correctly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf7..4beaede43277 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -399,9 +399,26 @@ xfs_growfs_data_private(
 
 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
-		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+		error = 0;
+		/*
+		 * new secondary superblocks need to be zeroed, not read from
+		 * disk as the contents of the new area we are growing into is
+		 * completely unknown.
+		 */
+		if (agno < oagcount) {
+			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+		} else {
+			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
+				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+				  XFS_FSS_TO_BB(mp, 1), 0);
+			if (bp)
+				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+			else
+				error = ENOMEM;
+		}
+
 		if (error) {
 			xfs_warn(mp,
 		"error %d reading secondary superblock for ag %d",
@@ -423,7 +440,7 @@ xfs_growfs_data_private(
 			break; /* no point in continuing */
 		}
 	}
-	return 0;
+	return error;
 
  error0:
 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-- 
cgit v1.2.3


From 531c3bdc8662e1a83f8ec80dc3346b1284877c0a Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 25 Oct 2012 17:22:30 +1100
Subject: xfs: silence uninitialised f.file warning.

Uninitialised variable build warning introduced by 2903ff0 ("switch
simple cases of fget_light to fdget"), gcc is not smart enough to
work out that the variable is not used uninitialised, and the commit
removed the initialisation at declaration that the old variable had.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8305f2ac6773..c1df3c623de2 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -70,7 +70,7 @@ xfs_find_handle(
 	int			hsize;
 	xfs_handle_t		handle;
 	struct inode		*inode;
-	struct fd		f;
+	struct fd		f = {0};
 	struct path		path;
 	int			error;
 	struct xfs_inode	*ip;
-- 
cgit v1.2.3


From cd856db69c88db438215244571957d812bdc6813 Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Sat, 20 Oct 2012 11:08:19 -0300
Subject: xfs: Update inode alloc comments

I found some out of date comments while studying the inode allocation
code, so I believe it's worth to have these comments updated.

It basically rewrites the comment regarding to "call_again" variable,
which is not used anymore, but instead, callers of xfs_ialloc() decides
if it needs to be called again relying only if ialloc_context is NULL or
not.

Also did some small changes in another comment that I thought to be
pertinent to the current behaviour of these functions and some alignment
on both comments.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ialloc.c |  6 +++---
 fs/xfs/xfs_inode.c  | 18 +++++++++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c5c4ef4f2bdb..37753e1c8537 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -877,9 +877,9 @@ error0:
  * This function is designed to be called twice if it has to do an allocation
  * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
  * If an inode is available without having to performn an allocation, an inode
- * number is returned.  In this case, *IO_agbp would be NULL.  If an allocation
- * needes to be done, xfs_dialloc would return the current AGI buffer in
- * *IO_agbp.  The caller should then commit the current transaction, allocate a
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
  * new transaction, and call xfs_dialloc() again, passing in the previous value
  * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
  * buffer is locked across the two calls, the second call is guaranteed to have
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bba8f37525b3..95f7a73b05cb 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1104,16 +1104,16 @@ xfs_iread_extents(
  * set according to the contents of the given cred structure.
  *
  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
- * to obtain the in-core version of the allocated inode.  Finally,
- * fill in the inode and log its initial contents.  In this case,
- * ialloc_context would be set to NULL and call_again set to false.
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode.  Finally, fill in the inode and
+ * log its initial contents.  In this case, ialloc_context would be
+ * set to NULL.
  *
- * If xfs_dialloc() does not have an available inode,
- * it will replenish its supply by doing an allocation. Since we can
- * only do one allocation within a transaction without deadlocks, we
- * must commit the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
  * The caller should then commit the current transaction, start a new
  * transaction, and call xfs_ialloc() again to actually get the inode.
  *
-- 
cgit v1.2.3


From c99abb8f560798625323c7b21d5636259017825d Mon Sep 17 00:00:00 2001
From: Carlos Maiolino <cmaiolino@redhat.com>
Date: Thu, 18 Oct 2012 12:28:58 -0300
Subject: xfs: Update mount options documentation

Once inode64 is the default allocation mode now, kernel documentation should be
updated to match this behaviour.

Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 Documentation/filesystems/xfs.txt | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 3fc0c31a6f5d..17187750270a 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -72,8 +72,15 @@ When mounting an XFS filesystem, the following options are accepted.
 	Indicates that XFS is allowed to create inodes at any location
 	in the filesystem, including those which will result in inode
 	numbers occupying more than 32 bits of significance.  This is
-	provided for backwards compatibility, but causes problems for
-	backup applications that cannot handle large inode numbers.
+	the default allocation option. Applications which do not handle
+	inode numbers bigger than 32 bits, should use inode32 option.
+
+  inode32
+	Indicates that XFS is limited to create inodes at locations which
+	will not result in inode numbers with more than 32 bits of
+	significance. This is provided for backwards compatibility, since
+	64 bits inode numbers might cause problems for some applications
+	that cannot handle large inode numbers.
 
   largeio/nolargeio
 	If "nolargeio" is specified, the optimal I/O reported in
-- 
cgit v1.2.3


From 4c05f9ad4d168098b7ce3ffa7098283f94811ed6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 2 Nov 2012 11:38:41 +1100
Subject: xfs: invalidate allocbt blocks moved to the free list

When we free a block from the alloc btree tree, we move it to the
freelist held in the AGFL and mark it busy in the busy extent tree.
This typically happens when we merge btree blocks.

Once the transaction is committed and checkpointed, the block can
remain on the free list for an indefinite amount of time.  Now, this
isn't the end of the world at this point - if the free list is
shortened, the buffer is invalidated in the transaction that moves
it back to free space. If the buffer is allocated as metadata from
the free list, then all the modifications getted logged, and we have
no issues, either. And if it gets allocated as userdata direct from
the freelist, it gets invalidated and so will never get written.

However, during the time it sits on the free list, pressure on the
log can cause the AIL to be pushed and the buffer that covers the
block gets pushed for write. IOWs, we end up writing a freed
metadata block to disk. Again, this isn't the end of the world
because we know from the above we are only writing to free space.

The problem, however, is for validation callbacks. If the block was
on old btree root block, then the level of the block is going to be
higher than the current tree root, and so will fail validation.
There may be other inconsistencies in the block as well, and
currently we don't care because the block is in free space. Shutting
down the filesystem because a freed block doesn't pass write
validation, OTOH, is rather unfriendly.

So, make sure we always invalidate buffers as they move from the
free space trees to the free list so that we guarantee they never
get written to disk while on the free list.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Phil White <pwhite@sgi.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc_btree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8f..f7876c6d6165 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
 	xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
 			      XFS_EXTENT_BUSY_SKIP_DISCARD);
 	xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+	xfs_trans_binval(cur->bc_tp, bp);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b6aff29f3af7437635ec3d66af9115bb17ba561f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 2 Nov 2012 11:38:42 +1100
Subject: xfs: don't vmap inode cluster buffers during free

Inode buffers do not need to be mapped as inodes are read or written
directly from/to the pages underlying the buffer. This fixes a
regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the
default behaviour").

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 95f7a73b05cb..965598eb308c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1760,7 +1760,8 @@ xfs_ifree_cluster(
 		 * to mark all the active inodes on the buffer stale.
 		 */
 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
-					mp->m_bsize * blks_per_cluster, 0);
+					mp->m_bsize * blks_per_cluster,
+					XBF_UNMAPPED);
 
 		if (!bp)
 			return ENOMEM;
-- 
cgit v1.2.3


From 137fff09b7924507871f8e6294dfe57b7a880332 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Fri, 2 Nov 2012 14:23:12 +1100
Subject: xfs: fix buffer shudown reference count mismatch

When we shut down the filesystem, we have to unpin and free all the
buffers currently active in the CIL. To do this we unpin and remove
them in one operation as a result of a failed iclogbuf write. For
buffers, we do this removal via a simultated IO completion of after
marking the buffer stale.

At the time we do this, we have two references to the buffer - the
active LRU reference and the buf log item.  The LRU reference is
removed by marking the buffer stale, and the active CIL reference is
by the xfs_buf_iodone() callback that is run by
xfs_buf_do_callbacks() during ioend processing (via the bp->b_iodone
callback).

However, ioend processing requires one more reference - that of the
IO that it is completing. We don't have this reference, so we free
the buffer prematurely and use it after it is freed. For buffers
marked with XBF_ASYNC, this leads to assert failures in
xfs_buf_rele() on debug kernels because the b_hold count is zero.

Fix this by making sure we take the necessary IO reference before
starting IO completion processing on the stale buffer, and set the
XBF_ASYNC flag to ensure that IO completion processing removes all
the active references from the buffer to ensure it is fully torn
down.

Cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf_item.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a8d0ed911196..becf4a97efc6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -526,7 +526,25 @@ xfs_buf_item_unpin(
 		}
 		xfs_buf_relse(bp);
 	} else if (freed && remove) {
+		/*
+		 * There are currently two references to the buffer - the active
+		 * LRU reference and the buf log item. What we are about to do
+		 * here - simulate a failed IO completion - requires 3
+		 * references.
+		 *
+		 * The LRU reference is removed by the xfs_buf_stale() call. The
+		 * buf item reference is removed by the xfs_buf_iodone()
+		 * callback that is run by xfs_buf_do_callbacks() during ioend
+		 * processing (via the bp->b_iodone callback), and then finally
+		 * the ioend processing will drop the IO reference if the buffer
+		 * is marked XBF_ASYNC.
+		 *
+		 * Hence we need to take an additional reference here so that IO
+		 * completion processing doesn't free the buffer prematurely.
+		 */
 		xfs_buf_lock(bp);
+		xfs_buf_hold(bp);
+		bp->b_flags |= XBF_ASYNC;
 		xfs_buf_ioerror(bp, EIO);
 		XFS_BUF_UNDONE(bp);
 		xfs_buf_stale(bp);
-- 
cgit v1.2.3


From 009507b052fa391618eccf9e8c9f484407fd9018 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 2 Nov 2012 11:38:44 +1100
Subject: xfs: fix reading of wrapped log data

Commit 4439647 ("xfs: reset buffer pointers before freeing them") in
3.0-rc1 introduced a regression when recovering log buffers that
wrapped around the end of log. The second part of the log buffer at
the start of the physical log was being read into the header buffer
rather than the data buffer, and hence recovery was seeing garbage
in the data buffer when it got to the region of the log buffer that
was incorrectly read.

Cc: <stable@vger.kernel.org> # 3.0.x, 3.2.x, 3.4.x 3.6.x
Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log_recover.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 651c98859b04..3e06333d4bd1 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3542,7 +3542,7 @@ xlog_do_recovery_pass(
 				 *   - order is important.
 				 */
 				error = xlog_bread_offset(log, 0,
-						bblks - split_bblks, hbp,
+						bblks - split_bblks, dbp,
 						offset + BBTOB(split_bblks));
 				if (error)
 					goto bread_err2;
-- 
cgit v1.2.3


From 69a58a43f74eb2cb23d9bce2524dae33c289a40f Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 9 Oct 2012 14:11:45 -0500
Subject: xfs: report projid32bit feature in geometry call

When xfs gained the projid32bit feature, it was never added to
the FSGEOMETRY ioctl feature flags, so it's not queryable without
this patch.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fs.h    | 3 ++-
 fs/xfs/xfs_fsops.c | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..0948c043443b 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LOGV2	0x0100	/* log format version 2	*/
 #define XFS_FSOP_GEOM_FLAGS_SECTOR	0x0200	/* sector sizes >1BB	*/
 #define XFS_FSOP_GEOM_FLAGS_ATTR2	0x0400	/* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32	0x0800  /* 32-bit project IDs	*/
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI	0x1000	/* ASCII only CI names	*/
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB	0x4000	/* lazy superblock counters */
 
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 4beaede43277..7b0a997cf62b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
 			(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
 			(xfs_sb_version_hasattr2(&mp->m_sb) ?
-				XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+				XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+			(xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
 		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
 				mp->m_sb.sb_logsectsize : BBSIZE;
 		geo->rtsectsize = mp->m_sb.sb_blocksize;
-- 
cgit v1.2.3


From 27b52867925e3aaed090063c1c58a7537e6373f3 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:38 -0500
Subject: xfs: add EOFBLOCKS inode tagging/untagging

Add the XFS_ICI_EOFBLOCKS_TAG inode tag to identify inodes with
speculatively preallocated blocks beyond EOF. An inode is tagged
when speculative preallocation occurs and untagged either via
truncate down or when post-EOF blocks are freed via release or
reclaim.

The tag management is intentionally not aggressive to prefer
simplicity over the complexity of handling all the corner cases
under which post-EOF blocks could be freed (i.e., forward
truncation, fallocate, write error conditions, etc.). This means
that a tagged inode may or may not have post-EOF blocks after a
period of time. The tag is eventually cleared when the inode is
released or reclaimed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ag.h       |  1 +
 fs/xfs/xfs_icache.c   | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.h   |  3 +++
 fs/xfs/xfs_iomap.c    |  8 +++++++
 fs/xfs/xfs_iops.c     |  4 ++++
 fs/xfs/xfs_trace.h    |  5 +++++
 fs/xfs/xfs_vnodeops.c |  2 ++
 7 files changed, 85 insertions(+)

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..22bd4db011c8 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -233,6 +233,7 @@ typedef struct xfs_perag {
 #define XFS_ICI_NO_TAG		(-1)	/* special flag for an untagged lookup
 					   in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG	1	/* inode has blocks beyond EOF */
 
 #define	XFS_AG_MAXLEVELS(mp)		((mp)->m_ag_maxlevels)
 #define	XFS_MIN_FREELIST_RAW(bl,cl,mp)	\
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 9c8703b5cd72..f9afc5ff0482 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1128,3 +1128,65 @@ xfs_reclaim_inodes_count(
 	return reclaimable;
 }
 
+void
+xfs_inode_set_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+	int tagged;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	trace_xfs_inode_set_eofblocks_tag(ip);
+
+	tagged = radix_tree_tagged(&pag->pag_ici_root,
+				   XFS_ICI_EOFBLOCKS_TAG);
+	radix_tree_tag_set(&pag->pag_ici_root,
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			   XFS_ICI_EOFBLOCKS_TAG);
+	if (!tagged) {
+		/* propagate the eofblocks tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				   XFS_ICI_EOFBLOCKS_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+
+		trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+					      -1, _RET_IP_);
+	}
+
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	struct xfs_mount *mp = ip->i_mount;
+	struct xfs_perag *pag;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	spin_lock(&pag->pag_ici_lock);
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+
+	radix_tree_tag_clear(&pag->pag_ici_root,
+			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+			     XFS_ICI_EOFBLOCKS_TAG);
+	if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+		/* clear the eofblocks tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				     XFS_ICI_EOFBLOCKS_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+					       -1, _RET_IP_);
+	}
+
+	spin_unlock(&pag->pag_ici_lock);
+	xfs_perag_put(pag);
+}
+
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 222e22f16b4a..db3613075dc6 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -35,6 +35,9 @@ void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index a066cf1766ab..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
@@ -450,6 +451,13 @@ retry:
 	if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
 		return xfs_alert_fsblock_zero(ip, &imap[0]);
 
+	/*
+	 * Tag the inode as speculatively preallocated so we can reclaim this
+	 * space on demand, if necessary.
+	 */
+	if (prealloc)
+		xfs_inode_set_eofblocks_tag(ip);
+
 	*ret_imap = imap[0];
 	return 0;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..81f5c4953287 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -854,6 +855,9 @@ xfs_setattr_size(
 		 * and do not wait the usual (long) time for writeout.
 		 */
 		xfs_iflags_set(ip, XFS_ITRUNCATED);
+
+		/* A truncate down always removes post-EOF blocks. */
+		xfs_inode_clear_eofblocks_tag(ip);
 	}
 
 	if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..6f46e034b766 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -130,6 +130,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
 
 TRACE_EVENT(xfs_attr_list_node_descend,
 	TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +587,9 @@ DEFINE_INODE_EVENT(xfs_update_time);
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
 DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
 
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+
 DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
 	TP_ARGS(ip, caller_ip),
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2ee1f49da0aa..e6e1d11dfdf2 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -238,6 +238,8 @@ xfs_free_eofblocks(
 		} else {
 			error = xfs_trans_commit(tp,
 						XFS_TRANS_RELEASE_LOG_RES);
+			if (!error)
+				xfs_inode_clear_eofblocks_tag(ip);
 		}
 
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-- 
cgit v1.2.3


From a454f7428ffa03c8e1321124d9074101b7290be6 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:39 -0500
Subject: xfs: support a tag-based inode_ag_iterator

Genericize xfs_inode_ag_walk() to support an optional radix tree tag
and args argument for the execute function. Create a new wrapper
called xfs_inode_ag_iterator_tag() that performs a tag based walk
of perag's and inodes.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_icache.c      | 56 ++++++++++++++++++++++++++++++++++++++++++------
 fs/xfs/xfs_icache.h      |  9 ++++++--
 fs/xfs/xfs_qm_syscalls.c |  5 +++--
 3 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f9afc5ff0482..2a96dc48ebe6 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -516,8 +516,11 @@ xfs_inode_ag_walk(
 	struct xfs_mount	*mp,
 	struct xfs_perag	*pag,
 	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
+					   struct xfs_perag *pag, int flags,
+					   void *args),
+	int			flags,
+	void			*args,
+	int			tag)
 {
 	uint32_t		first_index;
 	int			last_error = 0;
@@ -536,9 +539,17 @@ restart:
 		int		i;
 
 		rcu_read_lock();
-		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+
+		if (tag == -1)
+			nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
 					(void **)batch, first_index,
 					XFS_LOOKUP_BATCH);
+		else
+			nr_found = radix_tree_gang_lookup_tag(
+					&pag->pag_ici_root,
+					(void **) batch, first_index,
+					XFS_LOOKUP_BATCH, tag);
+
 		if (!nr_found) {
 			rcu_read_unlock();
 			break;
@@ -579,7 +590,7 @@ restart:
 		for (i = 0; i < nr_found; i++) {
 			if (!batch[i])
 				continue;
-			error = execute(batch[i], pag, flags);
+			error = execute(batch[i], pag, flags, args);
 			IRELE(batch[i]);
 			if (error == EAGAIN) {
 				skipped++;
@@ -608,8 +619,10 @@ int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
 	int			(*execute)(struct xfs_inode *ip,
-					   struct xfs_perag *pag, int flags),
-	int			flags)
+					   struct xfs_perag *pag, int flags,
+					   void *args),
+	int			flags,
+	void			*args)
 {
 	struct xfs_perag	*pag;
 	int			error = 0;
@@ -619,7 +632,36 @@ xfs_inode_ag_iterator(
 	ag = 0;
 	while ((pag = xfs_perag_get(mp, ag))) {
 		ag = pag->pag_agno + 1;
-		error = xfs_inode_ag_walk(mp, pag, execute, flags);
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+		xfs_perag_put(pag);
+		if (error) {
+			last_error = error;
+			if (error == EFSCORRUPTED)
+				break;
+		}
+	}
+	return XFS_ERROR(last_error);
+}
+
+int
+xfs_inode_ag_iterator_tag(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip,
+					   struct xfs_perag *pag, int flags,
+					   void *args),
+	int			flags,
+	void			*args,
+	int			tag)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			last_error = 0;
+	xfs_agnumber_t		ag;
+
+	ag = 0;
+	while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+		ag = pag->pag_agno + 1;
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index db3613075dc6..54c113478dfc 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,7 +40,12 @@ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
-	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-	int flags);
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+		int flags, void *args),
+	int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+		int flags, void *args),
+	int flags, void *args, int tag);
 
 #endif
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 7a9071f8855f..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -846,7 +846,8 @@ STATIC int
 xfs_dqrele_inode(
 	struct xfs_inode	*ip,
 	struct xfs_perag	*pag,
-	int			flags)
+	int			flags,
+	void			*args)
 {
 	/* skip quota inodes */
 	if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -882,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
 }
-- 
cgit v1.2.3


From 72b53efa4a6125a4c334871c58268c430605819a Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:40 -0500
Subject: xfs: create helper to check whether to free eofblocks on inode

This check is used in multiple places to determine whether we
should check for (and potentially free) post EOF blocks on an
inode. Add a helper to consolidate the check.

Note that when we remove an inode from the cache (xfs_inactive()),
we are required to trim post-EOF blocks even if the inode is marked
preallocated or append-only to maintain correct space accounting.
The 'force' parameter to xfs_can_free_eofblocks() specifies whether
we should ignore the prealloc/append-only status of the inode.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c    | 37 +++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.h    |  1 +
 fs/xfs/xfs_vnodeops.c | 19 +++++++------------
 3 files changed, 45 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 965598eb308c..7449cb943efd 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -3912,3 +3912,40 @@ xfs_iext_irec_update_extoffs(
 		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
 	}
 }
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+	/* prealloc/delalloc exists only on regular files */
+	if (!S_ISREG(ip->i_d.di_mode))
+		return false;
+
+	/*
+	 * Zero sized files with no cached pages and delalloc blocks will not
+	 * have speculative prealloc/delalloc blocks to remove.
+	 */
+	if (VFS_I(ip)->i_size == 0 &&
+	    VN_CACHED(VFS_I(ip)) == 0 &&
+	    ip->i_delayed_blks == 0)
+		return false;
+
+	/* If we haven't read in the extent list, then don't do it now. */
+	if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+		return false;
+
+	/*
+	 * Do not free real preallocated or append-only files unless the file
+	 * has delalloc blocks and we are forced to remove them.
+	 */
+	if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+		if (!force || ip->i_delayed_blks == 0)
+			return false;
+
+	return true;
+}
+
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1fc2065e010b..21b4de3df716 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -585,6 +585,7 @@ void		xfs_iext_irec_compact(xfs_ifork_t *);
 void		xfs_iext_irec_compact_pages(xfs_ifork_t *);
 void		xfs_iext_irec_compact_full(xfs_ifork_t *);
 void		xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+bool		xfs_can_free_eofblocks(struct xfs_inode *, bool);
 
 #define xfs_ipincount(ip)	((unsigned int) atomic_read(&ip->i_pincount))
 
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index e6e1d11dfdf2..c4c153900205 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -436,11 +436,7 @@ xfs_release(
 	if (ip->i_d.di_nlink == 0)
 		return 0;
 
-	if ((S_ISREG(ip->i_d.di_mode) &&
-	     (VFS_I(ip)->i_size > 0 ||
-	      (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-	     (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-	    (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+	if (xfs_can_free_eofblocks(ip, false)) {
 
 		/*
 		 * If we can't get the iolock just skip truncating the blocks
@@ -516,13 +512,12 @@ xfs_inactive(
 		goto out;
 
 	if (ip->i_d.di_nlink != 0) {
-		if ((S_ISREG(ip->i_d.di_mode) &&
-		    (VFS_I(ip)->i_size > 0 ||
-		     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-		    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-		    (!(ip->i_d.di_flags &
-				(XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-		     ip->i_delayed_blks != 0))) {
+		/*
+		 * force is true because we are evicting an inode from the
+		 * cache. Post-eof blocks must be freed, lest we end up with
+		 * broken free space accounting.
+		 */
+		if (xfs_can_free_eofblocks(ip, true)) {
 			error = xfs_free_eofblocks(mp, ip, false);
 			if (error)
 				return VN_INACTIVE_CACHE;
-- 
cgit v1.2.3


From 40165e27617e2a98bf8588001d2f2872fae2fee2 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:41 -0500
Subject: xfs: make xfs_free_eofblocks() non-static, return EAGAIN on trylock
 failure

Turn xfs_free_eofblocks() into a non-static function, return EAGAIN to
indicate trylock failure and make sure this error is not propagated in
xfs_release().

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 6 +++---
 fs/xfs/xfs_vnodeops.h | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4c153900205..c2ddd7a43942 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -151,7 +151,7 @@ xfs_readlink(
  * when the link count isn't zero and by xfs_dm_punch_hole() when
  * punching a hole to EOF.
  */
-STATIC int
+int
 xfs_free_eofblocks(
 	xfs_mount_t	*mp,
 	xfs_inode_t	*ip,
@@ -200,7 +200,7 @@ xfs_free_eofblocks(
 		if (need_iolock) {
 			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
 				xfs_trans_cancel(tp, 0);
-				return 0;
+				return EAGAIN;
 			}
 		}
 
@@ -463,7 +463,7 @@ xfs_release(
 			return 0;
 
 		error = xfs_free_eofblocks(mp, ip, true);
-		if (error)
+		if (error && error != EAGAIN)
 			return error;
 
 		/* delalloc blocks after truncation means it really is dirty */
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..52fafc416a0c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -57,5 +57,6 @@ int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
 int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
 
 #endif /* _XFS_VNODEOPS_H */
-- 
cgit v1.2.3


From 41176a68e3f710630feace536d0277a092e206b5 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:42 -0500
Subject: xfs: create function to scan and clear EOFBLOCKS inodes

xfs_inodes_free_eofblocks() implements scanning functionality for
EOFBLOCKS inodes. It uses the AG iterator to walk the tagged inodes
and free post-EOF blocks via the xfs_inode_free_eofblocks() execute
function. The scan can be invoked in best-effort mode or wait
(force) mode.

A best-effort scan (default) handles all inodes that do not have a
dirty cache and we successfully acquire the io lock via trylock. In
wait mode, we continue to cycle through an AG until all inodes are
handled.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_icache.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.h |  1 +
 fs/xfs/xfs_trace.h  |  1 +
 3 files changed, 45 insertions(+)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 2a96dc48ebe6..d115cb44b103 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1170,6 +1170,49 @@ xfs_reclaim_inodes_count(
 	return reclaimable;
 }
 
+STATIC int
+xfs_inode_free_eofblocks(
+	struct xfs_inode	*ip,
+	struct xfs_perag	*pag,
+	int			flags,
+	void			*args)
+{
+	int ret;
+
+	if (!xfs_can_free_eofblocks(ip, false)) {
+		/* inode could be preallocated or append-only */
+		trace_xfs_inode_free_eofblocks_invalid(ip);
+		xfs_inode_clear_eofblocks_tag(ip);
+		return 0;
+	}
+
+	/*
+	 * If the mapping is dirty the operation can block and wait for some
+	 * time. Unless we are waiting, skip it.
+	 */
+	if (!(flags & SYNC_WAIT) &&
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+		return 0;
+
+	ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+
+	/* don't revisit the inode if we're not waiting */
+	if (ret == EAGAIN && !(flags & SYNC_WAIT))
+		ret = 0;
+
+	return ret;
+}
+
+int
+xfs_icache_free_eofblocks(
+	struct xfs_mount	*mp,
+	int			flags)
+{
+	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+	return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+					 NULL, XFS_ICI_EOFBLOCKS_TAG);
+}
+
 void
 xfs_inode_set_eofblocks_tag(
 	xfs_inode_t	*ip)
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 54c113478dfc..cb6b8d0eee61 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -37,6 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
 void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, int);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 6f46e034b766..cb5234632072 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -589,6 +589,7 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
 
 DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 
 DECLARE_EVENT_CLASS(xfs_iref_class,
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
-- 
cgit v1.2.3


From 8ca149de80478441352a8622ea15fae7de703ced Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 7 Nov 2012 12:21:12 -0500
Subject: xfs: add XFS_IOC_FREE_EOFBLOCKS ioctl

The XFS_IOC_FREE_EOFBLOCKS ioctl allows users to invoke an EOFBLOCKS
scan. The xfs_eofblocks structure is defined to support the command
parameters (scan mode).

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fs.h     | 17 +++++++++++++++++
 fs/xfs/xfs_icache.c | 10 +++++++---
 fs/xfs/xfs_icache.h |  2 +-
 fs/xfs/xfs_ioctl.c  | 20 ++++++++++++++++++++
 4 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 0948c043443b..0cfa30813b16 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -339,6 +339,22 @@ typedef struct xfs_error_injection {
 } xfs_error_injection_t;
 
 
+/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION		1
+struct xfs_eofblocks {
+	__u32		eof_version;
+	__u32		eof_flags;
+	__u64		pad[15];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC		(1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_VALID	\
+	(XFS_EOF_FLAGS_SYNC)
+
+
 /*
  * The user-level Handle Request interface structure.
  */
@@ -457,6 +473,7 @@ typedef struct xfs_handle {
 /*	XFS_IOC_GETBIOSIZE ---- deprecated 47	   */
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_eofblocks)
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index d115cb44b103..fbb74c715266 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1206,11 +1206,15 @@ xfs_inode_free_eofblocks(
 int
 xfs_icache_free_eofblocks(
 	struct xfs_mount	*mp,
-	int			flags)
+	struct xfs_eofblocks	*eofb)
 {
-	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+	int flags = SYNC_TRYLOCK;
+
+	if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+		flags = SYNC_WAIT;
+
 	return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
-					 NULL, XFS_ICI_EOFBLOCKS_TAG);
+					 eofb, XFS_ICI_EOFBLOCKS_TAG);
 }
 
 void
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index cb6b8d0eee61..4934a77024cf 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -37,7 +37,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
 void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
-int xfs_icache_free_eofblocks(struct xfs_mount *, int);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1df3c623de2..5b20ab0b4f9d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -1602,6 +1603,25 @@ xfs_file_ioctl(
 		error = xfs_errortag_clearall(mp, 1);
 		return -error;
 
+	case XFS_IOC_FREE_EOFBLOCKS: {
+		struct xfs_eofblocks eofb;
+
+		if (copy_from_user(&eofb, arg, sizeof(eofb)))
+			return -XFS_ERROR(EFAULT);
+
+		if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
+			return -XFS_ERROR(EINVAL);
+
+		if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
+			return -XFS_ERROR(EINVAL);
+
+		if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad)))
+			return -XFS_ERROR(EINVAL);
+
+		error = xfs_icache_free_eofblocks(mp, &eofb);
+		return -error;
+	}
+
 	default:
 		return -ENOTTY;
 	}
-- 
cgit v1.2.3


From 3e3f9f5863548e870edfcc72e7617ac8ddcad44a Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 7 Nov 2012 12:21:13 -0500
Subject: xfs: add inode id filtering to eofblocks scan

Support inode ID filtering in the eofblocks scan. The caller must
set the associated XFS_EOF_FLAGS_*ID bit and ID field.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fs.h     | 14 ++++++++++++--
 fs/xfs/xfs_icache.c | 22 ++++++++++++++++++++++
 fs/xfs/xfs_ioctl.c  |  3 ++-
 3 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 0cfa30813b16..a19f9b205c15 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -346,13 +346,23 @@ typedef struct xfs_error_injection {
 struct xfs_eofblocks {
 	__u32		eof_version;
 	__u32		eof_flags;
-	__u64		pad[15];
+	uid_t		eof_uid;
+	gid_t		eof_gid;
+	prid_t		eof_prid;
+	__u32		pad32;
+	__u64		pad64[13];
 };
 
 /* eof_flags values */
 #define XFS_EOF_FLAGS_SYNC		(1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID		(1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID		(1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID		(1 << 3) /* filter by project id */
 #define XFS_EOF_FLAGS_VALID	\
-	(XFS_EOF_FLAGS_SYNC)
+	(XFS_EOF_FLAGS_SYNC |	\
+	 XFS_EOF_FLAGS_UID |	\
+	 XFS_EOF_FLAGS_GID |	\
+	 XFS_EOF_FLAGS_PRID)
 
 
 /*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index fbb74c715266..b239da91c43b 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1170,6 +1170,21 @@ xfs_reclaim_inodes_count(
 	return reclaimable;
 }
 
+STATIC int
+xfs_inode_match_id(
+	struct xfs_inode	*ip,
+	struct xfs_eofblocks	*eofb)
+{
+	if (eofb->eof_flags & XFS_EOF_FLAGS_UID)
+		return ip->i_d.di_uid == eofb->eof_uid;
+	else if (eofb->eof_flags & XFS_EOF_FLAGS_GID)
+		return ip->i_d.di_gid == eofb->eof_gid;
+	else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID)
+		return xfs_get_projid(ip) == eofb->eof_prid;
+
+	return 0;
+}
+
 STATIC int
 xfs_inode_free_eofblocks(
 	struct xfs_inode	*ip,
@@ -1178,6 +1193,7 @@ xfs_inode_free_eofblocks(
 	void			*args)
 {
 	int ret;
+	struct xfs_eofblocks *eofb = args;
 
 	if (!xfs_can_free_eofblocks(ip, false)) {
 		/* inode could be preallocated or append-only */
@@ -1194,6 +1210,12 @@ xfs_inode_free_eofblocks(
 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
+	if (eofb &&
+	    (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID|
+			       XFS_EOF_FLAGS_PRID)) &&
+	    !xfs_inode_match_id(ip, eofb))
+		return 0;
+
 	ret = xfs_free_eofblocks(ip->i_mount, ip, true);
 
 	/* don't revisit the inode if we're not waiting */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 5b20ab0b4f9d..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1615,7 +1615,8 @@ xfs_file_ioctl(
 		if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
 			return -XFS_ERROR(EINVAL);
 
-		if (memchr_inv(eofb.pad, 0, sizeof(eofb.pad)))
+		if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
+		    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
 			return -XFS_ERROR(EINVAL);
 
 		error = xfs_icache_free_eofblocks(mp, &eofb);
-- 
cgit v1.2.3


From 1b5560488d1ab7c932f6f99385b41116838c3486 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:45 -0500
Subject: xfs: support multiple inode id filtering in eofblocks scan

Enhance the eofblocks scan code to filter based on multiply specified
inode id values. When multiple inode id values are specified, only
inodes that match all id values are selected.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_icache.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index b239da91c43b..32908909815e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1175,14 +1175,19 @@ xfs_inode_match_id(
 	struct xfs_inode	*ip,
 	struct xfs_eofblocks	*eofb)
 {
-	if (eofb->eof_flags & XFS_EOF_FLAGS_UID)
-		return ip->i_d.di_uid == eofb->eof_uid;
-	else if (eofb->eof_flags & XFS_EOF_FLAGS_GID)
-		return ip->i_d.di_gid == eofb->eof_gid;
-	else if (eofb->eof_flags & XFS_EOF_FLAGS_PRID)
-		return xfs_get_projid(ip) == eofb->eof_prid;
+	if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
+	    ip->i_d.di_uid != eofb->eof_uid)
+		return 0;
 
-	return 0;
+	if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
+	    ip->i_d.di_gid != eofb->eof_gid)
+		return 0;
+
+	if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+	    xfs_get_projid(ip) != eofb->eof_prid)
+		return 0;
+
+	return 1;
 }
 
 STATIC int
@@ -1210,10 +1215,7 @@ xfs_inode_free_eofblocks(
 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
-	if (eofb &&
-	    (eofb->eof_flags & (XFS_EOF_FLAGS_UID|XFS_EOF_FLAGS_GID|
-			       XFS_EOF_FLAGS_PRID)) &&
-	    !xfs_inode_match_id(ip, eofb))
+	if (eofb && !xfs_inode_match_id(ip, eofb))
 		return 0;
 
 	ret = xfs_free_eofblocks(ip->i_mount, ip, true);
-- 
cgit v1.2.3


From 00ca79a04bef1a1b30ef8afd992d905b6d986caf Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 7 Nov 2012 12:21:14 -0500
Subject: xfs: add minimum file size filtering to eofblocks scan

Support minimum file size filtering in the eofblocks scan. The
caller must set the XFS_EOF_FLAGS_MINFILESIZE flags bit and minimum
file size value in bytes.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fs.h     |  7 +++++--
 fs/xfs/xfs_icache.c | 11 +++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index a19f9b205c15..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -350,7 +350,8 @@ struct xfs_eofblocks {
 	gid_t		eof_gid;
 	prid_t		eof_prid;
 	__u32		pad32;
-	__u64		pad64[13];
+	__u64		eof_min_file_size;
+	__u64		pad64[12];
 };
 
 /* eof_flags values */
@@ -358,11 +359,13 @@ struct xfs_eofblocks {
 #define XFS_EOF_FLAGS_UID		(1 << 1) /* filter by uid */
 #define XFS_EOF_FLAGS_GID		(1 << 2) /* filter by gid */
 #define XFS_EOF_FLAGS_PRID		(1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE	(1 << 4) /* filter by min file size */
 #define XFS_EOF_FLAGS_VALID	\
 	(XFS_EOF_FLAGS_SYNC |	\
 	 XFS_EOF_FLAGS_UID |	\
 	 XFS_EOF_FLAGS_GID |	\
-	 XFS_EOF_FLAGS_PRID)
+	 XFS_EOF_FLAGS_PRID |	\
+	 XFS_EOF_FLAGS_MINFILESIZE)
 
 
 /*
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 32908909815e..906e6dcd2c55 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1215,8 +1215,15 @@ xfs_inode_free_eofblocks(
 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
 		return 0;
 
-	if (eofb && !xfs_inode_match_id(ip, eofb))
-		return 0;
+	if (eofb) {
+		if (!xfs_inode_match_id(ip, eofb))
+			return 0;
+
+		/* skip the inode if the file size is too small */
+		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+			return 0;
+	}
 
 	ret = xfs_free_eofblocks(ip->i_mount, ip, true);
 
-- 
cgit v1.2.3


From 579b62faa5fb16ffeeb88cda5e2c4e95730881af Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 6 Nov 2012 09:50:47 -0500
Subject: xfs: add background scanning to clear eofblocks inodes

Create a new mount workqueue and delayed_work to enable background
scanning and freeing of eofblocks inodes. The scanner kicks in once
speculative preallocation occurs and stops requeueing itself when
no eofblocks inodes exist.

The scan interval is based on the new
'speculative_prealloc_lifetime' tunable (default to 5m). The
background scanner performs unfiltered, best effort scans (which
skips inodes under lock contention or with a dirty cache mapping).

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_globals.c |  4 +++-
 fs/xfs/xfs_icache.c  | 29 +++++++++++++++++++++++++++++
 fs/xfs/xfs_icache.h  |  1 +
 fs/xfs/xfs_linux.h   |  1 +
 fs/xfs/xfs_mount.c   |  2 ++
 fs/xfs/xfs_mount.h   |  3 +++
 fs/xfs/xfs_super.c   |  9 +++++++++
 fs/xfs/xfs_sysctl.c  |  9 +++++++++
 fs/xfs/xfs_sysctl.h  |  1 +
 9 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
 /*
  * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
  * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
  */
 xfs_param_t xfs_params = {
 			  /*	MIN		DFLT		MAX	*/
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
 	.rotorstep	= {	1,		1,		255	},
 	.inherit_nodfrg	= {	0,		1,		1	},
 	.fstrm_timer	= {	1,		30*100,		3600*100},
+	.eofb_timer	= {	1,		300,		3600*24},
 };
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 906e6dcd2c55..96e344e3e927 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -615,6 +615,32 @@ restart:
 	return last_error;
 }
 
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+	struct xfs_mount *mp)
+{
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+		queue_delayed_work(mp->m_eofblocks_workqueue,
+				   &mp->m_eofblocks_work,
+				   msecs_to_jiffies(xfs_eofb_secs * 1000));
+	rcu_read_unlock();
+}
+
+void
+xfs_eofblocks_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+				struct xfs_mount, m_eofblocks_work);
+	xfs_icache_free_eofblocks(mp, NULL);
+	xfs_queue_eofblocks(mp);
+}
+
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
@@ -1273,6 +1299,9 @@ xfs_inode_set_eofblocks_tag(
 				   XFS_ICI_EOFBLOCKS_TAG);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
+		/* kick off background trimming */
+		xfs_queue_eofblocks(ip->i_mount);
+
 		trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
 					      -1, _RET_IP_);
 	}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 4934a77024cf..e0f138c70a2f 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -38,6 +38,7 @@ void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
 void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
 int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
 
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..0a134ca5211c 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -118,6 +118,7 @@
 #define xfs_rotorstep		xfs_params.rotorstep.val
 #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
+#define xfs_eofb_secs		xfs_params.eofb_timer.val
 
 #define current_cpu()		(raw_smp_processor_id())
 #define current_pid()		(current->pid)
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 6f1c997704cd..41ae7e1590f5 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1428,6 +1428,8 @@ xfs_unmountfs(
 	__uint64_t		resblks;
 	int			error;
 
+	cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
 	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
 	IRELE(mp->m_rootip);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a631ca3b9065..dc306a09f56f 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -196,6 +196,8 @@ typedef struct xfs_mount {
 #endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
+	struct delayed_work	m_eofblocks_work; /* background eof blocks
+						     trimming */
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
@@ -207,6 +209,7 @@ typedef struct xfs_mount {
 	struct workqueue_struct	*m_cil_workqueue;
 	struct workqueue_struct	*m_reclaim_workqueue;
 	struct workqueue_struct	*m_log_workqueue;
+	struct workqueue_struct *m_eofblocks_workqueue;
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 3d9ea947e9f8..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -874,8 +874,15 @@ xfs_init_mount_workqueues(
 	if (!mp->m_log_workqueue)
 		goto out_destroy_reclaim;
 
+	mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+			WQ_NON_REENTRANT, 0, mp->m_fsname);
+	if (!mp->m_eofblocks_workqueue)
+		goto out_destroy_log;
+
 	return 0;
 
+out_destroy_log:
+	destroy_workqueue(mp->m_log_workqueue);
 out_destroy_reclaim:
 	destroy_workqueue(mp->m_reclaim_workqueue);
 out_destroy_cil:
@@ -892,6 +899,7 @@ STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
 {
+	destroy_workqueue(mp->m_eofblocks_workqueue);
 	destroy_workqueue(mp->m_log_workqueue);
 	destroy_workqueue(mp->m_reclaim_workqueue);
 	destroy_workqueue(mp->m_cil_workqueue);
@@ -1393,6 +1401,7 @@ xfs_fs_fill_super(
 	mutex_init(&mp->m_growlock);
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
 
 	mp->m_super = sb;
 	sb->s_fs_info = mp;
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
 		.extra1		= &xfs_params.fstrm_timer.min,
 		.extra2		= &xfs_params.fstrm_timer.max,
 	},
+	{
+		.procname	= "speculative_prealloc_lifetime",
+		.data		= &xfs_params.eofb_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.eofb_timer.min,
+		.extra2		= &xfs_params.eofb_timer.max,
+	},
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t rotorstep;	/* inode32 AG rotoring control knob */
 	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
+	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
 } xfs_param_t;
 
 /*
-- 
cgit v1.2.3


From 07428d7f0ca46087f7f1efa895322bb9dc1ac21d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:09:44 +1100
Subject: xfs: fix attr tree double split corruption

In certain circumstances, a double split of an attribute tree is
needed to insert or replace an attribute. In rare situations, this
can go wrong, leaving the attribute tree corrupted. In this case,
the attr being replaced is the last attr in a leaf node, and the
replacement is larger so doesn't fit in the same leaf node.
When we have the initial condition of a node format attribute
btree with two leaves at index 1 and 2. Call them L1 and L2.  The
leaf L1 is completely full, there is not a single byte of free space
in it. L2 is mostly empty.  The attribute being replaced - call it X
- is the last attribute in L1.

The way an attribute replace is executed is that the replacement
attribute - call it Y - is first inserted into the tree, but has an
INCOMPLETE flag set on it so that list traversals ignore it. Once
this transaction is committed, a second transaction it run to
atomically mark Y as COMPLETE and X as INCOMPLETE, so that a
traversal will now find Y and skip X. Once that transaction is
committed, attribute X is then removed.

So, the initial condition is:

     +--------+     +--------+
     |   L1   |     |   L2   |
     | fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |
     | fsp: 0 |     | fsp: N |
     |--------|     |--------|
     | attr A |     | attr 1 |
     |--------|     |--------|
     | attr B |     | attr 2 |
     |--------|     |--------|
     ..........     ..........
     |--------|     |--------|
     | attr X |     | attr n |
     +--------+     +--------+


So now we go to replace X, and see that L1:fsp = 0 - it is full so
we can't insert Y in the same leaf. So we record the the location of
attribute X so we can track it for later use, then we split L1 into
L1 and L3 and reblance across the two leafs. We end with:


     +--------+     +--------+     +--------+
     |   L1   |     |   L3   |     |   L2   |
     | fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |<----| bwd: 3 |
     | fsp: M |     | fsp: J |     | fsp: N |
     |--------|     |--------|     |--------|
     | attr A |     | attr X |     | attr 1 |
     |--------|     +--------+     |--------|
     | attr B |                    | attr 2 |
     |--------|                    |--------|
     ..........                    ..........
     |--------|                    |--------|
     | attr W |                    | attr n |
     +--------+                    +--------+


And we track that the original attribute is now at L3:0.

We then try to insert Y into L1 again, and find that there isn't
enough room because the new attribute is larger than the old one.
Hence we have to split again to make room for Y. We end up with
this:


     +--------+     +--------+     +--------+     +--------+
     |   L1   |     |   L4   |     |   L3   |     |   L2   |
     | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
     | fsp: M |     | fsp: J |     | fsp: J |     | fsp: N |
     |--------|     |--------|     |--------|     |--------|
     | attr A |     | attr Y |     | attr X |     | attr 1 |
     |--------|     + INCOMP +     +--------+     |--------|
     | attr B |     +--------+                    | attr 2 |
     |--------|                                   |--------|
     ..........                                   ..........
     |--------|                                   |--------|
     | attr W |                                   | attr n |
     +--------+                                   +--------+

And now we have the new (incomplete) attribute @ L4:0, and the
original attribute at L3:0. At this point, the first transaction is
committed, and we move to the flipping of the flags.

This is where we are supposed to end up with this:

     +--------+     +--------+     +--------+     +--------+
     |   L1   |     |   L4   |     |   L3   |     |   L2   |
     | fwd: 4 |---->| fwd: 3 |---->| fwd: 2 |---->| fwd: 0 |
     | bwd: 0 |<----| bwd: 1 |<----| bwd: 4 |<----| bwd: 3 |
     | fsp: M |     | fsp: J |     | fsp: J |     | fsp: N |
     |--------|     |--------|     |--------|     |--------|
     | attr A |     | attr Y |     | attr X |     | attr 1 |
     |--------|     +--------+     + INCOMP +     |--------|
     | attr B |                    +--------+     | attr 2 |
     |--------|                                   |--------|
     ..........                                   ..........
     |--------|                                   |--------|
     | attr W |                                   | attr n |
     +--------+                                   +--------+

But that doesn't happen properly - the attribute tracking indexes
are not pointing to the right locations. What we end up with is both
the old attribute to be removed pointing at L4:0 and the new
attribute at L4:1.  On a debug kernel, this assert fails like so:

XFS: Assertion failed: args->index2 < be16_to_cpu(leaf2->hdr.count), file: fs/xfs/xfs_attr_leaf.c, line: 2725

because the new attribute location does not exist. On a production
kernel, this goes unnoticed and the code proceeds ahead merrily and
removes L4 because it thinks that is the block that is no longer
needed. This leaves the hash index node pointing to entries
L1, L4 and L2, but only blocks L1, L3 and L2 to exist. Further, the
leaf level sibling list is L1 <-> L4 <-> L2, but L4 is now free
space, and so everything is busted. This corruption is caused by the
removal of the old attribute triggering a join - it joins everything
correctly but then frees the wrong block.

xfs_repair will report something like:

bad sibling back pointer for block 4 in attribute fork for inode 131
problem with attribute contents in inode 131
would clear attr fork
bad nblocks 8 for inode 131, would reset to 3
bad anextents 4 for inode 131, would reset to 0

The problem lies in the assignment of the old/new blocks for
tracking purposes when the double leaf split occurs. The first split
tries to place the new attribute inside the current leaf (i.e.
"inleaf == true") and moves the old attribute (X) to the new block.
This sets up the old block/index to L1:X, and newly allocated
block to L3:0. It then moves attr X to the new block and tries to
insert attr Y at the old index. That fails, so it splits again.

With the second split, the rebalance ends up placing the new attr in
the second new block - L4:0 - and this is where the code goes wrong.
What is does is it sets both the new and old block index to the
second new block. Hence it inserts attr Y at the right place (L4:0)
but overwrites the current location of the attr to replace that is
held in the new block index (currently L3:0). It over writes it with
L4:1 - the index we later assert fail on.

Hopefully this table will show this in a foramt that is a bit easier
to understand:

Split		old attr index		new attr index
		vanilla	patched		vanilla	patched
before 1st	L1:26	L1:26		N/A	N/A
after 1st	L3:0	L3:0		L1:26	L1:26
after 2nd	L4:0	L3:0		L4:1	L4:0
                ^^^^			^^^^
		wrong			wrong

The fix is surprisingly simple, for all this analysis - just stop
the rebalance on the out-of leaf case from overwriting the new attr
index - it's already correct for the double split case.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr_leaf.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..70eec1829776 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1291,6 +1291,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	leaf2 = blk2->bp->b_addr;
 	ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+	ASSERT(leaf2->hdr.count == 0);
 	args = state->args;
 
 	trace_xfs_attr_leaf_rebalance(args);
@@ -1361,6 +1362,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		 * I assert that since all callers pass in an empty
 		 * second buffer, this code should never execute.
 		 */
+		ASSERT(0);
 
 		/*
 		 * Figure the total bytes to be added to the destination leaf.
@@ -1422,10 +1424,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 			args->index2 = 0;
 			args->blkno2 = blk2->blkno;
 		} else {
+			/*
+			 * On a double leaf split, the original attr location
+			 * is already stored in blkno2/index2, so don't
+			 * overwrite it overwise we corrupt the tree.
+			 */
 			blk2->index = blk1->index
 				    - be16_to_cpu(leaf1->hdr.count);
-			args->index = args->index2 = blk2->index;
-			args->blkno = args->blkno2 = blk2->blkno;
+			args->index = blk2->index;
+			args->blkno = blk2->blkno;
+			if (!state->extravalid) {
+				/*
+				 * set the new attr location to match the old
+				 * one and let the higher level split code
+				 * decide where in the leaf to place it.
+				 */
+				args->index2 = blk2->index;
+				args->blkno2 = blk2->blkno;
+			}
 		}
 	} else {
 		ASSERT(state->inleaf == 1);
-- 
cgit v1.2.3


From 7bf7f352194252e6f05981d44fb8cb55668606cd Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:09:45 +1100
Subject: xfs: fix broken error handling in xfs_vm_writepage

When we shut down the filesystem, it might first be detected in
writeback when we are allocating a inode size transaction. This
happens after we have moved all the pages into the writeback state
and unlocked them. Unfortunately, if we fail to set up the
transaction we then abort writeback and try to invalidate the
current page. This then triggers are BUG() in block_invalidatepage()
because we are trying to invalidate an unlocked page.

Fixing this is a bit of a chicken and egg problem - we can't
allocate the transaction until we've clustered all the pages into
the IO and we know the size of it (i.e. whether the last block of
the IO is beyond the current EOF or not). However, we don't want to
hold pages locked for long periods of time, especially while we lock
other pages to cluster them into the write.

To fix this, we need to make a clear delineation in writeback where
errors can only be handled by IO completion processing. That is,
once we have marked a page for writeback and unlocked it, we have to
report errors via IO completion because we've already started the
IO. We may not have submitted any IO, but we've changed the page
state to indicate that it is under IO so we must now use the IO
completion path to report errors.

To do this, add an error field to xfs_submit_ioend() to pass it the
error that occurred during the building on the ioend chain. When
this is non-zero, mark each ioend with the error and call
xfs_finish_ioend() directly rather than building bios. This will
immediately push the ioends through completion processing with the
error that has occurred.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c | 54 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 15 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..e57e2daa357c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -481,11 +481,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
  *
  * The fix is two passes across the ioend list - one to start writeback on the
  * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
  */
 STATIC void
 xfs_submit_ioend(
 	struct writeback_control *wbc,
-	xfs_ioend_t		*ioend)
+	xfs_ioend_t		*ioend,
+	int			fail)
 {
 	xfs_ioend_t		*head = ioend;
 	xfs_ioend_t		*next;
@@ -506,6 +512,18 @@ xfs_submit_ioend(
 		next = ioend->io_list;
 		bio = NULL;
 
+		/*
+		 * If we are failing the IO now, just mark the ioend with an
+		 * error and finish it. This will run IO completion immediately
+		 * as there is only one reference to the ioend at this point in
+		 * time.
+		 */
+		if (fail) {
+			ioend->io_error = -fail;
+			xfs_finish_ioend(ioend);
+			continue;
+		}
+
 		for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
 
 			if (!bio) {
@@ -1060,7 +1078,18 @@ xfs_vm_writepage(
 
 	xfs_start_page_writeback(page, 1, count);
 
-	if (ioend && imap_valid) {
+	/* if there is no IO to be submitted for this page, we are done */
+	if (!ioend)
+		return 0;
+
+	ASSERT(iohead);
+
+	/*
+	 * Any errors from this point onwards need tobe reported through the IO
+	 * completion path as we have marked the initial page as under writeback
+	 * and unlocked it.
+	 */
+	if (imap_valid) {
 		xfs_off_t		end_index;
 
 		end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1108,15 @@ xfs_vm_writepage(
 				  wbc, end_index);
 	}
 
-	if (iohead) {
-		/*
-		 * Reserve log space if we might write beyond the on-disk
-		 * inode size.
-		 */
-		if (ioend->io_type != XFS_IO_UNWRITTEN &&
-		    xfs_ioend_is_append(ioend)) {
-			err = xfs_setfilesize_trans_alloc(ioend);
-			if (err)
-				goto error;
-		}
 
-		xfs_submit_ioend(wbc, iohead);
-	}
+	/*
+	 * Reserve log space if we might write beyond the on-disk inode size.
+	 */
+	err = 0;
+	if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+		err = xfs_setfilesize_trans_alloc(ioend);
+
+	xfs_submit_ioend(wbc, iohead, err);
 
 	return 0;
 
-- 
cgit v1.2.3


From 37eb17e604ac7398bbb133c82f281475d704fff7 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:09:46 +1100
Subject: xfs: drop buffer io reference when a bad bio is built

Error handling in xfs_buf_ioapply_map() does not handle IO reference
counts correctly. We increment the b_io_remaining count before
building the bio, but then fail to decrement it in the failure case.
This leads to the buffer never running IO completion and releasing
the reference that the IO holds, so at unmount we can leak the
buffer. This leak is captured by this assert failure during unmount:

XFS: Assertion failed: atomic_read(&pag->pag_ref) == 0, file: fs/xfs/xfs_mount.c, line: 273

This is not a new bug - the b_io_remaining accounting has had this
problem for a long, long time - it's just very hard to get a
zero length bio being built by this code...

Further, the buffer IO error can be overwritten on a multi-segment
buffer by subsequent bio completions for partial sections of the
buffer. Hence we should only set the buffer error status if the
buffer is not already carrying an error status. This ensures that a
partial IO error on a multi-segment buffer will not be lost. This
part of the problem is a regression, however.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..4b0b8dd1b7b0 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1197,9 +1197,14 @@ xfs_buf_bio_end_io(
 {
 	xfs_buf_t		*bp = (xfs_buf_t *)bio->bi_private;
 
-	xfs_buf_ioerror(bp, -error);
+	/*
+	 * don't overwrite existing errors - otherwise we can lose errors on
+	 * buffers that require multiple bios to complete.
+	 */
+	if (!bp->b_error)
+		xfs_buf_ioerror(bp, -error);
 
-	if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+	if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
 		invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
 
 	_xfs_buf_ioend(bp, 1);
@@ -1279,6 +1284,11 @@ next_chunk:
 		if (size)
 			goto next_chunk;
 	} else {
+		/*
+		 * This is guaranteed not to be the last io reference count
+		 * because the caller (xfs_buf_iorequest) holds a count itself.
+		 */
+		atomic_dec(&bp->b_io_remaining);
 		xfs_buf_ioerror(bp, EIO);
 		bio_put(bio);
 	}
-- 
cgit v1.2.3


From ee73259b401317117e7f5d4834c270b10b12bc8e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:53:53 +1100
Subject: xfs: add more attribute tree trace points.

Added when debugging recent attribute tree problems to more finely
trace code execution through the maze of twisty passages that makes
up the attr code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c      | 18 +++++++++++++++++
 fs/xfs/xfs_attr_leaf.c | 37 ++++++++++++++++++++--------------
 fs/xfs/xfs_da_btree.c  |  6 ++++++
 fs/xfs/xfs_trace.h     | 54 +++++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 99 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..55bbe98e8f82 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -1155,6 +1155,8 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 	struct xfs_buf *bp;
 	int error;
 
+	trace_xfs_attr_leaf_get(args);
+
 	args->blkno = 0;
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
 					     XFS_ATTR_FORK);
@@ -1185,6 +1187,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	int error;
 	struct xfs_buf *bp;
 
+	trace_xfs_attr_leaf_list(context);
+
 	context->cursor->blkno = 0;
 	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
 	if (error)
@@ -1653,6 +1657,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
 	xfs_da_state_blk_t *blk;
 	int level;
 
+	trace_xfs_attr_fillstate(state->args);
+
 	/*
 	 * Roll down the "path" in the state structure, storing the on-disk
 	 * block number for those buffers in the "path".
@@ -1699,6 +1705,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 	xfs_da_state_blk_t *blk;
 	int level, error;
 
+	trace_xfs_attr_refillstate(state->args);
+
 	/*
 	 * Roll down the "path" in the state structure, storing the on-disk
 	 * block number for those buffers in the "path".
@@ -1755,6 +1763,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
 	int error, retval;
 	int i;
 
+	trace_xfs_attr_node_get(args);
+
 	state = xfs_da_state_alloc();
 	state->args = args;
 	state->mp = args->dp->i_mount;
@@ -1804,6 +1814,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	int error, i;
 	struct xfs_buf *bp;
 
+	trace_xfs_attr_node_list(context);
+
 	cursor = context->cursor;
 	cursor->initted = 1;
 
@@ -1959,6 +1971,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 	int nmap, error, tmp, valuelen, blkcnt, i;
 	xfs_dablk_t lblkno;
 
+	trace_xfs_attr_rmtval_get(args);
+
 	ASSERT(!(args->flags & ATTR_KERNOVAL));
 
 	mp = args->dp->i_mount;
@@ -2014,6 +2028,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
 	xfs_dablk_t lblkno;
 	int blkcnt, valuelen, nmap, error, tmp, committed;
 
+	trace_xfs_attr_rmtval_set(args);
+
 	dp = args->dp;
 	mp = dp->i_mount;
 	src = args->value;
@@ -2143,6 +2159,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
 	xfs_dablk_t lblkno;
 	int valuelen, blkcnt, nmap, error, done, committed;
 
+	trace_xfs_attr_rmtval_remove(args);
+
 	mp = args->dp->i_mount;
 
 	/*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 70eec1829776..4bfc732bc9c9 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
 				struct xfs_buf **bpp);
 STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
 				  xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
+				  struct xfs_buf *leaf_buffer);
 STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
 						   xfs_da_state_blk_t *blk1,
 						   xfs_da_state_blk_t *blk2);
@@ -1071,7 +1072,7 @@ xfs_attr_leaf_add(
 	 * Compact the entries to coalesce free space.
 	 * This may change the hdr->count via dropping INCOMPLETE entries.
 	 */
-	xfs_attr_leaf_compact(args->trans, bp);
+	xfs_attr_leaf_compact(args, bp);
 
 	/*
 	 * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1103,8 @@ xfs_attr_leaf_add_work(
 	xfs_mount_t *mp;
 	int tmp, i;
 
+	trace_xfs_attr_leaf_add_work(args);
+
 	leaf = bp->b_addr;
 	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	hdr = &leaf->hdr;
@@ -1214,15 +1217,17 @@ xfs_attr_leaf_add_work(
  */
 STATIC void
 xfs_attr_leaf_compact(
-	struct xfs_trans *trans,
-	struct xfs_buf	*bp)
+	struct xfs_da_args	*args,
+	struct xfs_buf		*bp)
 {
-	xfs_attr_leafblock_t *leaf_s, *leaf_d;
-	xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
-	xfs_mount_t *mp;
-	char *tmpbuffer;
+	xfs_attr_leafblock_t	*leaf_s, *leaf_d;
+	xfs_attr_leaf_hdr_t	*hdr_s, *hdr_d;
+	struct xfs_trans	*trans = args->trans;
+	struct xfs_mount	*mp = trans->t_mountp;
+	char			*tmpbuffer;
+
+	trace_xfs_attr_leaf_compact(args);
 
-	mp = trans->t_mountp;
 	tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
 	ASSERT(tmpbuffer != NULL);
 	memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1345,9 +1350,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		max  = be16_to_cpu(hdr2->firstused)
 						- sizeof(xfs_attr_leaf_hdr_t);
 		max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
-		if (space > max) {
-			xfs_attr_leaf_compact(args->trans, blk2->bp);
-		}
+		if (space > max)
+			xfs_attr_leaf_compact(args, blk2->bp);
 
 		/*
 		 * Move high entries from leaf1 to low end of leaf2.
@@ -1378,9 +1382,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 		max  = be16_to_cpu(hdr1->firstused)
 						- sizeof(xfs_attr_leaf_hdr_t);
 		max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
-		if (space > max) {
-			xfs_attr_leaf_compact(args->trans, blk1->bp);
-		}
+		if (space > max)
+			xfs_attr_leaf_compact(args, blk1->bp);
 
 		/*
 		 * Move low entries from leaf2 to high end of leaf1.
@@ -1577,6 +1580,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 	xfs_dablk_t blkno;
 	struct xfs_buf *bp;
 
+	trace_xfs_attr_leaf_toosmall(state->args);
+
 	/*
 	 * Check for the degenerate case of the block being over 50% full.
 	 * If so, it's not worth even looking to see if we might be able
@@ -1702,6 +1707,8 @@ xfs_attr_leaf_remove(
 	int tablesize, tmp, i;
 	xfs_mount_t *mp;
 
+	trace_xfs_attr_leaf_remove(args);
+
 	leaf = bp->b_addr;
 	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	hdr = &leaf->hdr;
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..c62e7e6ff50e 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -779,6 +779,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 	xfs_dablk_t blkno;
 	struct xfs_buf *bp;
 
+	trace_xfs_da_node_toosmall(state->args);
+
 	/*
 	 * Check for the degenerate case of the block being over 50% full.
 	 * If so, it's not worth even looking to see if we might be able
@@ -900,6 +902,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
 	xfs_dahash_t lasthash=0;
 	int level, count;
 
+	trace_xfs_da_fixhashpath(state->args);
+
 	level = path->active-1;
 	blk = &path->blk[ level ];
 	switch (blk->magic) {
@@ -1417,6 +1421,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 	xfs_dablk_t blkno=0;
 	int level, error;
 
+	trace_xfs_da_path_shift(state->args);
+
 	/*
 	 * Roll up the Btree looking for the first block where our
 	 * current index is not at the edge of the block.  Note that
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index cb5234632072..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
 
 DECLARE_EVENT_CLASS(xfs_perag_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -1502,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
 DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
 DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
 
+DECLARE_EVENT_CLASS(xfs_attr_class,
+	TP_PROTO(struct xfs_da_args *args),
+	TP_ARGS(args),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__dynamic_array(char, name, args->namelen)
+		__field(int, namelen)
+		__field(int, valuelen)
+		__field(xfs_dahash_t, hashval)
+		__field(int, op_flags)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+		__entry->ino = args->dp->i_ino;
+		if (args->namelen)
+			memcpy(__get_str(name), args->name, args->namelen);
+		__entry->namelen = args->namelen;
+		__entry->valuelen = args->valuelen;
+		__entry->hashval = args->hashval;
+		__entry->op_flags = args->op_flags;
+	),
+	TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+		  "hashval 0x%x op_flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->namelen,
+		  __entry->namelen ? __get_str(name) : NULL,
+		  __entry->namelen,
+		  __entry->valuelen,
+		  __entry->hashval,
+		  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
 #define DEFINE_ATTR_EVENT(name) \
-DEFINE_EVENT(xfs_da_class, name, \
+DEFINE_EVENT(xfs_attr_class, name, \
 	TP_PROTO(struct xfs_da_args *args), \
 	TP_ARGS(args))
 DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1517,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1532,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
 
 DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
 DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_node_replace);
 DEFINE_ATTR_EVENT(xfs_attr_node_removename);
 
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
+
 #define DEFINE_DA_EVENT(name) \
 DEFINE_EVENT(xfs_da_class, name, \
 	TP_PROTO(struct xfs_da_args *args), \
@@ -1556,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
 DEFINE_DA_EVENT(xfs_da_node_remove);
 DEFINE_DA_EVENT(xfs_da_node_rebalance);
 DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
 DEFINE_DA_EVENT(xfs_da_swap_lastblock);
 DEFINE_DA_EVENT(xfs_da_grow_inode);
 DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
 
 DECLARE_EVENT_CLASS(xfs_dir2_space_class,
 	TP_PROTO(struct xfs_da_args *args, int idx),
-- 
cgit v1.2.3


From b64f3a390d3477517cbff7d613e551705540769b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 13 Nov 2012 16:40:27 -0600
Subject: xfs: use btree block initialisation functions in growfs

Factor xfs_btree_init_block() to be independent of the btree cursor,
and use the function to initialise btree blocks in the growfs code.
This makes adding support for different format btree blocks simple.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by Rich Johnston <rjohnston@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_btree.c | 33 ++++++++++++++++++++++++---------
 fs/xfs/xfs_btree.h | 11 +++++++++++
 fs/xfs/xfs_fsops.c | 37 +++++++++++++------------------------
 3 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..121ea99e615a 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -853,18 +853,22 @@ xfs_btree_set_sibling(
 	}
 }
 
-STATIC void
+void
 xfs_btree_init_block(
-	struct xfs_btree_cur	*cur,
-	int			level,
-	int			numrecs,
-	struct xfs_btree_block	*new)	/* new block */
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	__u32		magic,
+	__u16		level,
+	__u16		numrecs,
+	unsigned int	flags)
 {
-	new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+	struct xfs_btree_block	*new = XFS_BUF_TO_BLOCK(bp);
+
+	new->bb_magic = cpu_to_be32(magic);
 	new->bb_level = cpu_to_be16(level);
 	new->bb_numrecs = cpu_to_be16(numrecs);
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+	if (flags & XFS_BTREE_LONG_PTRS) {
 		new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
 		new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
 	} else {
@@ -873,6 +877,17 @@ xfs_btree_init_block(
 	}
 }
 
+STATIC void
+xfs_btree_init_block_cur(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	int			numrecs,
+	struct xfs_buf		*bp)
+{
+	xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
+			       level, numrecs, cur->bc_flags);
+}
+
 /*
  * Return true if ptr is the last record in the btree and
  * we need to track updateѕ to this record.  The decision
@@ -2183,7 +2198,7 @@ xfs_btree_split(
 		goto error0;
 
 	/* Fill in the btree header for the new right block. */
-	xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+	xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
 
 	/*
 	 * Split the entries between the old and the new block evenly.
@@ -2492,7 +2507,7 @@ xfs_btree_new_root(
 		nptr = 2;
 	}
 	/* Fill in the new block's btree header and log it. */
-	xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+	xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
 	xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
 	ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
 			!xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..c9cf2d00e236 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -378,6 +378,17 @@ xfs_btree_reada_bufs(
 	xfs_agblock_t		agbno,	/* allocation group block number */
 	xfs_extlen_t		count);	/* count of filesystem blocks */
 
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+	struct xfs_mount *mp,
+	struct xfs_buf	*bp,
+	__u32		magic,
+	__u16		level,
+	__u16		numrecs,
+	unsigned int	flags);
 
 /*
  * Common btree core entry points.
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7b0a997cf62b..a5034af35db7 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -125,7 +125,6 @@ xfs_growfs_data_private(
 	xfs_extlen_t		agsize;
 	xfs_extlen_t		tmpsize;
 	xfs_alloc_rec_t		*arec;
-	struct xfs_btree_block	*block;
 	xfs_buf_t		*bp;
 	int			bucket;
 	int			dpct;
@@ -263,17 +262,14 @@ xfs_growfs_data_private(
 			error = ENOMEM;
 			goto error0;
 		}
-		block = XFS_BUF_TO_BLOCK(bp);
-		memset(block, 0, mp->m_sb.sb_blocksize);
-		block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
-		block->bb_level = 0;
-		block->bb_numrecs = cpu_to_be16(1);
-		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+		xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+
+		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
+
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
 		if (error)
@@ -289,18 +285,15 @@ xfs_growfs_data_private(
 			error = ENOMEM;
 			goto error0;
 		}
-		block = XFS_BUF_TO_BLOCK(bp);
-		memset(block, 0, mp->m_sb.sb_blocksize);
-		block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
-		block->bb_level = 0;
-		block->bb_numrecs = cpu_to_be16(1);
-		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-		arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+		xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+
+		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
 			agsize - be32_to_cpu(arec->ar_startblock));
 		nfree += be32_to_cpu(arec->ar_blockcount);
+
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
 		if (error)
@@ -316,13 +309,9 @@ xfs_growfs_data_private(
 			error = ENOMEM;
 			goto error0;
 		}
-		block = XFS_BUF_TO_BLOCK(bp);
-		memset(block, 0, mp->m_sb.sb_blocksize);
-		block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
-		block->bb_level = 0;
-		block->bb_numrecs = 0;
-		block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-		block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+		xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
 		if (error)
-- 
cgit v1.2.3


From fd23683c3b1ab905cba61ea2981c156f4bf52845 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:53:59 +1100
Subject: xfs: growfs: use uncached buffers for new headers

When writing the new AG headers to disk, we can't attach write
verifiers because they have a dependency on the struct xfs-perag
being attached to the buffer to be fully initialised and growfs
can't fully initialise them until later in the process.

The simplest way to avoid this problem is to use uncached buffers
for writing the new headers. These buffers don't have the xfs-perag
attached to them, so it's simple to detect in the write verifier and
be able to skip the checks that need the xfs-perag.

This enables us to attach the appropriate buffer ops to the buffer
and hence calculate CRCs on the way to disk. IT also means that the
buffer is torn down immediately, and so the first access to the AG
headers will re-read the header from disk and perform full
verification of the buffer. This way we also can catch corruptions
due to problems that went undetected in growfs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by Rich Johnston <rjohnston@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c | 63 +++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a5034af35db7..2196830bf5c0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -114,6 +114,26 @@ xfs_fs_geometry(
 	return 0;
 }
 
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+	struct xfs_mount	*mp,
+	xfs_daddr_t		blkno,
+	size_t			numblks,
+	int			flags)
+{
+	struct xfs_buf		*bp;
+
+	bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+	if (!bp)
+		return NULL;
+
+	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+	bp->b_bn = blkno;
+	bp->b_maps[0].bm_bn = blkno;
+
+	return bp;
+}
+
 static int
 xfs_growfs_data_private(
 	xfs_mount_t		*mp,		/* mount point for filesystem */
@@ -189,15 +209,15 @@ xfs_growfs_data_private(
 		/*
 		 * AG freelist header block
 		 */
-		bp = xfs_buf_get(mp->m_ddev_targp,
-				 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-				 XFS_FSS_TO_BB(mp, 1), 0);
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+				XFS_FSS_TO_BB(mp, 1), 0);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
+
 		agf = XFS_BUF_TO_AGF(bp);
-		memset(agf, 0, mp->m_sb.sb_sectsize);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
 		agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
 		agf->agf_seqno = cpu_to_be32(agno);
@@ -226,15 +246,15 @@ xfs_growfs_data_private(
 		/*
 		 * AG inode header block
 		 */
-		bp = xfs_buf_get(mp->m_ddev_targp,
-				 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-				 XFS_FSS_TO_BB(mp, 1), 0);
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+				XFS_FSS_TO_BB(mp, 1), 0);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
+
 		agi = XFS_BUF_TO_AGI(bp);
-		memset(agi, 0, mp->m_sb.sb_sectsize);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
 		agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
 		agi->agi_seqno = cpu_to_be32(agno);
@@ -255,16 +275,16 @@ xfs_growfs_data_private(
 		/*
 		 * BNO btree root block
 		 */
-		bp = xfs_buf_get(mp->m_ddev_targp,
-				 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-				 BTOBB(mp->m_sb.sb_blocksize), 0);
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0);
+
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
-		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-		xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
 
+		xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
@@ -278,16 +298,15 @@ xfs_growfs_data_private(
 		/*
 		 * CNT btree root block
 		 */
-		bp = xfs_buf_get(mp->m_ddev_targp,
-				 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-				 BTOBB(mp->m_sb.sb_blocksize), 0);
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
-		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-		xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
 
+		xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
 		arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
 		arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
 		arec->ar_blockcount = cpu_to_be32(
@@ -302,14 +321,14 @@ xfs_growfs_data_private(
 		/*
 		 * INO btree root block
 		 */
-		bp = xfs_buf_get(mp->m_ddev_targp,
-				 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-				 BTOBB(mp->m_sb.sb_blocksize), 0);
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
-		xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+
 		xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
 
 		error = xfs_bwrite(bp);
-- 
cgit v1.2.3


From de497688daaabbab425a8a969528272ec1d962a6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:00 +1100
Subject: xfs: make growfs initialise the AGFL header

For verification purposes, AGFLs need to be initialised to a known
set of values. For upcoming CRC changes, they are also headers that
need to be initialised. Currently, growfs does neither for the AGFLs
- it ignores them completely. Add initialisation of the AGFL to be
full of invalid block numbers (NULLAGBLOCK) to put the
infrastructure in place needed for CRC support.

Includes a comment clarification from Jeff Liu.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by Rich Johnston <rjohnston@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 2196830bf5c0..bd9cb7f0b073 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -140,6 +140,7 @@ xfs_growfs_data_private(
 	xfs_growfs_data_t	*in)		/* growfs data input struct */
 {
 	xfs_agf_t		*agf;
+	struct xfs_agfl		*agfl;
 	xfs_agi_t		*agi;
 	xfs_agnumber_t		agno;
 	xfs_extlen_t		agsize;
@@ -207,7 +208,7 @@ xfs_growfs_data_private(
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
 		/*
-		 * AG freelist header block
+		 * AG freespace header block
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
@@ -243,6 +244,26 @@ xfs_growfs_data_private(
 		if (error)
 			goto error0;
 
+		/*
+		 * AG freelist header block
+		 */
+		bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+				XFS_FSS_TO_BB(mp, 1), 0);
+		if (!bp) {
+			error = ENOMEM;
+			goto error0;
+		}
+
+		agfl = XFS_BUF_TO_AGFL(bp);
+		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+		error = xfs_bwrite(bp);
+		xfs_buf_relse(bp);
+		if (error)
+			goto error0;
+
 		/*
 		 * AG inode header block
 		 */
-- 
cgit v1.2.3


From f5b8911b67eb4f15d95d5e5324d376d4a49d56e8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2012 17:42:47 +1100
Subject: xfs: remove xfs_tosspages

It's a buggy, unnecessary wrapper that is duplicating
truncate_pagecache_range().

When replacing the call in xfs_change_file_space(), also ensure that
the length being allocated/freed is always positive before making
any changes. These checks are done in the lower extent manipulation
functions, too, but we need to do them before any page cache
operations.

Reported-by: Andrew Dahl <adahl@sgi.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-By: Andrew Dahl <adahl@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dfrag.c    |  3 +--
 fs/xfs/xfs_fs_subr.c  | 12 ------------
 fs/xfs/xfs_vnodeops.c | 30 +++++++++++++++++++++++++-----
 fs/xfs/xfs_vnodeops.h |  2 --
 4 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..b2c63a28afa7 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -315,8 +315,7 @@ xfs_swap_extents(
 	 * are safe.  We don't really care if non-io related
 	 * fields change.
 	 */
-
-	xfs_tosspages(ip, 0, -1, FI_REMAPF);
+	truncate_pagecache_range(VFS_I(ip), 0, -1);
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
 	if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index 652b875a9d4c..d49de3d70456 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -25,18 +25,6 @@
  * note: all filemap functions return negative error codes. These
  * need to be inverted before returning to the xfs core functions.
  */
-void
-xfs_tosspages(
-	xfs_inode_t	*ip,
-	xfs_off_t	first,
-	xfs_off_t	last,
-	int		fiopt)
-{
-	/* can't toss partial tail pages, so mask them out */
-	last &= ~(PAGE_SIZE - 1);
-	truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-
 int
 xfs_flushinval_pages(
 	xfs_inode_t	*ip,
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c2ddd7a43942..de3702a57e55 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2118,7 +2118,7 @@ xfs_change_file_space(
 	xfs_fsize_t	fsize;
 	int		setprealloc;
 	xfs_off_t	startoffset;
-	xfs_off_t	llen;
+	xfs_off_t	end;
 	xfs_trans_t	*tp;
 	struct iattr	iattr;
 	int		prealloc_type;
@@ -2139,12 +2139,30 @@ xfs_change_file_space(
 		return XFS_ERROR(EINVAL);
 	}
 
-	llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+	/*
+	 * length of <= 0 for resv/unresv/zero is invalid.  length for
+	 * alloc/free is ignored completely and we have no idea what userspace
+	 * might have set it to, so set it to zero to allow range
+	 * checks to pass.
+	 */
+	switch (cmd) {
+	case XFS_IOC_ZERO_RANGE:
+	case XFS_IOC_RESVSP:
+	case XFS_IOC_RESVSP64:
+	case XFS_IOC_UNRESVSP:
+	case XFS_IOC_UNRESVSP64:
+		if (bf->l_len <= 0)
+			return XFS_ERROR(EINVAL);
+		break;
+	default:
+		bf->l_len = 0;
+		break;
+	}
 
 	if (bf->l_start < 0 ||
 	    bf->l_start > mp->m_super->s_maxbytes ||
-	    bf->l_start + llen < 0 ||
-	    bf->l_start + llen > mp->m_super->s_maxbytes)
+	    bf->l_start + bf->l_len < 0 ||
+	    bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
 		return XFS_ERROR(EINVAL);
 
 	bf->l_whence = 0;
@@ -2169,7 +2187,9 @@ xfs_change_file_space(
 	switch (cmd) {
 	case XFS_IOC_ZERO_RANGE:
 		prealloc_type |= XFS_BMAPI_CONVERT;
-		xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+		end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1;
+		if (startoffset > end)
+			truncate_pagecache_range(VFS_I(ip), startoffset, end);
 		/* FALLTHRU */
 	case XFS_IOC_RESVSP:
 	case XFS_IOC_RESVSP64:
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 52fafc416a0c..d48141d6bc3b 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
-void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
-		xfs_off_t last, int fiopt);
 int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
 		xfs_off_t last, int fiopt);
 int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
-- 
cgit v1.2.3


From d6638ae244f6323fcdf85e72eb4a5af6f6212893 Mon Sep 17 00:00:00 2001
From: Andrew Dahl <adahl@sgi.com>
Date: Wed, 14 Nov 2012 12:52:26 -0600
Subject: xfs: reverse the check on XFS_IOC_ZERO_RANGE

Reversing the check on XFS_IOC_ZERO_RANGE.

Range should be zeroed if the start is less than or equal to the end.

Signed-off-by: Andrew Dahl <adahl@sgi.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_vnodeops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index de3702a57e55..46a7a5de5d6d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2188,7 +2188,7 @@ xfs_change_file_space(
 	case XFS_IOC_ZERO_RANGE:
 		prealloc_type |= XFS_BMAPI_CONVERT;
 		end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1;
-		if (startoffset > end)
+		if (startoffset <= end)
 			truncate_pagecache_range(VFS_I(ip), startoffset, end);
 		/* FALLTHRU */
 	case XFS_IOC_RESVSP:
-- 
cgit v1.2.3


From 95eacf0f71b7682a05b8242c49c68e8e4bb673e3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:53:55 +1100
Subject: xfs: remove xfs_wait_on_pages()

It's just a simple wrapper around a VFS function that is only called
by another function in xfs_fs_subr.c. Remove it and call the VFS
function directly.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Andrew Dahl <adahl@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fs_subr.c  | 18 ++----------------
 fs/xfs/xfs_vnodeops.h |  1 -
 2 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index d49de3d70456..33658234dfc5 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -62,23 +62,9 @@ xfs_flush_pages(
 				last == -1 ? LLONG_MAX : last);
 	if (flags & XBF_ASYNC)
 		return ret;
-	ret2 = xfs_wait_on_pages(ip, first, last);
+	ret2 = -filemap_fdatawait_range(mapping, first,
+				last == -1 ? XFS_ISIZE(ip) - 1 : last);
 	if (!ret)
 		ret = ret2;
 	return ret;
 }
-
-int
-xfs_wait_on_pages(
-	xfs_inode_t	*ip,
-	xfs_off_t	first,
-	xfs_off_t	last)
-{
-	struct address_space *mapping = VFS_I(ip)->i_mapping;
-
-	if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-		return -filemap_fdatawait_range(mapping, first,
-					last == -1 ? XFS_ISIZE(ip) - 1 : last);
-	}
-	return 0;
-}
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index d48141d6bc3b..c8ad48b61a25 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -52,7 +52,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
 		xfs_off_t last, int fiopt);
 int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
 		xfs_off_t last, uint64_t flags, int fiopt);
-int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
 
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
-- 
cgit v1.2.3


From 4bc1ea6b8ddd4f2bd78944fbe5a1042ac14b1f5f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:53:56 +1100
Subject: xfs: remove xfs_flush_pages

It is a complex wrapper around VFS functions, but there are VFS
functions that provide exactly the same functionality. Call the VFS
functions directly and remove the unnecessary indirection and
complexity.

We don't need to care about clearing the XFS_ITRUNCATED flag, as
that is done during .writepages. Hence is cleared by the VFS
writeback path if there is anything to write back during the flush.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Andrew Dahl <adahl@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c     |  2 +-
 fs/xfs/xfs_bmap.c     |  2 +-
 fs/xfs/xfs_fs_subr.c  | 24 ------------------------
 fs/xfs/xfs_iops.c     |  4 ++--
 fs/xfs/xfs_vnodeops.c |  7 +++++--
 fs/xfs/xfs_vnodeops.h |  2 --
 6 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e57e2daa357c..71361da1f77c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1641,7 +1641,7 @@ xfs_vm_bmap(
 
 	trace_xfs_vm_bmap(XFS_I(inode));
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+	filemap_write_and_wait(mapping);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 83d0cf3df930..a60f3d1f151c 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5599,7 +5599,7 @@ xfs_getbmap(
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
 		if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
-			error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+			error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
 			if (error)
 				goto out_unlock_iolock;
 		}
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
index 33658234dfc5..b5380893728e 100644
--- a/fs/xfs/xfs_fs_subr.c
+++ b/fs/xfs/xfs_fs_subr.c
@@ -44,27 +44,3 @@ xfs_flushinval_pages(
 		truncate_inode_pages_range(mapping, first, last);
 	return -ret;
 }
-
-int
-xfs_flush_pages(
-	xfs_inode_t	*ip,
-	xfs_off_t	first,
-	xfs_off_t	last,
-	uint64_t	flags,
-	int		fiopt)
-{
-	struct address_space *mapping = VFS_I(ip)->i_mapping;
-	int		ret = 0;
-	int		ret2;
-
-	xfs_iflags_clear(ip, XFS_ITRUNCATED);
-	ret = -filemap_fdatawrite_range(mapping, first,
-				last == -1 ? LLONG_MAX : last);
-	if (flags & XBF_ASYNC)
-		return ret;
-	ret2 = -filemap_fdatawait_range(mapping, first,
-				last == -1 ? XFS_ISIZE(ip) - 1 : last);
-	if (!ret)
-		ret = ret2;
-	return ret;
-}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 81f5c4953287..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -780,8 +780,8 @@ xfs_setattr_size(
 	 * care about here.
 	 */
 	if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-		error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
-					FI_NONE);
+		error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+						      ip->i_d.di_size, newsize);
 		if (error)
 			goto out_unlock;
 	}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 46a7a5de5d6d..c00326afa7bf 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -428,8 +428,11 @@ xfs_release(
 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
 		if (truncated) {
 			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-			if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
-				xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+			if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+				error = -filemap_flush(VFS_I(ip)->i_mapping);
+				if (error)
+					return error;
+			}
 		}
 	}
 
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index c8ad48b61a25..73cb3cb15f75 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -50,8 +50,6 @@ int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
 int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
 		xfs_off_t last, int fiopt);
-int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
-		xfs_off_t last, uint64_t flags, int fiopt);
 
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
-- 
cgit v1.2.3


From fb59581404ab7ec5075299065c22cb211a9262a9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:53:57 +1100
Subject: xfs: remove xfs_flushinval_pages

It's just a simple wrapper around VFS functionality, and is actually
bugging in that it doesn't remove mappings before invalidating the
page cache. Remove it and replace it with the correct VFS
functionality.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Andrew Dahl <adahl@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Makefile       |  1 -
 fs/xfs/xfs_dfrag.c    | 10 ++++------
 fs/xfs/xfs_file.c     | 23 ++++++++++++-----------
 fs/xfs/xfs_fs_subr.c  | 46 ----------------------------------------------
 fs/xfs/xfs_vnodeops.c | 11 +++++------
 fs/xfs/xfs_vnodeops.h |  2 --
 6 files changed, 21 insertions(+), 72 deletions(-)
 delete mode 100644 fs/xfs/xfs_fs_subr.c

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index e65357bb3dc6..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,7 +37,6 @@ xfs-y				+= xfs_aops.o \
 				   xfs_file.o \
 				   xfs_filestream.o \
 				   xfs_fsops.o \
-				   xfs_fs_subr.o \
 				   xfs_globals.o \
 				   xfs_icache.o \
 				   xfs_ioctl.o \
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b2c63a28afa7..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
 		goto out_unlock;
 	}
 
-	if (VN_CACHED(VFS_I(tip)) != 0) {
-		error = xfs_flushinval_pages(tip, 0, -1,
-				FI_REMAPF_LOCKED);
-		if (error)
-			goto out_unlock;
-	}
+	error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		goto out_unlock;
+	truncate_pagecache_range(VFS_I(ip), 0, -1);
 
 	/* Verify O_DIRECT for ftmp */
 	if (VN_CACHED(VFS_I(tip)) != 0) {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index daf4066c24b2..c42f99e71f14 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -255,15 +255,14 @@ xfs_file_aio_read(
 		xfs_buftarg_t	*target =
 			XFS_IS_REALTIME_INODE(ip) ?
 				mp->m_rtdev_targp : mp->m_ddev_targp;
-		if ((iocb->ki_pos & target->bt_smask) ||
-		    (size & target->bt_smask)) {
-			if (iocb->ki_pos == i_size_read(inode))
+		if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+			if (pos == i_size_read(inode))
 				return 0;
 			return -XFS_ERROR(EINVAL);
 		}
 	}
 
-	n = mp->m_super->s_maxbytes - iocb->ki_pos;
+	n = mp->m_super->s_maxbytes - pos;
 	if (n <= 0 || size == 0)
 		return 0;
 
@@ -289,20 +288,21 @@ xfs_file_aio_read(
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
 		if (inode->i_mapping->nrpages) {
-			ret = -xfs_flushinval_pages(ip,
-					(iocb->ki_pos & PAGE_CACHE_MASK),
-					-1, FI_REMAPF_LOCKED);
+			ret = -filemap_write_and_wait_range(
+							VFS_I(ip)->i_mapping,
+							pos, -1);
 			if (ret) {
 				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
 				return ret;
 			}
+			truncate_pagecache_range(VFS_I(ip), pos, -1);
 		}
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 	}
 
-	trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+	trace_xfs_file_read(ip, size, pos, ioflags);
 
-	ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+	ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -670,10 +670,11 @@ xfs_file_dio_aio_write(
 		goto out;
 
 	if (mapping->nrpages) {
-		ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
-							FI_REMAPF_LOCKED);
+		ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+						    pos, -1);
 		if (ret)
 			goto out;
+		truncate_pagecache_range(VFS_I(ip), pos, -1);
 	}
 
 	/*
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index b5380893728e..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-int
-xfs_flushinval_pages(
-	xfs_inode_t	*ip,
-	xfs_off_t	first,
-	xfs_off_t	last,
-	int		fiopt)
-{
-	struct address_space *mapping = VFS_I(ip)->i_mapping;
-	int		ret = 0;
-
-	trace_xfs_pagecache_inval(ip, first, last);
-
-	xfs_iflags_clear(ip, XFS_ITRUNCATED);
-	ret = filemap_write_and_wait_range(mapping, first,
-				last == -1 ? LLONG_MAX : last);
-	if (!ret)
-		truncate_inode_pages_range(mapping, first, last);
-	return -ret;
-}
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c00326afa7bf..81c61fd17890 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -1958,12 +1958,11 @@ xfs_free_file_space(
 
 	rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
 	ioffset = offset & ~(rounding - 1);
-
-	if (VN_CACHED(VFS_I(ip)) != 0) {
-		error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
-		if (error)
-			goto out_unlock_iolock;
-	}
+	error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+					      ioffset, -1);
+	if (error)
+		goto out_unlock_iolock;
+	truncate_pagecache_range(VFS_I(ip), ioffset, -1);
 
 	/*
 	 * Need to zero the stuff we're not freeing, on disk.
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 73cb3cb15f75..91a03fa3814f 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,8 +48,6 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
-int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
-		xfs_off_t last, int fiopt);
 
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
-- 
cgit v1.2.3


From c3f8fc73ac97b76a12692088ef9cace9af8422c0 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:01 +1100
Subject: xfs: make buffer read verication an IO completion function

Add a verifier function callback capability to the buffer read
interfaces.  This will be used by the callers to supply a function
that verifies the contents of the buffer when it is read from disk.
This patch does not provide callback functions, but simply modifies
the interfaces to allow them to be called.

The reason for adding this to the read interfaces is that it is very
difficult to tell fom the outside is a buffer was just read from
disk or whether we just pulled it out of cache. Supplying a callbck
allows the buffer cache to use it's internal knowledge of the buffer
to execute it only when the buffer is read from disk.

It is intended that the verifier functions will mark the buffer with
an EFSCORRUPTED error when verification fails. This allows the
reading context to distinguish a verification error from an IO
error, and potentially take further actions on the buffer (e.g.
attempt repair) based on the error reported.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c       |  4 ++--
 fs/xfs/xfs_attr.c        |  2 +-
 fs/xfs/xfs_btree.c       | 21 ++++++++++++---------
 fs/xfs/xfs_buf.c         | 13 +++++++++----
 fs/xfs/xfs_buf.h         | 20 ++++++++++++--------
 fs/xfs/xfs_da_btree.c    |  4 ++--
 fs/xfs/xfs_dir2_leaf.c   |  2 +-
 fs/xfs/xfs_dquot.c       |  4 ++--
 fs/xfs/xfs_fsops.c       |  4 ++--
 fs/xfs/xfs_ialloc.c      |  2 +-
 fs/xfs/xfs_inode.c       |  2 +-
 fs/xfs/xfs_log.c         |  3 +--
 fs/xfs/xfs_log_recover.c |  8 +++++---
 fs/xfs/xfs_mount.c       |  6 +++---
 fs/xfs/xfs_qm.c          |  5 +++--
 fs/xfs/xfs_rtalloc.c     |  6 +++---
 fs/xfs/xfs_trans.h       | 19 ++++++++-----------
 fs/xfs/xfs_trans_buf.c   |  9 ++++++---
 fs/xfs/xfs_vnodeops.c    |  2 +-
 19 files changed, 75 insertions(+), 61 deletions(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 335206a9c698..21c3db08fd01 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -447,7 +447,7 @@ xfs_alloc_read_agfl(
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp);
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
@@ -2110,7 +2110,7 @@ xfs_read_agf(
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), flags, bpp);
+			XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL);
 	if (error)
 		return error;
 	if (!*bpp)
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 55bbe98e8f82..474c57a43cce 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -1994,7 +1994,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
 			dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
 			blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-						   dblkno, blkcnt, 0, &bp);
+						   dblkno, blkcnt, 0, &bp, NULL);
 			if (error)
 				return(error);
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 121ea99e615a..7e791160092d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,12 @@ xfs_btree_dup_cursor(
 	for (i = 0; i < new->bc_nlevels; i++) {
 		new->bc_ptrs[i] = cur->bc_ptrs[i];
 		new->bc_ra[i] = cur->bc_ra[i];
-		if ((bp = cur->bc_bufs[i])) {
-			if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-				XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+		bp = cur->bc_bufs[i];
+		if (bp) {
+			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+						   XFS_BUF_ADDR(bp), mp->m_bsize,
+						   0, &bp, NULL);
+			if (error) {
 				xfs_btree_del_cursor(new, error);
 				*ncur = NULL;
 				return error;
@@ -624,10 +627,10 @@ xfs_btree_read_bufl(
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-			mp->m_bsize, lock, &bp))) {
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+				   mp->m_bsize, lock, &bp, NULL);
+	if (error)
 		return error;
-	}
 	ASSERT(!xfs_buf_geterror(bp));
 	if (bp)
 		xfs_buf_set_ref(bp, refval);
@@ -650,7 +653,7 @@ xfs_btree_reada_bufl(
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL);
 }
 
 /*
@@ -670,7 +673,7 @@ xfs_btree_reada_bufs(
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agbno != NULLAGBLOCK);
 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL);
 }
 
 STATIC int
@@ -1013,7 +1016,7 @@ xfs_btree_read_buf_block(
 
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, flags, bpp);
+				   mp->m_bsize, flags, bpp, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 4b0b8dd1b7b0..0298dd684798 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -654,7 +654,8 @@ xfs_buf_read_map(
 	struct xfs_buftarg	*target,
 	struct xfs_buf_map	*map,
 	int			nmaps,
-	xfs_buf_flags_t		flags)
+	xfs_buf_flags_t		flags,
+	xfs_buf_iodone_t	verify)
 {
 	struct xfs_buf		*bp;
 
@@ -666,6 +667,7 @@ xfs_buf_read_map(
 
 		if (!XFS_BUF_ISDONE(bp)) {
 			XFS_STATS_INC(xb_get_read);
+			bp->b_iodone = verify;
 			_xfs_buf_read(bp, flags);
 		} else if (flags & XBF_ASYNC) {
 			/*
@@ -691,13 +693,14 @@ void
 xfs_buf_readahead_map(
 	struct xfs_buftarg	*target,
 	struct xfs_buf_map	*map,
-	int			nmaps)
+	int			nmaps,
+	xfs_buf_iodone_t	verify)
 {
 	if (bdi_read_congested(target->bt_bdi))
 		return;
 
 	xfs_buf_read_map(target, map, nmaps,
-		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify);
 }
 
 /*
@@ -709,7 +712,8 @@ xfs_buf_read_uncached(
 	struct xfs_buftarg	*target,
 	xfs_daddr_t		daddr,
 	size_t			numblks,
-	int			flags)
+	int			flags,
+	xfs_buf_iodone_t	verify)
 {
 	xfs_buf_t		*bp;
 	int			error;
@@ -723,6 +727,7 @@ xfs_buf_read_uncached(
 	bp->b_bn = daddr;
 	bp->b_maps[0].bm_bn = daddr;
 	bp->b_flags |= XBF_READ;
+	bp->b_iodone = verify;
 
 	xfsbdstrat(target->bt_mount, bp);
 	error = xfs_buf_iowait(bp);
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..677b1dc822f4 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
 
+
 #define XB_PAGES	2
 
 struct xfs_buf_map {
@@ -159,7 +160,6 @@ typedef struct xfs_buf {
 #endif
 } xfs_buf_t;
 
-
 /* Finding and Reading Buffers */
 struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
 			      struct xfs_buf_map *map, int nmaps,
@@ -196,9 +196,10 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
 			       xfs_buf_flags_t flags);
 struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
 			       struct xfs_buf_map *map, int nmaps,
-			       xfs_buf_flags_t flags);
+			       xfs_buf_flags_t flags, xfs_buf_iodone_t verify);
 void xfs_buf_readahead_map(struct xfs_buftarg *target,
-			       struct xfs_buf_map *map, int nmaps);
+			       struct xfs_buf_map *map, int nmaps,
+			       xfs_buf_iodone_t verify);
 
 static inline struct xfs_buf *
 xfs_buf_get(
@@ -216,20 +217,22 @@ xfs_buf_read(
 	struct xfs_buftarg	*target,
 	xfs_daddr_t		blkno,
 	size_t			numblks,
-	xfs_buf_flags_t		flags)
+	xfs_buf_flags_t		flags,
+	xfs_buf_iodone_t	verify)
 {
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return xfs_buf_read_map(target, &map, 1, flags);
+	return xfs_buf_read_map(target, &map, 1, flags, verify);
 }
 
 static inline void
 xfs_buf_readahead(
 	struct xfs_buftarg	*target,
 	xfs_daddr_t		blkno,
-	size_t			numblks)
+	size_t			numblks,
+	xfs_buf_iodone_t	verify)
 {
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return xfs_buf_readahead_map(target, &map, 1);
+	return xfs_buf_readahead_map(target, &map, 1, verify);
 }
 
 struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +242,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
 				int flags);
 struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
-				xfs_daddr_t daddr, size_t numblks, int flags);
+				xfs_daddr_t daddr, size_t numblks, int flags,
+				xfs_buf_iodone_t verify);
 void xfs_buf_hold(struct xfs_buf *bp);
 
 /* Releasing Buffers */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index c62e7e6ff50e..4af8bad7068c 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2161,7 +2161,7 @@ xfs_da_read_buf(
 
 	error = xfs_trans_read_buf_map(dp->i_mount, trans,
 					dp->i_mount->m_ddev_targp,
-					mapp, nmap, 0, &bp);
+					mapp, nmap, 0, &bp, NULL);
 	if (error)
 		goto out_free;
 
@@ -2237,7 +2237,7 @@ xfs_da_reada_buf(
 	}
 
 	mappedbno = mapp[0].bm_bn;
-	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap);
+	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL);
 
 out_free:
 	if (mapp != &map)
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..bac86984e403 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -926,7 +926,7 @@ xfs_dir2_leaf_readbuf(
 				XFS_FSB_TO_DADDR(mp,
 					map[mip->ra_index].br_startblock +
 							mip->ra_offset),
-				(int)BTOBB(mp->m_dirblksize));
+				(int)BTOBB(mp->m_dirblksize), NULL);
 			mip->ra_current = i;
 		}
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..e95f800333d4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -439,7 +439,7 @@ xfs_qm_dqtobp(
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
 					   mp->m_quotainfo->qi_dqchunklen,
-					   0, &bp);
+					   0, &bp, NULL);
 		if (error || !bp)
 			return XFS_ERROR(error);
 	}
@@ -920,7 +920,7 @@ xfs_qm_dqflush(
 	 * Get the buffer containing the on-disk dquot
 	 */
 	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-				   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+				   mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
 	if (error)
 		goto out_unlock;
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index bd9cb7f0b073..5440768ec41c 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -168,7 +168,7 @@ xfs_growfs_data_private(
 	dpct = pct - mp->m_sb.sb_imax_pct;
 	bp = xfs_buf_read_uncached(mp->m_ddev_targp,
 				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-				XFS_FSS_TO_BB(mp, 1), 0);
+				XFS_FSS_TO_BB(mp, 1), 0, NULL);
 	if (!bp)
 		return EIO;
 	xfs_buf_relse(bp);
@@ -439,7 +439,7 @@ xfs_growfs_data_private(
 		if (agno < oagcount) {
 			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+				  XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
 		} else {
 			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 37753e1c8537..12e3dead439d 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1490,7 +1490,7 @@ xfs_read_agi(
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, bpp);
+			XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7449cb943efd..8d6963010489 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -408,7 +408,7 @@ xfs_imap_to_bp(
 
 	buf_flags |= XBF_UNMAPPED;
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-				   (int)imap->im_len, buf_flags, &bp);
+				   (int)imap->im_len, buf_flags, &bp, NULL);
 	if (error) {
 		if (error != EAGAIN) {
 			xfs_warn(mp,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 46b6986e39b0..1d6d2ee08495 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1129,8 +1129,7 @@ xlog_iodone(xfs_buf_t *bp)
 	 * with it being freed after writing the unmount record to the
 	 * log.
 	 */
-
-}	/* xlog_iodone */
+}
 
 /*
  * Return size of each in-core log record buffer.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 3e06333d4bd1..eb1e29ff0c7c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2144,7 +2144,7 @@ xlog_recover_buffer_pass2(
 		buf_flags |= XBF_UNMAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
-			  buf_flags);
+			  buf_flags, NULL);
 	if (!bp)
 		return XFS_ERROR(ENOMEM);
 	error = bp->b_error;
@@ -2237,7 +2237,8 @@ xlog_recover_inode_pass2(
 	}
 	trace_xfs_log_recover_inode_recover(log, in_f);
 
-	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
+	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+			  NULL);
 	if (!bp) {
 		error = ENOMEM;
 		goto error;
@@ -2548,7 +2549,8 @@ xlog_recover_dquot_pass2(
 	ASSERT(dq_f->qlf_len == 1);
 
 	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
-				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+				   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+				   NULL);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 41ae7e1590f5..d5402b0eb6a3 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -652,7 +652,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 
 reread:
 	bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-					BTOBB(sector_size), 0);
+					BTOBB(sector_size), 0, NULL);
 	if (!bp) {
 		if (loud)
 			xfs_warn(mp, "SB buffer read failed");
@@ -1002,7 +1002,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 	}
 	bp = xfs_buf_read_uncached(mp->m_ddev_targp,
 					d - XFS_FSS_TO_BB(mp, 1),
-					XFS_FSS_TO_BB(mp, 1), 0);
+					XFS_FSS_TO_BB(mp, 1), 0, NULL);
 	if (!bp) {
 		xfs_warn(mp, "last sector read failed");
 		return EIO;
@@ -1017,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
 		}
 		bp = xfs_buf_read_uncached(mp->m_logdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
-					XFS_FSB_TO_BB(mp, 1), 0);
+					XFS_FSB_TO_BB(mp, 1), 0, NULL);
 		if (!bp) {
 			xfs_warn(mp, "log device read failed");
 			return EIO;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 48c750b0e830..688f608b3668 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -892,7 +892,7 @@ xfs_qm_dqiter_bufs(
 	while (blkcnt--) {
 		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 			      XFS_FSB_TO_DADDR(mp, bno),
-			      mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+			      mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
 		if (error)
 			break;
 
@@ -979,7 +979,8 @@ xfs_qm_dqiterate(
 				while (rablkcnt--) {
 					xfs_buf_readahead(mp->m_ddev_targp,
 					       XFS_FSB_TO_DADDR(mp, rablkno),
-					       mp->m_quotainfo->qi_dqchunklen);
+					       mp->m_quotainfo->qi_dqchunklen,
+					       NULL);
 					rablkno++;
 				}
 			}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index a69e0b4750a9..b271ed939d7b 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -870,7 +870,7 @@ xfs_rtbuf_get(
 	ASSERT(map.br_startblock != NULLFSBLOCK);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-				   mp->m_bsize, 0, &bp);
+				   mp->m_bsize, 0, &bp, NULL);
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
@@ -1873,7 +1873,7 @@ xfs_growfs_rt(
 	 */
 	bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
 				XFS_FSB_TO_BB(mp, nrblocks - 1),
-				XFS_FSB_TO_BB(mp, 1), 0);
+				XFS_FSB_TO_BB(mp, 1), 0, NULL);
 	if (!bp)
 		return EIO;
 	xfs_buf_relse(bp);
@@ -2220,7 +2220,7 @@ xfs_rtmount_init(
 	}
 	bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
-					XFS_FSB_TO_BB(mp, 1), 0);
+					XFS_FSB_TO_BB(mp, 1), 0, NULL);
 	if (!bp) {
 		xfs_warn(mp, "realtime device size check failed");
 		return EIO;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..f02d40296506 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
 	int			numblks,
 	uint			flags)
 {
-	struct xfs_buf_map	map = {
-		.bm_bn = blkno,
-		.bm_len = numblks,
-	};
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
 	return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
 }
 
@@ -476,7 +473,8 @@ int		xfs_trans_read_buf_map(struct xfs_mount *mp,
 				       struct xfs_buftarg *target,
 				       struct xfs_buf_map *map, int nmaps,
 				       xfs_buf_flags_t flags,
-				       struct xfs_buf **bpp);
+				       struct xfs_buf **bpp,
+				       xfs_buf_iodone_t verify);
 
 static inline int
 xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
 	xfs_daddr_t		blkno,
 	int			numblks,
 	xfs_buf_flags_t		flags,
-	struct xfs_buf		**bpp)
+	struct xfs_buf		**bpp,
+	xfs_buf_iodone_t	verify)
 {
-	struct xfs_buf_map	map = {
-		.bm_bn = blkno,
-		.bm_len = numblks,
-	};
-	return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
+	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+	return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
+				      flags, bpp, verify);
 }
 
 struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..977628207b45 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
 	struct xfs_buf_map	*map,
 	int			nmaps,
 	xfs_buf_flags_t		flags,
-	struct xfs_buf		**bpp)
+	struct xfs_buf		**bpp,
+	xfs_buf_iodone_t	verify)
 {
 	xfs_buf_t		*bp;
 	xfs_buf_log_item_t	*bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
 
 	*bpp = NULL;
 	if (!tp) {
-		bp = xfs_buf_read_map(target, map, nmaps, flags);
+		bp = xfs_buf_read_map(target, map, nmaps, flags, verify);
 		if (!bp)
 			return (flags & XBF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
 		if (!(XFS_BUF_ISDONE(bp))) {
 			trace_xfs_trans_read_buf_io(bp, _RET_IP_);
 			ASSERT(!XFS_BUF_ISASYNC(bp));
+			ASSERT(bp->b_iodone == NULL);
 			XFS_BUF_READ(bp);
+			bp->b_iodone = verify;
 			xfsbdstrat(tp->t_mountp, bp);
 			error = xfs_buf_iowait(bp);
 			if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
 		return 0;
 	}
 
-	bp = xfs_buf_read_map(target, map, nmaps, flags);
+	bp = xfs_buf_read_map(target, map, nmaps, flags, verify);
 	if (bp == NULL) {
 		*bpp = NULL;
 		return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 81c61fd17890..26880793feca 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -80,7 +80,7 @@ xfs_readlink_bmap(
 		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
 		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
 
-		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+		bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
 		if (!bp)
 			return XFS_ERROR(ENOMEM);
 		error = bp->b_error;
-- 
cgit v1.2.3


From eab4e63368b4cfa597dbdac66d1a7a836a693b7d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:02 +1100
Subject: xfs: uncached buffer reads need to return an error

With verification being done as an IO completion callback, different
errors can be returned from a read. Uncached reads only return a
buffer or NULL on failure, which means the verification error cannot
be returned to the caller.

Split the error handling for these reads into two - a failure to get
a buffer will still return NULL, but a read error will return a
referenced buffer with b_error set rather than NULL. The caller is
responsible for checking the error state of the buffer returned.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c     | 9 ++-------
 fs/xfs/xfs_fsops.c   | 5 +++++
 fs/xfs/xfs_mount.c   | 6 ++++++
 fs/xfs/xfs_rtalloc.c | 9 ++++++++-
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 0298dd684798..fbc965fc075a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -715,8 +715,7 @@ xfs_buf_read_uncached(
 	int			flags,
 	xfs_buf_iodone_t	verify)
 {
-	xfs_buf_t		*bp;
-	int			error;
+	struct xfs_buf		*bp;
 
 	bp = xfs_buf_get_uncached(target, numblks, flags);
 	if (!bp)
@@ -730,11 +729,7 @@ xfs_buf_read_uncached(
 	bp->b_iodone = verify;
 
 	xfsbdstrat(target->bt_mount, bp);
-	error = xfs_buf_iowait(bp);
-	if (error) {
-		xfs_buf_relse(bp);
-		return NULL;
-	}
+	xfs_buf_iowait(bp);
 	return bp;
 }
 
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 5440768ec41c..f35f8d7731f0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -171,6 +171,11 @@ xfs_growfs_data_private(
 				XFS_FSS_TO_BB(mp, 1), 0, NULL);
 	if (!bp)
 		return EIO;
+	if (bp->b_error) {
+		int	error = bp->b_error;
+		xfs_buf_relse(bp);
+		return error;
+	}
 	xfs_buf_relse(bp);
 
 	new = nb;	/* use new as a temporary here */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d5402b0eb6a3..df6d0b2aade1 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -658,6 +658,12 @@ reread:
 			xfs_warn(mp, "SB buffer read failed");
 		return EIO;
 	}
+	if (bp->b_error) {
+		error = bp->b_error;
+		if (loud)
+			xfs_warn(mp, "SB validate failed");
+		goto release_buf;
+	}
 
 	/*
 	 * Initialize the mount structure from the superblock.
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index b271ed939d7b..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -1876,6 +1876,11 @@ xfs_growfs_rt(
 				XFS_FSB_TO_BB(mp, 1), 0, NULL);
 	if (!bp)
 		return EIO;
+	if (bp->b_error) {
+		error = bp->b_error;
+		xfs_buf_relse(bp);
+		return error;
+	}
 	xfs_buf_relse(bp);
 
 	/*
@@ -2221,8 +2226,10 @@ xfs_rtmount_init(
 	bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
 					d - XFS_FSB_TO_BB(mp, 1),
 					XFS_FSB_TO_BB(mp, 1), 0, NULL);
-	if (!bp) {
+	if (!bp || bp->b_error) {
 		xfs_warn(mp, "realtime device size check failed");
+		if (bp)
+			xfs_buf_relse(bp);
 		return EIO;
 	}
 	xfs_buf_relse(bp);
-- 
cgit v1.2.3


From 98021821a502db347bd9c7671beeee6e8ce07ea6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:03 +1100
Subject: xfs: verify superblocks as they are read from disk

Add a superblock verify callback function and pass it into the
buffer read functions. Remove the now redundant verification code
that is currently in use.

Adding verification shows that secondary superblocks never have
their "sb_inprogress" flag cleared by mkfs.xfs, so when validating
the secondary superblocks during a grow operation we have to avoid
checking this field. Even if we fix mkfs, we will still have to
ignore this field for verification purposes unless a version of mkfs
that does not have this bug was used.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_fsops.c       |  4 +-
 fs/xfs/xfs_log_recover.c |  5 ++-
 fs/xfs/xfs_mount.c       | 98 ++++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_mount.h       |  3 +-
 4 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index f35f8d7731f0..cb65b067ed31 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -444,7 +444,8 @@ xfs_growfs_data_private(
 		if (agno < oagcount) {
 			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-				  XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+				  XFS_FSS_TO_BB(mp, 1), 0, &bp,
+				  xfs_sb_read_verify);
 		} else {
 			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
@@ -462,6 +463,7 @@ xfs_growfs_data_private(
 			break;
 		}
 		xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+
 		/*
 		 * If we get an error writing out the alternate superblocks,
 		 * just issue a warning and continue.  The real work is
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index eb1e29ff0c7c..924a4bc3d49a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3692,13 +3692,14 @@ xlog_do_recover(
 
 	/*
 	 * Now that we've finished replaying all buffer and inode
-	 * updates, re-read in the superblock.
+	 * updates, re-read in the superblock and reverify it.
 	 */
 	bp = xfs_getsb(log->l_mp, 0);
 	XFS_BUF_UNDONE(bp);
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
+	bp->b_iodone = xfs_sb_read_verify;
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_buf_iowait(bp);
 	if (error) {
@@ -3710,7 +3711,7 @@ xlog_do_recover(
 
 	/* Convert superblock from on-disk format */
 	sbp = &log->l_mp->m_sb;
-	xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
+	xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
 	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
 	ASSERT(xfs_sb_good_version(sbp));
 	xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index df6d0b2aade1..bff18d73c610 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -304,9 +304,8 @@ STATIC int
 xfs_mount_validate_sb(
 	xfs_mount_t	*mp,
 	xfs_sb_t	*sbp,
-	int		flags)
+	bool		check_inprogress)
 {
-	int		loud = !(flags & XFS_MFSI_QUIET);
 
 	/*
 	 * If the log device and data device have the
@@ -316,21 +315,18 @@ xfs_mount_validate_sb(
 	 * a volume filesystem in a non-volume manner.
 	 */
 	if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-		if (loud)
-			xfs_warn(mp, "bad magic number");
+		xfs_warn(mp, "bad magic number");
 		return XFS_ERROR(EWRONGFS);
 	}
 
 	if (!xfs_sb_good_version(sbp)) {
-		if (loud)
-			xfs_warn(mp, "bad version");
+		xfs_warn(mp, "bad version");
 		return XFS_ERROR(EWRONGFS);
 	}
 
 	if (unlikely(
 	    sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-		if (loud)
-			xfs_warn(mp,
+		xfs_warn(mp,
 		"filesystem is marked as having an external log; "
 		"specify logdev on the mount command line.");
 		return XFS_ERROR(EINVAL);
@@ -338,8 +334,7 @@ xfs_mount_validate_sb(
 
 	if (unlikely(
 	    sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-		if (loud)
-			xfs_warn(mp,
+		xfs_warn(mp,
 		"filesystem is marked as having an internal log; "
 		"do not specify logdev on the mount command line.");
 		return XFS_ERROR(EINVAL);
@@ -373,8 +368,7 @@ xfs_mount_validate_sb(
 	    sbp->sb_dblocks == 0					||
 	    sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)			||
 	    sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-		if (loud)
-			XFS_CORRUPTION_ERROR("SB sanity check failed",
+		XFS_CORRUPTION_ERROR("SB sanity check failed",
 				XFS_ERRLEVEL_LOW, mp, sbp);
 		return XFS_ERROR(EFSCORRUPTED);
 	}
@@ -383,12 +377,10 @@ xfs_mount_validate_sb(
 	 * Until this is fixed only page-sized or smaller data blocks work.
 	 */
 	if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-		if (loud) {
-			xfs_warn(mp,
+		xfs_warn(mp,
 		"File system with blocksize %d bytes. "
 		"Only pagesize (%ld) or less will currently work.",
 				sbp->sb_blocksize, PAGE_SIZE);
-		}
 		return XFS_ERROR(ENOSYS);
 	}
 
@@ -402,23 +394,20 @@ xfs_mount_validate_sb(
 	case 2048:
 		break;
 	default:
-		if (loud)
-			xfs_warn(mp, "inode size of %d bytes not supported",
+		xfs_warn(mp, "inode size of %d bytes not supported",
 				sbp->sb_inodesize);
 		return XFS_ERROR(ENOSYS);
 	}
 
 	if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
 	    xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-		if (loud)
-			xfs_warn(mp,
+		xfs_warn(mp,
 		"file system too large to be mounted on this system.");
 		return XFS_ERROR(EFBIG);
 	}
 
-	if (unlikely(sbp->sb_inprogress)) {
-		if (loud)
-			xfs_warn(mp, "file system busy");
+	if (check_inprogress && sbp->sb_inprogress) {
+		xfs_warn(mp, "Offline file system operation in progress!");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
 
@@ -426,9 +415,7 @@ xfs_mount_validate_sb(
 	 * Version 1 directory format has never worked on Linux.
 	 */
 	if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-		if (loud)
-			xfs_warn(mp,
-				"file system using version 1 directory format");
+		xfs_warn(mp, "file system using version 1 directory format");
 		return XFS_ERROR(ENOSYS);
 	}
 
@@ -521,11 +508,9 @@ out_unwind:
 
 void
 xfs_sb_from_disk(
-	struct xfs_mount	*mp,
+	struct xfs_sb	*to,
 	xfs_dsb_t	*from)
 {
-	struct xfs_sb *to = &mp->m_sb;
-
 	to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
 	to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
 	to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -627,6 +612,50 @@ xfs_sb_to_disk(
 	}
 }
 
+void
+xfs_sb_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_sb	sb;
+	int		error;
+
+	xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+	/*
+	 * Only check the in progress field for the primary superblock as
+	 * mkfs.xfs doesn't clear it from secondary superblocks.
+	 */
+	error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+	if (error)
+		xfs_buf_ioerror(bp, error);
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, the run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+void
+xfs_sb_quiet_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_sb	sb;
+
+	xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+	if (sb.sb_magicnum == XFS_SB_MAGIC) {
+		/* XFS filesystem, verify noisily! */
+		xfs_sb_read_verify(bp);
+		return;
+	}
+	/* quietly fail */
+	xfs_buf_ioerror(bp, EFSCORRUPTED);
+}
+
 /*
  * xfs_readsb
  *
@@ -652,7 +681,9 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 
 reread:
 	bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-					BTOBB(sector_size), 0, NULL);
+				   BTOBB(sector_size), 0,
+				   loud ? xfs_sb_read_verify
+				        : xfs_sb_quiet_read_verify);
 	if (!bp) {
 		if (loud)
 			xfs_warn(mp, "SB buffer read failed");
@@ -667,15 +698,8 @@ reread:
 
 	/*
 	 * Initialize the mount structure from the superblock.
-	 * But first do some basic consistency checking.
 	 */
-	xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
-	error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
-	if (error) {
-		if (loud)
-			xfs_warn(mp, "SB validate failed");
-		goto release_buf;
-	}
+	xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
 
 	/*
 	 * We must be able to do sector-sized and sector-aligned IO.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index dc306a09f56f..de9089acc610 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -385,10 +385,11 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 
 #endif	/* __KERNEL__ */
 
+extern void	xfs_sb_read_verify(struct xfs_buf *);
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
 					xfs_agnumber_t *);
-extern void	xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
+extern void	xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void	xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 
 #endif	/* __XFS_MOUNT_H__ */
-- 
cgit v1.2.3


From 5d5f527d13369d0047d52b7ac4ddee4f8c0ad173 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2012 17:44:56 +1100
Subject: xfs: verify AGF blocks as they are read from disk

Add an AGF block verify callback function and pass it into the
buffer read functions. This replaces the existing verification that
is done after the read completes.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c | 68 +++++++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 21c3db08fd01..c9eb955a49c7 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -2091,6 +2091,47 @@ xfs_alloc_put_freelist(
 	return 0;
 }
 
+static void
+xfs_agf_read_verify(
+	struct xfs_buf	*bp)
+ {
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agf	*agf;
+	int		agf_ok;
+
+	agf = XFS_BUF_TO_AGF(bp);
+
+	agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+		XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+		be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+		be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+		be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+		be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag)
+		agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+						bp->b_pag->pag_agno;
+
+	if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+		agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+						be32_to_cpu(agf->agf_length);
+
+	if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+			XFS_RANDOM_ALLOC_READ_AGF))) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 /*
  * Read in the allocation group header (free/alloc section).
  */
@@ -2102,44 +2143,19 @@ xfs_read_agf(
 	int			flags,	/* XFS_BUF_ */
 	struct xfs_buf		**bpp)	/* buffer for the ag freelist header */
 {
-	struct xfs_agf	*agf;		/* ag freelist header */
-	int		agf_ok;		/* set if agf is consistent */
 	int		error;
 
 	ASSERT(agno != NULLAGNUMBER);
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), flags, bpp, NULL);
+			XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify);
 	if (error)
 		return error;
 	if (!*bpp)
 		return 0;
 
 	ASSERT(!(*bpp)->b_error);
-	agf = XFS_BUF_TO_AGF(*bpp);
-
-	/*
-	 * Validate the magic number of the agf block.
-	 */
-	agf_ok =
-		agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-		XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-		be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-		be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
-		be32_to_cpu(agf->agf_seqno) == agno;
-	if (xfs_sb_version_haslazysbcount(&mp->m_sb))
-		agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
-						be32_to_cpu(agf->agf_length);
-	if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
-			XFS_RANDOM_ALLOC_READ_AGF))) {
-		XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
-				     XFS_ERRLEVEL_LOW, mp, agf);
-		xfs_trans_brelse(tp, *bpp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
 	xfs_buf_set_ref(*bpp, XFS_AGF_REF);
 	return 0;
 }
-- 
cgit v1.2.3


From 3702ce6ed71cd60451ab278088863456dcb0dd99 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:05 +1100
Subject: xfs: verify AGI blocks as they are read from disk

Add an AGI block verify callback function and pass it into the
buffer read functions. Remove the now redundant verification code
that is currently in use.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 56 +++++++++++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 12e3dead439d..5bd255e5f7b8 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1472,6 +1472,40 @@ xfs_check_agi_unlinked(
 #define xfs_check_agi_unlinked(agi)
 #endif
 
+static void
+xfs_agi_read_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agi	*agi = XFS_BUF_TO_AGI(bp);
+	int		agi_ok;
+
+	/*
+	 * Validate the magic number of the agi block.
+	 */
+	agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
+		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+
+	/*
+	 * during growfs operations, the perag is not fully initialised,
+	 * so we can't use it for any useful checking. growfs ensures we can't
+	 * use it by using uncached buffers that don't have the perag attached
+	 * so we can detect and avoid this problem.
+	 */
+	if (bp->b_pag)
+		agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
+						bp->b_pag->pag_agno;
+
+	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+			XFS_RANDOM_IALLOC_READ_AGI))) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+	xfs_check_agi_unlinked(agi);
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 /*
  * Read in the allocation group header (inode allocation section)
  */
@@ -1482,38 +1516,18 @@ xfs_read_agi(
 	xfs_agnumber_t		agno,	/* allocation group number */
 	struct xfs_buf		**bpp)	/* allocation group hdr buf */
 {
-	struct xfs_agi		*agi;	/* allocation group header */
-	int			agi_ok;	/* agi is consistent */
 	int			error;
 
 	ASSERT(agno != NULLAGNUMBER);
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, bpp, NULL);
+			XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify);
 	if (error)
 		return error;
 
 	ASSERT(!xfs_buf_geterror(*bpp));
-	agi = XFS_BUF_TO_AGI(*bpp);
-
-	/*
-	 * Validate the magic number of the agi block.
-	 */
-	agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
-		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-		be32_to_cpu(agi->agi_seqno) == agno;
-	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
-			XFS_RANDOM_IALLOC_READ_AGI))) {
-		XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
-				     mp, agi);
-		xfs_trans_brelse(tp, *bpp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
-
 	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-
-	xfs_check_agi_unlinked(agi);
 	return 0;
 }
 
-- 
cgit v1.2.3


From bb80c6d79a3b0f9b6c3236a4bec021c72615bfd1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:06 +1100
Subject: xfs: verify AGFL blocks as they are read from disk

Add an AGFL block verify callback function and pass it into the
buffer read functions.

While this commit adds verification code to the AGFL, it cannot be
used reliably until the CRC format change comes along as mkfs does
not initialise the full AGFL. Hence it can be full of garbage at the
first mount and will fail verification right now. CRC enabled
filesystems won't have this problem, so leave the code that has
already been written ifdef'd out until the proper time.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index c9eb955a49c7..38b4ab8957ff 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,43 @@ xfs_alloc_fixup_trees(
 	return 0;
 }
 
+void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+	/*
+	 * we cannot actually do any verification of the AGFL because mkfs does
+	 * not initialise the AGFL to zero or NULL. Hence the only valid part of
+	 * the AGFL is what the AGF says is active. We can't get to the AGF, so
+	 * we can't verify just those entries are valid.
+	 *
+	 * This problem goes away when the CRC format change comes along as that
+	 * requires the AGFL to be initialised by mkfs. At that point, we can
+	 * verify the blocks in the agfl -active or not- lie within the bounds
+	 * of the AG. Until then, just leave this check ifdef'd out.
+	 */
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	struct xfs_agfl	*agfl = XFS_BUF_TO_AGFL(bp);
+	int		agfl_ok = 1;
+
+	int		i;
+
+	for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+		if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+		    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+			agfl_ok = 0;
+	}
+
+	if (!agfl_ok) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+#endif
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 /*
  * Read in the allocation group free block array.
  */
@@ -447,7 +484,7 @@ xfs_alloc_read_agfl(
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify);
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
-- 
cgit v1.2.3


From af133e8606d32c2aed43870491ebbdc56feec8a8 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:07 +1100
Subject: xfs: verify inode buffers as they are read from disk

Add an inode buffer verify callback function and pass it into the
buffer read functions. Inodes are special in that the verbose checks
will be done when reading the inode, but we still need to sanity
check the buffer when that is first read. Always verify the magic
numbers in all inodes in the buffer, rather than jus ton debug
kernels.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_inode.c | 100 +++++++++++++++++++++++++++--------------------------
 1 file changed, 51 insertions(+), 49 deletions(-)

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8d6963010489..514eac913f1c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -382,6 +382,46 @@ xfs_inobp_check(
 }
 #endif
 
+static void
+xfs_inode_buf_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount *mp = bp->b_target->bt_mount;
+	int		i;
+	int		ni;
+
+	/*
+	 * Validate the magic number and version of every inode in the buffer
+	 */
+	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+	for (i = 0; i < ni; i++) {
+		int		di_ok;
+		xfs_dinode_t	*dip;
+
+		dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+					(i << mp->m_sb.sb_inodelog));
+		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+			    XFS_DINODE_GOOD_VERSION(dip->di_version);
+		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+						XFS_ERRTAG_ITOBP_INOTOBP,
+						XFS_RANDOM_ITOBP_INOTOBP))) {
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+					     mp, dip);
+#ifdef DEBUG
+			xfs_emerg(mp,
+				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
+				(unsigned long long)bp->b_bn, i,
+				be16_to_cpu(dip->di_magic));
+			ASSERT(0);
+#endif
+		}
+	}
+	xfs_inobp_check(mp, bp);
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 /*
  * This routine is called to map an inode to the buffer containing the on-disk
  * version of the inode.  It returns a pointer to the buffer containing the
@@ -396,71 +436,33 @@ xfs_imap_to_bp(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
 	struct xfs_imap		*imap,
-	struct xfs_dinode	**dipp,
+	struct xfs_dinode       **dipp,
 	struct xfs_buf		**bpp,
 	uint			buf_flags,
 	uint			iget_flags)
 {
 	struct xfs_buf		*bp;
 	int			error;
-	int			i;
-	int			ni;
 
 	buf_flags |= XBF_UNMAPPED;
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-				   (int)imap->im_len, buf_flags, &bp, NULL);
+				   (int)imap->im_len, buf_flags, &bp,
+				   xfs_inode_buf_verify);
 	if (error) {
-		if (error != EAGAIN) {
-			xfs_warn(mp,
-				"%s: xfs_trans_read_buf() returned error %d.",
-				__func__, error);
-		} else {
+		if (error == EAGAIN) {
 			ASSERT(buf_flags & XBF_TRYLOCK);
+			return error;
 		}
-		return error;
-	}
 
-	/*
-	 * Validate the magic number and version of every inode in the buffer
-	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-	 */
-#ifdef DEBUG
-	ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else	/* usual case */
-	ni = 1;
-#endif
+		if (error == EFSCORRUPTED &&
+		    (iget_flags & XFS_IGET_UNTRUSTED))
+			return XFS_ERROR(EINVAL);
 
-	for (i = 0; i < ni; i++) {
-		int		di_ok;
-		xfs_dinode_t	*dip;
-
-		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-					(i << mp->m_sb.sb_inodelog));
-		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-			    XFS_DINODE_GOOD_VERSION(dip->di_version);
-		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-						XFS_ERRTAG_ITOBP_INOTOBP,
-						XFS_RANDOM_ITOBP_INOTOBP))) {
-			if (iget_flags & XFS_IGET_UNTRUSTED) {
-				xfs_trans_brelse(tp, bp);
-				return XFS_ERROR(EINVAL);
-			}
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-					     mp, dip);
-#ifdef DEBUG
-			xfs_emerg(mp,
-				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
-				(unsigned long long)imap->im_blkno, i,
-				be16_to_cpu(dip->di_magic));
-			ASSERT(0);
-#endif
-			xfs_trans_brelse(tp, bp);
-			return XFS_ERROR(EFSCORRUPTED);
-		}
+		xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+			__func__, error);
+		return error;
 	}
 
-	xfs_inobp_check(mp, bp);
-
 	*bpp = bp;
 	*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
 	return 0;
-- 
cgit v1.2.3


From 3d3e6f64e22c94115d47de670611bcd3ecda3796 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:08 +1100
Subject: xfs: verify btree blocks as they are read from disk

Add an btree block verify callback function and pass it into the
buffer read functions. Because each different btree block type
requires different verification, add a function to the ops structure
that is called from the generic code.

Also, propagate the verification callback functions through the
readahead functions, and into the external bmap and bulkstat inode
readahead code that uses the generic btree buffer read functions.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc_btree.c  | 61 +++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap.c         | 60 +++++++++++++++++++++++++-----------------
 fs/xfs/xfs_bmap_btree.c   | 47 +++++++++++++++++++++++++++++++++
 fs/xfs/xfs_bmap_btree.h   |  1 +
 fs/xfs/xfs_btree.c        | 66 ++++++++++++++++++++++++-----------------------
 fs/xfs/xfs_btree.h        | 10 ++++---
 fs/xfs/xfs_ialloc_btree.c | 40 ++++++++++++++++++++++++++++
 fs/xfs/xfs_inode.c        |  2 +-
 fs/xfs/xfs_inode.h        |  1 +
 fs/xfs/xfs_itable.c       |  3 ++-
 10 files changed, 230 insertions(+), 61 deletions(-)

diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f7876c6d6165..46961e52e9b8 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,6 +272,66 @@ xfs_allocbt_key_diff(
 	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
+void
+xfs_allocbt_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+	int			sblock_ok; /* block passes checks */
+
+	/*
+	 * magic number and level verification
+	 *
+	 * During growfs operations, we can't verify the exact level as the
+	 * perag is not fully initialised and hence not attached to the buffer.
+	 * In this case, check against the maximum tree depth.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	switch (block->bb_magic) {
+	case cpu_to_be32(XFS_ABTB_MAGIC):
+		if (pag)
+			sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+		else
+			sblock_ok = level < mp->m_ag_maxlevels;
+		break;
+	case cpu_to_be32(XFS_ABTC_MAGIC):
+		if (pag)
+			sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+		else
+			sblock_ok = level < mp->m_ag_maxlevels;
+		break;
+	default:
+		sblock_ok = 0;
+		break;
+	}
+
+	/* numrecs verification */
+	sblock_ok = sblock_ok &&
+		be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+
+	/* sibling pointer verification */
+	sblock_ok = sblock_ok &&
+		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+		block->bb_u.s.bb_leftsib &&
+		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+		block->bb_u.s.bb_rightsib;
+
+	if (!sblock_ok) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify",
+					XFS_ERRLEVEL_LOW, mp, block);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -327,6 +387,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
 	.key_diff		= xfs_allocbt_key_diff,
+	.read_verify		= xfs_allocbt_read_verify,
 #ifdef DEBUG
 	.keys_inorder		= xfs_allocbt_keys_inorder,
 	.recs_inorder		= xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a60f3d1f151c..9ae7aba52e0f 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents(
 	if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
 		return error;
 #endif
-	if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
-			XFS_BMAP_BTREE_REF)))
+	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+				xfs_bmbt_read_verify);
+	if (error)
 		return error;
 	cblock = XFS_BUF_TO_BLOCK(cbp);
 	if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -4078,8 +4079,9 @@ xfs_bmap_read_extents(
 	 * pointer (leftmost) at each level.
 	 */
 	while (level-- > 0) {
-		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF)))
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify);
+		if (error)
 			return error;
 		block = XFS_BUF_TO_BLOCK(bp);
 		XFS_WANT_CORRUPTED_GOTO(
@@ -4124,7 +4126,8 @@ xfs_bmap_read_extents(
 		 */
 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 		if (nextbno != NULLFSBLOCK)
-			xfs_btree_reada_bufl(mp, nextbno, 1);
+			xfs_btree_reada_bufl(mp, nextbno, 1,
+					     xfs_bmbt_read_verify);
 		/*
 		 * Copy records into the extent records.
 		 */
@@ -4156,8 +4159,9 @@ xfs_bmap_read_extents(
 		 */
 		if (bno == NULLFSBLOCK)
 			break;
-		if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF)))
+		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+				XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify);
+		if (error)
 			return error;
 		block = XFS_BUF_TO_BLOCK(bp);
 	}
@@ -5868,15 +5872,16 @@ xfs_bmap_check_leaf_extents(
 	 */
 	while (level-- > 0) {
 		/* See if buf is in cur first */
+		bp_release = 0;
 		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-		if (bp) {
-			bp_release = 0;
-		} else {
+		if (!bp) {
 			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						xfs_bmbt_read_verify);
+			if (error)
+				goto error_norelse;
 		}
-		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF)))
-			goto error_norelse;
 		block = XFS_BUF_TO_BLOCK(bp);
 		XFS_WANT_CORRUPTED_GOTO(
 			xfs_bmap_sanity_check(mp, bp, level),
@@ -5953,15 +5958,16 @@ xfs_bmap_check_leaf_extents(
 		if (bno == NULLFSBLOCK)
 			break;
 
+		bp_release = 0;
 		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-		if (bp) {
-			bp_release = 0;
-		} else {
+		if (!bp) {
 			bp_release = 1;
+			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						xfs_bmbt_read_verify);
+			if (error)
+				goto error_norelse;
 		}
-		if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF)))
-			goto error_norelse;
 		block = XFS_BUF_TO_BLOCK(bp);
 	}
 	if (bp_release) {
@@ -6052,7 +6058,9 @@ xfs_bmap_count_tree(
 	struct xfs_btree_block	*block, *nextblock;
 	int			numrecs;
 
-	if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+						xfs_bmbt_read_verify);
+	if (error)
 		return error;
 	*count += 1;
 	block = XFS_BUF_TO_BLOCK(bp);
@@ -6061,8 +6069,10 @@ xfs_bmap_count_tree(
 		/* Not at node above leaves, count this level of nodes */
 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 		while (nextbno != NULLFSBLOCK) {
-			if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
-				0, &nbp, XFS_BMAP_BTREE_REF)))
+			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+						XFS_BMAP_BTREE_REF,
+						xfs_bmbt_read_verify);
+			if (error)
 				return error;
 			*count += 1;
 			nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6091,8 +6101,10 @@ xfs_bmap_count_tree(
 			if (nextbno == NULLFSBLOCK)
 				break;
 			bno = nextbno;
-			if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF)))
+			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+						XFS_BMAP_BTREE_REF,
+						xfs_bmbt_read_verify);
+			if (error)
 				return error;
 			*count += 1;
 			block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..bddca9b92869 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
+#include "xfs_trace.h"
 
 /*
  * Determine the extent state.
@@ -707,6 +708,51 @@ xfs_bmbt_key_diff(
 				      cur->bc_rec.b.br_startoff;
 }
 
+void
+xfs_bmbt_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	unsigned int		level;
+	int			lblock_ok; /* block passes checks */
+
+	/* magic number and level verification.
+	 *
+	 * We don't know waht fork we belong to, so just verify that the level
+	 * is less than the maximum of the two. Later checks will be more
+	 * precise.
+	 */
+	level = be16_to_cpu(block->bb_level);
+	lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
+		    level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
+
+	/* numrecs verification */
+	lblock_ok = lblock_ok &&
+		be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+
+	/* sibling pointer verification */
+	lblock_ok = lblock_ok &&
+		block->bb_u.l.bb_leftsib &&
+		(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+		 XFS_FSB_SANITY_CHECK(mp,
+			be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+		block->bb_u.l.bb_rightsib &&
+		(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+		 XFS_FSB_SANITY_CHECK(mp,
+			be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+	if (!lblock_ok) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify",
+					XFS_ERRLEVEL_LOW, mp, block);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 #ifdef DEBUG
 STATIC int
 xfs_bmbt_keys_inorder(
@@ -746,6 +792,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
 	.key_diff		= xfs_bmbt_key_diff,
+	.read_verify		= xfs_bmbt_read_verify,
 #ifdef DEBUG
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..1d00fbe9dd79 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -232,6 +232,7 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
 extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern void xfs_bmbt_read_verify(struct xfs_buf *bp);
 
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7e791160092d..ef1066078c33 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -270,7 +270,8 @@ xfs_btree_dup_cursor(
 		if (bp) {
 			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 						   XFS_BUF_ADDR(bp), mp->m_bsize,
-						   0, &bp, NULL);
+						   0, &bp,
+						   cur->bc_ops->read_verify);
 			if (error) {
 				xfs_btree_del_cursor(new, error);
 				*ncur = NULL;
@@ -612,23 +613,24 @@ xfs_btree_offsets(
  * Get a buffer for the block, return it read in.
  * Long-form addressing.
  */
-int					/* error */
+int
 xfs_btree_read_bufl(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_trans_t	*tp,		/* transaction pointer */
-	xfs_fsblock_t	fsbno,		/* file system block number */
-	uint		lock,		/* lock flags for read_buf */
-	xfs_buf_t	**bpp,		/* buffer for fsbno */
-	int		refval)		/* ref count value for buffer */
-{
-	xfs_buf_t	*bp;		/* return value */
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	uint			lock,		/* lock flags for read_buf */
+	struct xfs_buf		**bpp,		/* buffer for fsbno */
+	int			refval,		/* ref count value for buffer */
+	xfs_buf_iodone_t	verify)
+{
+	struct xfs_buf		*bp;		/* return value */
 	xfs_daddr_t		d;		/* real disk block address */
-	int		error;
+	int			error;
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, lock, &bp, NULL);
+				   mp->m_bsize, lock, &bp, verify);
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
@@ -645,15 +647,16 @@ xfs_btree_read_bufl(
 /* ARGSUSED */
 void
 xfs_btree_reada_bufl(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_fsblock_t	fsbno,		/* file system block number */
-	xfs_extlen_t	count)		/* count of filesystem blocks */
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	xfs_buf_iodone_t	verify)
 {
 	xfs_daddr_t		d;
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify);
 }
 
 /*
@@ -663,17 +666,18 @@ xfs_btree_reada_bufl(
 /* ARGSUSED */
 void
 xfs_btree_reada_bufs(
-	xfs_mount_t	*mp,		/* file system mount point */
-	xfs_agnumber_t	agno,		/* allocation group number */
-	xfs_agblock_t	agbno,		/* allocation group block number */
-	xfs_extlen_t	count)		/* count of filesystem blocks */
+	struct xfs_mount	*mp,		/* file system mount point */
+	xfs_agnumber_t		agno,		/* allocation group number */
+	xfs_agblock_t		agbno,		/* allocation group block number */
+	xfs_extlen_t		count,		/* count of filesystem blocks */
+	xfs_buf_iodone_t	verify)
 {
 	xfs_daddr_t		d;
 
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agbno != NULLAGBLOCK);
 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, NULL);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify);
 }
 
 STATIC int
@@ -687,12 +691,14 @@ xfs_btree_readahead_lblock(
 	xfs_dfsbno_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
-		xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+				     cur->bc_ops->read_verify);
 		rval++;
 	}
 
 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
-		xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+				     cur->bc_ops->read_verify);
 		rval++;
 	}
 
@@ -712,13 +718,13 @@ xfs_btree_readahead_sblock(
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-				     left, 1);
+				     left, 1, cur->bc_ops->read_verify);
 		rval++;
 	}
 
 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-				     right, 1);
+				     right, 1, cur->bc_ops->read_verify);
 		rval++;
 	}
 
@@ -1016,19 +1022,15 @@ xfs_btree_read_buf_block(
 
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, flags, bpp, NULL);
+				   mp->m_bsize, flags, bpp,
+				   cur->bc_ops->read_verify);
 	if (error)
 		return error;
 
 	ASSERT(!xfs_buf_geterror(*bpp));
-
 	xfs_btree_set_refs(cur, *bpp);
 	*block = XFS_BUF_TO_BLOCK(*bpp);
-
-	error = xfs_btree_check_block(cur, *block, level, *bpp);
-	if (error)
-		xfs_trans_brelse(cur->bc_tp, *bpp);
-	return error;
+	return 0;
 }
 
 /*
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index c9cf2d00e236..3a4c314047a0 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,7 @@ struct xfs_btree_ops {
 	__int64_t (*key_diff)(struct xfs_btree_cur *cur,
 			      union xfs_btree_key *key);
 
+	void	(*read_verify)(struct xfs_buf *bp);
 #ifdef DEBUG
 	/* check that k1 is lower than k2 */
 	int	(*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +356,8 @@ xfs_btree_read_bufl(
 	xfs_fsblock_t		fsbno,	/* file system block number */
 	uint			lock,	/* lock flags for read_buf */
 	struct xfs_buf		**bpp,	/* buffer for fsbno */
-	int			refval);/* ref count value for buffer */
+	int			refval,	/* ref count value for buffer */
+	xfs_buf_iodone_t	verify);
 
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +367,8 @@ void					/* error */
 xfs_btree_reada_bufl(
 	struct xfs_mount	*mp,	/* file system mount point */
 	xfs_fsblock_t		fsbno,	/* file system block number */
-	xfs_extlen_t		count);	/* count of filesystem blocks */
+	xfs_extlen_t		count,	/* count of filesystem blocks */
+	xfs_buf_iodone_t	verify);
 
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,7 +379,8 @@ xfs_btree_reada_bufs(
 	struct xfs_mount	*mp,	/* file system mount point */
 	xfs_agnumber_t		agno,	/* allocation group number */
 	xfs_agblock_t		agbno,	/* allocation group block number */
-	xfs_extlen_t		count);	/* count of filesystem blocks */
+	xfs_extlen_t		count,	/* count of filesystem blocks */
+	xfs_buf_iodone_t	verify);
 
 /*
  * Initialise a new btree block header
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..11306c6d61c7 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 
 
 STATIC int
@@ -181,6 +182,44 @@ xfs_inobt_key_diff(
 			  cur->bc_rec.i.ir_startino;
 }
 
+void
+xfs_inobt_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	unsigned int		level;
+	int			sblock_ok; /* block passes checks */
+
+	/* magic number and level verification */
+	level = be16_to_cpu(block->bb_level);
+	sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
+		    level < mp->m_in_maxlevels;
+
+	/* numrecs verification */
+	sblock_ok = sblock_ok &&
+		be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+
+	/* sibling pointer verification */
+	sblock_ok = sblock_ok &&
+		(block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+		block->bb_u.s.bb_leftsib &&
+		(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+		 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+		block->bb_u.s.bb_rightsib;
+
+	if (!sblock_ok) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		XFS_CORRUPTION_ERROR("xfs_inobt_read_verify",
+					XFS_ERRLEVEL_LOW, mp, block);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -218,6 +257,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
 	.key_diff		= xfs_inobt_key_diff,
+	.read_verify		= xfs_inobt_read_verify,
 #ifdef DEBUG
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 514eac913f1c..3a243d076950 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -382,7 +382,7 @@ xfs_inobp_check(
 }
 #endif
 
-static void
+void
 xfs_inode_buf_verify(
 	struct xfs_buf	*bp)
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 21b4de3df716..1a892114792f 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -554,6 +554,7 @@ int		xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 			       struct xfs_buf **, uint, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, uint);
+void		xfs_inode_buf_verify(struct xfs_buf *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 3998fd2a7949..0f18d412e3e8 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -396,7 +396,8 @@ xfs_bulkstat(
 					if (xfs_inobt_maskn(chunkidx, nicluster)
 							& ~r.ir_free)
 						xfs_btree_reada_bufs(mp, agno,
-							agbno, nbcluster);
+							agbno, nbcluster,
+							xfs_inode_buf_verify);
 				}
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
-- 
cgit v1.2.3


From c6319198702350a2215a8c0cacd6cc4283728a1b Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2012 17:50:13 +1100
Subject: xfs: verify dquot blocks as they are read from disk

Add a dquot buffer verify callback function and pass it into the
buffer read functions. This checks all the dquots in a buffer, but
cannot completely verify the dquot ids are correct. Also, errors
cannot be repaired, so an additional function is added to repair bad
dquots in the buffer if such an error is detected in a context where
repair is allowed.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dquot.c | 117 +++++++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_dquot.h |   1 +
 fs/xfs/xfs_qm.c    |   3 +-
 3 files changed, 98 insertions(+), 23 deletions(-)

diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index e95f800333d4..0ba0f0992d6e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -360,6 +360,89 @@ xfs_qm_dqalloc(
 	return (error);
 }
 
+void
+xfs_dquot_read_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	struct xfs_disk_dquot	*ddq;
+	xfs_dqid_t		id = 0;
+	int			i;
+
+	/*
+	 * On the first read of the buffer, verify that each dquot is valid.
+	 * We don't know what the id of the dquot is supposed to be, just that
+	 * they should be increasing monotonically within the buffer. If the
+	 * first id is corrupt, then it will fail on the second dquot in the
+	 * buffer so corruptions could point to the wrong dquot in this case.
+	 */
+	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+		int	error;
+
+		ddq = &d[i].dd_diskdq;
+
+		if (i == 0)
+			id = be32_to_cpu(ddq->d_id);
+
+		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+					"xfs_dquot_read_verify");
+		if (error) {
+			XFS_CORRUPTION_ERROR("xfs_dquot_read_verify",
+					     XFS_ERRLEVEL_LOW, mp, d);
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			break;
+		}
+	}
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+STATIC int
+xfs_qm_dqrepair(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_dquot	*dqp,
+	xfs_dqid_t		firstid,
+	struct xfs_buf		**bpp)
+{
+	int			error;
+	struct xfs_disk_dquot	*ddq;
+	struct xfs_dqblk	*d;
+	int			i;
+
+	/*
+	 * Read the buffer without verification so we get the corrupted
+	 * buffer returned to us.
+	 */
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+				   mp->m_quotainfo->qi_dqchunklen,
+				   0, bpp, NULL);
+
+	if (error) {
+		ASSERT(*bpp == NULL);
+		return XFS_ERROR(error);
+	}
+
+	ASSERT(xfs_buf_islocked(*bpp));
+	d = (struct xfs_dqblk *)(*bpp)->b_addr;
+
+	/* Do the actual repair of dquots in this buffer */
+	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+		ddq = &d[i].dd_diskdq;
+		error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+				       dqp->dq_flags & XFS_DQ_ALLTYPES,
+				       XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+		if (error) {
+			/* repair failed, we're screwed */
+			xfs_trans_brelse(tp, *bpp);
+			return XFS_ERROR(EIO);
+		}
+	}
+
+	return 0;
+}
+
 /*
  * Maps a dquot to the buffer containing its on-disk version.
  * This returns a ptr to the buffer containing the on-disk dquot
@@ -378,7 +461,6 @@ xfs_qm_dqtobp(
 	xfs_buf_t	*bp;
 	xfs_inode_t	*quotip = XFS_DQ_TO_QIP(dqp);
 	xfs_mount_t	*mp = dqp->q_mount;
-	xfs_disk_dquot_t *ddq;
 	xfs_dqid_t	id = be32_to_cpu(dqp->q_core.d_id);
 	xfs_trans_t	*tp = (tpp ? *tpp : NULL);
 
@@ -439,33 +521,24 @@ xfs_qm_dqtobp(
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
 					   mp->m_quotainfo->qi_dqchunklen,
-					   0, &bp, NULL);
-		if (error || !bp)
-			return XFS_ERROR(error);
-	}
+					   0, &bp, xfs_dquot_read_verify);
 
-	ASSERT(xfs_buf_islocked(bp));
-
-	/*
-	 * calculate the location of the dquot inside the buffer.
-	 */
-	ddq = bp->b_addr + dqp->q_bufoffset;
+		if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+			xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
+						mp->m_quotainfo->qi_dqperchunk;
+			ASSERT(bp == NULL);
+			error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+		}
 
-	/*
-	 * A simple sanity check in case we got a corrupted dquot...
-	 */
-	error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
-			   flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-			   "dqtobp");
-	if (error) {
-		if (!(flags & XFS_QMOPT_DQREPAIR)) {
-			xfs_trans_brelse(tp, bp);
-			return XFS_ERROR(EIO);
+		if (error) {
+			ASSERT(bp == NULL);
+			return XFS_ERROR(error);
 		}
 	}
 
+	ASSERT(xfs_buf_islocked(bp));
 	*O_bpp = bp;
-	*O_ddpp = ddq;
+	*O_ddpp = bp->b_addr + dqp->q_bufoffset;
 
 	return (0);
 }
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..a08ba92d7da0 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -140,6 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 
 extern int		xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
 					uint, struct xfs_dquot	**);
+extern void		xfs_dquot_read_verify(struct xfs_buf *bp);
 extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int		xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
 extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 688f608b3668..a6dfb97490cc 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -892,7 +892,8 @@ xfs_qm_dqiter_bufs(
 	while (blkcnt--) {
 		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 			      XFS_FSB_TO_DADDR(mp, bno),
-			      mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
+			      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+			      xfs_dquot_read_verify);
 		if (error)
 			break;
 
-- 
cgit v1.2.3


From 4bb20a83a2a5ac4dcb62780c9950e47939956126 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:10 +1100
Subject: xfs: add verifier callback to directory read code

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c       | 23 ++++++++++++-----------
 fs/xfs/xfs_attr_leaf.c  | 18 +++++++++---------
 fs/xfs/xfs_da_btree.c   | 44 ++++++++++++++++++++++++++++----------------
 fs/xfs/xfs_da_btree.h   |  7 ++++---
 fs/xfs/xfs_dir2_block.c | 23 ++++++++++++-----------
 fs/xfs/xfs_dir2_leaf.c  | 33 ++++++++++++++++-----------------
 fs/xfs/xfs_dir2_node.c  | 43 ++++++++++++++++++++-----------------------
 fs/xfs/xfs_file.c       |  2 +-
 8 files changed, 102 insertions(+), 91 deletions(-)

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 474c57a43cce..cd5a9cd0ded0 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -904,7 +904,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	dp = args->dp;
 	args->blkno = 0;
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error)
 		return(error);
 	ASSERT(bp != NULL);
@@ -1032,7 +1032,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * remove the "old" attr from that block (neat, huh!)
 		 */
 		error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
-						     &bp, XFS_ATTR_FORK);
+						     &bp, XFS_ATTR_FORK, NULL);
 		if (error)
 			return(error);
 		ASSERT(bp != NULL);
@@ -1101,7 +1101,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 	dp = args->dp;
 	args->blkno = 0;
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error) {
 		return(error);
 	}
@@ -1159,7 +1159,7 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 
 	args->blkno = 0;
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error)
 		return(error);
 	ASSERT(bp != NULL);
@@ -1190,7 +1190,8 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	trace_xfs_attr_leaf_list(context);
 
 	context->cursor->blkno = 0;
-	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK,
+				NULL);
 	if (error)
 		return XFS_ERROR(error);
 	ASSERT(bp != NULL);
@@ -1605,7 +1606,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		state->path.blk[0].bp = NULL;
 
 		error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-						     XFS_ATTR_FORK);
+						     XFS_ATTR_FORK, NULL);
 		if (error)
 			goto out;
 		ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
@@ -1718,7 +1719,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 			error = xfs_da_read_buf(state->args->trans,
 						state->args->dp,
 						blk->blkno, blk->disk_blkno,
-						&blk->bp, XFS_ATTR_FORK);
+						&blk->bp, XFS_ATTR_FORK, NULL);
 			if (error)
 				return(error);
 		} else {
@@ -1737,7 +1738,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 			error = xfs_da_read_buf(state->args->trans,
 						state->args->dp,
 						blk->blkno, blk->disk_blkno,
-						&blk->bp, XFS_ATTR_FORK);
+						&blk->bp, XFS_ATTR_FORK, NULL);
 			if (error)
 				return(error);
 		} else {
@@ -1827,7 +1828,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	bp = NULL;
 	if (cursor->blkno > 0) {
 		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
-					      &bp, XFS_ATTR_FORK);
+					      &bp, XFS_ATTR_FORK, NULL);
 		if ((error != 0) && (error != EFSCORRUPTED))
 			return(error);
 		if (bp) {
@@ -1870,7 +1871,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 		for (;;) {
 			error = xfs_da_read_buf(NULL, context->dp,
 						      cursor->blkno, -1, &bp,
-						      XFS_ATTR_FORK);
+						      XFS_ATTR_FORK, NULL);
 			if (error)
 				return(error);
 			if (unlikely(bp == NULL)) {
@@ -1937,7 +1938,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 		cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
 		xfs_trans_brelse(NULL, bp);
 		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
-					      &bp, XFS_ATTR_FORK);
+					      &bp, XFS_ATTR_FORK, NULL);
 		if (error)
 			return(error);
 		if (unlikely((bp == NULL))) {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 4bfc732bc9c9..ba2b9a2cd236 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -871,7 +871,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
 	if (error)
 		goto out;
 	error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error)
 		goto out;
 	ASSERT(bp1 != NULL);
@@ -1642,7 +1642,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 		if (blkno == 0)
 			continue;
 		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-					blkno, -1, &bp, XFS_ATTR_FORK);
+					blkno, -1, &bp, XFS_ATTR_FORK, NULL);
 		if (error)
 			return(error);
 		ASSERT(bp != NULL);
@@ -2519,7 +2519,7 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 	 * Set up the operation.
 	 */
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error) {
 		return(error);
 	}
@@ -2584,7 +2584,7 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
 	 * Set up the operation.
 	 */
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error) {
 		return(error);
 	}
@@ -2641,7 +2641,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	 * Read the block containing the "old" attr
 	 */
 	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
-					     XFS_ATTR_FORK);
+					     XFS_ATTR_FORK, NULL);
 	if (error) {
 		return(error);
 	}
@@ -2652,7 +2652,7 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	 */
 	if (args->blkno2 != args->blkno) {
 		error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
-					-1, &bp2, XFS_ATTR_FORK);
+					-1, &bp2, XFS_ATTR_FORK, NULL);
 		if (error) {
 			return(error);
 		}
@@ -2753,7 +2753,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
 	 * the extents in reverse order the extent containing
 	 * block 0 must still be there.
 	 */
-	error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+	error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL);
 	if (error)
 		return(error);
 	blkno = XFS_BUF_ADDR(bp);
@@ -2839,7 +2839,7 @@ xfs_attr_node_inactive(
 		 * before we come back to this one.
 		 */
 		error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
-						XFS_ATTR_FORK);
+						XFS_ATTR_FORK, NULL);
 		if (error)
 			return(error);
 		if (child_bp) {
@@ -2880,7 +2880,7 @@ xfs_attr_node_inactive(
 		 */
 		if ((i+1) < count) {
 			error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
-				&bp, XFS_ATTR_FORK);
+				&bp, XFS_ATTR_FORK, NULL);
 			if (error)
 				return(error);
 			child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 4af8bad7068c..f9e9149de009 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -747,7 +747,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 	child = be32_to_cpu(oldroot->btree[0].before);
 	ASSERT(child != 0);
 	error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
-					     args->whichfork);
+					     args->whichfork, NULL);
 	if (error)
 		return(error);
 	ASSERT(bp != NULL);
@@ -838,7 +838,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 		if (blkno == 0)
 			continue;
 		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-					blkno, -1, &bp, state->args->whichfork);
+					blkno, -1, &bp, state->args->whichfork,
+					NULL);
 		if (error)
 			return(error);
 		ASSERT(bp != NULL);
@@ -1084,7 +1085,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		 */
 		blk->blkno = blkno;
 		error = xfs_da_read_buf(args->trans, args->dp, blkno,
-					-1, &blk->bp, args->whichfork);
+					-1, &blk->bp, args->whichfork, NULL);
 		if (error) {
 			blk->blkno = 0;
 			state->path.active--;
@@ -1247,7 +1248,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		if (old_info->back) {
 			error = xfs_da_read_buf(args->trans, args->dp,
 						be32_to_cpu(old_info->back),
-						-1, &bp, args->whichfork);
+						-1, &bp, args->whichfork, NULL);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1268,7 +1269,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		if (old_info->forw) {
 			error = xfs_da_read_buf(args->trans, args->dp,
 						be32_to_cpu(old_info->forw),
-						-1, &bp, args->whichfork);
+						-1, &bp, args->whichfork, NULL);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1368,7 +1369,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		if (drop_info->back) {
 			error = xfs_da_read_buf(args->trans, args->dp,
 						be32_to_cpu(drop_info->back),
-						-1, &bp, args->whichfork);
+						-1, &bp, args->whichfork, NULL);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1385,7 +1386,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		if (drop_info->forw) {
 			error = xfs_da_read_buf(args->trans, args->dp,
 						be32_to_cpu(drop_info->forw),
-						-1, &bp, args->whichfork);
+						-1, &bp, args->whichfork, NULL);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1470,7 +1471,7 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 		 */
 		blk->blkno = blkno;
 		error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
-						     &blk->bp, args->whichfork);
+					&blk->bp, args->whichfork, NULL);
 		if (error)
 			return(error);
 		ASSERT(blk->bp != NULL);
@@ -1733,7 +1734,8 @@ xfs_da_swap_lastblock(
 	 * Read the last block in the btree space.
 	 */
 	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-	if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+	error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL);
+	if (error)
 		return error;
 	/*
 	 * Copy the last block into the dead buffer and log it.
@@ -1759,7 +1761,9 @@ xfs_da_swap_lastblock(
 	 * If the moved block has a left sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+		error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w,
+					NULL);
+		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
 		if (unlikely(
@@ -1780,7 +1784,9 @@ xfs_da_swap_lastblock(
 	 * If the moved block has a right sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-		if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+		error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w,
+					NULL);
+		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
 		if (unlikely(
@@ -1803,7 +1809,9 @@ xfs_da_swap_lastblock(
 	 * Walk down the tree looking for the parent of the moved block.
 	 */
 	for (;;) {
-		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+		error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w,
+					NULL);
+		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
 		if (unlikely(par_node->hdr.info.magic !=
@@ -1853,7 +1861,9 @@ xfs_da_swap_lastblock(
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+		error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w,
+					NULL);
+		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
 		if (unlikely(
@@ -2139,7 +2149,8 @@ xfs_da_read_buf(
 	xfs_dablk_t		bno,
 	xfs_daddr_t		mappedbno,
 	struct xfs_buf		**bpp,
-	int			whichfork)
+	int			whichfork,
+	xfs_buf_iodone_t	verifier)
 {
 	struct xfs_buf		*bp;
 	struct xfs_buf_map	map;
@@ -2161,7 +2172,7 @@ xfs_da_read_buf(
 
 	error = xfs_trans_read_buf_map(dp->i_mount, trans,
 					dp->i_mount->m_ddev_targp,
-					mapp, nmap, 0, &bp, NULL);
+					mapp, nmap, 0, &bp, verifier);
 	if (error)
 		goto out_free;
 
@@ -2217,7 +2228,8 @@ xfs_da_reada_buf(
 	struct xfs_trans	*trans,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
-	int			whichfork)
+	int			whichfork,
+	xfs_buf_iodone_t	verifier)
 {
 	xfs_daddr_t		mappedbno = -1;
 	struct xfs_buf_map	map;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..bf8bfaa0d356 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DA_BTREE_H__
 #define	__XFS_DA_BTREE_H__
 
-struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_inode;
 struct xfs_mount;
@@ -226,9 +225,11 @@ int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 			      struct xfs_buf **bp, int whichfork);
 int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
-			       struct xfs_buf **bpp, int whichfork);
+			       struct xfs_buf **bpp, int whichfork,
+			       xfs_buf_iodone_t verifier);
 xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-			xfs_dablk_t bno, int whichfork);
+				xfs_dablk_t bno, int whichfork,
+				xfs_buf_iodone_t verifier);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  struct xfs_buf *dead_buf);
 
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..53666ca6c953 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -97,10 +97,10 @@ xfs_dir2_block_addname(
 	/*
 	 * Read the (one and only) directory block into dabuf bp.
 	 */
-	if ((error =
-	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+	error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp,
+				XFS_DATA_FORK, NULL);
+	if (error)
 		return error;
-	}
 	ASSERT(bp != NULL);
 	hdr = bp->b_addr;
 	/*
@@ -457,7 +457,7 @@ xfs_dir2_block_getdents(
 	 * Can't read the block, give up, else get dabuf in bp.
 	 */
 	error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
-				&bp, XFS_DATA_FORK);
+				&bp, XFS_DATA_FORK, NULL);
 	if (error)
 		return error;
 
@@ -640,10 +640,10 @@ xfs_dir2_block_lookup_int(
 	/*
 	 * Read the buffer, return error if we can't get it.
 	 */
-	if ((error =
-	    xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+	error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp,
+				XFS_DATA_FORK, NULL);
+	if (error)
 		return error;
-	}
 	ASSERT(bp != NULL);
 	hdr = bp->b_addr;
 	xfs_dir2_data_check(dp, bp);
@@ -917,10 +917,11 @@ xfs_dir2_leaf_to_block(
 	/*
 	 * Read the data block if we don't already have it, give up if it fails.
 	 */
-	if (dbp == NULL &&
-	    (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
-		    XFS_DATA_FORK))) {
-		return error;
+	if (!dbp) {
+		error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+					XFS_DATA_FORK, NULL);
+		if (error)
+			return error;
 	}
 	hdr = dbp->b_addr;
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index bac86984e403..86e3dc1de0e7 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -315,10 +315,9 @@ xfs_dir2_leaf_addname(
 	 * Read the leaf block.
 	 */
 	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-		XFS_DATA_FORK);
-	if (error) {
+				XFS_DATA_FORK, NULL);
+	if (error)
 		return error;
-	}
 	ASSERT(lbp != NULL);
 	/*
 	 * Look up the entry by hash value and name.
@@ -500,9 +499,9 @@ xfs_dir2_leaf_addname(
 	 * Just read that one in.
 	 */
 	else {
-		if ((error =
-		    xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
-			    -1, &dbp, XFS_DATA_FORK))) {
+		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
+					-1, &dbp, XFS_DATA_FORK, NULL);
+		if (error) {
 			xfs_trans_brelse(tp, lbp);
 			return error;
 		}
@@ -895,7 +894,7 @@ xfs_dir2_leaf_readbuf(
 	error = xfs_da_read_buf(NULL, dp, map->br_startoff,
 			map->br_blockcount >= mp->m_dirblkfsbs ?
 			    XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1,
-			&bp, XFS_DATA_FORK);
+			&bp, XFS_DATA_FORK, NULL);
 
 	/*
 	 * Should just skip over the data block instead of giving up.
@@ -938,7 +937,7 @@ xfs_dir2_leaf_readbuf(
 			xfs_da_reada_buf(NULL, dp,
 					map[mip->ra_index].br_startoff +
 							mip->ra_offset,
-					XFS_DATA_FORK);
+					XFS_DATA_FORK, NULL);
 			mip->ra_current = i;
 		}
 
@@ -1376,7 +1375,7 @@ xfs_dir2_leaf_lookup_int(
 	 * Read the leaf block into the buffer.
 	 */
 	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-							XFS_DATA_FORK);
+							XFS_DATA_FORK, NULL);
 	if (error)
 		return error;
 	*lbpp = lbp;
@@ -1411,7 +1410,7 @@ xfs_dir2_leaf_lookup_int(
 				xfs_trans_brelse(tp, dbp);
 			error = xfs_da_read_buf(tp, dp,
 						xfs_dir2_db_to_da(mp, newdb),
-						-1, &dbp, XFS_DATA_FORK);
+						-1, &dbp, XFS_DATA_FORK, NULL);
 			if (error) {
 				xfs_trans_brelse(tp, lbp);
 				return error;
@@ -1453,7 +1452,7 @@ xfs_dir2_leaf_lookup_int(
 			xfs_trans_brelse(tp, dbp);
 			error = xfs_da_read_buf(tp, dp,
 						xfs_dir2_db_to_da(mp, cidb),
-						-1, &dbp, XFS_DATA_FORK);
+						-1, &dbp, XFS_DATA_FORK, NULL);
 			if (error) {
 				xfs_trans_brelse(tp, lbp);
 				return error;
@@ -1738,10 +1737,10 @@ xfs_dir2_leaf_trim_data(
 	/*
 	 * Read the offending data block.  We need its buffer.
 	 */
-	if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
-			XFS_DATA_FORK))) {
+	error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
+				XFS_DATA_FORK, NULL);
+	if (error)
 		return error;
-	}
 
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1863,10 @@ xfs_dir2_node_to_leaf(
 	/*
 	 * Read the freespace block.
 	 */
-	if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
-			XFS_DATA_FORK))) {
+	error = xfs_da_read_buf(tp, dp,  mp->m_dirfreeblk, -1, &fbp,
+				XFS_DATA_FORK, NULL);
+	if (error)
 		return error;
-	}
 	free = fbp->b_addr;
 	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
 	ASSERT(!free->hdr.firstdb);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..290c2b1016ab 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -399,7 +399,7 @@ xfs_dir2_leafn_lookup_for_addname(
 				 */
 				error = xfs_da_read_buf(tp, dp,
 						xfs_dir2_db_to_da(mp, newfdb),
-						-1, &curbp, XFS_DATA_FORK);
+						-1, &curbp, XFS_DATA_FORK, NULL);
 				if (error)
 					return error;
 				free = curbp->b_addr;
@@ -536,7 +536,7 @@ xfs_dir2_leafn_lookup_for_entry(
 			} else {
 				error = xfs_da_read_buf(tp, dp,
 						xfs_dir2_db_to_da(mp, newdb),
-						-1, &curbp, XFS_DATA_FORK);
+						-1, &curbp, XFS_DATA_FORK, NULL);
 				if (error)
 					return error;
 			}
@@ -915,10 +915,10 @@ xfs_dir2_leafn_remove(
 		 * read in the free block.
 		 */
 		fdb = xfs_dir2_db_to_fdb(mp, db);
-		if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
-				-1, &fbp, XFS_DATA_FORK))) {
+		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+					-1, &fbp, XFS_DATA_FORK, NULL);
+		if (error)
 			return error;
-		}
 		free = fbp->b_addr;
 		ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
 		ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -1169,11 +1169,10 @@ xfs_dir2_leafn_toosmall(
 		/*
 		 * Read the sibling leaf block.
 		 */
-		if ((error =
-		    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
-			    -1, &bp, XFS_DATA_FORK))) {
+		error = xfs_da_read_buf(state->args->trans, state->args->dp,
+					blkno, -1, &bp, XFS_DATA_FORK, NULL);
+		if (error)
 			return error;
-		}
 		ASSERT(bp != NULL);
 		/*
 		 * Count bytes in the two blocks combined.
@@ -1454,14 +1453,13 @@ xfs_dir2_node_addname_int(
 			 * This should be really rare, so there's no reason
 			 * to avoid it.
 			 */
-			if ((error = xfs_da_read_buf(tp, dp,
-					xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
-					XFS_DATA_FORK))) {
+			error = xfs_da_read_buf(tp, dp,
+						xfs_dir2_db_to_da(mp, fbno), -2,
+						&fbp, XFS_DATA_FORK, NULL);
+			if (error)
 				return error;
-			}
-			if (unlikely(fbp == NULL)) {
+			if (!fbp)
 				continue;
-			}
 			free = fbp->b_addr;
 			ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
 			findex = 0;
@@ -1520,9 +1518,9 @@ xfs_dir2_node_addname_int(
 		 * that was just allocated.
 		 */
 		fbno = xfs_dir2_db_to_fdb(mp, dbno);
-		if (unlikely(error = xfs_da_read_buf(tp, dp,
-				xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
-				XFS_DATA_FORK)))
+		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2,
+					&fbp, XFS_DATA_FORK, NULL);
+		if (error)
 			return error;
 
 		/*
@@ -1631,7 +1629,7 @@ xfs_dir2_node_addname_int(
 		 * Read the data block in.
 		 */
 		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
-				-1, &dbp, XFS_DATA_FORK);
+					-1, &dbp, XFS_DATA_FORK, NULL);
 		if (error)
 			return error;
 		hdr = dbp->b_addr;
@@ -1917,11 +1915,10 @@ xfs_dir2_node_trim_free(
 	/*
 	 * Read the freespace block.
 	 */
-	if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
-			XFS_DATA_FORK))) {
+	error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
+				XFS_DATA_FORK, NULL);
+	if (error)
 		return error;
-	}
-
 	/*
 	 * There can be holes in freespace.  If fo is a hole, there's
 	 * nothing to do.
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index c42f99e71f14..f6dab7da7bcc 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -891,7 +891,7 @@ xfs_dir_open(
 	 */
 	mode = xfs_ilock_map_shared(ip);
 	if (ip->i_d.di_nextents > 0)
-		xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+		xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL);
 	xfs_iunlock(ip, mode);
 	return 0;
 }
-- 
cgit v1.2.3


From 20f7e9f3726a27cccade65c28265eef8ca50eecb Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:11 +1100
Subject: xfs: factor dir2 block read operations

In preparation for verifying dir2 block format buffers, factor
the read operations out of the block operations (lookup, addname,
getdents) and some of the additional logic to make it easier to
understand an dmodify the code.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_block.c | 386 ++++++++++++++++++++++++++----------------------
 1 file changed, 209 insertions(+), 177 deletions(-)

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 53666ca6c953..25ce409487be 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,178 @@ xfs_dir_startup(void)
 	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
 
+static int
+xfs_dir2_block_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	struct xfs_buf		**bpp)
+{
+	struct xfs_mount	*mp = dp->i_mount;
+
+	return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+					XFS_DATA_FORK, NULL);
+}
+
+static void
+xfs_dir2_block_need_space(
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	__be16				**tagpp,
+	struct xfs_dir2_data_unused	**dupp,
+	struct xfs_dir2_data_unused	**enddupp,
+	int				*compact,
+	int				len)
+{
+	struct xfs_dir2_data_free	*bf;
+	__be16				*tagp = NULL;
+	struct xfs_dir2_data_unused	*dup = NULL;
+	struct xfs_dir2_data_unused	*enddup = NULL;
+
+	*compact = 0;
+	bf = hdr->bestfree;
+
+	/*
+	 * If there are stale entries we'll use one for the leaf.
+	 */
+	if (btp->stale) {
+		if (be16_to_cpu(bf[0].length) >= len) {
+			/*
+			 * The biggest entry enough to avoid compaction.
+			 */
+			dup = (xfs_dir2_data_unused_t *)
+			      ((char *)hdr + be16_to_cpu(bf[0].offset));
+			goto out;
+		}
+
+		/*
+		 * Will need to compact to make this work.
+		 * Tag just before the first leaf entry.
+		 */
+		*compact = 1;
+		tagp = (__be16 *)blp - 1;
+
+		/* Data object just before the first leaf entry.  */
+		dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+		/*
+		 * If it's not free then the data will go where the
+		 * leaf data starts now, if it works at all.
+		 */
+		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+			    (uint)sizeof(*blp) < len)
+				dup = NULL;
+		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+			dup = NULL;
+		else
+			dup = (xfs_dir2_data_unused_t *)blp;
+		goto out;
+	}
+
+	/*
+	 * no stale entries, so just use free space.
+	 * Tag just before the first leaf entry.
+	 */
+	tagp = (__be16 *)blp - 1;
+
+	/* Data object just before the first leaf entry.  */
+	enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+	/*
+	 * If it's not free then can't do this add without cleaning up:
+	 * the space before the first leaf entry needs to be free so it
+	 * can be expanded to hold the pointer to the new entry.
+	 */
+	if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+		/*
+		 * Check out the biggest freespace and see if it's the same one.
+		 */
+		dup = (xfs_dir2_data_unused_t *)
+		      ((char *)hdr + be16_to_cpu(bf[0].offset));
+		if (dup != enddup) {
+			/*
+			 * Not the same free entry, just check its length.
+			 */
+			if (be16_to_cpu(dup->length) < len)
+				dup = NULL;
+			goto out;
+		}
+
+		/*
+		 * It is the biggest freespace, can it hold the leaf too?
+		 */
+		if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+			/*
+			 * Yes, use the second-largest entry instead if it works.
+			 */
+			if (be16_to_cpu(bf[1].length) >= len)
+				dup = (xfs_dir2_data_unused_t *)
+				      ((char *)hdr + be16_to_cpu(bf[1].offset));
+			else
+				dup = NULL;
+		}
+	}
+out:
+	*tagpp = tagp;
+	*dupp = dup;
+	*enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+	struct xfs_trans		*tp,
+	struct xfs_buf			*bp,
+	struct xfs_dir2_data_hdr	*hdr,
+	struct xfs_dir2_block_tail	*btp,
+	struct xfs_dir2_leaf_entry	*blp,
+	int				*needlog,
+	int				*lfloghigh,
+	int				*lfloglow)
+{
+	int			fromidx;	/* source leaf index */
+	int			toidx;		/* target leaf index */
+	int			needscan = 0;
+	int			highstale;	/* high stale index */
+
+	fromidx = toidx = be32_to_cpu(btp->count) - 1;
+	highstale = *lfloghigh = -1;
+	for (; fromidx >= 0; fromidx--) {
+		if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+			if (highstale == -1)
+				highstale = toidx;
+			else {
+				if (*lfloghigh == -1)
+					*lfloghigh = toidx;
+				continue;
+			}
+		}
+		if (fromidx < toidx)
+			blp[toidx] = blp[fromidx];
+		toidx--;
+	}
+	*lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+	*lfloghigh -= be32_to_cpu(btp->stale) - 1;
+	be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+	xfs_dir2_data_make_free(tp, bp,
+		(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+		(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+		needlog, &needscan);
+	blp += be32_to_cpu(btp->stale) - 1;
+	btp->stale = cpu_to_be32(1);
+	/*
+	 * If we now need to rebuild the bestfree map, do so.
+	 * This needs to happen before the next call to use_free.
+	 */
+	if (needscan)
+		xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+}
+
 /*
  * Add an entry to a block directory.
  */
@@ -63,7 +235,6 @@ int						/* error */
 xfs_dir2_block_addname(
 	xfs_da_args_t		*args)		/* directory op arguments */
 {
-	xfs_dir2_data_free_t	*bf;		/* bestfree table in block */
 	xfs_dir2_data_hdr_t	*hdr;		/* block header */
 	xfs_dir2_leaf_entry_t	*blp;		/* block leaf entries */
 	struct xfs_buf		*bp;		/* buffer for block */
@@ -94,134 +265,44 @@ xfs_dir2_block_addname(
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
-	/*
-	 * Read the (one and only) directory block into dabuf bp.
-	 */
-	error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp,
-				XFS_DATA_FORK, NULL);
+
+	/* Read the (one and only) directory block into bp. */
+	error = xfs_dir2_block_read(tp, dp, &bp);
 	if (error)
 		return error;
-	ASSERT(bp != NULL);
-	hdr = bp->b_addr;
-	/*
-	 * Check the magic number, corrupted if wrong.
-	 */
-	if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
-		XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
-				     XFS_ERRLEVEL_LOW, mp, hdr);
-		xfs_trans_brelse(tp, bp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
+
 	len = xfs_dir2_data_entsize(args->namelen);
+
 	/*
 	 * Set up pointers to parts of the block.
 	 */
-	bf = hdr->bestfree;
+	hdr = bp->b_addr;
 	btp = xfs_dir2_block_tail_p(mp, hdr);
 	blp = xfs_dir2_block_leaf_p(btp);
+
 	/*
-	 * No stale entries?  Need space for entry and new leaf.
-	 */
-	if (!btp->stale) {
-		/*
-		 * Tag just before the first leaf entry.
-		 */
-		tagp = (__be16 *)blp - 1;
-		/*
-		 * Data object just before the first leaf entry.
-		 */
-		enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-		/*
-		 * If it's not free then can't do this add without cleaning up:
-		 * the space before the first leaf entry needs to be free so it
-		 * can be expanded to hold the pointer to the new entry.
-		 */
-		if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-			dup = enddup = NULL;
-		/*
-		 * Check out the biggest freespace and see if it's the same one.
-		 */
-		else {
-			dup = (xfs_dir2_data_unused_t *)
-			      ((char *)hdr + be16_to_cpu(bf[0].offset));
-			if (dup == enddup) {
-				/*
-				 * It is the biggest freespace, is it too small
-				 * to hold the new leaf too?
-				 */
-				if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
-					/*
-					 * Yes, we use the second-largest
-					 * entry instead if it works.
-					 */
-					if (be16_to_cpu(bf[1].length) >= len)
-						dup = (xfs_dir2_data_unused_t *)
-						      ((char *)hdr +
-						       be16_to_cpu(bf[1].offset));
-					else
-						dup = NULL;
-				}
-			} else {
-				/*
-				 * Not the same free entry,
-				 * just check its length.
-				 */
-				if (be16_to_cpu(dup->length) < len) {
-					dup = NULL;
-				}
-			}
-		}
-		compact = 0;
-	}
-	/*
-	 * If there are stale entries we'll use one for the leaf.
-	 * Is the biggest entry enough to avoid compaction?
-	 */
-	else if (be16_to_cpu(bf[0].length) >= len) {
-		dup = (xfs_dir2_data_unused_t *)
-		      ((char *)hdr + be16_to_cpu(bf[0].offset));
-		compact = 0;
-	}
-	/*
-	 * Will need to compact to make this work.
+	 * Find out if we can reuse stale entries or whether we need extra
+	 * space for entry and new leaf.
 	 */
-	else {
-		/*
-		 * Tag just before the first leaf entry.
-		 */
-		tagp = (__be16 *)blp - 1;
-		/*
-		 * Data object just before the first leaf entry.
-		 */
-		dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-		/*
-		 * If it's not free then the data will go where the
-		 * leaf data starts now, if it works at all.
-		 */
-		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-			if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
-			    (uint)sizeof(*blp) < len)
-				dup = NULL;
-		} else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
-			dup = NULL;
-		else
-			dup = (xfs_dir2_data_unused_t *)blp;
-		compact = 1;
-	}
+	xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+				  &enddup, &compact, len);
+
 	/*
-	 * If this isn't a real add, we're done with the buffer.
+	 * Done everything we need for a space check now.
 	 */
-	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+	if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
 		xfs_trans_brelse(tp, bp);
+		if (!dup)
+			return XFS_ERROR(ENOSPC);
+		return 0;
+	}
+
 	/*
 	 * If we don't have space for the new entry & leaf ...
 	 */
 	if (!dup) {
-		/*
-		 * Not trying to actually do anything, or don't have
-		 * a space reservation: return no-space.
-		 */
-		if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+		/* Don't have a space reservation: return no-space.  */
+		if (args->total == 0)
 			return XFS_ERROR(ENOSPC);
 		/*
 		 * Convert to the next larger format.
@@ -232,65 +313,24 @@ xfs_dir2_block_addname(
 			return error;
 		return xfs_dir2_leaf_addname(args);
 	}
-	/*
-	 * Just checking, and it would work, so say so.
-	 */
-	if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-		return 0;
+
 	needlog = needscan = 0;
+
 	/*
 	 * If need to compact the leaf entries, do it now.
-	 * Leave the highest-numbered stale entry stale.
-	 * XXX should be the one closest to mid but mid is not yet computed.
-	 */
-	if (compact) {
-		int	fromidx;		/* source leaf index */
-		int	toidx;			/* target leaf index */
-
-		for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
-			highstale = lfloghigh = -1;
-		     fromidx >= 0;
-		     fromidx--) {
-			if (blp[fromidx].address ==
-			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-				if (highstale == -1)
-					highstale = toidx;
-				else {
-					if (lfloghigh == -1)
-						lfloghigh = toidx;
-					continue;
-				}
-			}
-			if (fromidx < toidx)
-				blp[toidx] = blp[fromidx];
-			toidx--;
-		}
-		lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
-		lfloghigh -= be32_to_cpu(btp->stale) - 1;
-		be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-		xfs_dir2_data_make_free(tp, bp,
-			(xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-			(xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
-			&needlog, &needscan);
-		blp += be32_to_cpu(btp->stale) - 1;
-		btp->stale = cpu_to_be32(1);
-		/*
-		 * If we now need to rebuild the bestfree map, do so.
-		 * This needs to happen before the next call to use_free.
-		 */
-		if (needscan) {
-			xfs_dir2_data_freescan(mp, hdr, &needlog);
-			needscan = 0;
-		}
-	}
-	/*
-	 * Set leaf logging boundaries to impossible state.
-	 * For the no-stale case they're set explicitly.
 	 */
+	if (compact)
+		xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+				      &lfloghigh, &lfloglow);
 	else if (btp->stale) {
+		/*
+		 * Set leaf logging boundaries to impossible state.
+		 * For the no-stale case they're set explicitly.
+		 */
 		lfloglow = be32_to_cpu(btp->count);
 		lfloghigh = -1;
 	}
+
 	/*
 	 * Find the slot that's first lower than our hash value, -1 if none.
 	 */
@@ -450,18 +490,13 @@ xfs_dir2_block_getdents(
 	/*
 	 * If the block number in the offset is out of range, we're done.
 	 */
-	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+	if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
 		return 0;
-	}
-	/*
-	 * Can't read the block, give up, else get dabuf in bp.
-	 */
-	error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
-				&bp, XFS_DATA_FORK, NULL);
+
+	error = xfs_dir2_block_read(NULL, dp, &bp);
 	if (error)
 		return error;
 
-	ASSERT(bp != NULL);
 	/*
 	 * Extract the byte offset we start at from the seek pointer.
 	 * We'll skip entries before this.
@@ -637,14 +672,11 @@ xfs_dir2_block_lookup_int(
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
-	/*
-	 * Read the buffer, return error if we can't get it.
-	 */
-	error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp,
-				XFS_DATA_FORK, NULL);
+
+	error = xfs_dir2_block_read(tp, dp, &bp);
 	if (error)
 		return error;
-	ASSERT(bp != NULL);
+
 	hdr = bp->b_addr;
 	xfs_dir2_data_check(dp, bp);
 	btp = xfs_dir2_block_tail_p(mp, hdr);
-- 
cgit v1.2.3


From 82025d7f79148fe66a1594a0ebe4ab38152cf9e6 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:12 +1100
Subject: xfs: verify dir2 block format buffers

Add a dir2 block format read verifier. To fully verify every block
when read, call xfs_dir2_data_check() on them. Change
xfs_dir2_data_check() to do runtime checking, convert ASSERT()
checks to XFS_WANT_CORRUPTED_RETURN(), which will trigger an ASSERT
failure on debug kernels, but on production kernels will dump an
error to dmesg and return EFSCORRUPTED to the caller.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_block.c | 22 ++++++++++++++-
 fs/xfs/xfs_dir2_data.c  | 73 +++++++++++++++++++++++++++++--------------------
 fs/xfs/xfs_dir2_priv.h  |  4 ++-
 3 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 25ce409487be..57351b868861 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,26 @@ xfs_dir_startup(void)
 	xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
 
+static void
+xfs_dir2_block_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+	int			block_ok = 0;
+
+	block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+	block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+	if (!block_ok) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
 static int
 xfs_dir2_block_read(
 	struct xfs_trans	*tp,
@@ -65,7 +85,7 @@ xfs_dir2_block_read(
 	struct xfs_mount	*mp = dp->i_mount;
 
 	return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
-					XFS_DATA_FORK, NULL);
+					XFS_DATA_FORK, xfs_dir2_block_verify);
 }
 
 static void
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..cb117234e32e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
 STATIC xfs_dir2_data_free_t *
 xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
 
-#ifdef DEBUG
 /*
  * Check the consistency of the data block.
  * The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
  */
-void
-xfs_dir2_data_check(
+int
+__xfs_dir2_data_check(
 	struct xfs_inode	*dp,		/* incore inode pointer */
 	struct xfs_buf		*bp)		/* data block's buffer */
 {
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
 	int			stale;		/* count of stale leaves */
 	struct xfs_name		name;
 
-	mp = dp->i_mount;
+	mp = bp->b_target->bt_mount;
 	hdr = bp->b_addr;
 	bf = hdr->bestfree;
 	p = (char *)(hdr + 1);
 
-	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
 		btp = xfs_dir2_block_tail_p(mp, hdr);
 		lep = xfs_dir2_block_leaf_p(btp);
 		endp = (char *)lep;
-	} else {
-		ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+		break;
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
 		endp = (char *)hdr + mp->m_dirblksize;
+		break;
+	default:
+		XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+		return EFSCORRUPTED;
 	}
 
 	count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
 	 * Account for zero bestfree entries.
 	 */
 	if (!bf[0].length) {
-		ASSERT(!bf[0].offset);
+		XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
 		freeseen |= 1 << 0;
 	}
 	if (!bf[1].length) {
-		ASSERT(!bf[1].offset);
+		XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
 		freeseen |= 1 << 1;
 	}
 	if (!bf[2].length) {
-		ASSERT(!bf[2].offset);
+		XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
 		freeseen |= 1 << 2;
 	}
-	ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
-	ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+
+	XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+						be16_to_cpu(bf[1].length));
+	XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+						be16_to_cpu(bf[2].length));
 	/*
 	 * Loop over the data/unused entries.
 	 */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
 		 * doesn't need to be there.
 		 */
 		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-			ASSERT(lastfree == 0);
-			ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
-			       (char *)dup - (char *)hdr);
+			XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+			XFS_WANT_CORRUPTED_RETURN(
+				be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+					       (char *)dup - (char *)hdr);
 			dfp = xfs_dir2_data_freefind(hdr, dup);
 			if (dfp) {
 				i = (int)(dfp - bf);
-				ASSERT((freeseen & (1 << i)) == 0);
+				XFS_WANT_CORRUPTED_RETURN(
+					(freeseen & (1 << i)) == 0);
 				freeseen |= 1 << i;
 			} else {
-				ASSERT(be16_to_cpu(dup->length) <=
-				       be16_to_cpu(bf[2].length));
+				XFS_WANT_CORRUPTED_RETURN(
+					be16_to_cpu(dup->length) <=
+						be16_to_cpu(bf[2].length));
 			}
 			p += be16_to_cpu(dup->length);
 			lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
 		 * The linear search is crude but this is DEBUG code.
 		 */
 		dep = (xfs_dir2_data_entry_t *)p;
-		ASSERT(dep->namelen != 0);
-		ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
-		ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
-		       (char *)dep - (char *)hdr);
+		XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+		XFS_WANT_CORRUPTED_RETURN(
+			!xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+		XFS_WANT_CORRUPTED_RETURN(
+			be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+					       (char *)dep - (char *)hdr);
 		count++;
 		lastfree = 0;
 		if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,30 @@ xfs_dir2_data_check(
 				    be32_to_cpu(lep[i].hashval) == hash)
 					break;
 			}
-			ASSERT(i < be32_to_cpu(btp->count));
+			XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
 		}
 		p += xfs_dir2_data_entsize(dep->namelen);
 	}
 	/*
 	 * Need to have seen all the entries and all the bestfree slots.
 	 */
-	ASSERT(freeseen == 7);
+	XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
 	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
 		for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
 			if (lep[i].address ==
 			    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
 				stale++;
 			if (i > 0)
-				ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+				XFS_WANT_CORRUPTED_RETURN(
+					be32_to_cpu(lep[i].hashval) >=
+						be32_to_cpu(lep[i - 1].hashval));
 		}
-		ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-		ASSERT(stale == be32_to_cpu(btp->stale));
+		XFS_WANT_CORRUPTED_RETURN(count ==
+			be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+		XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
 	}
+	return 0;
 }
-#endif
 
 /*
  * Given a data block and an unused entry from that block,
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..93b8f66ae788 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -41,10 +41,12 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 
 /* xfs_dir2_data.c */
 #ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#define	xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
 #else
 #define	xfs_dir2_data_check(dp,bp)
 #endif
+extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
 		struct xfs_dir2_data_unused *dup, int *loghead);
-- 
cgit v1.2.3


From 2025207ca6738a1217126ef14af9d104433f9824 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:13 +1100
Subject: xfs: factor dir2 free block reading

Also factor out the updating of the free block when removing entries
from leaf blocks, and add a verifier callback for reads.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_leaf.c |   3 +-
 fs/xfs/xfs_dir2_node.c | 218 +++++++++++++++++++++++++++++++------------------
 fs/xfs/xfs_dir2_priv.h |   2 +
 3 files changed, 143 insertions(+), 80 deletions(-)

diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 86e3dc1de0e7..6c1359dc9898 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -1863,8 +1863,7 @@ xfs_dir2_node_to_leaf(
 	/*
 	 * Read the freespace block.
 	 */
-	error = xfs_da_read_buf(tp, dp,  mp->m_dirfreeblk, -1, &fbp,
-				XFS_DATA_FORK, NULL);
+	error = xfs_dir2_free_read(tp, dp,  mp->m_dirfreeblk, &fbp);
 	if (error)
 		return error;
 	free = fbp->b_addr;
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 290c2b1016ab..d7f899dfbff5 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,57 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
 static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
 				     xfs_da_state_blk_t *fblk);
 
+static void
+xfs_dir2_free_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+	int			block_ok = 0;
+
+	block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+	if (!block_ok) {
+		XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
+				     XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+static int
+__xfs_dir2_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+					XFS_DATA_FORK, xfs_dir2_free_verify);
+}
+
+int
+xfs_dir2_free_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	struct xfs_buf		**bpp)
+{
+	return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+}
+
 /*
  * Log entries from a freespace block.
  */
@@ -394,12 +445,10 @@ xfs_dir2_leafn_lookup_for_addname(
 				 */
 				if (curbp)
 					xfs_trans_brelse(tp, curbp);
-				/*
-				 * Read the free block.
-				 */
-				error = xfs_da_read_buf(tp, dp,
+
+				error = xfs_dir2_free_read(tp, dp,
 						xfs_dir2_db_to_da(mp, newfdb),
-						-1, &curbp, XFS_DATA_FORK, NULL);
+						&curbp);
 				if (error)
 					return error;
 				free = curbp->b_addr;
@@ -825,6 +874,77 @@ xfs_dir2_leafn_rebalance(
 	}
 }
 
+static int
+xfs_dir2_data_block_free(
+	xfs_da_args_t		*args,
+	struct xfs_dir2_data_hdr *hdr,
+	struct xfs_dir2_free	*free,
+	xfs_dir2_db_t		fdb,
+	int			findex,
+	struct xfs_buf		*fbp,
+	int			longest)
+{
+	struct xfs_trans	*tp = args->trans;
+	int			logfree = 0;
+
+	if (!hdr) {
+		/* One less used entry in the free table.  */
+		be32_add_cpu(&free->hdr.nused, -1);
+		xfs_dir2_free_log_header(tp, fbp);
+
+		/*
+		 * If this was the last entry in the table, we can trim the
+		 * table size back.  There might be other entries at the end
+		 * referring to non-existent data blocks, get those too.
+		 */
+		if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+			int	i;		/* free entry index */
+
+			for (i = findex - 1; i >= 0; i--) {
+				if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
+					break;
+			}
+			free->hdr.nvalid = cpu_to_be32(i + 1);
+			logfree = 0;
+		} else {
+			/* Not the last entry, just punch it out.  */
+			free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+			logfree = 1;
+		}
+		/*
+		 * If there are no useful entries left in the block,
+		 * get rid of the block if we can.
+		 */
+		if (!free->hdr.nused) {
+			int error;
+
+			error = xfs_dir2_shrink_inode(args, fdb, fbp);
+			if (error == 0) {
+				fbp = NULL;
+				logfree = 0;
+			} else if (error != ENOSPC || args->total != 0)
+				return error;
+			/*
+			 * It's possible to get ENOSPC if there is no
+			 * space reservation.  In this case some one
+			 * else will eventually get rid of this block.
+			 */
+		}
+	} else {
+		/*
+		 * Data block is not empty, just set the free entry to the new
+		 * value.
+		 */
+		free->bests[findex] = cpu_to_be16(longest);
+		logfree = 1;
+	}
+
+	/* Log the free entry that changed, unless we got rid of it.  */
+	if (logfree)
+		xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+	return 0;
+}
+
 /*
  * Remove an entry from a node directory.
  * This removes the leaf entry and the data entry,
@@ -908,15 +1028,14 @@ xfs_dir2_leafn_remove(
 		xfs_dir2_db_t	fdb;		/* freeblock block number */
 		int		findex;		/* index in freeblock entries */
 		xfs_dir2_free_t	*free;		/* freeblock structure */
-		int		logfree;	/* need to log free entry */
 
 		/*
 		 * Convert the data block number to a free block,
 		 * read in the free block.
 		 */
 		fdb = xfs_dir2_db_to_fdb(mp, db);
-		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
-					-1, &fbp, XFS_DATA_FORK, NULL);
+		error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+					   &fbp);
 		if (error)
 			return error;
 		free = fbp->b_addr;
@@ -954,68 +1073,12 @@ xfs_dir2_leafn_remove(
 		 * If we got rid of the data block, we can eliminate that entry
 		 * in the free block.
 		 */
-		if (hdr == NULL) {
-			/*
-			 * One less used entry in the free table.
-			 */
-			be32_add_cpu(&free->hdr.nused, -1);
-			xfs_dir2_free_log_header(tp, fbp);
-			/*
-			 * If this was the last entry in the table, we can
-			 * trim the table size back.  There might be other
-			 * entries at the end referring to non-existent
-			 * data blocks, get those too.
-			 */
-			if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
-				int	i;		/* free entry index */
-
-				for (i = findex - 1;
-				     i >= 0 &&
-				     free->bests[i] == cpu_to_be16(NULLDATAOFF);
-				     i--)
-					continue;
-				free->hdr.nvalid = cpu_to_be32(i + 1);
-				logfree = 0;
-			}
-			/*
-			 * Not the last entry, just punch it out.
-			 */
-			else {
-				free->bests[findex] = cpu_to_be16(NULLDATAOFF);
-				logfree = 1;
-			}
-			/*
-			 * If there are no useful entries left in the block,
-			 * get rid of the block if we can.
-			 */
-			if (!free->hdr.nused) {
-				error = xfs_dir2_shrink_inode(args, fdb, fbp);
-				if (error == 0) {
-					fbp = NULL;
-					logfree = 0;
-				} else if (error != ENOSPC || args->total != 0)
-					return error;
-				/*
-				 * It's possible to get ENOSPC if there is no
-				 * space reservation.  In this case some one
-				 * else will eventually get rid of this block.
-				 */
-			}
-		}
-		/*
-		 * Data block is not empty, just set the free entry to
-		 * the new value.
-		 */
-		else {
-			free->bests[findex] = cpu_to_be16(longest);
-			logfree = 1;
-		}
-		/*
-		 * Log the free entry that changed, unless we got rid of it.
-		 */
-		if (logfree)
-			xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+		error = xfs_dir2_data_block_free(args, hdr, free,
+						 fdb, findex, fbp, longest);
+		if (error)
+			return error;
 	}
+
 	xfs_dir2_leafn_check(dp, bp);
 	/*
 	 * Return indication of whether this leaf block is empty enough
@@ -1453,9 +1516,9 @@ xfs_dir2_node_addname_int(
 			 * This should be really rare, so there's no reason
 			 * to avoid it.
 			 */
-			error = xfs_da_read_buf(tp, dp,
-						xfs_dir2_db_to_da(mp, fbno), -2,
-						&fbp, XFS_DATA_FORK, NULL);
+			error = xfs_dir2_free_try_read(tp, dp,
+						xfs_dir2_db_to_da(mp, fbno),
+						&fbp);
 			if (error)
 				return error;
 			if (!fbp)
@@ -1518,8 +1581,9 @@ xfs_dir2_node_addname_int(
 		 * that was just allocated.
 		 */
 		fbno = xfs_dir2_db_to_fdb(mp, dbno);
-		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fbno), -2,
-					&fbp, XFS_DATA_FORK, NULL);
+		error = xfs_dir2_free_try_read(tp, dp,
+					       xfs_dir2_db_to_da(mp, fbno),
+					       &fbp);
 		if (error)
 			return error;
 
@@ -1915,17 +1979,15 @@ xfs_dir2_node_trim_free(
 	/*
 	 * Read the freespace block.
 	 */
-	error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
-				XFS_DATA_FORK, NULL);
+	error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
 	if (error)
 		return error;
 	/*
 	 * There can be holes in freespace.  If fo is a hole, there's
 	 * nothing to do.
 	 */
-	if (bp == NULL) {
+	if (!bp)
 		return 0;
-	}
 	free = bp->b_addr;
 	ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
 	/*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 93b8f66ae788..263a63287910 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -117,6 +117,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
 extern int xfs_dir2_node_replace(struct xfs_da_args *args);
 extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
 		int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, struct xfs_buf **bpp);
 
 /* xfs_dir2_sf.c */
 extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
-- 
cgit v1.2.3


From e4813572640e27d3a5cce3f06751a9f54f77aaa5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:14 +1100
Subject: xfs: factor out dir2 data block reading

And add a verifier callback function while there.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_block.c |  3 +--
 fs/xfs/xfs_dir2_data.c  | 32 ++++++++++++++++++++++++++++++++
 fs/xfs/xfs_dir2_leaf.c  | 38 +++++++++++++++++---------------------
 fs/xfs/xfs_dir2_node.c  |  8 ++++----
 fs/xfs/xfs_dir2_priv.h  |  2 ++
 5 files changed, 56 insertions(+), 27 deletions(-)

diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 57351b868861..ca03b109772d 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -970,8 +970,7 @@ xfs_dir2_leaf_to_block(
 	 * Read the data block if we don't already have it, give up if it fails.
 	 */
 	if (!dbp) {
-		error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
-					XFS_DATA_FORK, NULL);
+		error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
 		if (error)
 			return error;
 	}
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index cb117234e32e..0ef04f1bf511 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -185,6 +185,38 @@ __xfs_dir2_data_check(
 	return 0;
 }
 
+static void
+xfs_dir2_data_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+	int			block_ok = 0;
+
+	block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+	block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+	if (!block_ok) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+int
+xfs_dir2_data_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mapped_bno,
+	struct xfs_buf		**bpp)
+{
+	return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+					XFS_DATA_FORK, xfs_dir2_data_verify);
+}
+
 /*
  * Given a data block and an unused entry from that block,
  * return the bestfree entry if any that corresponds to it.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 6c1359dc9898..0fdf765c917f 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -493,14 +493,14 @@ xfs_dir2_leaf_addname(
 		hdr = dbp->b_addr;
 		bestsp[use_block] = hdr->bestfree[0].length;
 		grown = 1;
-	}
-	/*
-	 * Already had space in some data block.
-	 * Just read that one in.
-	 */
-	else {
-		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
-					-1, &dbp, XFS_DATA_FORK, NULL);
+	} else {
+		/*
+		 * Already had space in some data block.
+		 * Just read that one in.
+		 */
+		error = xfs_dir2_data_read(tp, dp,
+					   xfs_dir2_db_to_da(mp, use_block),
+					   -1, &dbp);
 		if (error) {
 			xfs_trans_brelse(tp, lbp);
 			return error;
@@ -508,7 +508,6 @@ xfs_dir2_leaf_addname(
 		hdr = dbp->b_addr;
 		grown = 0;
 	}
-	xfs_dir2_data_check(dp, dbp);
 	/*
 	 * Point to the biggest freespace in our data block.
 	 */
@@ -891,10 +890,9 @@ xfs_dir2_leaf_readbuf(
 	 * Read the directory block starting at the first mapping.
 	 */
 	mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-	error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+	error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
 			map->br_blockcount >= mp->m_dirblkfsbs ?
-			    XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1,
-			&bp, XFS_DATA_FORK, NULL);
+			    XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
 
 	/*
 	 * Should just skip over the data block instead of giving up.
@@ -1408,14 +1406,13 @@ xfs_dir2_leaf_lookup_int(
 		if (newdb != curdb) {
 			if (dbp)
 				xfs_trans_brelse(tp, dbp);
-			error = xfs_da_read_buf(tp, dp,
-						xfs_dir2_db_to_da(mp, newdb),
-						-1, &dbp, XFS_DATA_FORK, NULL);
+			error = xfs_dir2_data_read(tp, dp,
+						   xfs_dir2_db_to_da(mp, newdb),
+						   -1, &dbp);
 			if (error) {
 				xfs_trans_brelse(tp, lbp);
 				return error;
 			}
-			xfs_dir2_data_check(dp, dbp);
 			curdb = newdb;
 		}
 		/*
@@ -1450,9 +1447,9 @@ xfs_dir2_leaf_lookup_int(
 		ASSERT(cidb != -1);
 		if (cidb != curdb) {
 			xfs_trans_brelse(tp, dbp);
-			error = xfs_da_read_buf(tp, dp,
-						xfs_dir2_db_to_da(mp, cidb),
-						-1, &dbp, XFS_DATA_FORK, NULL);
+			error = xfs_dir2_data_read(tp, dp,
+						   xfs_dir2_db_to_da(mp, cidb),
+						   -1, &dbp);
 			if (error) {
 				xfs_trans_brelse(tp, lbp);
 				return error;
@@ -1737,8 +1734,7 @@ xfs_dir2_leaf_trim_data(
 	/*
 	 * Read the offending data block.  We need its buffer.
 	 */
-	error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
-				XFS_DATA_FORK, NULL);
+	error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index d7f899dfbff5..67b811c17eaa 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -583,9 +583,9 @@ xfs_dir2_leafn_lookup_for_entry(
 				ASSERT(state->extravalid);
 				curbp = state->extrablk.bp;
 			} else {
-				error = xfs_da_read_buf(tp, dp,
+				error = xfs_dir2_data_read(tp, dp,
 						xfs_dir2_db_to_da(mp, newdb),
-						-1, &curbp, XFS_DATA_FORK, NULL);
+						-1, &curbp);
 				if (error)
 					return error;
 			}
@@ -1692,8 +1692,8 @@ xfs_dir2_node_addname_int(
 		/*
 		 * Read the data block in.
 		 */
-		error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
-					-1, &dbp, XFS_DATA_FORK, NULL);
+		error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+					   -1, &dbp);
 		if (error)
 			return error;
 		hdr = dbp->b_addr;
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 263a63287910..71ec82839107 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -46,6 +46,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 #define	xfs_dir2_data_check(dp,bp)
 #endif
 extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
 
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
-- 
cgit v1.2.3


From e6f7667c4eef42b6f5bc6cdeb31d0bab62fe5f79 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:15 +1100
Subject: xfs: factor dir2 leaf read

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_dir2_leaf.c | 73 ++++++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_dir2_node.c |  6 ++---
 fs/xfs/xfs_dir2_priv.h |  2 ++
 3 files changed, 67 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0fdf765c917f..97408e3287ed 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,62 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
 				    int first, int last);
 static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
 
+static void
+xfs_dir2_leaf_verify(
+	struct xfs_buf		*bp,
+	__be16			magic)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
+	int			block_ok = 0;
+
+	block_ok = hdr->info.magic == magic;
+	if (!block_ok) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+static void
+xfs_dir2_leaf1_verify(
+	struct xfs_buf		*bp)
+{
+	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+static void
+xfs_dir2_leafn_verify(
+	struct xfs_buf		*bp)
+{
+	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+static int
+xfs_dir2_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+					XFS_DATA_FORK, xfs_dir2_leaf1_verify);
+}
+
+int
+xfs_dir2_leafn_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		fbno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+					XFS_DATA_FORK, xfs_dir2_leafn_verify);
+}
 
 /*
  * Convert a block form directory to a leaf form directory.
@@ -311,14 +367,11 @@ xfs_dir2_leaf_addname(
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
-	/*
-	 * Read the leaf block.
-	 */
-	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-				XFS_DATA_FORK, NULL);
+
+	error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
 	if (error)
 		return error;
-	ASSERT(lbp != NULL);
+
 	/*
 	 * Look up the entry by hash value and name.
 	 * We know it's not there, our caller has already done a lookup.
@@ -1369,13 +1422,11 @@ xfs_dir2_leaf_lookup_int(
 	dp = args->dp;
 	tp = args->trans;
 	mp = dp->i_mount;
-	/*
-	 * Read the leaf block into the buffer.
-	 */
-	error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-							XFS_DATA_FORK, NULL);
+
+	error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
 	if (error)
 		return error;
+
 	*lbpp = lbp;
 	leaf = lbp->b_addr;
 	xfs_dir2_leaf_check(dp, lbp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 67b811c17eaa..7c6f95697e28 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -1232,11 +1232,11 @@ xfs_dir2_leafn_toosmall(
 		/*
 		 * Read the sibling leaf block.
 		 */
-		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-					blkno, -1, &bp, XFS_DATA_FORK, NULL);
+		error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+					    blkno, -1, &bp);
 		if (error)
 			return error;
-		ASSERT(bp != NULL);
+
 		/*
 		 * Count bytes in the two blocks combined.
 		 */
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 71ec82839107..4560825d099c 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -70,6 +70,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
 		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 /* xfs_dir2_leaf.c */
+extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
 		struct xfs_buf *dbp);
 extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
-- 
cgit v1.2.3


From ad14c33ac862601c4c22755ed3b59f1906b134e5 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:16 +1100
Subject: xfs: factor and verify attr leaf reads

Some reads are not converted yet because it isn't obvious ahead of
time what the format of the block is going to be. Need to determine
how to tell if the first block in the tree is a node or leaf format
block. That will be done in later patches.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c      | 70 ++++++++++++--------------------------------
 fs/xfs/xfs_attr_leaf.c | 78 +++++++++++++++++++++++++++++---------------------
 fs/xfs/xfs_attr_leaf.h |  3 ++
 3 files changed, 66 insertions(+), 85 deletions(-)

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index cd5a9cd0ded0..d644915367e3 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 	 */
 	dp = args->dp;
 	args->blkno = 0;
-	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK, NULL);
+	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
-		return(error);
-	ASSERT(bp != NULL);
+		return error;
 
 	/*
 	 * Look up the given attribute in the leaf block.  Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 * Read in the block containing the "old" attr, then
 		 * remove the "old" attr from that block (neat, huh!)
 		 */
-		error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
-						     &bp, XFS_ATTR_FORK, NULL);
+		error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+					   -1, &bp);
 		if (error)
-			return(error);
-		ASSERT(bp != NULL);
-		(void)xfs_attr_leaf_remove(bp, args);
+			return error;
+
+		xfs_attr_leaf_remove(bp, args);
 
 		/*
 		 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 	 */
 	dp = args->dp;
 	args->blkno = 0;
-	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK, NULL);
-	if (error) {
-		return(error);
-	}
+	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
+		return error;
 
-	ASSERT(bp != NULL);
 	error = xfs_attr_leaf_lookup_int(bp, args);
 	if (error == ENOATTR) {
 		xfs_trans_brelse(args->trans, bp);
 		return(error);
 	}
 
-	(void)xfs_attr_leaf_remove(bp, args);
+	xfs_attr_leaf_remove(bp, args);
 
 	/*
 	 * If the result is small enough, shrink it all into the inode.
@@ -1158,11 +1153,9 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 	trace_xfs_attr_leaf_get(args);
 
 	args->blkno = 0;
-	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK, NULL);
+	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
 	if (error)
-		return(error);
-	ASSERT(bp != NULL);
+		return error;
 
 	error = xfs_attr_leaf_lookup_int(bp, args);
 	if (error != EEXIST)  {
@@ -1183,25 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 STATIC int
 xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 {
-	xfs_attr_leafblock_t *leaf;
 	int error;
 	struct xfs_buf *bp;
 
 	trace_xfs_attr_leaf_list(context);
 
 	context->cursor->blkno = 0;
-	error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK,
-				NULL);
+	error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
 	if (error)
 		return XFS_ERROR(error);
-	ASSERT(bp != NULL);
-	leaf = bp->b_addr;
-	if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-		XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
-				     context->dp->i_mount, leaf);
-		xfs_trans_brelse(NULL, bp);
-		return XFS_ERROR(EFSCORRUPTED);
-	}
 
 	error = xfs_attr_leaf_list_int(bp, context);
 	xfs_trans_brelse(NULL, bp);
@@ -1605,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 		ASSERT(state->path.blk[0].bp);
 		state->path.blk[0].bp = NULL;
 
-		error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-						     XFS_ATTR_FORK, NULL);
+		error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
 		if (error)
 			goto out;
-		ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
-		       cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 
 		if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
 			xfs_bmap_init(args->flist, args->firstblock);
@@ -1920,14 +1900,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	for (;;) {
 		leaf = bp->b_addr;
-		if (unlikely(leaf->hdr.info.magic !=
-			     cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-			XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
-					     XFS_ERRLEVEL_LOW,
-					     context->dp->i_mount, leaf);
-			xfs_trans_brelse(NULL, bp);
-			return(XFS_ERROR(EFSCORRUPTED));
-		}
 		error = xfs_attr_leaf_list_int(bp, context);
 		if (error) {
 			xfs_trans_brelse(NULL, bp);
@@ -1937,16 +1909,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 			break;
 		cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
 		xfs_trans_brelse(NULL, bp);
-		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
-					      &bp, XFS_ATTR_FORK, NULL);
+		error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
+					   &bp);
 		if (error)
-			return(error);
-		if (unlikely((bp == NULL))) {
-			XFS_ERROR_REPORT("xfs_attr_node_list(5)",
-					 XFS_ERRLEVEL_LOW,
-					 context->dp->i_mount);
-			return(XFS_ERROR(EFSCORRUPTED));
-		}
+			return error;
 	}
 	xfs_trans_brelse(NULL, bp);
 	return(0);
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index ba2b9a2cd236..357971536d50 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -88,6 +88,36 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
 					 xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
 
+static void
+xfs_attr_leaf_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
+	int			block_ok = 0;
+
+	block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+	if (!block_ok) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+int
+xfs_attr_leaf_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp)
+{
+	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+					XFS_ATTR_FORK, xfs_attr_leaf_verify);
+}
+
 /*========================================================================
  * Namespace helper routines
  *========================================================================*/
@@ -870,11 +900,10 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
 	error = xfs_da_grow_inode(args, &blkno);
 	if (error)
 		goto out;
-	error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
-					     XFS_ATTR_FORK, NULL);
+	error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
 	if (error)
 		goto out;
-	ASSERT(bp1 != NULL);
+
 	bp2 = NULL;
 	error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
 					    XFS_ATTR_FORK);
@@ -1641,18 +1670,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
 			blkno = be32_to_cpu(info->back);
 		if (blkno == 0)
 			continue;
-		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-					blkno, -1, &bp, XFS_ATTR_FORK, NULL);
+		error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+					blkno, -1, &bp);
 		if (error)
 			return(error);
-		ASSERT(bp != NULL);
 
 		leaf = (xfs_attr_leafblock_t *)info;
 		count  = be16_to_cpu(leaf->hdr.count);
 		bytes  = state->blocksize - (state->blocksize>>2);
 		bytes -= be16_to_cpu(leaf->hdr.usedbytes);
 		leaf = bp->b_addr;
-		ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 		count += be16_to_cpu(leaf->hdr.count);
 		bytes -= be16_to_cpu(leaf->hdr.usedbytes);
 		bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -2518,15 +2545,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
 	/*
 	 * Set up the operation.
 	 */
-	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK, NULL);
-	if (error) {
+	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
 		return(error);
-	}
-	ASSERT(bp != NULL);
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
 	ASSERT(args->index >= 0);
 	entry = &leaf->entries[ args->index ];
@@ -2583,15 +2606,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
 	/*
 	 * Set up the operation.
 	 */
-	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-					     XFS_ATTR_FORK, NULL);
-	if (error) {
+	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+	if (error)
 		return(error);
-	}
-	ASSERT(bp != NULL);
 
 	leaf = bp->b_addr;
-	ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
 	ASSERT(args->index >= 0);
 	entry = &leaf->entries[ args->index ];
@@ -2640,35 +2659,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
 	/*
 	 * Read the block containing the "old" attr
 	 */
-	error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
-					     XFS_ATTR_FORK, NULL);
-	if (error) {
-		return(error);
-	}
-	ASSERT(bp1 != NULL);
+	error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+	if (error)
+		return error;
 
 	/*
 	 * Read the block containing the "new" attr, if it is different
 	 */
 	if (args->blkno2 != args->blkno) {
-		error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
-					-1, &bp2, XFS_ATTR_FORK, NULL);
-		if (error) {
-			return(error);
-		}
-		ASSERT(bp2 != NULL);
+		error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+					   -1, &bp2);
+		if (error)
+			return error;
 	} else {
 		bp2 = bp1;
 	}
 
 	leaf1 = bp1->b_addr;
-	ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
 	ASSERT(args->index >= 0);
 	entry1 = &leaf1->entries[ args->index ];
 
 	leaf2 = bp2->b_addr;
-	ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
 	ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
 	ASSERT(args->index2 >= 0);
 	entry2 = &leaf2->entries[ args->index2 ];
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..8f7ab986f45d 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -227,6 +227,9 @@ int	xfs_attr_leaf_to_shortform(struct xfs_buf *bp,
 int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
 int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
 int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);
+int	xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			struct xfs_buf **bpp);
 
 /*
  * Routines used for growing the Btree.
-- 
cgit v1.2.3


From d9392a4bb75503fc2adbb5237c3df940c6467eb2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:17 +1100
Subject: xfs: add xfs_da_node verification

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_attr.c      |  22 ++++------
 fs/xfs/xfs_attr_leaf.c |  12 +++---
 fs/xfs/xfs_attr_leaf.h |   8 ++--
 fs/xfs/xfs_da_btree.c  | 109 +++++++++++++++++++++++++++++++++++++------------
 fs/xfs/xfs_da_btree.h  |   3 ++
 fs/xfs/xfs_dir2_leaf.c |   2 +-
 fs/xfs/xfs_dir2_priv.h |   1 +
 7 files changed, 107 insertions(+), 50 deletions(-)

diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index d644915367e3..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -1696,10 +1696,10 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
 	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
 		if (blk->disk_blkno) {
-			error = xfs_da_read_buf(state->args->trans,
+			error = xfs_da_node_read(state->args->trans,
 						state->args->dp,
 						blk->blkno, blk->disk_blkno,
-						&blk->bp, XFS_ATTR_FORK, NULL);
+						&blk->bp, XFS_ATTR_FORK);
 			if (error)
 				return(error);
 		} else {
@@ -1715,10 +1715,10 @@ xfs_attr_refillstate(xfs_da_state_t *state)
 	ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
 	for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
 		if (blk->disk_blkno) {
-			error = xfs_da_read_buf(state->args->trans,
+			error = xfs_da_node_read(state->args->trans,
 						state->args->dp,
 						blk->blkno, blk->disk_blkno,
-						&blk->bp, XFS_ATTR_FORK, NULL);
+						&blk->bp, XFS_ATTR_FORK);
 			if (error)
 				return(error);
 		} else {
@@ -1807,8 +1807,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	bp = NULL;
 	if (cursor->blkno > 0) {
-		error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
-					      &bp, XFS_ATTR_FORK, NULL);
+		error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
+					      &bp, XFS_ATTR_FORK);
 		if ((error != 0) && (error != EFSCORRUPTED))
 			return(error);
 		if (bp) {
@@ -1849,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	if (bp == NULL) {
 		cursor->blkno = 0;
 		for (;;) {
-			error = xfs_da_read_buf(NULL, context->dp,
+			error = xfs_da_node_read(NULL, context->dp,
 						      cursor->blkno, -1, &bp,
-						      XFS_ATTR_FORK, NULL);
+						      XFS_ATTR_FORK);
 			if (error)
 				return(error);
-			if (unlikely(bp == NULL)) {
-				XFS_ERROR_REPORT("xfs_attr_node_list(2)",
-						 XFS_ERRLEVEL_LOW,
-						 context->dp->i_mount);
-				return(XFS_ERROR(EFSCORRUPTED));
-			}
 			node = bp->b_addr;
 			if (node->hdr.info.magic ==
 			    cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 357971536d50..efe170da2881 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
 					 xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
 
-static void
+void
 xfs_attr_leaf_verify(
 	struct xfs_buf		*bp)
 {
@@ -2765,7 +2765,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
 	 * the extents in reverse order the extent containing
 	 * block 0 must still be there.
 	 */
-	error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK, NULL);
+	error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
 	if (error)
 		return(error);
 	blkno = XFS_BUF_ADDR(bp);
@@ -2850,8 +2850,8 @@ xfs_attr_node_inactive(
 		 * traversal of the tree so we may deal with many blocks
 		 * before we come back to this one.
 		 */
-		error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
-						XFS_ATTR_FORK, NULL);
+		error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
+						XFS_ATTR_FORK);
 		if (error)
 			return(error);
 		if (child_bp) {
@@ -2891,8 +2891,8 @@ xfs_attr_node_inactive(
 		 * child block number.
 		 */
 		if ((i+1) < count) {
-			error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
-				&bp, XFS_ATTR_FORK, NULL);
+			error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
+						 &bp, XFS_ATTR_FORK);
 			if (error)
 				return(error);
 			child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 8f7ab986f45d..098e9a58ad9f 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -227,9 +227,6 @@ int	xfs_attr_leaf_to_shortform(struct xfs_buf *bp,
 int	xfs_attr_leaf_clearflag(struct xfs_da_args *args);
 int	xfs_attr_leaf_setflag(struct xfs_da_args *args);
 int	xfs_attr_leaf_flipflags(xfs_da_args_t *args);
-int	xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
-			xfs_dablk_t bno, xfs_daddr_t mappedbno,
-			struct xfs_buf **bpp);
 
 /*
  * Routines used for growing the Btree.
@@ -264,4 +261,9 @@ int	xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
 				   struct xfs_buf *leaf2_bp);
 int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
 					int *local);
+int	xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			struct xfs_buf **bpp);
+void	xfs_attr_leaf_verify(struct xfs_buf *bp);
+
 #endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index f9e9149de009..1b84fc50a053 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,68 @@ STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
 				  xfs_da_state_blk_t *save_blk);
 STATIC void	xfs_da_state_kill_altpath(xfs_da_state_t *state);
 
+static void
+__xfs_da_node_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_da_node_hdr *hdr = bp->b_addr;
+	int			block_ok = 0;
+
+	block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
+	block_ok = block_ok &&
+			be16_to_cpu(hdr->level) > 0 &&
+			be16_to_cpu(hdr->count) > 0 ;
+	if (!block_ok) {
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+	}
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+static void
+xfs_da_node_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_da_blkinfo	*info = bp->b_addr;
+
+	switch (be16_to_cpu(info->magic)) {
+		case XFS_DA_NODE_MAGIC:
+			__xfs_da_node_verify(bp);
+			return;
+		case XFS_ATTR_LEAF_MAGIC:
+			xfs_attr_leaf_verify(bp);
+			return;
+		case XFS_DIR2_LEAFN_MAGIC:
+			xfs_dir2_leafn_verify(bp);
+			return;
+		default:
+			break;
+	}
+
+	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info);
+	xfs_buf_ioerror(bp, EFSCORRUPTED);
+
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
+
+int
+xfs_da_node_read(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
+	struct xfs_buf		**bpp,
+	int			which_fork)
+{
+	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+					which_fork, xfs_da_node_verify);
+}
+
 /*========================================================================
  * Routines used for growing the Btree.
  *========================================================================*/
@@ -746,8 +808,8 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 	 */
 	child = be32_to_cpu(oldroot->btree[0].before);
 	ASSERT(child != 0);
-	error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
-					     args->whichfork, NULL);
+	error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
+					     args->whichfork);
 	if (error)
 		return(error);
 	ASSERT(bp != NULL);
@@ -837,9 +899,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
 			blkno = be32_to_cpu(info->back);
 		if (blkno == 0)
 			continue;
-		error = xfs_da_read_buf(state->args->trans, state->args->dp,
-					blkno, -1, &bp, state->args->whichfork,
-					NULL);
+		error = xfs_da_node_read(state->args->trans, state->args->dp,
+					blkno, -1, &bp, state->args->whichfork);
 		if (error)
 			return(error);
 		ASSERT(bp != NULL);
@@ -1084,8 +1145,8 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
 		 * Read the next node down in the tree.
 		 */
 		blk->blkno = blkno;
-		error = xfs_da_read_buf(args->trans, args->dp, blkno,
-					-1, &blk->bp, args->whichfork, NULL);
+		error = xfs_da_node_read(args->trans, args->dp, blkno,
+					-1, &blk->bp, args->whichfork);
 		if (error) {
 			blk->blkno = 0;
 			state->path.active--;
@@ -1246,9 +1307,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		new_info->forw = cpu_to_be32(old_blk->blkno);
 		new_info->back = old_info->back;
 		if (old_info->back) {
-			error = xfs_da_read_buf(args->trans, args->dp,
+			error = xfs_da_node_read(args->trans, args->dp,
 						be32_to_cpu(old_info->back),
-						-1, &bp, args->whichfork, NULL);
+						-1, &bp, args->whichfork);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1267,9 +1328,9 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 		new_info->forw = old_info->forw;
 		new_info->back = cpu_to_be32(old_blk->blkno);
 		if (old_info->forw) {
-			error = xfs_da_read_buf(args->trans, args->dp,
+			error = xfs_da_node_read(args->trans, args->dp,
 						be32_to_cpu(old_info->forw),
-						-1, &bp, args->whichfork, NULL);
+						-1, &bp, args->whichfork);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1367,9 +1428,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		trace_xfs_da_unlink_back(args);
 		save_info->back = drop_info->back;
 		if (drop_info->back) {
-			error = xfs_da_read_buf(args->trans, args->dp,
+			error = xfs_da_node_read(args->trans, args->dp,
 						be32_to_cpu(drop_info->back),
-						-1, &bp, args->whichfork, NULL);
+						-1, &bp, args->whichfork);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1384,9 +1445,9 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
 		trace_xfs_da_unlink_forward(args);
 		save_info->forw = drop_info->forw;
 		if (drop_info->forw) {
-			error = xfs_da_read_buf(args->trans, args->dp,
+			error = xfs_da_node_read(args->trans, args->dp,
 						be32_to_cpu(drop_info->forw),
-						-1, &bp, args->whichfork, NULL);
+						-1, &bp, args->whichfork);
 			if (error)
 				return(error);
 			ASSERT(bp != NULL);
@@ -1470,8 +1531,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 		 * Read the next child block.
 		 */
 		blk->blkno = blkno;
-		error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
-					&blk->bp, args->whichfork, NULL);
+		error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+					&blk->bp, args->whichfork);
 		if (error)
 			return(error);
 		ASSERT(blk->bp != NULL);
@@ -1734,7 +1795,7 @@ xfs_da_swap_lastblock(
 	 * Read the last block in the btree space.
 	 */
 	last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-	error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w, NULL);
+	error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
 	if (error)
 		return error;
 	/*
@@ -1761,8 +1822,7 @@ xfs_da_swap_lastblock(
 	 * If the moved block has a left sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-		error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w,
-					NULL);
+		error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
 		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
@@ -1784,8 +1844,7 @@ xfs_da_swap_lastblock(
 	 * If the moved block has a right sibling, fix up the pointers.
 	 */
 	if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-		error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w,
-					NULL);
+		error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
 		if (error)
 			goto done;
 		sib_info = sib_buf->b_addr;
@@ -1809,8 +1868,7 @@ xfs_da_swap_lastblock(
 	 * Walk down the tree looking for the parent of the moved block.
 	 */
 	for (;;) {
-		error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w,
-					NULL);
+		error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
@@ -1861,8 +1919,7 @@ xfs_da_swap_lastblock(
 			error = XFS_ERROR(EFSCORRUPTED);
 			goto done;
 		}
-		error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w,
-					NULL);
+		error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
 		if (error)
 			goto done;
 		par_node = par_buf->b_addr;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index bf8bfaa0d356..2d1bec4b7595 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -213,6 +213,9 @@ int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
  */
 int	xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
 				       xfs_da_state_blk_t *new_blk);
+int	xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+			 xfs_dablk_t bno, xfs_daddr_t mappedbno,
+			 struct xfs_buf **bpp, int which_fork);
 
 /*
  * Utility routines.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 97408e3287ed..67cc21c2a45d 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -74,7 +74,7 @@ xfs_dir2_leaf1_verify(
 	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
 }
 
-static void
+void
 xfs_dir2_leafn_verify(
 	struct xfs_buf		*bp)
 {
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 4560825d099c..e0b96e7693ea 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -70,6 +70,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
 		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 /* xfs_dir2_leaf.c */
+extern void xfs_dir2_leafn_verify(struct xfs_buf *bp);
 extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
-- 
cgit v1.2.3


From da6958c873ecd846d71fafbfe0f6168bb9c2c99e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:18 +1100
Subject: xfs: Add verifiers to dir2 data readahead.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_da_btree.c  |  4 ++--
 fs/xfs/xfs_da_btree.h  |  4 ++--
 fs/xfs/xfs_dir2_data.c | 13 ++++++++++++-
 fs/xfs/xfs_dir2_leaf.c | 11 +++++------
 fs/xfs/xfs_dir2_priv.h |  2 ++
 fs/xfs/xfs_file.c      |  4 +++-
 6 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 1b84fc50a053..93ebc0fc6dd9 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2285,10 +2285,10 @@ xfs_da_reada_buf(
 	struct xfs_trans	*trans,
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
+	xfs_daddr_t		mappedbno,
 	int			whichfork,
 	xfs_buf_iodone_t	verifier)
 {
-	xfs_daddr_t		mappedbno = -1;
 	struct xfs_buf_map	map;
 	struct xfs_buf_map	*mapp;
 	int			nmap;
@@ -2296,7 +2296,7 @@ xfs_da_reada_buf(
 
 	mapp = &map;
 	nmap = 1;
-	error = xfs_dabuf_map(trans, dp, bno, -1, whichfork,
+	error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
 				&mapp, &nmap);
 	if (error) {
 		/* mapping a hole is not an error, but we don't continue */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 2d1bec4b7595..521b008445ab 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -231,8 +231,8 @@ int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 			       struct xfs_buf **bpp, int whichfork,
 			       xfs_buf_iodone_t verifier);
 xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-				xfs_dablk_t bno, int whichfork,
-				xfs_buf_iodone_t verifier);
+				xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+				int whichfork, xfs_buf_iodone_t verifier);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  struct xfs_buf *dead_buf);
 
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 0ef04f1bf511..1a43c8593c00 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -185,7 +185,7 @@ __xfs_dir2_data_check(
 	return 0;
 }
 
-static void
+void
 xfs_dir2_data_verify(
 	struct xfs_buf		*bp)
 {
@@ -217,6 +217,17 @@ xfs_dir2_data_read(
 					XFS_DATA_FORK, xfs_dir2_data_verify);
 }
 
+int
+xfs_dir2_data_readahead(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	xfs_dablk_t		bno,
+	xfs_daddr_t		mapped_bno)
+{
+	return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+					XFS_DATA_FORK, xfs_dir2_data_verify);
+}
+
 /*
  * Given a data block and an unused entry from that block,
  * return the bestfree entry if any that corresponds to it.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 67cc21c2a45d..8a95547d42ac 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -972,11 +972,11 @@ xfs_dir2_leaf_readbuf(
 		 */
 		if (i > mip->ra_current &&
 		    map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-			xfs_buf_readahead(mp->m_ddev_targp,
+			xfs_dir2_data_readahead(NULL, dp,
+				map[mip->ra_index].br_startoff + mip->ra_offset,
 				XFS_FSB_TO_DADDR(mp,
 					map[mip->ra_index].br_startblock +
-							mip->ra_offset),
-				(int)BTOBB(mp->m_dirblksize), NULL);
+							mip->ra_offset));
 			mip->ra_current = i;
 		}
 
@@ -985,10 +985,9 @@ xfs_dir2_leaf_readbuf(
 		 * use our mapping, but this is a very rare case.
 		 */
 		else if (i > mip->ra_current) {
-			xfs_da_reada_buf(NULL, dp,
+			xfs_dir2_data_readahead(NULL, dp,
 					map[mip->ra_index].br_startoff +
-							mip->ra_offset,
-					XFS_DATA_FORK, NULL);
+							mip->ra_offset, -1);
 			mip->ra_current = i;
 		}
 
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index e0b96e7693ea..daf5d0fc6165 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -48,6 +48,8 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
 extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+		xfs_dablk_t bno, xfs_daddr_t mapped_bno);
 
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f6dab7da7bcc..400b187595bb 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
 
@@ -891,7 +893,7 @@ xfs_dir_open(
 	 */
 	mode = xfs_ilock_map_shared(ip);
 	if (ip->i_d.di_nextents > 0)
-		xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK, NULL);
+		xfs_dir2_data_readahead(NULL, ip, 0, -1);
 	xfs_iunlock(ip, mode);
 	return 0;
 }
-- 
cgit v1.2.3


From cfb02852226aa449fe27075caffe88726507668c Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 12 Nov 2012 22:54:19 +1100
Subject: xfs: add buffer pre-write callback

Add a callback to the buffer write path to enable verification of
the buffer and CRC calculation prior to issuing the write to the
underlying storage.

If the callback function detects some kind of failure or error
condition, it must mark the buffer with an error so that the caller
can take appropriate action. In the case of xfs_buf_ioapply(), a
corrupt metadta buffer willt rigger a shutdown of the filesystem,
because something is clearly wrong and we can't allow corrupt
metadata to be written to disk.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_buf.c | 16 ++++++++++++++++
 fs/xfs/xfs_buf.h |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index fbc965fc075a..bd1a948ee39c 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
 	 */
 	if (bp->b_flags & XBF_STALE) {
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+		ASSERT(bp->b_iodone == NULL);
 		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+		bp->b_pre_io = NULL;
 	}
 
 	trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -1323,6 +1325,20 @@ _xfs_buf_ioapply(
 	/* we only use the buffer cache for meta-data */
 	rw |= REQ_META;
 
+	/*
+	 * run the pre-io callback function if it exists. If this function
+	 * fails it will mark the buffer with an error and the IO should
+	 * not be dispatched.
+	 */
+	if (bp->b_pre_io) {
+		bp->b_pre_io(bp);
+		if (bp->b_error) {
+			xfs_force_shutdown(bp->b_target->bt_mount,
+					   SHUTDOWN_CORRUPT_INCORE);
+			return;
+		}
+	}
+
 	/*
 	 * Walk all the vectors issuing IO on them. Set up the initial offset
 	 * into the buffer and the desired IO size before we start -
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 677b1dc822f4..51bc16a1cd9c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -155,6 +155,9 @@ typedef struct xfs_buf {
 	unsigned int		b_offset;	/* page offset in first page */
 	unsigned short		b_error;	/* error code on I/O */
 
+	void			(*b_pre_io)(struct xfs_buf *);
+						/* pre-io callback function */
+
 #ifdef XFS_BUF_LOCK_TRACKING
 	int			b_last_holder;
 #endif
-- 
cgit v1.2.3


From 612cfbfe174a89d565363fff7f3961a2dda5fb71 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2012 17:52:32 +1100
Subject: xfs: add pre-write metadata buffer verifier callbacks

These verifiers are essentially the same code as the read verifiers,
but do not require ioend processing. Hence factor the read verifier
functions and add a new write verifier wrapper that is used as the
callback.

This is done as one large patch for all verifiers rather than one
patch per verifier as the change is largely mechanical. This
includes hooking up the write verifier via the read verifier
function.

Hooking up the write verifier for buffers obtained via
xfs_trans_get_buf() will be done in a separate patch as that touches
code in many different places rather than just the verifier
functions.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c        | 35 ++++++++++++++++++++++++++++++++---
 fs/xfs/xfs_alloc_btree.c  | 21 +++++++++++++++++----
 fs/xfs/xfs_attr_leaf.c    | 19 +++++++++++++++++--
 fs/xfs/xfs_attr_leaf.h    |  2 +-
 fs/xfs/xfs_bmap_btree.c   | 21 +++++++++++++++++----
 fs/xfs/xfs_da_btree.c     | 37 +++++++++++++++++++++++++------------
 fs/xfs/xfs_dir2_block.c   | 16 +++++++++++++++-
 fs/xfs/xfs_dir2_data.c    | 19 +++++++++++++++++--
 fs/xfs/xfs_dir2_leaf.c    | 31 ++++++++++++++++++++++++-------
 fs/xfs/xfs_dir2_node.c    | 17 ++++++++++++++++-
 fs/xfs/xfs_dir2_priv.h    |  2 +-
 fs/xfs/xfs_dquot.c        | 27 +++++++++++++++++++++------
 fs/xfs/xfs_dquot.h        |  2 +-
 fs/xfs/xfs_ialloc.c       | 17 ++++++++++++++++-
 fs/xfs/xfs_ialloc_btree.c | 19 ++++++++++++++++---
 fs/xfs/xfs_inode.c        | 19 +++++++++++++++++--
 fs/xfs/xfs_inode.h        |  2 +-
 fs/xfs/xfs_itable.c       |  2 +-
 fs/xfs/xfs_mount.c        | 19 +++++++++++++++++--
 fs/xfs/xfs_qm.c           |  2 +-
 20 files changed, 273 insertions(+), 56 deletions(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 38b4ab8957ff..d12bbedf6fe5 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,8 +430,8 @@ xfs_alloc_fixup_trees(
 	return 0;
 }
 
-void
-xfs_agfl_read_verify(
+static void
+xfs_agfl_verify(
 	struct xfs_buf	*bp)
 {
 #ifdef WHEN_CRCS_COME_ALONG
@@ -463,6 +463,21 @@ xfs_agfl_read_verify(
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
 #endif
+}
+
+static void
+xfs_agfl_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_agfl_verify(bp);
+}
+
+void
+xfs_agfl_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_agfl_verify(bp);
+	bp->b_pre_io = xfs_agfl_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
@@ -2129,7 +2144,7 @@ xfs_alloc_put_freelist(
 }
 
 static void
-xfs_agf_read_verify(
+xfs_agf_verify(
 	struct xfs_buf	*bp)
  {
 	struct xfs_mount *mp = bp->b_target->bt_mount;
@@ -2164,7 +2179,21 @@ xfs_agf_read_verify(
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
+
+static void
+xfs_agf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_agf_verify(bp);
+}
 
+void
+xfs_agf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_agf_verify(bp);
+	bp->b_pre_io = xfs_agf_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 46961e52e9b8..6e98b22ebde0 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,8 +272,8 @@ xfs_allocbt_key_diff(
 	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
-void
-xfs_allocbt_read_verify(
+static void
+xfs_allocbt_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -323,11 +323,24 @@ xfs_allocbt_read_verify(
 
 	if (!sblock_ok) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR("xfs_allocbt_read_verify",
-					XFS_ERRLEVEL_LOW, mp, block);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
 
+static void
+xfs_allocbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_allocbt_verify(bp);
+}
+
+void
+xfs_allocbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_allocbt_verify(bp);
+	bp->b_pre_io = xfs_allocbt_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index efe170da2881..57729d71ab1a 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -88,7 +88,7 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
 					 xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
 
-void
+static void
 xfs_attr_leaf_verify(
 	struct xfs_buf		*bp)
 {
@@ -101,11 +101,26 @@ xfs_attr_leaf_verify(
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
+
+static void
+xfs_attr_leaf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_attr_leaf_verify(bp);
+}
 
+void
+xfs_attr_leaf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_attr_leaf_verify(bp);
+	bp->b_pre_io = xfs_attr_leaf_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
 
+
 int
 xfs_attr_leaf_read(
 	struct xfs_trans	*tp,
@@ -115,7 +130,7 @@ xfs_attr_leaf_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-					XFS_ATTR_FORK, xfs_attr_leaf_verify);
+				XFS_ATTR_FORK, xfs_attr_leaf_read_verify);
 }
 
 /*========================================================================
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 098e9a58ad9f..3bbf6277e43c 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -264,6 +264,6 @@ int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
 int	xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
 			xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			struct xfs_buf **bpp);
-void	xfs_attr_leaf_verify(struct xfs_buf *bp);
+void	xfs_attr_leaf_read_verify(struct xfs_buf *bp);
 
 #endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bddca9b92869..17d7423e7503 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -708,8 +708,8 @@ xfs_bmbt_key_diff(
 				      cur->bc_rec.b.br_startoff;
 }
 
-void
-xfs_bmbt_read_verify(
+static void
+xfs_bmbt_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -744,11 +744,24 @@ xfs_bmbt_read_verify(
 
 	if (!lblock_ok) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR("xfs_bmbt_read_verify",
-					XFS_ERRLEVEL_LOW, mp, block);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
 
+static void
+xfs_bmbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_bmbt_verify(bp);
+}
+
+void
+xfs_bmbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_bmbt_verify(bp);
+	bp->b_pre_io = xfs_bmbt_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 93ebc0fc6dd9..6bb0a59eaaee 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -92,7 +92,7 @@ STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
 STATIC void	xfs_da_state_kill_altpath(xfs_da_state_t *state);
 
 static void
-__xfs_da_node_verify(
+xfs_da_node_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -108,12 +108,17 @@ __xfs_da_node_verify(
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
 
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
 static void
-xfs_da_node_verify(
+xfs_da_node_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_da_node_verify(bp);
+}
+
+static void
+xfs_da_node_read_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -121,21 +126,22 @@ xfs_da_node_verify(
 
 	switch (be16_to_cpu(info->magic)) {
 		case XFS_DA_NODE_MAGIC:
-			__xfs_da_node_verify(bp);
-			return;
+			xfs_da_node_verify(bp);
+			break;
 		case XFS_ATTR_LEAF_MAGIC:
-			xfs_attr_leaf_verify(bp);
+			xfs_attr_leaf_read_verify(bp);
 			return;
 		case XFS_DIR2_LEAFN_MAGIC:
-			xfs_dir2_leafn_verify(bp);
+			xfs_dir2_leafn_read_verify(bp);
 			return;
 		default:
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+					     mp, info);
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
 			break;
 	}
 
-	XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, info);
-	xfs_buf_ioerror(bp, EFSCORRUPTED);
-
+	bp->b_pre_io = xfs_da_node_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
@@ -150,7 +156,7 @@ xfs_da_node_read(
 	int			which_fork)
 {
 	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-					which_fork, xfs_da_node_verify);
+					which_fork, xfs_da_node_read_verify);
 }
 
 /*========================================================================
@@ -816,7 +822,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 	xfs_da_blkinfo_onlychild_validate(bp->b_addr,
 					be16_to_cpu(oldroot->hdr.level));
 
+	/*
+	 * This could be copying a leaf back into the root block in the case of
+	 * there only being a single leaf block left in the tree. Hence we have
+	 * to update the pre_io pointer as well to match the buffer type change
+	 * that could occur.
+	 */
 	memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+	root_blk->bp->b_pre_io = bp->b_pre_io;
 	xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
 	error = xfs_da_shrink_inode(args, child, bp);
 	return(error);
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index ca03b109772d..0f8793c74fe2 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -71,7 +71,21 @@ xfs_dir2_block_verify(
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
 
+static void
+xfs_dir2_block_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_block_verify(bp);
+}
+
+void
+xfs_dir2_block_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_block_verify(bp);
+	bp->b_pre_io = xfs_dir2_block_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
@@ -85,7 +99,7 @@ xfs_dir2_block_read(
 	struct xfs_mount	*mp = dp->i_mount;
 
 	return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
-					XFS_DATA_FORK, xfs_dir2_block_verify);
+				XFS_DATA_FORK, xfs_dir2_block_read_verify);
 }
 
 static void
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 1a43c8593c00..b555585f5ab6 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -200,11 +200,26 @@ xfs_dir2_data_verify(
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
+
+static void
+xfs_dir2_data_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_data_verify(bp);
+}
 
+void
+xfs_dir2_data_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_data_verify(bp);
+	bp->b_pre_io = xfs_dir2_data_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
 
+
 int
 xfs_dir2_data_read(
 	struct xfs_trans	*tp,
@@ -214,7 +229,7 @@ xfs_dir2_data_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
-					XFS_DATA_FORK, xfs_dir2_data_verify);
+				XFS_DATA_FORK, xfs_dir2_data_read_verify);
 }
 
 int
@@ -225,7 +240,7 @@ xfs_dir2_data_readahead(
 	xfs_daddr_t		mapped_bno)
 {
 	return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
-					XFS_DATA_FORK, xfs_dir2_data_verify);
+				XFS_DATA_FORK, xfs_dir2_data_read_verify);
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 8a95547d42ac..5b3bcab2a656 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -62,23 +62,40 @@ xfs_dir2_leaf_verify(
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
+
+static void
+xfs_dir2_leaf1_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
 
+static void
+xfs_dir2_leaf1_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+	bp->b_pre_io = xfs_dir2_leaf1_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
 
 static void
-xfs_dir2_leaf1_verify(
-	struct xfs_buf		*bp)
+xfs_dir2_leafn_write_verify(
+	struct xfs_buf	*bp)
 {
-	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
 }
 
 void
-xfs_dir2_leafn_verify(
-	struct xfs_buf		*bp)
+xfs_dir2_leafn_read_verify(
+	struct xfs_buf	*bp)
 {
 	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+	bp->b_pre_io = xfs_dir2_leafn_write_verify;
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
 }
 
 static int
@@ -90,7 +107,7 @@ xfs_dir2_leaf_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-					XFS_DATA_FORK, xfs_dir2_leaf1_verify);
+				XFS_DATA_FORK, xfs_dir2_leaf1_read_verify);
 }
 
 int
@@ -102,7 +119,7 @@ xfs_dir2_leafn_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-					XFS_DATA_FORK, xfs_dir2_leafn_verify);
+				XFS_DATA_FORK, xfs_dir2_leafn_read_verify);
 }
 
 /*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 7c6f95697e28..a58abe1fc0d0 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -69,11 +69,26 @@ xfs_dir2_free_verify(
 				     XFS_ERRLEVEL_LOW, mp, hdr);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
+
+static void
+xfs_dir2_free_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_free_verify(bp);
+}
 
+void
+xfs_dir2_free_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dir2_free_verify(bp);
+	bp->b_pre_io = xfs_dir2_free_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
 
+
 static int
 __xfs_dir2_free_read(
 	struct xfs_trans	*tp,
@@ -83,7 +98,7 @@ __xfs_dir2_free_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-					XFS_DATA_FORK, xfs_dir2_free_verify);
+				XFS_DATA_FORK, xfs_dir2_free_read_verify);
 }
 
 int
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index daf5d0fc6165..7ec61af8449f 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -72,7 +72,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
 		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 /* xfs_dir2_leaf.c */
-extern void xfs_dir2_leafn_verify(struct xfs_buf *bp);
+extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp);
 extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 0ba0f0992d6e..b38a10e6f2e0 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -360,8 +360,8 @@ xfs_qm_dqalloc(
 	return (error);
 }
 
-void
-xfs_dquot_read_verify(
+static void
+xfs_dquot_buf_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -388,12 +388,26 @@ xfs_dquot_read_verify(
 		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
 					"xfs_dquot_read_verify");
 		if (error) {
-			XFS_CORRUPTION_ERROR("xfs_dquot_read_verify",
-					     XFS_ERRLEVEL_LOW, mp, d);
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
 			xfs_buf_ioerror(bp, EFSCORRUPTED);
 			break;
 		}
 	}
+}
+
+static void
+xfs_dquot_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dquot_buf_verify(bp);
+}
+
+void
+xfs_dquot_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dquot_buf_verify(bp);
+	bp->b_pre_io = xfs_dquot_buf_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
@@ -413,7 +427,7 @@ xfs_qm_dqrepair(
 
 	/*
 	 * Read the buffer without verification so we get the corrupted
-	 * buffer returned to us.
+	 * buffer returned to us. make sure we verify it on write, though.
 	 */
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
 				   mp->m_quotainfo->qi_dqchunklen,
@@ -423,6 +437,7 @@ xfs_qm_dqrepair(
 		ASSERT(*bpp == NULL);
 		return XFS_ERROR(error);
 	}
+	(*bpp)->b_pre_io = xfs_dquot_buf_write_verify;
 
 	ASSERT(xfs_buf_islocked(*bpp));
 	d = (struct xfs_dqblk *)(*bpp)->b_addr;
@@ -521,7 +536,7 @@ xfs_qm_dqtobp(
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
 					   mp->m_quotainfo->qi_dqchunklen,
-					   0, &bp, xfs_dquot_read_verify);
+					   0, &bp, xfs_dquot_buf_read_verify);
 
 		if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
 			xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index a08ba92d7da0..5438d883b628 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -140,7 +140,7 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 
 extern int		xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
 					uint, struct xfs_dquot	**);
-extern void		xfs_dquot_read_verify(struct xfs_buf *bp);
+extern void		xfs_dquot_buf_read_verify(struct xfs_buf *bp);
 extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int		xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
 extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5bd255e5f7b8..070f41845572 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1473,7 +1473,7 @@ xfs_check_agi_unlinked(
 #endif
 
 static void
-xfs_agi_read_verify(
+xfs_agi_verify(
 	struct xfs_buf	*bp)
 {
 	struct xfs_mount *mp = bp->b_target->bt_mount;
@@ -1502,6 +1502,21 @@ xfs_agi_read_verify(
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
 	xfs_check_agi_unlinked(agi);
+}
+
+static void
+xfs_agi_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_agi_verify(bp);
+}
+
+void
+xfs_agi_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_agi_verify(bp);
+	bp->b_pre_io = xfs_agi_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 11306c6d61c7..15a79f8ca03c 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -183,7 +183,7 @@ xfs_inobt_key_diff(
 }
 
 void
-xfs_inobt_read_verify(
+xfs_inobt_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
@@ -211,11 +211,24 @@ xfs_inobt_read_verify(
 
 	if (!sblock_ok) {
 		trace_xfs_btree_corrupt(bp, _RET_IP_);
-		XFS_CORRUPTION_ERROR("xfs_inobt_read_verify",
-					XFS_ERRLEVEL_LOW, mp, block);
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
 		xfs_buf_ioerror(bp, EFSCORRUPTED);
 	}
+}
+
+static void
+xfs_inobt_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inobt_verify(bp);
+}
 
+void
+xfs_inobt_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inobt_verify(bp);
+	bp->b_pre_io = xfs_inobt_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 3a243d076950..910b2da01042 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -382,7 +382,7 @@ xfs_inobp_check(
 }
 #endif
 
-void
+static void
 xfs_inode_buf_verify(
 	struct xfs_buf	*bp)
 {
@@ -418,6 +418,21 @@ xfs_inode_buf_verify(
 		}
 	}
 	xfs_inobp_check(mp, bp);
+}
+
+static void
+xfs_inode_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp);
+}
+
+void
+xfs_inode_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_inode_buf_verify(bp);
+	bp->b_pre_io = xfs_inode_buf_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
@@ -447,7 +462,7 @@ xfs_imap_to_bp(
 	buf_flags |= XBF_UNMAPPED;
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 				   (int)imap->im_len, buf_flags, &bp,
-				   xfs_inode_buf_verify);
+				   xfs_inode_buf_read_verify);
 	if (error) {
 		if (error == EAGAIN) {
 			ASSERT(buf_flags & XBF_TRYLOCK);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1a892114792f..a322c19723a3 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -554,7 +554,7 @@ int		xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 			       struct xfs_buf **, uint, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, uint);
-void		xfs_inode_buf_verify(struct xfs_buf *);
+void		xfs_inode_buf_read_verify(struct xfs_buf *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 0f18d412e3e8..7f86fdaab7ae 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -397,7 +397,7 @@ xfs_bulkstat(
 							& ~r.ir_free)
 						xfs_btree_reada_bufs(mp, agno,
 							agbno, nbcluster,
-							xfs_inode_buf_verify);
+							xfs_inode_buf_read_verify);
 				}
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bff18d73c610..c85da75e4a43 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -612,8 +612,8 @@ xfs_sb_to_disk(
 	}
 }
 
-void
-xfs_sb_read_verify(
+static void
+xfs_sb_verify(
 	struct xfs_buf	*bp)
 {
 	struct xfs_mount *mp = bp->b_target->bt_mount;
@@ -629,6 +629,21 @@ xfs_sb_read_verify(
 	error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
 	if (error)
 		xfs_buf_ioerror(bp, error);
+}
+
+static void
+xfs_sb_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_sb_verify(bp);
+}
+
+void
+xfs_sb_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_sb_verify(bp);
+	bp->b_pre_io = xfs_sb_write_verify;
 	bp->b_iodone = NULL;
 	xfs_buf_ioend(bp, 0);
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index a6dfb97490cc..bd40ae9624e5 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs(
 		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 			      XFS_FSB_TO_DADDR(mp, bno),
 			      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
-			      xfs_dquot_read_verify);
+			      xfs_dquot_buf_read_verify);
 		if (error)
 			break;
 
-- 
cgit v1.2.3


From b0f539de9fcc543a3ffa40bc22bf51aca6ea6183 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2012 17:53:49 +1100
Subject: xfs: connect up write verifiers to new buffers

Metadata buffers that are read from disk have write verifiers
already attached to them, but newly allocated buffers do not. Add
appropriate write verifiers to all new metadata buffers.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_alloc.c        |   8 ++--
 fs/xfs/xfs_alloc.h        |   3 ++
 fs/xfs/xfs_alloc_btree.c  |   1 +
 fs/xfs/xfs_attr_leaf.c    |   4 +-
 fs/xfs/xfs_bmap.c         |   2 +
 fs/xfs/xfs_bmap_btree.c   |   3 +-
 fs/xfs/xfs_bmap_btree.h   |   1 +
 fs/xfs/xfs_btree.c        |   1 +
 fs/xfs/xfs_btree.h        |   2 +
 fs/xfs/xfs_da_btree.c     |   3 ++
 fs/xfs/xfs_dir2_block.c   |   2 +
 fs/xfs/xfs_dir2_data.c    |  11 +++--
 fs/xfs/xfs_dir2_leaf.c    |  19 +++++----
 fs/xfs/xfs_dir2_node.c    |  24 +++++++----
 fs/xfs/xfs_dir2_priv.h    |   2 +
 fs/xfs/xfs_dquot.c        | 104 +++++++++++++++++++++++-----------------------
 fs/xfs/xfs_fsops.c        |   8 +++-
 fs/xfs/xfs_ialloc.c       |   5 ++-
 fs/xfs/xfs_ialloc.h       |   4 +-
 fs/xfs/xfs_ialloc_btree.c |   1 +
 fs/xfs/xfs_inode.c        |  14 ++++++-
 fs/xfs/xfs_inode.h        |   1 +
 fs/xfs/xfs_mount.c        |   2 +-
 fs/xfs/xfs_mount.h        |   1 +
 24 files changed, 137 insertions(+), 89 deletions(-)

diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index d12bbedf6fe5..545a6c4c2366 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -465,14 +465,14 @@ xfs_agfl_verify(
 #endif
 }
 
-static void
+void
 xfs_agfl_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agfl_verify(bp);
 }
 
-void
+static void
 xfs_agfl_read_verify(
 	struct xfs_buf	*bp)
 {
@@ -2181,14 +2181,14 @@ xfs_agf_verify(
 	}
 }
 
-static void
+void
 xfs_agf_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agf_verify(bp);
 }
 
-void
+static void
 xfs_agf_read_verify(
 	struct xfs_buf	*bp)
 {
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index feacb061bab7..f32811f50f43 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
 	xfs_extlen_t		*len,	/* output: length of extent */
 	int			*stat);	/* output: success/failure */
 
+void xfs_agf_write_verify(struct xfs_buf *bp);
+void xfs_agfl_write_verify(struct xfs_buf *bp);
+
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 6e98b22ebde0..b83396524913 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -401,6 +401,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
 	.key_diff		= xfs_allocbt_key_diff,
 	.read_verify		= xfs_allocbt_read_verify,
+	.write_verify		= xfs_allocbt_write_verify,
 #ifdef DEBUG
 	.keys_inorder		= xfs_allocbt_keys_inorder,
 	.recs_inorder		= xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 57729d71ab1a..5cd5b0c1d17a 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -924,7 +924,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
 					    XFS_ATTR_FORK);
 	if (error)
 		goto out;
-	ASSERT(bp2 != NULL);
+	bp2->b_pre_io = bp1->b_pre_io;
 	memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
 	bp1 = NULL;
 	xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -978,7 +978,7 @@ xfs_attr_leaf_create(
 					    XFS_ATTR_FORK);
 	if (error)
 		return(error);
-	ASSERT(bp != NULL);
+	bp->b_pre_io = xfs_attr_leaf_write_verify;
 	leaf = bp->b_addr;
 	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
 	hdr = &leaf->hdr;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 9ae7aba52e0f..6a0f3f9f39d3 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3124,6 +3124,7 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the child block.
 	 */
+	abp->b_pre_io = xfs_bmbt_write_verify;
 	ablock = XFS_BUF_TO_BLOCK(abp);
 	ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
 	ablock->bb_level = 0;
@@ -3270,6 +3271,7 @@ xfs_bmap_local_to_extents(
 		ASSERT(args.len == 1);
 		*firstblock = args.fsbno;
 		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+		bp->b_pre_io = xfs_bmbt_write_verify;
 		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
 		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
 		xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 17d7423e7503..79758e1e4f74 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -749,7 +749,7 @@ xfs_bmbt_verify(
 	}
 }
 
-static void
+void
 xfs_bmbt_write_verify(
 	struct xfs_buf	*bp)
 {
@@ -806,6 +806,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
 	.key_diff		= xfs_bmbt_key_diff,
 	.read_verify		= xfs_bmbt_read_verify,
+	.write_verify		= xfs_bmbt_write_verify,
 #ifdef DEBUG
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 1d00fbe9dd79..938c85986549 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -233,6 +233,7 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern void xfs_bmbt_read_verify(struct xfs_buf *bp);
+extern void xfs_bmbt_write_verify(struct xfs_buf *bp);
 
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index ef1066078c33..1e2d89eed2a4 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -996,6 +996,7 @@ xfs_btree_get_buf_block(
 	if (!*bpp)
 		return ENOMEM;
 
+	(*bpp)->b_pre_io = cur->bc_ops->write_verify;
 	*block = XFS_BUF_TO_BLOCK(*bpp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 3a4c314047a0..458ab3550898 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -189,6 +189,8 @@ struct xfs_btree_ops {
 			      union xfs_btree_key *key);
 
 	void	(*read_verify)(struct xfs_buf *bp);
+	void	(*write_verify)(struct xfs_buf *bp);
+
 #ifdef DEBUG
 	/* check that k1 is lower than k2 */
 	int	(*keys_inorder)(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 6bb0a59eaaee..087950fc2eb7 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -193,6 +193,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
 	xfs_trans_log_buf(tp, bp,
 		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
 
+	bp->b_pre_io = xfs_da_node_write_verify;
 	*bpp = bp;
 	return(0);
 }
@@ -392,6 +393,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	}
 	memcpy(node, oldroot, size);
 	xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+	bp->b_pre_io = blk1->bp->b_pre_io;
 	blk1->bp = bp;
 	blk1->blkno = blkno;
 
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 0f8793c74fe2..e2fdc6f03d8a 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -1010,6 +1010,7 @@ xfs_dir2_leaf_to_block(
 	/*
 	 * Start converting it to block form.
 	 */
+	dbp->b_pre_io = xfs_dir2_block_write_verify;
 	hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
 	needlog = 1;
 	needscan = 0;
@@ -1139,6 +1140,7 @@ xfs_dir2_sf_to_block(
 		kmem_free(sfp);
 		return error;
 	}
+	bp->b_pre_io = xfs_dir2_block_write_verify;
 	hdr = bp->b_addr;
 	hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
 	/*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index b555585f5ab6..dcb8a873ab92 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -185,7 +185,7 @@ __xfs_dir2_data_check(
 	return 0;
 }
 
-void
+static void
 xfs_dir2_data_verify(
 	struct xfs_buf		*bp)
 {
@@ -202,14 +202,14 @@ xfs_dir2_data_verify(
 	}
 }
 
-static void
+void
 xfs_dir2_data_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_data_verify(bp);
 }
 
-void
+static void
 xfs_dir2_data_read_verify(
 	struct xfs_buf	*bp)
 {
@@ -482,10 +482,9 @@ xfs_dir2_data_init(
 	 */
 	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
 		XFS_DATA_FORK);
-	if (error) {
+	if (error)
 		return error;
-	}
-	ASSERT(bp != NULL);
+	bp->b_pre_io = xfs_dir2_data_write_verify;
 
 	/*
 	 * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 5b3bcab2a656..3002ab7d54c3 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -81,7 +81,7 @@ xfs_dir2_leaf1_read_verify(
 	xfs_buf_ioend(bp, 0);
 }
 
-static void
+void
 xfs_dir2_leafn_write_verify(
 	struct xfs_buf	*bp)
 {
@@ -198,6 +198,7 @@ xfs_dir2_block_to_leaf(
 	/*
 	 * Fix up the block header, make it a data block.
 	 */
+	dbp->b_pre_io = xfs_dir2_data_write_verify;
 	hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
 	if (needscan)
 		xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -1243,15 +1244,14 @@ xfs_dir2_leaf_init(
 	 * Get the buffer for the block.
 	 */
 	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
-		XFS_DATA_FORK);
-	if (error) {
+			       XFS_DATA_FORK);
+	if (error)
 		return error;
-	}
-	ASSERT(bp != NULL);
-	leaf = bp->b_addr;
+
 	/*
 	 * Initialize the header.
 	 */
+	leaf = bp->b_addr;
 	leaf->hdr.info.magic = cpu_to_be16(magic);
 	leaf->hdr.info.forw = 0;
 	leaf->hdr.info.back = 0;
@@ -1264,10 +1264,12 @@ xfs_dir2_leaf_init(
 	 * the block.
 	 */
 	if (magic == XFS_DIR2_LEAF1_MAGIC) {
+		bp->b_pre_io = xfs_dir2_leaf1_write_verify;
 		ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 		ltp->bestcount = 0;
 		xfs_dir2_leaf_log_tail(tp, bp);
-	}
+	} else
+		bp->b_pre_io = xfs_dir2_leafn_write_verify;
 	*bpp = bp;
 	return 0;
 }
@@ -1951,7 +1953,10 @@ xfs_dir2_node_to_leaf(
 		xfs_dir2_leaf_compact(args, lbp);
 	else
 		xfs_dir2_leaf_log_header(tp, lbp);
+
+	lbp->b_pre_io = xfs_dir2_leaf1_write_verify;
 	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+
 	/*
 	 * Set up the leaf tail from the freespace block.
 	 */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index a58abe1fc0d0..da90a91f4420 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -197,11 +197,12 @@ xfs_dir2_leaf_to_node(
 	/*
 	 * Get the buffer for the new freespace block.
 	 */
-	if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
-			XFS_DATA_FORK))) {
+	error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+				XFS_DATA_FORK);
+	if (error)
 		return error;
-	}
-	ASSERT(fbp != NULL);
+	fbp->b_pre_io = xfs_dir2_free_write_verify;
+
 	free = fbp->b_addr;
 	leaf = lbp->b_addr;
 	ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -223,7 +224,10 @@ xfs_dir2_leaf_to_node(
 		*to = cpu_to_be16(off);
 	}
 	free->hdr.nused = cpu_to_be32(n);
+
+	lbp->b_pre_io = xfs_dir2_leafn_write_verify;
 	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+
 	/*
 	 * Log everything.
 	 */
@@ -632,6 +636,7 @@ xfs_dir2_leafn_lookup_for_entry(
 			state->extrablk.index = (int)((char *)dep -
 							(char *)curbp->b_addr);
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_pre_io = xfs_dir2_data_write_verify;
 			if (cmp == XFS_CMP_EXACT)
 				return XFS_ERROR(EEXIST);
 		}
@@ -646,6 +651,7 @@ xfs_dir2_leafn_lookup_for_entry(
 			state->extrablk.index = -1;
 			state->extrablk.blkno = curdb;
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+			curbp->b_pre_io = xfs_dir2_data_write_verify;
 		} else {
 			/* If the curbp is not the CI match block, drop it */
 			if (state->extrablk.bp != curbp)
@@ -1638,12 +1644,12 @@ xfs_dir2_node_addname_int(
 			/*
 			 * Get a buffer for the new block.
 			 */
-			if ((error = xfs_da_get_buf(tp, dp,
-						   xfs_dir2_db_to_da(mp, fbno),
-						   -1, &fbp, XFS_DATA_FORK))) {
+			error = xfs_da_get_buf(tp, dp,
+					       xfs_dir2_db_to_da(mp, fbno),
+					       -1, &fbp, XFS_DATA_FORK);
+			if (error)
 				return error;
-			}
-			ASSERT(fbp != NULL);
+			fbp->b_pre_io = xfs_dir2_free_write_verify;
 
 			/*
 			 * Initialize the new block to be empty, and remember
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 7ec61af8449f..01b82dcddc3e 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -45,6 +45,7 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 #else
 #define	xfs_dir2_data_check(dp,bp)
 #endif
+extern void xfs_dir2_data_write_verify(struct xfs_buf *bp);
 extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
 extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
@@ -73,6 +74,7 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
 
 /* xfs_dir2_leaf.c */
 extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp);
+extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp);
 extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b38a10e6f2e0..1b06aa051074 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,57 @@ xfs_qm_init_dquot_blk(
 	xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
 
+static void
+xfs_dquot_buf_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
+	struct xfs_disk_dquot	*ddq;
+	xfs_dqid_t		id = 0;
+	int			i;
+
+	/*
+	 * On the first read of the buffer, verify that each dquot is valid.
+	 * We don't know what the id of the dquot is supposed to be, just that
+	 * they should be increasing monotonically within the buffer. If the
+	 * first id is corrupt, then it will fail on the second dquot in the
+	 * buffer so corruptions could point to the wrong dquot in this case.
+	 */
+	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+		int	error;
+
+		ddq = &d[i].dd_diskdq;
+
+		if (i == 0)
+			id = be32_to_cpu(ddq->d_id);
+
+		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+					"xfs_dquot_read_verify");
+		if (error) {
+			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+			xfs_buf_ioerror(bp, EFSCORRUPTED);
+			break;
+		}
+	}
+}
+
+static void
+xfs_dquot_buf_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dquot_buf_verify(bp);
+}
 
+void
+xfs_dquot_buf_read_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_dquot_buf_verify(bp);
+	bp->b_pre_io = xfs_dquot_buf_write_verify;
+	bp->b_iodone = NULL;
+	xfs_buf_ioend(bp, 0);
+}
 
 /*
  * Allocate a block and fill it with dquots.
@@ -315,6 +365,7 @@ xfs_qm_dqalloc(
 	error = xfs_buf_geterror(bp);
 	if (error)
 		goto error1;
+	bp->b_pre_io = xfs_dquot_buf_write_verify;
 
 	/*
 	 * Make a chunk of dquots out of this buffer and log
@@ -359,59 +410,6 @@ xfs_qm_dqalloc(
 
 	return (error);
 }
-
-static void
-xfs_dquot_buf_verify(
-	struct xfs_buf		*bp)
-{
-	struct xfs_mount	*mp = bp->b_target->bt_mount;
-	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
-	struct xfs_disk_dquot	*ddq;
-	xfs_dqid_t		id = 0;
-	int			i;
-
-	/*
-	 * On the first read of the buffer, verify that each dquot is valid.
-	 * We don't know what the id of the dquot is supposed to be, just that
-	 * they should be increasing monotonically within the buffer. If the
-	 * first id is corrupt, then it will fail on the second dquot in the
-	 * buffer so corruptions could point to the wrong dquot in this case.
-	 */
-	for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
-		int	error;
-
-		ddq = &d[i].dd_diskdq;
-
-		if (i == 0)
-			id = be32_to_cpu(ddq->d_id);
-
-		error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-					"xfs_dquot_read_verify");
-		if (error) {
-			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
-			xfs_buf_ioerror(bp, EFSCORRUPTED);
-			break;
-		}
-	}
-}
-
-static void
-xfs_dquot_buf_write_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_dquot_buf_verify(bp);
-}
-
-void
-xfs_dquot_buf_read_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_dquot_buf_verify(bp);
-	bp->b_pre_io = xfs_dquot_buf_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
-}
-
 STATIC int
 xfs_qm_dqrepair(
 	struct xfs_mount	*mp,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cb65b067ed31..5d6d6b9d369d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -222,6 +222,7 @@ xfs_growfs_data_private(
 			error = ENOMEM;
 			goto error0;
 		}
+		bp->b_pre_io = xfs_agf_write_verify;
 
 		agf = XFS_BUF_TO_AGF(bp);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -259,6 +260,7 @@ xfs_growfs_data_private(
 			error = ENOMEM;
 			goto error0;
 		}
+		bp->b_pre_io = xfs_agfl_write_verify;
 
 		agfl = XFS_BUF_TO_AGFL(bp);
 		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
@@ -279,6 +281,7 @@ xfs_growfs_data_private(
 			error = ENOMEM;
 			goto error0;
 		}
+		bp->b_pre_io = xfs_agi_write_verify;
 
 		agi = XFS_BUF_TO_AGI(bp);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -450,9 +453,10 @@ xfs_growfs_data_private(
 			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 				  XFS_FSS_TO_BB(mp, 1), 0);
-			if (bp)
+			if (bp) {
 				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-			else
+				bp->b_pre_io = xfs_sb_write_verify;
+			} else
 				error = ENOMEM;
 		}
 
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 070f41845572..faf68600d3a6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -210,6 +210,7 @@ xfs_ialloc_inode_init(
 		 *	to log a whole cluster of inodes instead of all the
 		 *	individual transactions causing a lot of log traffic.
 		 */
+		fbuf->b_pre_io = xfs_inode_buf_write_verify;
 		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
 		for (i = 0; i < ninodes; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
@@ -1504,14 +1505,14 @@ xfs_agi_verify(
 	xfs_check_agi_unlinked(agi);
 }
 
-static void
+void
 xfs_agi_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agi_verify(bp);
 }
 
-void
+static void
 xfs_agi_read_verify(
 	struct xfs_buf	*bp)
 {
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..7a169e34e30e 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 /*
  * Get the data from the pointed-to record.
  */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
 		xfs_inobt_rec_incore_t *rec, int *stat);
 
+void xfs_agi_write_verify(struct xfs_buf *bp);
+
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 15a79f8ca03c..7761e1ebeff7 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -271,6 +271,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
 	.key_diff		= xfs_inobt_key_diff,
 	.read_verify		= xfs_inobt_read_verify,
+	.write_verify		= xfs_inobt_write_verify,
 #ifdef DEBUG
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 910b2da01042..dfcbe73f1db4 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -420,7 +420,7 @@ xfs_inode_buf_verify(
 	xfs_inobp_check(mp, bp);
 }
 
-static void
+void
 xfs_inode_buf_write_verify(
 	struct xfs_buf	*bp)
 {
@@ -1782,6 +1782,18 @@ xfs_ifree_cluster(
 
 		if (!bp)
 			return ENOMEM;
+
+		/*
+		 * This buffer may not have been correctly initialised as we
+		 * didn't read it from disk. That's not important because we are
+		 * only using to mark the buffer as stale in the log, and to
+		 * attach stale cached inodes on it. That means it will never be
+		 * dispatched for IO. If it is, we want to know about it, and we
+		 * want it to fail. We can acheive this by adding a write
+		 * verifier to the buffer.
+		 */
+		 bp->b_pre_io = xfs_inode_buf_write_verify;
+
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
 		 * stale. These will all have the flush locks held, so an
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a322c19723a3..482214d120a7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -555,6 +555,7 @@ int		xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, uint);
 void		xfs_inode_buf_read_verify(struct xfs_buf *);
+void		xfs_inode_buf_write_verify(struct xfs_buf *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c85da75e4a43..152a7fc843f9 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -631,7 +631,7 @@ xfs_sb_verify(
 		xfs_buf_ioerror(bp, error);
 }
 
-static void
+void
 xfs_sb_write_verify(
 	struct xfs_buf	*bp)
 {
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index de9089acc610..29c1b3ac920e 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -386,6 +386,7 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 #endif	/* __KERNEL__ */
 
 extern void	xfs_sb_read_verify(struct xfs_buf *);
+extern void	xfs_sb_write_verify(struct xfs_buf *bp);
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
 					xfs_agnumber_t *);
-- 
cgit v1.2.3


From 1813dd64057490e7a0678a885c4fe6d02f78bdc1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 14 Nov 2012 17:54:40 +1100
Subject: xfs: convert buffer verifiers to an ops structure.

To separate the verifiers from iodone functions and associate read
and write verifiers at the same time, introduce a buffer verifier
operations structure to the xfs_buf.

This avoids the need for assigning the write verifier, clearing the
iodone function and re-running ioend processing in the read
verifier, and gets rid of the nasty "b_pre_io" name for the write
verifier function pointer. If we ever need to, it will also be
easier to add further content specific callbacks to a buffer with an
ops structure in place.

We also avoid needing to export verifier functions, instead we
can simply export the ops structures for those that are needed
outside the function they are defined in.

This patch also fixes a directory block readahead verifier issue
it exposed.

This patch also adds ops callbacks to the inode/alloc btree blocks
initialised by growfs. These will need more work before they will
work with CRCs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Phil White <pwhite@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ag.h           |  4 +++
 fs/xfs/xfs_alloc.c        | 28 ++++++++++++---------
 fs/xfs/xfs_alloc.h        |  4 +--
 fs/xfs/xfs_alloc_btree.c  | 18 ++++++++------
 fs/xfs/xfs_alloc_btree.h  |  2 ++
 fs/xfs/xfs_attr_leaf.c    | 19 +++++++-------
 fs/xfs/xfs_attr_leaf.h    |  3 ++-
 fs/xfs/xfs_bmap.c         | 22 ++++++++---------
 fs/xfs/xfs_bmap_btree.c   | 20 ++++++++-------
 fs/xfs/xfs_bmap_btree.h   |  3 +--
 fs/xfs/xfs_btree.c        | 26 +++++++++----------
 fs/xfs/xfs_btree.h        |  9 +++----
 fs/xfs/xfs_buf.c          | 63 ++++++++++++++++++++++++++++-------------------
 fs/xfs/xfs_buf.h          | 24 ++++++++++--------
 fs/xfs/xfs_da_btree.c     | 40 +++++++++++++++++++-----------
 fs/xfs/xfs_da_btree.h     |  4 +--
 fs/xfs/xfs_dir2_block.c   | 20 ++++++++-------
 fs/xfs/xfs_dir2_data.c    | 52 +++++++++++++++++++++++++++++++-------
 fs/xfs/xfs_dir2_leaf.c    | 36 +++++++++++++++------------
 fs/xfs/xfs_dir2_node.c    | 26 ++++++++++---------
 fs/xfs/xfs_dir2_priv.h    | 10 +++++---
 fs/xfs/xfs_dquot.c        | 18 ++++++++------
 fs/xfs/xfs_dquot.h        |  3 ++-
 fs/xfs/xfs_fsops.c        | 29 +++++++++++++---------
 fs/xfs/xfs_ialloc.c       | 18 ++++++++------
 fs/xfs/xfs_ialloc.h       |  2 +-
 fs/xfs/xfs_ialloc_btree.c | 17 +++++++------
 fs/xfs/xfs_ialloc_btree.h |  2 ++
 fs/xfs/xfs_inode.c        | 22 ++++++++++-------
 fs/xfs/xfs_inode.h        |  3 +--
 fs/xfs/xfs_itable.c       |  2 +-
 fs/xfs/xfs_log_recover.c  |  2 +-
 fs/xfs/xfs_mount.c        | 35 +++++++++++++++-----------
 fs/xfs/xfs_mount.h        |  4 +--
 fs/xfs/xfs_qm.c           |  2 +-
 fs/xfs/xfs_trans.h        |  6 ++---
 fs/xfs/xfs_trans_buf.c    |  8 +++---
 37 files changed, 357 insertions(+), 249 deletions(-)

diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 22bd4db011c8..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
 extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
 			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
 
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+
 /*
  * Size of the unlinked inode hash table in the agi.
  */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
 extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
 				xfs_agnumber_t agno, struct xfs_buf **bpp);
 
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
 /*
  * The third a.g. block contains the a.g. freelist, an array
  * of block pointers to blocks owned by the allocation btree code.
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 545a6c4c2366..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -465,7 +465,7 @@ xfs_agfl_verify(
 #endif
 }
 
-void
+static void
 xfs_agfl_write_verify(
 	struct xfs_buf	*bp)
 {
@@ -477,11 +477,13 @@ xfs_agfl_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agfl_verify(bp);
-	bp->b_pre_io = xfs_agfl_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+	.verify_read = xfs_agfl_read_verify,
+	.verify_write = xfs_agfl_write_verify,
+};
+
 /*
  * Read in the allocation group free block array.
  */
@@ -499,7 +501,7 @@ xfs_alloc_read_agfl(
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, &bp, xfs_agfl_read_verify);
+			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
@@ -2181,23 +2183,25 @@ xfs_agf_verify(
 	}
 }
 
-void
-xfs_agf_write_verify(
+static void
+xfs_agf_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agf_verify(bp);
 }
 
 static void
-xfs_agf_read_verify(
+xfs_agf_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agf_verify(bp);
-	bp->b_pre_io = xfs_agf_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+	.verify_read = xfs_agf_read_verify,
+	.verify_write = xfs_agf_write_verify,
+};
+
 /*
  * Read in the allocation group header (free/alloc section).
  */
@@ -2215,7 +2219,7 @@ xfs_read_agf(
 	error = xfs_trans_read_buf(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), flags, bpp, xfs_agf_read_verify);
+			XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
 	if (error)
 		return error;
 	if (!*bpp)
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index f32811f50f43..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,7 +231,7 @@ xfs_alloc_get_rec(
 	xfs_extlen_t		*len,	/* output: length of extent */
 	int			*stat);	/* output: success/failure */
 
-void xfs_agf_write_verify(struct xfs_buf *bp);
-void xfs_agfl_write_verify(struct xfs_buf *bp);
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
 
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b83396524913..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -329,22 +329,25 @@ xfs_allocbt_verify(
 }
 
 static void
-xfs_allocbt_write_verify(
+xfs_allocbt_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_allocbt_verify(bp);
 }
 
-void
-xfs_allocbt_read_verify(
+static void
+xfs_allocbt_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_allocbt_verify(bp);
-	bp->b_pre_io = xfs_allocbt_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+	.verify_read = xfs_allocbt_read_verify,
+	.verify_write = xfs_allocbt_write_verify,
+};
+
+
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -400,8 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
 	.key_diff		= xfs_allocbt_key_diff,
-	.read_verify		= xfs_allocbt_read_verify,
-	.write_verify		= xfs_allocbt_write_verify,
+	.buf_ops		= &xfs_allocbt_buf_ops,
 #ifdef DEBUG
 	.keys_inorder		= xfs_allocbt_keys_inorder,
 	.recs_inorder		= xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
 		xfs_agnumber_t, xfs_btnum_t);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+
 #endif	/* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 5cd5b0c1d17a..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -104,22 +104,23 @@ xfs_attr_leaf_verify(
 }
 
 static void
-xfs_attr_leaf_write_verify(
+xfs_attr_leaf_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_attr_leaf_verify(bp);
 }
 
-void
-xfs_attr_leaf_read_verify(
+static void
+xfs_attr_leaf_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_attr_leaf_verify(bp);
-	bp->b_pre_io = xfs_attr_leaf_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
+	.verify_read = xfs_attr_leaf_read_verify,
+	.verify_write = xfs_attr_leaf_write_verify,
+};
 
 int
 xfs_attr_leaf_read(
@@ -130,7 +131,7 @@ xfs_attr_leaf_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-				XFS_ATTR_FORK, xfs_attr_leaf_read_verify);
+				XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
 }
 
 /*========================================================================
@@ -924,7 +925,7 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
 					    XFS_ATTR_FORK);
 	if (error)
 		goto out;
-	bp2->b_pre_io = bp1->b_pre_io;
+	bp2->b_ops = bp1->b_ops;
 	memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
 	bp1 = NULL;
 	xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -978,7 +979,7 @@ xfs_attr_leaf_create(
 					    XFS_ATTR_FORK);
 	if (error)
 		return(error);
-	bp->b_pre_io = xfs_attr_leaf_write_verify;
+	bp->b_ops = &xfs_attr_leaf_buf_ops;
 	leaf = bp->b_addr;
 	memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
 	hdr = &leaf->hdr;
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index 3bbf6277e43c..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -264,6 +264,7 @@ int	xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
 int	xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
 			xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			struct xfs_buf **bpp);
-void	xfs_attr_leaf_read_verify(struct xfs_buf *bp);
+
+extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
 
 #endif	/* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 6a0f3f9f39d3..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2663,7 +2663,7 @@ xfs_bmap_btree_to_extents(
 		return error;
 #endif
 	error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
-				xfs_bmbt_read_verify);
+				&xfs_bmbt_buf_ops);
 	if (error)
 		return error;
 	cblock = XFS_BUF_TO_BLOCK(cbp);
@@ -3124,7 +3124,7 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the child block.
 	 */
-	abp->b_pre_io = xfs_bmbt_write_verify;
+	abp->b_ops = &xfs_bmbt_buf_ops;
 	ablock = XFS_BUF_TO_BLOCK(abp);
 	ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
 	ablock->bb_level = 0;
@@ -3271,7 +3271,7 @@ xfs_bmap_local_to_extents(
 		ASSERT(args.len == 1);
 		*firstblock = args.fsbno;
 		bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-		bp->b_pre_io = xfs_bmbt_write_verify;
+		bp->b_ops = &xfs_bmbt_buf_ops;
 		memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
 		xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
 		xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4082,7 +4082,7 @@ xfs_bmap_read_extents(
 	 */
 	while (level-- > 0) {
 		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify);
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
 		if (error)
 			return error;
 		block = XFS_BUF_TO_BLOCK(bp);
@@ -4129,7 +4129,7 @@ xfs_bmap_read_extents(
 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
 		if (nextbno != NULLFSBLOCK)
 			xfs_btree_reada_bufl(mp, nextbno, 1,
-					     xfs_bmbt_read_verify);
+					     &xfs_bmbt_buf_ops);
 		/*
 		 * Copy records into the extent records.
 		 */
@@ -4162,7 +4162,7 @@ xfs_bmap_read_extents(
 		if (bno == NULLFSBLOCK)
 			break;
 		error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-				XFS_BMAP_BTREE_REF, xfs_bmbt_read_verify);
+				XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
 		if (error)
 			return error;
 		block = XFS_BUF_TO_BLOCK(bp);
@@ -5880,7 +5880,7 @@ xfs_bmap_check_leaf_extents(
 			bp_release = 1;
 			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
 						XFS_BMAP_BTREE_REF,
-						xfs_bmbt_read_verify);
+						&xfs_bmbt_buf_ops);
 			if (error)
 				goto error_norelse;
 		}
@@ -5966,7 +5966,7 @@ xfs_bmap_check_leaf_extents(
 			bp_release = 1;
 			error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
 						XFS_BMAP_BTREE_REF,
-						xfs_bmbt_read_verify);
+						&xfs_bmbt_buf_ops);
 			if (error)
 				goto error_norelse;
 		}
@@ -6061,7 +6061,7 @@ xfs_bmap_count_tree(
 	int			numrecs;
 
 	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
-						xfs_bmbt_read_verify);
+						&xfs_bmbt_buf_ops);
 	if (error)
 		return error;
 	*count += 1;
@@ -6073,7 +6073,7 @@ xfs_bmap_count_tree(
 		while (nextbno != NULLFSBLOCK) {
 			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
 						XFS_BMAP_BTREE_REF,
-						xfs_bmbt_read_verify);
+						&xfs_bmbt_buf_ops);
 			if (error)
 				return error;
 			*count += 1;
@@ -6105,7 +6105,7 @@ xfs_bmap_count_tree(
 			bno = nextbno;
 			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
 						XFS_BMAP_BTREE_REF,
-						xfs_bmbt_read_verify);
+						&xfs_bmbt_buf_ops);
 			if (error)
 				return error;
 			*count += 1;
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 79758e1e4f74..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -749,23 +749,26 @@ xfs_bmbt_verify(
 	}
 }
 
-void
-xfs_bmbt_write_verify(
+static void
+xfs_bmbt_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_bmbt_verify(bp);
 }
 
-void
-xfs_bmbt_read_verify(
+static void
+xfs_bmbt_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_bmbt_verify(bp);
-	bp->b_pre_io = xfs_bmbt_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+	.verify_read = xfs_bmbt_read_verify,
+	.verify_write = xfs_bmbt_write_verify,
+};
+
+
 #ifdef DEBUG
 STATIC int
 xfs_bmbt_keys_inorder(
@@ -805,8 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
 	.key_diff		= xfs_bmbt_key_diff,
-	.read_verify		= xfs_bmbt_read_verify,
-	.write_verify		= xfs_bmbt_write_verify,
+	.buf_ops		= &xfs_bmbt_buf_ops,
 #ifdef DEBUG
 	.keys_inorder		= xfs_bmbt_keys_inorder,
 	.recs_inorder		= xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 938c85986549..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -232,11 +232,10 @@ extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
 extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
-extern void xfs_bmbt_read_verify(struct xfs_buf *bp);
-extern void xfs_bmbt_write_verify(struct xfs_buf *bp);
 
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
 
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
 
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 1e2d89eed2a4..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -271,7 +271,7 @@ xfs_btree_dup_cursor(
 			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 						   XFS_BUF_ADDR(bp), mp->m_bsize,
 						   0, &bp,
-						   cur->bc_ops->read_verify);
+						   cur->bc_ops->buf_ops);
 			if (error) {
 				xfs_btree_del_cursor(new, error);
 				*ncur = NULL;
@@ -621,7 +621,7 @@ xfs_btree_read_bufl(
 	uint			lock,		/* lock flags for read_buf */
 	struct xfs_buf		**bpp,		/* buffer for fsbno */
 	int			refval,		/* ref count value for buffer */
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;		/* return value */
 	xfs_daddr_t		d;		/* real disk block address */
@@ -630,7 +630,7 @@ xfs_btree_read_bufl(
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, lock, &bp, verify);
+				   mp->m_bsize, lock, &bp, ops);
 	if (error)
 		return error;
 	ASSERT(!xfs_buf_geterror(bp));
@@ -650,13 +650,13 @@ xfs_btree_reada_bufl(
 	struct xfs_mount	*mp,		/* file system mount point */
 	xfs_fsblock_t		fsbno,		/* file system block number */
 	xfs_extlen_t		count,		/* count of filesystem blocks */
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	xfs_daddr_t		d;
 
 	ASSERT(fsbno != NULLFSBLOCK);
 	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 }
 
 /*
@@ -670,14 +670,14 @@ xfs_btree_reada_bufs(
 	xfs_agnumber_t		agno,		/* allocation group number */
 	xfs_agblock_t		agbno,		/* allocation group block number */
 	xfs_extlen_t		count,		/* count of filesystem blocks */
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	xfs_daddr_t		d;
 
 	ASSERT(agno != NULLAGNUMBER);
 	ASSERT(agbno != NULLAGBLOCK);
 	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, verify);
+	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 }
 
 STATIC int
@@ -692,13 +692,13 @@ xfs_btree_readahead_lblock(
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
 		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
-				     cur->bc_ops->read_verify);
+				     cur->bc_ops->buf_ops);
 		rval++;
 	}
 
 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
 		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
-				     cur->bc_ops->read_verify);
+				     cur->bc_ops->buf_ops);
 		rval++;
 	}
 
@@ -718,13 +718,13 @@ xfs_btree_readahead_sblock(
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-				     left, 1, cur->bc_ops->read_verify);
+				     left, 1, cur->bc_ops->buf_ops);
 		rval++;
 	}
 
 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
 		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-				     right, 1, cur->bc_ops->read_verify);
+				     right, 1, cur->bc_ops->buf_ops);
 		rval++;
 	}
 
@@ -996,7 +996,7 @@ xfs_btree_get_buf_block(
 	if (!*bpp)
 		return ENOMEM;
 
-	(*bpp)->b_pre_io = cur->bc_ops->write_verify;
+	(*bpp)->b_ops = cur->bc_ops->buf_ops;
 	*block = XFS_BUF_TO_BLOCK(*bpp);
 	return 0;
 }
@@ -1024,7 +1024,7 @@ xfs_btree_read_buf_block(
 	d = xfs_btree_ptr_to_daddr(cur, ptr);
 	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
 				   mp->m_bsize, flags, bpp,
-				   cur->bc_ops->read_verify);
+				   cur->bc_ops->buf_ops);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 458ab3550898..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,8 +188,7 @@ struct xfs_btree_ops {
 	__int64_t (*key_diff)(struct xfs_btree_cur *cur,
 			      union xfs_btree_key *key);
 
-	void	(*read_verify)(struct xfs_buf *bp);
-	void	(*write_verify)(struct xfs_buf *bp);
+	const struct xfs_buf_ops	*buf_ops;
 
 #ifdef DEBUG
 	/* check that k1 is lower than k2 */
@@ -359,7 +358,7 @@ xfs_btree_read_bufl(
 	uint			lock,	/* lock flags for read_buf */
 	struct xfs_buf		**bpp,	/* buffer for fsbno */
 	int			refval,	/* ref count value for buffer */
-	xfs_buf_iodone_t	verify);
+	const struct xfs_buf_ops *ops);
 
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -370,7 +369,7 @@ xfs_btree_reada_bufl(
 	struct xfs_mount	*mp,	/* file system mount point */
 	xfs_fsblock_t		fsbno,	/* file system block number */
 	xfs_extlen_t		count,	/* count of filesystem blocks */
-	xfs_buf_iodone_t	verify);
+	const struct xfs_buf_ops *ops);
 
 /*
  * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -382,7 +381,7 @@ xfs_btree_reada_bufs(
 	xfs_agnumber_t		agno,	/* allocation group number */
 	xfs_agblock_t		agbno,	/* allocation group block number */
 	xfs_extlen_t		count,	/* count of filesystem blocks */
-	xfs_buf_iodone_t	verify);
+	const struct xfs_buf_ops *ops);
 
 /*
  * Initialise a new btree block header
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bd1a948ee39c..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -571,7 +571,7 @@ found:
 		ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 		ASSERT(bp->b_iodone == NULL);
 		bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
-		bp->b_pre_io = NULL;
+		bp->b_ops = NULL;
 	}
 
 	trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -657,7 +657,7 @@ xfs_buf_read_map(
 	struct xfs_buf_map	*map,
 	int			nmaps,
 	xfs_buf_flags_t		flags,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
 
@@ -669,7 +669,7 @@ xfs_buf_read_map(
 
 		if (!XFS_BUF_ISDONE(bp)) {
 			XFS_STATS_INC(xb_get_read);
-			bp->b_iodone = verify;
+			bp->b_ops = ops;
 			_xfs_buf_read(bp, flags);
 		} else if (flags & XBF_ASYNC) {
 			/*
@@ -696,13 +696,13 @@ xfs_buf_readahead_map(
 	struct xfs_buftarg	*target,
 	struct xfs_buf_map	*map,
 	int			nmaps,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	if (bdi_read_congested(target->bt_bdi))
 		return;
 
 	xfs_buf_read_map(target, map, nmaps,
-		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, verify);
+		     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
 }
 
 /*
@@ -715,7 +715,7 @@ xfs_buf_read_uncached(
 	xfs_daddr_t		daddr,
 	size_t			numblks,
 	int			flags,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
 
@@ -728,7 +728,7 @@ xfs_buf_read_uncached(
 	bp->b_bn = daddr;
 	bp->b_maps[0].bm_bn = daddr;
 	bp->b_flags |= XBF_READ;
-	bp->b_iodone = verify;
+	bp->b_ops = ops;
 
 	xfsbdstrat(target->bt_mount, bp);
 	xfs_buf_iowait(bp);
@@ -1001,27 +1001,37 @@ STATIC void
 xfs_buf_iodone_work(
 	struct work_struct	*work)
 {
-	xfs_buf_t		*bp =
+	struct xfs_buf		*bp =
 		container_of(work, xfs_buf_t, b_iodone_work);
+	bool			read = !!(bp->b_flags & XBF_READ);
+
+	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+	if (read && bp->b_ops)
+		bp->b_ops->verify_read(bp);
 
 	if (bp->b_iodone)
 		(*(bp->b_iodone))(bp);
 	else if (bp->b_flags & XBF_ASYNC)
 		xfs_buf_relse(bp);
+	else {
+		ASSERT(read && bp->b_ops);
+		complete(&bp->b_iowait);
+	}
 }
 
 void
 xfs_buf_ioend(
-	xfs_buf_t		*bp,
-	int			schedule)
+	struct xfs_buf	*bp,
+	int		schedule)
 {
+	bool		read = !!(bp->b_flags & XBF_READ);
+
 	trace_xfs_buf_iodone(bp, _RET_IP_);
 
-	bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 	if (bp->b_error == 0)
 		bp->b_flags |= XBF_DONE;
 
-	if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+	if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
 		if (schedule) {
 			INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
 			queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1029,6 +1039,7 @@ xfs_buf_ioend(
 			xfs_buf_iodone_work(&bp->b_iodone_work);
 		}
 	} else {
+		bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 		complete(&bp->b_iowait);
 	}
 }
@@ -1316,6 +1327,20 @@ _xfs_buf_ioapply(
 			rw |= REQ_FUA;
 		if (bp->b_flags & XBF_FLUSH)
 			rw |= REQ_FLUSH;
+
+		/*
+		 * Run the write verifier callback function if it exists. If
+		 * this function fails it will mark the buffer with an error and
+		 * the IO should not be dispatched.
+		 */
+		if (bp->b_ops) {
+			bp->b_ops->verify_write(bp);
+			if (bp->b_error) {
+				xfs_force_shutdown(bp->b_target->bt_mount,
+						   SHUTDOWN_CORRUPT_INCORE);
+				return;
+			}
+		}
 	} else if (bp->b_flags & XBF_READ_AHEAD) {
 		rw = READA;
 	} else {
@@ -1325,20 +1350,6 @@ _xfs_buf_ioapply(
 	/* we only use the buffer cache for meta-data */
 	rw |= REQ_META;
 
-	/*
-	 * run the pre-io callback function if it exists. If this function
-	 * fails it will mark the buffer with an error and the IO should
-	 * not be dispatched.
-	 */
-	if (bp->b_pre_io) {
-		bp->b_pre_io(bp);
-		if (bp->b_error) {
-			xfs_force_shutdown(bp->b_target->bt_mount,
-					   SHUTDOWN_CORRUPT_INCORE);
-			return;
-		}
-	}
-
 	/*
 	 * Walk all the vectors issuing IO on them. Set up the initial offset
 	 * into the buffer and the desired IO size before we start -
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 51bc16a1cd9c..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -111,6 +111,11 @@ struct xfs_buf_map {
 #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
 	struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
 
+struct xfs_buf_ops {
+	void (*verify_read)(struct xfs_buf *);
+	void (*verify_write)(struct xfs_buf *);
+};
+
 typedef struct xfs_buf {
 	/*
 	 * first cacheline holds all the fields needed for an uncontended cache
@@ -154,9 +159,7 @@ typedef struct xfs_buf {
 	unsigned int		b_page_count;	/* size of page array */
 	unsigned int		b_offset;	/* page offset in first page */
 	unsigned short		b_error;	/* error code on I/O */
-
-	void			(*b_pre_io)(struct xfs_buf *);
-						/* pre-io callback function */
+	const struct xfs_buf_ops	*b_ops;
 
 #ifdef XFS_BUF_LOCK_TRACKING
 	int			b_last_holder;
@@ -199,10 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
 			       xfs_buf_flags_t flags);
 struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
 			       struct xfs_buf_map *map, int nmaps,
-			       xfs_buf_flags_t flags, xfs_buf_iodone_t verify);
+			       xfs_buf_flags_t flags,
+			       const struct xfs_buf_ops *ops);
 void xfs_buf_readahead_map(struct xfs_buftarg *target,
 			       struct xfs_buf_map *map, int nmaps,
-			       xfs_buf_iodone_t verify);
+			       const struct xfs_buf_ops *ops);
 
 static inline struct xfs_buf *
 xfs_buf_get(
@@ -221,10 +225,10 @@ xfs_buf_read(
 	xfs_daddr_t		blkno,
 	size_t			numblks,
 	xfs_buf_flags_t		flags,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return xfs_buf_read_map(target, &map, 1, flags, verify);
+	return xfs_buf_read_map(target, &map, 1, flags, ops);
 }
 
 static inline void
@@ -232,10 +236,10 @@ xfs_buf_readahead(
 	struct xfs_buftarg	*target,
 	xfs_daddr_t		blkno,
 	size_t			numblks,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-	return xfs_buf_readahead_map(target, &map, 1, verify);
+	return xfs_buf_readahead_map(target, &map, 1, ops);
 }
 
 struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -246,7 +250,7 @@ struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
 				int flags);
 struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
 				xfs_daddr_t daddr, size_t numblks, int flags,
-				xfs_buf_iodone_t verify);
+				const struct xfs_buf_ops *ops);
 void xfs_buf_hold(struct xfs_buf *bp);
 
 /* Releasing Buffers */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 087950fc2eb7..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -117,6 +117,12 @@ xfs_da_node_write_verify(
 	xfs_da_node_verify(bp);
 }
 
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
 static void
 xfs_da_node_read_verify(
 	struct xfs_buf		*bp)
@@ -129,10 +135,12 @@ xfs_da_node_read_verify(
 			xfs_da_node_verify(bp);
 			break;
 		case XFS_ATTR_LEAF_MAGIC:
-			xfs_attr_leaf_read_verify(bp);
+			bp->b_ops = &xfs_attr_leaf_buf_ops;
+			bp->b_ops->verify_read(bp);
 			return;
 		case XFS_DIR2_LEAFN_MAGIC:
-			xfs_dir2_leafn_read_verify(bp);
+			bp->b_ops = &xfs_dir2_leafn_buf_ops;
+			bp->b_ops->verify_read(bp);
 			return;
 		default:
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
@@ -140,12 +148,14 @@ xfs_da_node_read_verify(
 			xfs_buf_ioerror(bp, EFSCORRUPTED);
 			break;
 	}
-
-	bp->b_pre_io = xfs_da_node_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_da_node_buf_ops = {
+	.verify_read = xfs_da_node_read_verify,
+	.verify_write = xfs_da_node_write_verify,
+};
+
+
 int
 xfs_da_node_read(
 	struct xfs_trans	*tp,
@@ -156,7 +166,7 @@ xfs_da_node_read(
 	int			which_fork)
 {
 	return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
-					which_fork, xfs_da_node_read_verify);
+					which_fork, &xfs_da_node_buf_ops);
 }
 
 /*========================================================================
@@ -193,7 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
 	xfs_trans_log_buf(tp, bp,
 		XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
 
-	bp->b_pre_io = xfs_da_node_write_verify;
+	bp->b_ops = &xfs_da_node_buf_ops;
 	*bpp = bp;
 	return(0);
 }
@@ -394,7 +404,7 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
 	memcpy(node, oldroot, size);
 	xfs_trans_log_buf(tp, bp, 0, size - 1);
 
-	bp->b_pre_io = blk1->bp->b_pre_io;
+	bp->b_ops = blk1->bp->b_ops;
 	blk1->bp = bp;
 	blk1->blkno = blkno;
 
@@ -828,11 +838,11 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
 	/*
 	 * This could be copying a leaf back into the root block in the case of
 	 * there only being a single leaf block left in the tree. Hence we have
-	 * to update the pre_io pointer as well to match the buffer type change
+	 * to update the b_ops pointer as well to match the buffer type change
 	 * that could occur.
 	 */
 	memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
-	root_blk->bp->b_pre_io = bp->b_pre_io;
+	root_blk->bp->b_ops = bp->b_ops;
 	xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
 	error = xfs_da_shrink_inode(args, child, bp);
 	return(error);
@@ -2223,7 +2233,7 @@ xfs_da_read_buf(
 	xfs_daddr_t		mappedbno,
 	struct xfs_buf		**bpp,
 	int			whichfork,
-	xfs_buf_iodone_t	verifier)
+	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
 	struct xfs_buf_map	map;
@@ -2245,7 +2255,7 @@ xfs_da_read_buf(
 
 	error = xfs_trans_read_buf_map(dp->i_mount, trans,
 					dp->i_mount->m_ddev_targp,
-					mapp, nmap, 0, &bp, verifier);
+					mapp, nmap, 0, &bp, ops);
 	if (error)
 		goto out_free;
 
@@ -2303,7 +2313,7 @@ xfs_da_reada_buf(
 	xfs_dablk_t		bno,
 	xfs_daddr_t		mappedbno,
 	int			whichfork,
-	xfs_buf_iodone_t	verifier)
+	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf_map	map;
 	struct xfs_buf_map	*mapp;
@@ -2322,7 +2332,7 @@ xfs_da_reada_buf(
 	}
 
 	mappedbno = mapp[0].bm_bn;
-	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, NULL);
+	xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
 
 out_free:
 	if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 521b008445ab..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -229,10 +229,10 @@ int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 			       xfs_dablk_t bno, xfs_daddr_t mappedbno,
 			       struct xfs_buf **bpp, int whichfork,
-			       xfs_buf_iodone_t verifier);
+			       const struct xfs_buf_ops *ops);
 xfs_daddr_t	xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 				xfs_dablk_t bno, xfs_daddr_t mapped_bno,
-				int whichfork, xfs_buf_iodone_t verifier);
+				int whichfork, const struct xfs_buf_ops *ops);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  struct xfs_buf *dead_buf);
 
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e2fdc6f03d8a..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -74,22 +74,24 @@ xfs_dir2_block_verify(
 }
 
 static void
-xfs_dir2_block_write_verify(
+xfs_dir2_block_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_block_verify(bp);
 }
 
-void
-xfs_dir2_block_read_verify(
+static void
+xfs_dir2_block_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_block_verify(bp);
-	bp->b_pre_io = xfs_dir2_block_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
+	.verify_read = xfs_dir2_block_read_verify,
+	.verify_write = xfs_dir2_block_write_verify,
+};
+
 static int
 xfs_dir2_block_read(
 	struct xfs_trans	*tp,
@@ -99,7 +101,7 @@ xfs_dir2_block_read(
 	struct xfs_mount	*mp = dp->i_mount;
 
 	return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
-				XFS_DATA_FORK, xfs_dir2_block_read_verify);
+				XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
 }
 
 static void
@@ -1010,7 +1012,7 @@ xfs_dir2_leaf_to_block(
 	/*
 	 * Start converting it to block form.
 	 */
-	dbp->b_pre_io = xfs_dir2_block_write_verify;
+	dbp->b_ops = &xfs_dir2_block_buf_ops;
 	hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
 	needlog = 1;
 	needscan = 0;
@@ -1140,7 +1142,7 @@ xfs_dir2_sf_to_block(
 		kmem_free(sfp);
 		return error;
 	}
-	bp->b_pre_io = xfs_dir2_block_write_verify;
+	bp->b_ops = &xfs_dir2_block_buf_ops;
 	hdr = bp->b_addr;
 	hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
 	/*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index dcb8a873ab92..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -202,23 +202,57 @@ xfs_dir2_data_verify(
 	}
 }
 
-void
-xfs_dir2_data_write_verify(
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir2_data_reada_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+	switch (hdr->magic) {
+	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+		bp->b_ops = &xfs_dir2_block_buf_ops;
+		bp->b_ops->verify_read(bp);
+		return;
+	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+		xfs_dir2_data_verify(bp);
+		return;
+	default:
+		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+		xfs_buf_ioerror(bp, EFSCORRUPTED);
+		break;
+	}
+}
+
+static void
+xfs_dir2_data_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_data_verify(bp);
 }
 
 static void
-xfs_dir2_data_read_verify(
+xfs_dir2_data_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_data_verify(bp);
-	bp->b_pre_io = xfs_dir2_data_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
+	.verify_read = xfs_dir2_data_read_verify,
+	.verify_write = xfs_dir2_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
+	.verify_read = xfs_dir2_data_reada_verify,
+	.verify_write = xfs_dir2_data_write_verify,
+};
+
 
 int
 xfs_dir2_data_read(
@@ -229,7 +263,7 @@ xfs_dir2_data_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
-				XFS_DATA_FORK, xfs_dir2_data_read_verify);
+				XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
 }
 
 int
@@ -240,7 +274,7 @@ xfs_dir2_data_readahead(
 	xfs_daddr_t		mapped_bno)
 {
 	return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
-				XFS_DATA_FORK, xfs_dir2_data_read_verify);
+				XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
 }
 
 /*
@@ -484,7 +518,7 @@ xfs_dir2_data_init(
 		XFS_DATA_FORK);
 	if (error)
 		return error;
-	bp->b_pre_io = xfs_dir2_data_write_verify;
+	bp->b_ops = &xfs_dir2_data_buf_ops;
 
 	/*
 	 * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 3002ab7d54c3..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -65,39 +65,43 @@ xfs_dir2_leaf_verify(
 }
 
 static void
-xfs_dir2_leaf1_write_verify(
+xfs_dir2_leaf1_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
 }
 
 static void
-xfs_dir2_leaf1_read_verify(
+xfs_dir2_leaf1_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
-	bp->b_pre_io = xfs_dir2_leaf1_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
 void
-xfs_dir2_leafn_write_verify(
+xfs_dir2_leafn_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
 }
 
 void
-xfs_dir2_leafn_read_verify(
+xfs_dir2_leafn_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
-	bp->b_pre_io = xfs_dir2_leafn_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
+	.verify_read = xfs_dir2_leaf1_read_verify,
+	.verify_write = xfs_dir2_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
+	.verify_read = xfs_dir2_leafn_read_verify,
+	.verify_write = xfs_dir2_leafn_write_verify,
+};
+
 static int
 xfs_dir2_leaf_read(
 	struct xfs_trans	*tp,
@@ -107,7 +111,7 @@ xfs_dir2_leaf_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, xfs_dir2_leaf1_read_verify);
+				XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
 }
 
 int
@@ -119,7 +123,7 @@ xfs_dir2_leafn_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, xfs_dir2_leafn_read_verify);
+				XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
 }
 
 /*
@@ -198,7 +202,7 @@ xfs_dir2_block_to_leaf(
 	/*
 	 * Fix up the block header, make it a data block.
 	 */
-	dbp->b_pre_io = xfs_dir2_data_write_verify;
+	dbp->b_ops = &xfs_dir2_data_buf_ops;
 	hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
 	if (needscan)
 		xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -1264,12 +1268,12 @@ xfs_dir2_leaf_init(
 	 * the block.
 	 */
 	if (magic == XFS_DIR2_LEAF1_MAGIC) {
-		bp->b_pre_io = xfs_dir2_leaf1_write_verify;
+		bp->b_ops = &xfs_dir2_leaf1_buf_ops;
 		ltp = xfs_dir2_leaf_tail_p(mp, leaf);
 		ltp->bestcount = 0;
 		xfs_dir2_leaf_log_tail(tp, bp);
 	} else
-		bp->b_pre_io = xfs_dir2_leafn_write_verify;
+		bp->b_ops = &xfs_dir2_leafn_buf_ops;
 	*bpp = bp;
 	return 0;
 }
@@ -1954,7 +1958,7 @@ xfs_dir2_node_to_leaf(
 	else
 		xfs_dir2_leaf_log_header(tp, lbp);
 
-	lbp->b_pre_io = xfs_dir2_leaf1_write_verify;
+	lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
 	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
 
 	/*
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index da90a91f4420..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -72,22 +72,24 @@ xfs_dir2_free_verify(
 }
 
 static void
-xfs_dir2_free_write_verify(
+xfs_dir2_free_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_free_verify(bp);
 }
 
-void
-xfs_dir2_free_read_verify(
+static void
+xfs_dir2_free_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dir2_free_verify(bp);
-	bp->b_pre_io = xfs_dir2_free_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
+	.verify_read = xfs_dir2_free_read_verify,
+	.verify_write = xfs_dir2_free_write_verify,
+};
+
 
 static int
 __xfs_dir2_free_read(
@@ -98,7 +100,7 @@ __xfs_dir2_free_read(
 	struct xfs_buf		**bpp)
 {
 	return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
-				XFS_DATA_FORK, xfs_dir2_free_read_verify);
+				XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
 }
 
 int
@@ -201,7 +203,7 @@ xfs_dir2_leaf_to_node(
 				XFS_DATA_FORK);
 	if (error)
 		return error;
-	fbp->b_pre_io = xfs_dir2_free_write_verify;
+	fbp->b_ops = &xfs_dir2_free_buf_ops;
 
 	free = fbp->b_addr;
 	leaf = lbp->b_addr;
@@ -225,7 +227,7 @@ xfs_dir2_leaf_to_node(
 	}
 	free->hdr.nused = cpu_to_be32(n);
 
-	lbp->b_pre_io = xfs_dir2_leafn_write_verify;
+	lbp->b_ops = &xfs_dir2_leafn_buf_ops;
 	leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
 
 	/*
@@ -636,7 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
 			state->extrablk.index = (int)((char *)dep -
 							(char *)curbp->b_addr);
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-			curbp->b_pre_io = xfs_dir2_data_write_verify;
+			curbp->b_ops = &xfs_dir2_data_buf_ops;
 			if (cmp == XFS_CMP_EXACT)
 				return XFS_ERROR(EEXIST);
 		}
@@ -651,7 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
 			state->extrablk.index = -1;
 			state->extrablk.blkno = curdb;
 			state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
-			curbp->b_pre_io = xfs_dir2_data_write_verify;
+			curbp->b_ops = &xfs_dir2_data_buf_ops;
 		} else {
 			/* If the curbp is not the CI match block, drop it */
 			if (state->extrablk.bp != curbp)
@@ -1649,7 +1651,7 @@ xfs_dir2_node_addname_int(
 					       -1, &fbp, XFS_DATA_FORK);
 			if (error)
 				return error;
-			fbp->b_pre_io = xfs_dir2_free_write_verify;
+			fbp->b_ops = &xfs_dir2_free_buf_ops;
 
 			/*
 			 * Initialize the new block to be empty, and remember
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 01b82dcddc3e..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
 				const unsigned char *name, int len);
 
 /* xfs_dir2_block.c */
+extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
 extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
 		xfs_off_t *offset, filldir_t filldir);
@@ -45,7 +47,9 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 #else
 #define	xfs_dir2_data_check(dp,bp)
 #endif
-extern void xfs_dir2_data_write_verify(struct xfs_buf *bp);
+
+extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+
 extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
 extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
@@ -73,8 +77,8 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
 		xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 
 /* xfs_dir2_leaf.c */
-extern void xfs_dir2_leafn_read_verify(struct xfs_buf *bp);
-extern void xfs_dir2_leafn_write_verify(struct xfs_buf *bp);
+extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+
 extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
 		xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1b06aa051074..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -284,22 +284,24 @@ xfs_dquot_buf_verify(
 }
 
 static void
-xfs_dquot_buf_write_verify(
+xfs_dquot_buf_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dquot_buf_verify(bp);
 }
 
 void
-xfs_dquot_buf_read_verify(
+xfs_dquot_buf_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_dquot_buf_verify(bp);
-	bp->b_pre_io = xfs_dquot_buf_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+	.verify_read = xfs_dquot_buf_read_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+};
+
 /*
  * Allocate a block and fill it with dquots.
  * This is called when the bmapi finds a hole.
@@ -365,7 +367,7 @@ xfs_qm_dqalloc(
 	error = xfs_buf_geterror(bp);
 	if (error)
 		goto error1;
-	bp->b_pre_io = xfs_dquot_buf_write_verify;
+	bp->b_ops = &xfs_dquot_buf_ops;
 
 	/*
 	 * Make a chunk of dquots out of this buffer and log
@@ -435,7 +437,7 @@ xfs_qm_dqrepair(
 		ASSERT(*bpp == NULL);
 		return XFS_ERROR(error);
 	}
-	(*bpp)->b_pre_io = xfs_dquot_buf_write_verify;
+	(*bpp)->b_ops = &xfs_dquot_buf_ops;
 
 	ASSERT(xfs_buf_islocked(*bpp));
 	d = (struct xfs_dqblk *)(*bpp)->b_addr;
@@ -534,7 +536,7 @@ xfs_qm_dqtobp(
 		error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 					   dqp->q_blkno,
 					   mp->m_quotainfo->qi_dqchunklen,
-					   0, &bp, xfs_dquot_buf_read_verify);
+					   0, &bp, &xfs_dquot_buf_ops);
 
 		if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
 			xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 5438d883b628..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -140,7 +140,6 @@ static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
 
 extern int		xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint,
 					uint, struct xfs_dquot	**);
-extern void		xfs_dquot_buf_read_verify(struct xfs_buf *bp);
 extern void		xfs_qm_dqdestroy(xfs_dquot_t *);
 extern int		xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
 extern void		xfs_qm_dqunpin_wait(xfs_dquot_t *);
@@ -162,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
 	return dqp;
 }
 
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+
 #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 5d6d6b9d369d..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -119,7 +119,8 @@ xfs_growfs_get_hdr_buf(
 	struct xfs_mount	*mp,
 	xfs_daddr_t		blkno,
 	size_t			numblks,
-	int			flags)
+	int			flags,
+	const struct xfs_buf_ops *ops)
 {
 	struct xfs_buf		*bp;
 
@@ -130,6 +131,7 @@ xfs_growfs_get_hdr_buf(
 	xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
 	bp->b_bn = blkno;
 	bp->b_maps[0].bm_bn = blkno;
+	bp->b_ops = ops;
 
 	return bp;
 }
@@ -217,12 +219,12 @@ xfs_growfs_data_private(
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-				XFS_FSS_TO_BB(mp, 1), 0);
+				XFS_FSS_TO_BB(mp, 1), 0,
+				&xfs_agf_buf_ops);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
-		bp->b_pre_io = xfs_agf_write_verify;
 
 		agf = XFS_BUF_TO_AGF(bp);
 		agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
@@ -255,12 +257,12 @@ xfs_growfs_data_private(
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-				XFS_FSS_TO_BB(mp, 1), 0);
+				XFS_FSS_TO_BB(mp, 1), 0,
+				&xfs_agfl_buf_ops);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
-		bp->b_pre_io = xfs_agfl_write_verify;
 
 		agfl = XFS_BUF_TO_AGFL(bp);
 		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
@@ -276,12 +278,12 @@ xfs_growfs_data_private(
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-				XFS_FSS_TO_BB(mp, 1), 0);
+				XFS_FSS_TO_BB(mp, 1), 0,
+				&xfs_agi_buf_ops);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
 		}
-		bp->b_pre_io = xfs_agi_write_verify;
 
 		agi = XFS_BUF_TO_AGI(bp);
 		agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
@@ -306,7 +308,8 @@ xfs_growfs_data_private(
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0);
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_allocbt_buf_ops);
 
 		if (!bp) {
 			error = ENOMEM;
@@ -329,7 +332,8 @@ xfs_growfs_data_private(
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0);
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_allocbt_buf_ops);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
@@ -352,7 +356,8 @@ xfs_growfs_data_private(
 		 */
 		bp = xfs_growfs_get_hdr_buf(mp,
 				XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-				BTOBB(mp->m_sb.sb_blocksize), 0);
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_inobt_buf_ops);
 		if (!bp) {
 			error = ENOMEM;
 			goto error0;
@@ -448,14 +453,14 @@ xfs_growfs_data_private(
 			error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 				  XFS_FSS_TO_BB(mp, 1), 0, &bp,
-				  xfs_sb_read_verify);
+				  &xfs_sb_buf_ops);
 		} else {
 			bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
 				  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 				  XFS_FSS_TO_BB(mp, 1), 0);
 			if (bp) {
+				bp->b_ops = &xfs_sb_buf_ops;
 				xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-				bp->b_pre_io = xfs_sb_write_verify;
 			} else
 				error = ENOMEM;
 		}
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index faf68600d3a6..2d6495eaaa34 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -210,7 +210,7 @@ xfs_ialloc_inode_init(
 		 *	to log a whole cluster of inodes instead of all the
 		 *	individual transactions causing a lot of log traffic.
 		 */
-		fbuf->b_pre_io = xfs_inode_buf_write_verify;
+		fbuf->b_ops = &xfs_inode_buf_ops;
 		xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
 		for (i = 0; i < ninodes; i++) {
 			int	ioffset = i << mp->m_sb.sb_inodelog;
@@ -1505,23 +1505,25 @@ xfs_agi_verify(
 	xfs_check_agi_unlinked(agi);
 }
 
-void
-xfs_agi_write_verify(
+static void
+xfs_agi_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agi_verify(bp);
 }
 
 static void
-xfs_agi_read_verify(
+xfs_agi_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_agi_verify(bp);
-	bp->b_pre_io = xfs_agi_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+	.verify_read = xfs_agi_read_verify,
+	.verify_write = xfs_agi_write_verify,
+};
+
 /*
  * Read in the allocation group header (inode allocation section)
  */
@@ -1538,7 +1540,7 @@ xfs_read_agi(
 
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-			XFS_FSS_TO_BB(mp, 1), 0, bpp, xfs_agi_read_verify);
+			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 7a169e34e30e..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,6 +150,6 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
 		xfs_inobt_rec_incore_t *rec, int *stat);
 
-void xfs_agi_write_verify(struct xfs_buf *bp);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
 
 #endif	/* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 7761e1ebeff7..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -217,22 +217,24 @@ xfs_inobt_verify(
 }
 
 static void
-xfs_inobt_write_verify(
+xfs_inobt_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_inobt_verify(bp);
 }
 
-void
-xfs_inobt_read_verify(
+static void
+xfs_inobt_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_inobt_verify(bp);
-	bp->b_pre_io = xfs_inobt_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+	.verify_read = xfs_inobt_read_verify,
+	.verify_write = xfs_inobt_write_verify,
+};
+
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -270,8 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 	.init_rec_from_cur	= xfs_inobt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_inobt_init_ptr_from_cur,
 	.key_diff		= xfs_inobt_key_diff,
-	.read_verify		= xfs_inobt_read_verify,
-	.write_verify		= xfs_inobt_write_verify,
+	.buf_ops		= &xfs_inobt_buf_ops,
 #ifdef DEBUG
 	.keys_inorder		= xfs_inobt_keys_inorder,
 	.recs_inorder		= xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+
 #endif	/* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dfcbe73f1db4..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -420,23 +420,27 @@ xfs_inode_buf_verify(
 	xfs_inobp_check(mp, bp);
 }
 
-void
-xfs_inode_buf_write_verify(
+
+static void
+xfs_inode_buf_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_inode_buf_verify(bp);
 }
 
-void
-xfs_inode_buf_read_verify(
+static void
+xfs_inode_buf_write_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_inode_buf_verify(bp);
-	bp->b_pre_io = xfs_inode_buf_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+	.verify_read = xfs_inode_buf_read_verify,
+	.verify_write = xfs_inode_buf_write_verify,
+};
+
+
 /*
  * This routine is called to map an inode to the buffer containing the on-disk
  * version of the inode.  It returns a pointer to the buffer containing the
@@ -462,7 +466,7 @@ xfs_imap_to_bp(
 	buf_flags |= XBF_UNMAPPED;
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 				   (int)imap->im_len, buf_flags, &bp,
-				   xfs_inode_buf_read_verify);
+				   &xfs_inode_buf_ops);
 	if (error) {
 		if (error == EAGAIN) {
 			ASSERT(buf_flags & XBF_TRYLOCK);
@@ -1792,7 +1796,7 @@ xfs_ifree_cluster(
 		 * want it to fail. We can acheive this by adding a write
 		 * verifier to the buffer.
 		 */
-		 bp->b_pre_io = xfs_inode_buf_write_verify;
+		 bp->b_ops = &xfs_inode_buf_ops;
 
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 482214d120a7..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -554,8 +554,6 @@ int		xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
 			       struct xfs_buf **, uint, uint);
 int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
 			  struct xfs_inode *, uint);
-void		xfs_inode_buf_read_verify(struct xfs_buf *);
-void		xfs_inode_buf_write_verify(struct xfs_buf *);
 void		xfs_dinode_to_disk(struct xfs_dinode *,
 				   struct xfs_icdinode *);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
@@ -600,5 +598,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 extern struct kmem_zone	*xfs_ifork_zone;
 extern struct kmem_zone	*xfs_inode_zone;
 extern struct kmem_zone	*xfs_ili_zone;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
 
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 7f86fdaab7ae..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -397,7 +397,7 @@ xfs_bulkstat(
 							& ~r.ir_free)
 						xfs_btree_reada_bufs(mp, agno,
 							agbno, nbcluster,
-							xfs_inode_buf_read_verify);
+							&xfs_inode_buf_ops);
 				}
 				irbp->ir_startino = r.ir_startino;
 				irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 924a4bc3d49a..931e8e23f192 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3699,7 +3699,7 @@ xlog_do_recover(
 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
-	bp->b_iodone = xfs_sb_read_verify;
+	bp->b_ops = &xfs_sb_buf_ops;
 	xfsbdstrat(log->l_mp, bp);
 	error = xfs_buf_iowait(bp);
 	if (error) {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 152a7fc843f9..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -631,21 +631,11 @@ xfs_sb_verify(
 		xfs_buf_ioerror(bp, error);
 }
 
-void
-xfs_sb_write_verify(
-	struct xfs_buf	*bp)
-{
-	xfs_sb_verify(bp);
-}
-
-void
+static void
 xfs_sb_read_verify(
 	struct xfs_buf	*bp)
 {
 	xfs_sb_verify(bp);
-	bp->b_pre_io = xfs_sb_write_verify;
-	bp->b_iodone = NULL;
-	xfs_buf_ioend(bp, 0);
 }
 
 /*
@@ -654,7 +644,7 @@ xfs_sb_read_verify(
  * If we find an XFS superblock, the run a normal, noisy mount because we are
  * really going to mount it and want to know about errors.
  */
-void
+static void
 xfs_sb_quiet_read_verify(
 	struct xfs_buf	*bp)
 {
@@ -671,6 +661,23 @@ xfs_sb_quiet_read_verify(
 	xfs_buf_ioerror(bp, EFSCORRUPTED);
 }
 
+static void
+xfs_sb_write_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_sb_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+	.verify_read = xfs_sb_read_verify,
+	.verify_write = xfs_sb_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+	.verify_read = xfs_sb_quiet_read_verify,
+	.verify_write = xfs_sb_write_verify,
+};
+
 /*
  * xfs_readsb
  *
@@ -697,8 +704,8 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 reread:
 	bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
 				   BTOBB(sector_size), 0,
-				   loud ? xfs_sb_read_verify
-				        : xfs_sb_quiet_read_verify);
+				   loud ? &xfs_sb_buf_ops
+				        : &xfs_sb_quiet_buf_ops);
 	if (!bp) {
 		if (loud)
 			xfs_warn(mp, "SB buffer read failed");
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 29c1b3ac920e..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -385,12 +385,12 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 
 #endif	/* __KERNEL__ */
 
-extern void	xfs_sb_read_verify(struct xfs_buf *);
-extern void	xfs_sb_write_verify(struct xfs_buf *bp);
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int	xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
 					xfs_agnumber_t *);
 extern void	xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void	xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
 
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+
 #endif	/* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index bd40ae9624e5..e6a0af0ba007 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -893,7 +893,7 @@ xfs_qm_dqiter_bufs(
 		error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
 			      XFS_FSB_TO_DADDR(mp, bno),
 			      mp->m_quotainfo->qi_dqchunklen, 0, &bp,
-			      xfs_dquot_buf_read_verify);
+			      &xfs_dquot_buf_ops);
 		if (error)
 			break;
 
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f02d40296506..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -474,7 +474,7 @@ int		xfs_trans_read_buf_map(struct xfs_mount *mp,
 				       struct xfs_buf_map *map, int nmaps,
 				       xfs_buf_flags_t flags,
 				       struct xfs_buf **bpp,
-				       xfs_buf_iodone_t verify);
+				       const struct xfs_buf_ops *ops);
 
 static inline int
 xfs_trans_read_buf(
@@ -485,11 +485,11 @@ xfs_trans_read_buf(
 	int			numblks,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
 	return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
-				      flags, bpp, verify);
+				      flags, bpp, ops);
 }
 
 struct xfs_buf	*xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 977628207b45..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -258,7 +258,7 @@ xfs_trans_read_buf_map(
 	int			nmaps,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp,
-	xfs_buf_iodone_t	verify)
+	const struct xfs_buf_ops *ops)
 {
 	xfs_buf_t		*bp;
 	xfs_buf_log_item_t	*bip;
@@ -266,7 +266,7 @@ xfs_trans_read_buf_map(
 
 	*bpp = NULL;
 	if (!tp) {
-		bp = xfs_buf_read_map(target, map, nmaps, flags, verify);
+		bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
 		if (!bp)
 			return (flags & XBF_TRYLOCK) ?
 					EAGAIN : XFS_ERROR(ENOMEM);
@@ -315,7 +315,7 @@ xfs_trans_read_buf_map(
 			ASSERT(!XFS_BUF_ISASYNC(bp));
 			ASSERT(bp->b_iodone == NULL);
 			XFS_BUF_READ(bp);
-			bp->b_iodone = verify;
+			bp->b_ops = ops;
 			xfsbdstrat(tp->t_mountp, bp);
 			error = xfs_buf_iowait(bp);
 			if (error) {
@@ -352,7 +352,7 @@ xfs_trans_read_buf_map(
 		return 0;
 	}
 
-	bp = xfs_buf_read_map(target, map, nmaps, flags, verify);
+	bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
 	if (bp == NULL) {
 		*bpp = NULL;
 		return (flags & XBF_TRYLOCK) ?
-- 
cgit v1.2.3


From bc02e8693d875c2a9b0037cfd37fe0b726d26403 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 16 Nov 2012 09:20:37 +1100
Subject: xfs: add CRC infrastructure

 - add a mount feature bit for CRC enabled filesystems
 - add some helpers for generating and verifying the CRCs
 - add a copy_uuid helper

The checksumming helpers are loosely based on similar ones in sctp,
all other bits come from Dave Chinner.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/Kconfig     |  1 +
 fs/xfs/uuid.h      |  6 ++++++
 fs/xfs/xfs_cksum.h | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_linux.h |  1 +
 fs/xfs/xfs_sb.h    |  7 ++++++
 5 files changed, 78 insertions(+)
 create mode 100644 fs/xfs/xfs_cksum.h

diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
 	tristate "XFS filesystem support"
 	depends on BLOCK
 	select EXPORTFS
+	select LIBCRC32C
 	help
 	  XFS is a high performance journaling filesystem which originated
 	  on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
 
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+	memcpy(dst, src, sizeof(uuid_t));
+}
+
 #endif	/* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED	(~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	__uint32_t zero = 0;
+	__uint32_t crc;
+
+	/* Calculate CRC up to the checksum. */
+	crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+	/* Skip checksum field */
+	crc = crc32c(crc, &zero, sizeof(__u32));
+
+	/* Calculate the rest of the CRC. */
+	return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+		      length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+	return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 0a134ca5211c..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/crc32c.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_ATTR2BIT	0x00000008	/* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT	0x00000010	/* parent pointers */
 #define XFS_SB_VERSION2_PROJID32BIT	0x00000080	/* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT		0x00000100	/* metadata CRCs */
 
 #define	XFS_SB_VERSION2_OKREALFBITS	\
 	(XFS_SB_VERSION2_LAZYSBCOUNTBIT	| \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
 		(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
 }
 
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+{
+	return (xfs_sb_version_hasmorebits(sbp) &&
+		(sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+}
+
 /*
  * end of superblock version macros
  */
-- 
cgit v1.2.3


From 0e446be44806240c779666591bb9e8cb0e86a50d Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 12 Nov 2012 22:54:24 +1100
Subject: xfs: add CRC checks to the log

Implement CRCs for the log buffers.  We re-use a field in
struct xlog_rec_header that was used for a weak checksum of the
log buffer payload in debug builds before.

The new checksumming uses the crc32c checksum we will use elsewhere
in XFS, and also protects the record header and addition cycle data.

Due to this there are some interesting changes in xlog_sync, as we
need to do the cycle wrapping for the split buffer case much earlier,
as we would touch the buffer after generating the checksum otherwise.

The CRC calculation is always enabled, even for non-CRC filesystems,
as adding this CRC does not change the log format. On non-CRC
filesystems, only issue an alert if a CRC mismatch is found and
allow recovery to continue - this will act as an indicator that
log recovery problems are a result of log corruption. On CRC enabled
filesystems, however, log recovery will fail.

Note that existing debug kernels will write a simple checksum value
to the log, so the first time this is run on a filesystem taht was
last used on a debug kernel it will through CRC mismatch warning
errors. These can be ignored.

Initially based on a patch from Dave Chinner, then modified
significantly by Christoph Hellwig.  Modified again by Dave Chinner
to get to this version.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c         | 132 ++++++++++++++++++++++++++++++++++++++---------
 fs/xfs/xfs_log_priv.h    |  11 ++--
 fs/xfs/xfs_log_recover.c | 132 ++++++++++++++++++++++-------------------------
 3 files changed, 176 insertions(+), 99 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 1d6d2ee08495..c6d6e136ba77 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -35,6 +35,7 @@
 #include "xfs_inode.h"
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
+#include "xfs_cksum.h"
 
 kmem_zone_t	*xfs_log_ticket_zone;
 
@@ -1489,6 +1490,84 @@ xlog_grant_push_ail(
 		xfs_ail_push(log->l_ailp, threshold_lsn);
 }
 
+/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+	struct xlog		*log,
+	struct xlog_in_core	*iclog,
+	int			roundoff)
+{
+	int			i, j, k;
+	int			size = iclog->ic_offset + roundoff;
+	__be32			cycle_lsn;
+	xfs_caddr_t		dp;
+
+	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+	dp = iclog->ic_datap;
+	for (i = 0; i < BTOBB(size); i++) {
+		if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+			break;
+		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+		*(__be32 *)dp = cycle_lsn;
+		dp += BBSIZE;
+	}
+
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+		for ( ; i < BTOBB(size); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+			*(__be32 *)dp = cycle_lsn;
+			dp += BBSIZE;
+		}
+
+		for (i = 1; i < log->l_iclog_heads; i++)
+			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+	}
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__be32
+xlog_cksum(
+	struct xlog		*log,
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	int			size)
+{
+	__uint32_t		crc;
+
+	/* first generate the crc for the record header ... */
+	crc = xfs_start_cksum((char *)rhead,
+			      sizeof(struct xlog_rec_header),
+			      offsetof(struct xlog_rec_header, h_crc));
+
+	/* ... then for additional cycle data for v2 logs ... */
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+		int		i;
+
+		for (i = 1; i < log->l_iclog_heads; i++) {
+			crc = crc32c(crc, &xhdr[i].hic_xheader,
+				     sizeof(struct xlog_rec_ext_header));
+		}
+	}
+
+	/* ... and finally for the payload */
+	crc = crc32c(crc, dp, size);
+
+	return xfs_end_cksum(crc);
+}
+
 /*
  * The bdstrat callback function for log bufs. This gives us a central
  * place to trap bufs in case we get hit by a log I/O error and need to
@@ -1549,7 +1628,6 @@ xlog_sync(
 	struct xlog		*log,
 	struct xlog_in_core	*iclog)
 {
-	xfs_caddr_t	dptr;		/* pointer to byte sized element */
 	xfs_buf_t	*bp;
 	int		i;
 	uint		count;		/* byte count of bwrite */
@@ -1558,6 +1636,7 @@ xlog_sync(
 	int		split = 0;	/* split write into two regions */
 	int		error;
 	int		v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+	int		size;
 
 	XFS_STATS_INC(xs_log_writes);
 	ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1588,13 +1667,10 @@ xlog_sync(
 	xlog_pack_data(log, iclog, roundoff); 
 
 	/* real byte length */
-	if (v2) {
-		iclog->ic_header.h_len =
-			cpu_to_be32(iclog->ic_offset + roundoff);
-	} else {
-		iclog->ic_header.h_len =
-			cpu_to_be32(iclog->ic_offset);
-	}
+	size = iclog->ic_offset;
+	if (v2)
+		size += roundoff;
+	iclog->ic_header.h_len = cpu_to_be32(size);
 
 	bp = iclog->ic_bp;
 	XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1603,12 +1679,36 @@ xlog_sync(
 
 	/* Do we need to split this write into 2 parts? */
 	if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+		char		*dptr;
+
 		split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
 		count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
-		iclog->ic_bwritecnt = 2;	/* split into 2 writes */
+		iclog->ic_bwritecnt = 2;
+
+		/*
+		 * Bump the cycle numbers at the start of each block in the
+		 * part of the iclog that ends up in the buffer that gets
+		 * written to the start of the log.
+		 *
+		 * Watch out for the header magic number case, though.
+		 */
+		dptr = (char *)&iclog->ic_header + count;
+		for (i = 0; i < split; i += BBSIZE) {
+			__uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+			if (++cycle == XLOG_HEADER_MAGIC_NUM)
+				cycle++;
+			*(__be32 *)dptr = cpu_to_be32(cycle);
+
+			dptr += BBSIZE;
+		}
 	} else {
 		iclog->ic_bwritecnt = 1;
 	}
+
+	/* calculcate the checksum */
+	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+					    iclog->ic_datap, size);
+
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
 	XFS_BUF_ZEROFLAGS(bp);
@@ -1662,19 +1762,6 @@ xlog_sync(
 		bp->b_flags |= XBF_SYNCIO;
 		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
 			bp->b_flags |= XBF_FUA;
-		dptr = bp->b_addr;
-		/*
-		 * Bump the cycle numbers at the start of each block
-		 * since this part of the buffer is at the start of
-		 * a new cycle.  Watch out for the header magic number
-		 * case, though.
-		 */
-		for (i = 0; i < split; i += BBSIZE) {
-			be32_add_cpu((__be32 *)dptr, 1);
-			if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
-				be32_add_cpu((__be32 *)dptr, 1);
-			dptr += BBSIZE;
-		}
 
 		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1691,7 +1778,6 @@ xlog_sync(
 	return 0;
 }	/* xlog_sync */
 
-
 /*
  * Deallocate a log structure
  */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9a4e0e5ec322..dc3498bf17c2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
 /*
  * Flags for log structure
  */
-#define XLOG_CHKSUM_MISMATCH	0x1	/* used only during recovery */
 #define XLOG_ACTIVE_RECOVERY	0x2	/* in the middle of recovery */
 #define	XLOG_RECOVERY_NEEDED	0x4	/* log was recovered */
 #define XLOG_IO_ERROR		0x8	/* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
 	__be32	  h_len;	/* len in bytes; should be 64-bit aligned: 4 */
 	__be64	  h_lsn;	/* lsn of this LR			:  8 */
 	__be64	  h_tail_lsn;	/* lsn of 1st LR w/ buffers not committed: 8 */
-	__be32	  h_chksum;	/* may not be used; non-zero if used	:  4 */
+	__le32	  h_crc;	/* crc of log record                    :  4 */
 	__be32	  h_prev_block; /* block number to previous LR		:  4 */
 	__be32	  h_num_logops;	/* number of log operations in this LR	:  4 */
 	__be32	  h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -555,11 +554,9 @@ xlog_recover(
 extern int
 xlog_recover_finish(
 	struct xlog		*log);
-extern void
-xlog_pack_data(
-	struct xlog		*log,
-	struct xlog_in_core	*iclog,
-	int);
+
+extern __be32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+			    char *dp, int size);
 
 extern kmem_zone_t *xfs_log_ticket_zone;
 struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 931e8e23f192..9c3651c9e75b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,6 +41,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 
@@ -3216,80 +3217,58 @@ xlog_recover_process_iunlinks(
 	mp->m_dmevmask = mp_dmevmask;
 }
 
-
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
-	struct xlog		*log,
-	struct xlog_in_core	*iclog,
-	int			size)
-{
-	int		i;
-	__be32		*up;
-	uint		chksum = 0;
-
-	up = (__be32 *)iclog->ic_datap;
-	/* divide length by 4 to get # words */
-	for (i = 0; i < (size >> 2); i++) {
-		chksum ^= be32_to_cpu(*up);
-		up++;
-	}
-	iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
-
 /*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
  */
-void
-xlog_pack_data(
-	struct xlog		*log,
-	struct xlog_in_core	*iclog,
-	int			roundoff)
+STATIC int
+xlog_unpack_data_crc(
+	struct xlog_rec_header	*rhead,
+	xfs_caddr_t		dp,
+	struct xlog		*log)
 {
-	int			i, j, k;
-	int			size = iclog->ic_offset + roundoff;
-	__be32			cycle_lsn;
-	xfs_caddr_t		dp;
-
-	xlog_pack_data_checksum(log, iclog, size);
-
-	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
-
-	dp = iclog->ic_datap;
-	for (i = 0; i < BTOBB(size) &&
-		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
-		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
-		*(__be32 *)dp = cycle_lsn;
-		dp += BBSIZE;
-	}
-
-	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-		xlog_in_core_2_t *xhdr = iclog->ic_data;
-
-		for ( ; i < BTOBB(size); i++) {
-			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
-			*(__be32 *)dp = cycle_lsn;
-			dp += BBSIZE;
+	__be32			crc;
+
+	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+	if (crc != rhead->h_crc) {
+		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+			xfs_alert(log->l_mp,
+		"log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+					be32_to_cpu(rhead->h_crc),
+					be32_to_cpu(crc));
+			xfs_hex_dump(dp, 32);
 		}
 
-		for (i = 1; i < log->l_iclog_heads; i++) {
-			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
-		}
+		/*
+		 * If we've detected a log record corruption, then we can't
+		 * recover past this point. Abort recovery if we are enforcing
+		 * CRC protection by punting an error back up the stack.
+		 */
+		if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+			return EFSCORRUPTED;
 	}
+
+	return 0;
 }
 
-STATIC void
+STATIC int
 xlog_unpack_data(
 	struct xlog_rec_header	*rhead,
 	xfs_caddr_t		dp,
 	struct xlog		*log)
 {
 	int			i, j, k;
+	int			error;
+
+	error = xlog_unpack_data_crc(rhead, dp, log);
+	if (error)
+		return error;
 
 	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
 		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3306,6 +3285,8 @@ xlog_unpack_data(
 			dp += BBSIZE;
 		}
 	}
+
+	return 0;
 }
 
 STATIC int
@@ -3437,9 +3418,13 @@ xlog_do_recovery_pass(
 			if (error)
 				goto bread_err2;
 
-			xlog_unpack_data(rhead, offset, log);
-			if ((error = xlog_recover_process_data(log,
-						rhash, rhead, offset, pass)))
+			error = xlog_unpack_data(rhead, offset, log);
+			if (error)
+				goto bread_err2;
+
+			error = xlog_recover_process_data(log,
+						rhash, rhead, offset, pass);
+			if (error)
 				goto bread_err2;
 			blk_no += bblks + hblks;
 		}
@@ -3549,9 +3534,14 @@ xlog_do_recovery_pass(
 				if (error)
 					goto bread_err2;
 			}
-			xlog_unpack_data(rhead, offset, log);
-			if ((error = xlog_recover_process_data(log, rhash,
-							rhead, offset, pass)))
+
+			error = xlog_unpack_data(rhead, offset, log);
+			if (error)
+				goto bread_err2;
+
+			error = xlog_recover_process_data(log, rhash,
+							rhead, offset, pass);
+			if (error)
 				goto bread_err2;
 			blk_no += bblks;
 		}
@@ -3576,9 +3566,13 @@ xlog_do_recovery_pass(
 			if (error)
 				goto bread_err2;
 
-			xlog_unpack_data(rhead, offset, log);
-			if ((error = xlog_recover_process_data(log, rhash,
-							rhead, offset, pass)))
+			error = xlog_unpack_data(rhead, offset, log);
+			if (error)
+				goto bread_err2;
+
+			error = xlog_recover_process_data(log, rhash,
+							rhead, offset, pass);
+			if (error)
 				goto bread_err2;
 			blk_no += bblks + hblks;
 		}
-- 
cgit v1.2.3


From 0acba3cd018c51e529a6f008c092629ad2e9e0a4 Mon Sep 17 00:00:00 2001
From: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Date: Mon, 26 Nov 2012 15:25:05 +0900
Subject: xfs: Remove the description of nodelaylog mount option from xfs.txt

nodelaylog mount option is removed by commit 93b8a585. But there still be
the description about it in the xfs document. This patch removes it.

Signed-off-by: Satoru Takeuchi <takeuchi_satoru@jp.fujitsu.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 Documentation/filesystems/xfs.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt
index 17187750270a..3e4b3dd1e046 100644
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -43,7 +43,7 @@ When mounting an XFS filesystem, the following options are accepted.
 	Issue command to let the block device reclaim space freed by the
 	filesystem.  This is useful for SSD devices, thinly provisioned
 	LUNs and virtual machine images, but may have a performance
-	impact.  This option is incompatible with the nodelaylog option.
+	impact.
 
   dmapi
 	Enable the DMAPI (Data Management API) event callouts.
-- 
cgit v1.2.3


From 7c4cebe8e02dd0b0e655605442bbe9268db9ed4f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 23 Nov 2012 14:24:23 +1100
Subject: xfs: inode allocation should use unmapped buffers.

Inode buffers do not need to be mapped as inodes are read or written
directly from/to the pages underlying the buffer. This fixes a
regression introduced by commit 611c994 ("xfs: make XBF_MAPPED the
default behaviour").

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_ialloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 2d6495eaaa34..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
 		 */
 		d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
 		fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-					 mp->m_bsize * blks_per_cluster, 0);
+					 mp->m_bsize * blks_per_cluster,
+					 XBF_UNMAPPED);
 		if (!fbuf)
 			return ENOMEM;
 		/*
-- 
cgit v1.2.3


From ef9d873344ff9f5084eacb9f3735982314dfda9e Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Thu, 29 Nov 2012 15:26:33 +1100
Subject: xfs: byte range granularity for XFS_IOC_ZERO_RANGE

XFS_IOC_ZERO_RANGE simply does not work properly for non page cache
aligned ranges. Neither test 242 or 290 exercise this correctly, so
the behaviour is completely busted even though the tests pass.

Fix it to support full byte range granularity as was originally
intended for this ioctl.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_file.c     |  2 +-
 fs/xfs/xfs_vnodeops.c | 96 ++++++++++++++++++++++++++++++++++++++++-----------
 fs/xfs/xfs_vnodeops.h |  1 +
 3 files changed, 77 insertions(+), 22 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 400b187595bb..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -86,7 +86,7 @@ xfs_rw_ilock_demote(
  *	valid before the operation, it will be read from disk before
  *	being partially zeroed.
  */
-STATIC int
+int
 xfs_iozero(
 	struct xfs_inode	*ip,	/* inode			*/
 	loff_t			pos,	/* offset in file		*/
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 26880793feca..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
 	return error;
 }
 
+
+STATIC int
+xfs_zero_file_space(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len,
+	int			attr_flags)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	uint			granularity;
+	xfs_off_t		start_boundary;
+	xfs_off_t		end_boundary;
+	int			error;
+
+	granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+	/*
+	 * Round the range of extents we are going to convert inwards.  If the
+	 * offset is aligned, then it doesn't get changed so we zero from the
+	 * start of the block offset points to.
+	 */
+	start_boundary = round_up(offset, granularity);
+	end_boundary = round_down(offset + len, granularity);
+
+	ASSERT(start_boundary >= offset);
+	ASSERT(end_boundary <= offset + len);
+
+	if (!(attr_flags & XFS_ATTR_NOLOCK))
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	if (start_boundary < end_boundary - 1) {
+		/* punch out the page cache over the conversion range */
+		truncate_pagecache_range(VFS_I(ip), start_boundary,
+					 end_boundary - 1);
+		/* convert the blocks */
+		error = xfs_alloc_file_space(ip, start_boundary,
+					end_boundary - start_boundary - 1,
+					XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+					attr_flags);
+		if (error)
+			goto out_unlock;
+
+		/* We've handled the interior of the range, now for the edges */
+		if (start_boundary != offset)
+			error = xfs_iozero(ip, offset, start_boundary - offset);
+		if (error)
+			goto out_unlock;
+
+		if (end_boundary != offset + len)
+			error = xfs_iozero(ip, end_boundary,
+					   offset + len - end_boundary);
+
+	} else {
+		/*
+		 * It's either a sub-granularity range or the range spanned lies
+		 * partially across two adjacent blocks.
+		 */
+		error = xfs_iozero(ip, offset, len);
+	}
+
+out_unlock:
+	if (!(attr_flags & XFS_ATTR_NOLOCK))
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+
+}
+
 /*
  * xfs_change_file_space()
  *      This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
 	xfs_fsize_t	fsize;
 	int		setprealloc;
 	xfs_off_t	startoffset;
-	xfs_off_t	end;
 	xfs_trans_t	*tp;
 	struct iattr	iattr;
-	int		prealloc_type;
 
 	if (!S_ISREG(ip->i_d.di_mode))
 		return XFS_ERROR(EINVAL);
@@ -2172,31 +2237,20 @@ xfs_change_file_space(
 	startoffset = bf->l_start;
 	fsize = XFS_ISIZE(ip);
 
-	/*
-	 * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
-	 * file space.
-	 * These calls do NOT zero the data space allocated to the file,
-	 * nor do they change the file size.
-	 *
-	 * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
-	 * space.
-	 * These calls cause the new file data to be zeroed and the file
-	 * size to be changed.
-	 */
 	setprealloc = clrprealloc = 0;
-	prealloc_type = XFS_BMAPI_PREALLOC;
-
 	switch (cmd) {
 	case XFS_IOC_ZERO_RANGE:
-		prealloc_type |= XFS_BMAPI_CONVERT;
-		end = round_down(startoffset + bf->l_len, PAGE_SIZE) - 1;
-		if (startoffset <= end)
-			truncate_pagecache_range(VFS_I(ip), startoffset, end);
-		/* FALLTHRU */
+		error = xfs_zero_file_space(ip, startoffset, bf->l_len,
+						attr_flags);
+		if (error)
+			return error;
+		setprealloc = 1;
+		break;
+
 	case XFS_IOC_RESVSP:
 	case XFS_IOC_RESVSP64:
 		error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-						prealloc_type, attr_flags);
+						XFS_BMAPI_PREALLOC, attr_flags);
 		if (error)
 			return error;
 		setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 91a03fa3814f..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -49,6 +49,7 @@ int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
 		int flags, struct attrlist_cursor_kern *cursor);
 
+int xfs_iozero(struct xfs_inode *, loff_t, size_t);
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
 
-- 
cgit v1.2.3


From 437a255aa23766666aec78af63be4c253faa8d57 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Nov 2012 13:01:00 +1100
Subject: xfs: fix direct IO nested transaction deadlock.

The direct IO path can do a nested transaction reservation when
writing past the EOF. The first transaction is the append
transaction for setting the filesize at IO completion, but we can
also need a transaction for allocation of blocks. If the log is low
on space due to reservations and small log, the append transaction
can be granted after wating for space as the only active transaction
in the system. This then attempts a reservation for an allocation,
which there isn't space in the log for, and the reservation sleeps.
The result is that there is nothing left in the system to wake up
all the processes waiting for log space to come free.

The stack trace that shows this deadlock is relatively innocuous:

 xlog_grant_head_wait
 xlog_grant_head_check
 xfs_log_reserve
 xfs_trans_reserve
 xfs_iomap_write_direct
 __xfs_get_blocks
 xfs_get_blocks_direct
 do_blockdev_direct_IO
 __blockdev_direct_IO
 xfs_vm_direct_IO
 generic_file_direct_write
 xfs_file_dio_aio_writ
 xfs_file_aio_write
 do_sync_write
 vfs_write

This was discovered on a filesystem with a log of only 10MB, and a
log stripe unit of 256k whih increased the base reservations by
512k. Hence a allocation transaction requires 1.2MB of log space to
be available instead of only 260k, and so greatly increased the
chance that there wouldn't be enough log space available for the
nested transaction to succeed. The key to reproducing it is this
mkfs command:

mkfs.xfs -f -d agcount=16,su=256k,sw=12 -l su=256k,size=2560b $SCRATCH_DEV

The test case was a 1000 fsstress processes running with random
freeze and unfreezes every few seconds. Thanks to Eryu Guan
(eguan@redhat.com) for writing the test that found this on a system
with a somewhat unique default configuration....

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Andrew Dahl <adahl@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_aops.c | 81 ++++++++++++++++++++-----------------------------------
 fs/xfs/xfs_log.c  |  3 ++-
 2 files changed, 31 insertions(+), 53 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 71361da1f77c..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
 	ioend->io_append_trans = tp;
 
 	/*
-	 * We will pass freeze protection with a transaction.  So tell lockdep
+	 * We may pass freeze protection with a transaction.  So tell lockdep
 	 * we released it.
 	 */
 	rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
 	xfs_fsize_t		isize;
 
 	/*
-	 * The transaction was allocated in the I/O submission thread,
-	 * thus we need to mark ourselves as beeing in a transaction
-	 * manually.
+	 * The transaction may have been allocated in the I/O submission thread,
+	 * thus we need to mark ourselves as beeing in a transaction manually.
+	 * Similarly for freeze protection.
 	 */
 	current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+	rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			   0, 1, _THIS_IP_);
 
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
 
 		if (ioend->io_type == XFS_IO_UNWRITTEN)
 			queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-		else if (ioend->io_append_trans)
+		else if (ioend->io_append_trans ||
+			 (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
 			queue_work(mp->m_data_workqueue, &ioend->io_work);
 		else
 			xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
 	struct xfs_inode *ip = XFS_I(ioend->io_inode);
 	int		error = 0;
 
-	if (ioend->io_append_trans) {
-		/*
-		 * We've got freeze protection passed with the transaction.
-		 * Tell lockdep about it.
-		 */
-		rwsem_acquire_read(
-			&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-			0, 1, _THIS_IP_);
-	}
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		ioend->io_error = -EIO;
 		goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
 	 * range to normal written extens after the data I/O has finished.
 	 */
 	if (ioend->io_type == XFS_IO_UNWRITTEN) {
+		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+						  ioend->io_size);
+	} else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
 		/*
-		 * For buffered I/O we never preallocate a transaction when
-		 * doing the unwritten extent conversion, but for direct I/O
-		 * we do not know if we are converting an unwritten extent
-		 * or not at the point where we preallocate the transaction.
+		 * For direct I/O we do not know if we need to allocate blocks
+		 * or not so we can't preallocate an append transaction as that
+		 * results in nested reservations and log space deadlocks. Hence
+		 * allocate the transaction here. While this is sub-optimal and
+		 * can block IO completion for some time, we're stuck with doing
+		 * it this way until we can pass the ioend to the direct IO
+		 * allocation callbacks and avoid nesting that way.
 		 */
-		if (ioend->io_append_trans) {
-			ASSERT(ioend->io_isdirect);
-
-			current_set_flags_nested(
-				&ioend->io_append_trans->t_pflags, PF_FSTRANS);
-			xfs_trans_cancel(ioend->io_append_trans, 0);
-		}
-
-		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
-						 ioend->io_size);
-		if (error) {
-			ioend->io_error = -error;
+		error = xfs_setfilesize_trans_alloc(ioend);
+		if (error)
 			goto done;
-		}
+		error = xfs_setfilesize(ioend);
 	} else if (ioend->io_append_trans) {
 		error = xfs_setfilesize(ioend);
-		if (error)
-			ioend->io_error = -error;
 	} else {
 		ASSERT(!xfs_ioend_is_append(ioend));
 	}
 
 done:
+	if (error)
+		ioend->io_error = -error;
 	xfs_destroy_ioend(ioend);
 }
 
@@ -1432,25 +1422,21 @@ xfs_vm_direct_IO(
 		size_t size = iov_length(iov, nr_segs);
 
 		/*
-		 * We need to preallocate a transaction for a size update
-		 * here.  In the case that this write both updates the size
-		 * and converts at least on unwritten extent we will cancel
-		 * the still clean transaction after the I/O has finished.
+		 * We cannot preallocate a size update transaction here as we
+		 * don't know whether allocation is necessary or not. Hence we
+		 * can only tell IO completion that one is necessary if we are
+		 * not doing unwritten extent conversion.
 		 */
 		iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-		if (offset + size > XFS_I(inode)->i_d.di_size) {
-			ret = xfs_setfilesize_trans_alloc(ioend);
-			if (ret)
-				goto out_destroy_ioend;
+		if (offset + size > XFS_I(inode)->i_d.di_size)
 			ioend->io_isdirect = 1;
-		}
 
 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
 					    xfs_get_blocks_direct,
 					    xfs_end_io_direct_write, NULL, 0);
 		if (ret != -EIOCBQUEUED && iocb->private)
-			goto out_trans_cancel;
+			goto out_destroy_ioend;
 	} else {
 		ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
 					    offset, nr_segs,
@@ -1460,15 +1446,6 @@ xfs_vm_direct_IO(
 
 	return ret;
 
-out_trans_cancel:
-	if (ioend->io_append_trans) {
-		current_set_flags_nested(&ioend->io_append_trans->t_pflags,
-					 PF_FSTRANS);
-		rwsem_acquire_read(
-			&inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-			0, 1, _THIS_IP_);
-		xfs_trans_cancel(ioend->io_append_trans, 0);
-	}
 out_destroy_ioend:
 	xfs_destroy_ioend(ioend);
 	return ret;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c6d6e136ba77..c49e2c12dba4 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -460,7 +460,8 @@ xfs_log_reserve(
 	tic->t_trans_type = t_type;
 	*ticp = tic;
 
-	xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+	xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+					    : tic->t_unit_res);
 
 	trace_xfs_log_reserve(log, tic);
 
-- 
cgit v1.2.3


From b870553cdecb26d5291af09602352b763e323df2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Nov 2012 13:01:02 +1100
Subject: xfs: fix stray dquot unlock when reclaiming dquots

When we fail to get a dquot lock during reclaim, we jump to an error
handler that unlocks the dquot. This is wrong as we didn't lock the
dquot, and unlocking it means who-ever is holding the lock has had
it silently taken away, and hence it results in a lock imbalance.

Found by inspection while modifying the code for the numa-lru
patchset. This fixes a random hang I've been seeing on xfstest 232
for the past several months.

cc: <stable@vger.kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_qm.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index e6a0af0ba007..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1456,7 +1456,7 @@ xfs_qm_dqreclaim_one(
 	int			error;
 
 	if (!xfs_dqlock_nowait(dqp))
-		goto out_busy;
+		goto out_move_tail;
 
 	/*
 	 * This dquot has acquired a reference in the meantime remove it from
@@ -1479,7 +1479,7 @@ xfs_qm_dqreclaim_one(
 	 * getting flushed to disk, we don't want to reclaim it.
 	 */
 	if (!xfs_dqflock_nowait(dqp))
-		goto out_busy;
+		goto out_unlock_move_tail;
 
 	if (XFS_DQ_IS_DIRTY(dqp)) {
 		struct xfs_buf	*bp = NULL;
@@ -1490,7 +1490,7 @@ xfs_qm_dqreclaim_one(
 		if (error) {
 			xfs_warn(mp, "%s: dquot %p flush failed",
 				 __func__, dqp);
-			goto out_busy;
+			goto out_unlock_move_tail;
 		}
 
 		xfs_buf_delwri_queue(bp, buffer_list);
@@ -1499,7 +1499,7 @@ xfs_qm_dqreclaim_one(
 		 * Give the dquot another try on the freelist, as the
 		 * flushing will take some time.
 		 */
-		goto out_busy;
+		goto out_unlock_move_tail;
 	}
 	xfs_dqfunlock(dqp);
 
@@ -1518,14 +1518,13 @@ xfs_qm_dqreclaim_one(
 	XFS_STATS_INC(xs_qm_dqreclaims);
 	return;
 
-out_busy:
-	xfs_dqunlock(dqp);
-
 	/*
 	 * Move the dquot to the tail of the list so that we don't spin on it.
 	 */
+out_unlock_move_tail:
+	xfs_dqunlock(dqp);
+out_move_tail:
 	list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-
 	trace_xfs_dqreclaim_busy(dqp);
 	XFS_STATS_INC(xs_qm_dqreclaim_misses);
 }
-- 
cgit v1.2.3


From f9668a09e32ac6d2aa22f44cc310e430a8f4a40f Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Wed, 28 Nov 2012 13:01:03 +1100
Subject: xfs: fix sparse reported log CRC endian issue

Not a bug as such, just warning noise from the xlog_cksum()
returning a __be32 type when it should be returning a __le32 type.

On Wed, Nov 28, 2012 at 08:30:59AM -0500, Christoph Hellwig wrote:
> But why are we storing the crc field little endian while all other on
> disk formats are big endian? (And yes I realize it might as well have
> been me who did that back in the idea, but I still have no idea why)

Because the CRC always returns the calcuation LE format, even on BE
systems. So rather than always having to byte swap it everywhere and
have all the force casts and anootations for sparse, it seems simpler to
just make it a __le32 everywhere....

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
---
 fs/xfs/xfs_log.c         | 2 +-
 fs/xfs/xfs_log_priv.h    | 2 +-
 fs/xfs/xfs_log_recover.c | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index c49e2c12dba4..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1538,7 +1538,7 @@ xlog_pack_data(
  * This is a little more complicated than it should be because the various
  * headers and the actual data are non-contiguous.
  */
-__be32
+__le32
 xlog_cksum(
 	struct xlog		*log,
 	struct xlog_rec_header	*rhead,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index dc3498bf17c2..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -555,7 +555,7 @@ extern int
 xlog_recover_finish(
 	struct xlog		*log);
 
-extern __be32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+extern __le32	 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
 			    char *dp, int size);
 
 extern kmem_zone_t *xfs_log_ticket_zone;
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9c3651c9e75b..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3233,15 +3233,15 @@ xlog_unpack_data_crc(
 	xfs_caddr_t		dp,
 	struct xlog		*log)
 {
-	__be32			crc;
+	__le32			crc;
 
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
 	if (crc != rhead->h_crc) {
 		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
 			xfs_alert(log->l_mp,
 		"log record CRC mismatch: found 0x%x, expected 0x%x.\n",
-					be32_to_cpu(rhead->h_crc),
-					be32_to_cpu(crc));
+					le32_to_cpu(rhead->h_crc),
+					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
 		}
 
-- 
cgit v1.2.3