summaryrefslogtreecommitdiff
path: root/fs/xfs/linux-2.6
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c370
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h40
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c4
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c2
-rw-r--r--fs/xfs/linux-2.6/xfs_message.c31
-rw-r--r--fs/xfs/linux-2.6/xfs_message.h24
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c165
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c261
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h2
11 files changed, 354 insertions, 555 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 52dbd14260ba..79ce38be15a1 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1295,7 +1295,7 @@ xfs_get_blocks_direct(
* If the private argument is non-NULL __xfs_get_blocks signals us that we
* need to issue a transaction to convert the range from unwritten to written
* extents. In case this is regular synchronous I/O we just call xfs_end_io
- * to do this and we are done. But in case this was a successfull AIO
+ * to do this and we are done. But in case this was a successful AIO
* request this handler is called from interrupt context, from which we
* can't start transactions. In that case offload the I/O completion to
* the workqueues we also use for buffered I/O completion.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index c05324d3282c..9ef9ed2cfe2e 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -94,75 +94,6 @@ xfs_buf_vmap_len(
}
/*
- * Page Region interfaces.
- *
- * For pages in filesystems where the blocksize is smaller than the
- * pagesize, we use the page->private field (long) to hold a bitmap
- * of uptodate regions within the page.
- *
- * Each such region is "bytes per page / bits per long" bytes long.
- *
- * NBPPR == number-of-bytes-per-page-region
- * BTOPR == bytes-to-page-region (rounded up)
- * BTOPRT == bytes-to-page-region-truncated (rounded down)
- */
-#if (BITS_PER_LONG == 32)
-#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */
-#elif (BITS_PER_LONG == 64)
-#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */
-#else
-#error BITS_PER_LONG must be 32 or 64
-#endif
-#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG)
-#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
-#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT))
-
-STATIC unsigned long
-page_region_mask(
- size_t offset,
- size_t length)
-{
- unsigned long mask;
- int first, final;
-
- first = BTOPR(offset);
- final = BTOPRT(offset + length - 1);
- first = min(first, final);
-
- mask = ~0UL;
- mask <<= BITS_PER_LONG - (final - first);
- mask >>= BITS_PER_LONG - (final);
-
- ASSERT(offset + length <= PAGE_CACHE_SIZE);
- ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
-
- return mask;
-}
-
-STATIC void
-set_page_region(
- struct page *page,
- size_t offset,
- size_t length)
-{
- set_page_private(page,
- page_private(page) | page_region_mask(offset, length));
- if (page_private(page) == ~0UL)
- SetPageUptodate(page);
-}
-
-STATIC int
-test_page_region(
- struct page *page,
- size_t offset,
- size_t length)
-{
- unsigned long mask = page_region_mask(offset, length);
-
- return (mask && (page_private(page) & mask) == mask);
-}
-
-/*
* xfs_buf_lru_add - add a buffer to the LRU.
*
* The LRU takes a new reference to the buffer so that it will only be freed
@@ -189,7 +120,7 @@ xfs_buf_lru_add(
* The unlocked check is safe here because it only occurs when there are not
* b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
* to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unneccessary round trip on the
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
* bt_lru_lock.
*/
STATIC void
@@ -332,7 +263,7 @@ xfs_buf_free(
ASSERT(list_empty(&bp->b_lru));
- if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
+ if (bp->b_flags & _XBF_PAGES) {
uint i;
if (xfs_buf_is_vmapped(bp))
@@ -342,56 +273,77 @@ xfs_buf_free(
for (i = 0; i < bp->b_page_count; i++) {
struct page *page = bp->b_pages[i];
- if (bp->b_flags & _XBF_PAGE_CACHE)
- ASSERT(!PagePrivate(page));
- page_cache_release(page);
+ __free_page(page);
}
- }
+ } else if (bp->b_flags & _XBF_KMEM)
+ kmem_free(bp->b_addr);
_xfs_buf_free_pages(bp);
xfs_buf_deallocate(bp);
}
/*
- * Finds all pages for buffer in question and builds it's page list.
+ * Allocates all the pages for buffer in question and builds it's page list.
*/
STATIC int
-_xfs_buf_lookup_pages(
+xfs_buf_allocate_memory(
xfs_buf_t *bp,
uint flags)
{
- struct address_space *mapping = bp->b_target->bt_mapping;
- size_t blocksize = bp->b_target->bt_bsize;
size_t size = bp->b_count_desired;
size_t nbytes, offset;
gfp_t gfp_mask = xb_to_gfp(flags);
unsigned short page_count, i;
- pgoff_t first;
xfs_off_t end;
int error;
+ /*
+ * for buffers that are contained within a single page, just allocate
+ * the memory from the heap - there's no need for the complexity of
+ * page arrays to keep allocation down to order 0.
+ */
+ if (bp->b_buffer_length < PAGE_SIZE) {
+ bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+ if (!bp->b_addr) {
+ /* low memory - use alloc_page loop instead */
+ goto use_alloc_page;
+ }
+
+ if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+ PAGE_MASK) !=
+ ((unsigned long)bp->b_addr & PAGE_MASK)) {
+ /* b_addr spans two pages - use alloc_page instead */
+ kmem_free(bp->b_addr);
+ bp->b_addr = NULL;
+ goto use_alloc_page;
+ }
+ bp->b_offset = offset_in_page(bp->b_addr);
+ bp->b_pages = bp->b_page_array;
+ bp->b_pages[0] = virt_to_page(bp->b_addr);
+ bp->b_page_count = 1;
+ bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+ return 0;
+ }
+
+use_alloc_page:
end = bp->b_file_offset + bp->b_buffer_length;
page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
-
error = _xfs_buf_get_pages(bp, page_count, flags);
if (unlikely(error))
return error;
- bp->b_flags |= _XBF_PAGE_CACHE;
offset = bp->b_offset;
- first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
+ bp->b_flags |= _XBF_PAGES;
for (i = 0; i < bp->b_page_count; i++) {
struct page *page;
uint retries = 0;
-
- retry:
- page = find_or_create_page(mapping, first + i, gfp_mask);
+retry:
+ page = alloc_page(gfp_mask);
if (unlikely(page == NULL)) {
if (flags & XBF_READ_AHEAD) {
bp->b_page_count = i;
- for (i = 0; i < bp->b_page_count; i++)
- unlock_page(bp->b_pages[i]);
- return -ENOMEM;
+ error = ENOMEM;
+ goto out_free_pages;
}
/*
@@ -412,52 +364,44 @@ _xfs_buf_lookup_pages(
XFS_STATS_INC(xb_page_found);
- nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
+ nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
-
- ASSERT(!PagePrivate(page));
- if (!PageUptodate(page)) {
- page_count--;
- if (blocksize >= PAGE_CACHE_SIZE) {
- if (flags & XBF_READ)
- bp->b_flags |= _XBF_PAGE_LOCKED;
- } else if (!PagePrivate(page)) {
- if (test_page_region(page, offset, nbytes))
- page_count++;
- }
- }
-
bp->b_pages[i] = page;
offset = 0;
}
+ return 0;
- if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
- for (i = 0; i < bp->b_page_count; i++)
- unlock_page(bp->b_pages[i]);
- }
-
- if (page_count == bp->b_page_count)
- bp->b_flags |= XBF_DONE;
-
+out_free_pages:
+ for (i = 0; i < bp->b_page_count; i++)
+ __free_page(bp->b_pages[i]);
return error;
}
/*
- * Map buffer into kernel address-space if nessecary.
+ * Map buffer into kernel address-space if necessary.
*/
STATIC int
_xfs_buf_map_pages(
xfs_buf_t *bp,
uint flags)
{
- /* A single page buffer is always mappable */
+ ASSERT(bp->b_flags & _XBF_PAGES);
if (bp->b_page_count == 1) {
+ /* A single page buffer is always mappable */
bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
bp->b_flags |= XBF_MAPPED;
} else if (flags & XBF_MAPPED) {
- bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
- -1, PAGE_KERNEL);
- if (unlikely(bp->b_addr == NULL))
+ int retried = 0;
+
+ do {
+ bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+ -1, PAGE_KERNEL);
+ if (bp->b_addr)
+ break;
+ vm_unmap_aliases();
+ } while (retried++ <= 1);
+
+ if (!bp->b_addr)
return -ENOMEM;
bp->b_addr += bp->b_offset;
bp->b_flags |= XBF_MAPPED;
@@ -568,9 +512,14 @@ found:
}
}
+ /*
+ * if the buffer is stale, clear all the external state associated with
+ * it. We need to keep flags such as how we allocated the buffer memory
+ * intact here.
+ */
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
- bp->b_flags &= XBF_MAPPED;
+ bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -591,7 +540,7 @@ xfs_buf_get(
xfs_buf_flags_t flags)
{
xfs_buf_t *bp, *new_bp;
- int error = 0, i;
+ int error = 0;
new_bp = xfs_buf_allocate(flags);
if (unlikely(!new_bp))
@@ -599,7 +548,7 @@ xfs_buf_get(
bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
if (bp == new_bp) {
- error = _xfs_buf_lookup_pages(bp, flags);
+ error = xfs_buf_allocate_memory(bp, flags);
if (error)
goto no_buffer;
} else {
@@ -608,9 +557,6 @@ xfs_buf_get(
return NULL;
}
- for (i = 0; i < bp->b_page_count; i++)
- mark_page_accessed(bp->b_pages[i]);
-
if (!(bp->b_flags & XBF_MAPPED)) {
error = _xfs_buf_map_pages(bp, flags);
if (unlikely(error)) {
@@ -709,10 +655,7 @@ xfs_buf_readahead(
xfs_off_t ioff,
size_t isize)
{
- struct backing_dev_info *bdi;
-
- bdi = target->bt_mapping->backing_dev_info;
- if (bdi_read_congested(bdi))
+ if (bdi_read_congested(target->bt_bdi))
return;
xfs_buf_read(target, ioff, isize,
@@ -790,10 +733,10 @@ xfs_buf_associate_memory(
size_t buflen;
int page_count;
- pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
+ pageaddr = (unsigned long)mem & PAGE_MASK;
offset = (unsigned long)mem - pageaddr;
- buflen = PAGE_CACHE_ALIGN(len + offset);
- page_count = buflen >> PAGE_CACHE_SHIFT;
+ buflen = PAGE_ALIGN(len + offset);
+ page_count = buflen >> PAGE_SHIFT;
/* Free any previous set of page pointers */
if (bp->b_pages)
@@ -810,13 +753,12 @@ xfs_buf_associate_memory(
for (i = 0; i < bp->b_page_count; i++) {
bp->b_pages[i] = mem_to_page((void *)pageaddr);
- pageaddr += PAGE_CACHE_SIZE;
+ pageaddr += PAGE_SIZE;
}
bp->b_count_desired = len;
bp->b_buffer_length = buflen;
bp->b_flags |= XBF_MAPPED;
- bp->b_flags &= ~_XBF_PAGE_LOCKED;
return 0;
}
@@ -923,20 +865,7 @@ xfs_buf_rele(
/*
- * Mutual exclusion on buffers. Locking model:
- *
- * Buffers associated with inodes for which buffer locking
- * is not enabled are not protected by semaphores, and are
- * assumed to be exclusively owned by the caller. There is a
- * spinlock in the buffer, used by the caller when concurrent
- * access is possible.
- */
-
-/*
- * Locks a buffer object, if it is not already locked. Note that this in
- * no way locks the underlying pages, so it is only useful for
- * synchronizing concurrent use of buffer objects, not for synchronizing
- * independent access to the underlying pages.
+ * Lock a buffer object, if it is not already locked.
*
* If we come across a stale, pinned, locked buffer, we know that we are
* being asked to lock a buffer that has been reallocated. Because it is
@@ -970,10 +899,7 @@ xfs_buf_lock_value(
}
/*
- * Locks a buffer object.
- * Note that this in no way locks the underlying pages, so it is only
- * useful for synchronizing concurrent use of buffer objects, not for
- * synchronizing independent access to the underlying pages.
+ * Lock a buffer object.
*
* If we come across a stale, pinned, locked buffer, we know that we
* are being asked to lock a buffer that has been reallocated. Because
@@ -989,8 +915,6 @@ xfs_buf_lock(
if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
xfs_log_force(bp->b_target->bt_mount, 0);
- if (atomic_read(&bp->b_io_remaining))
- blk_flush_plug(current);
down(&bp->b_sema);
XB_SET_OWNER(bp);
@@ -1246,10 +1170,8 @@ _xfs_buf_ioend(
xfs_buf_t *bp,
int schedule)
{
- if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
- bp->b_flags &= ~_XBF_PAGE_LOCKED;
+ if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
xfs_buf_ioend(bp, schedule);
- }
}
STATIC void
@@ -1258,35 +1180,12 @@ xfs_buf_bio_end_io(
int error)
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
- unsigned int blocksize = bp->b_target->bt_bsize;
- struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
xfs_buf_ioerror(bp, -error);
if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
- do {
- struct page *page = bvec->bv_page;
-
- ASSERT(!PagePrivate(page));
- if (unlikely(bp->b_error)) {
- if (bp->b_flags & XBF_READ)
- ClearPageUptodate(page);
- } else if (blocksize >= PAGE_CACHE_SIZE) {
- SetPageUptodate(page);
- } else if (!PagePrivate(page) &&
- (bp->b_flags & _XBF_PAGE_CACHE)) {
- set_page_region(page, bvec->bv_offset, bvec->bv_len);
- }
-
- if (--bvec >= bio->bi_io_vec)
- prefetchw(&bvec->bv_page->flags);
-
- if (bp->b_flags & _XBF_PAGE_LOCKED)
- unlock_page(page);
- } while (bvec >= bio->bi_io_vec);
-
_xfs_buf_ioend(bp, 1);
bio_put(bio);
}
@@ -1300,7 +1199,6 @@ _xfs_buf_ioapply(
int offset = bp->b_offset;
int size = bp->b_count_desired;
sector_t sector = bp->b_bn;
- unsigned int blocksize = bp->b_target->bt_bsize;
total_nr_pages = bp->b_page_count;
map_i = 0;
@@ -1321,29 +1219,6 @@ _xfs_buf_ioapply(
(bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
}
- /* Special code path for reading a sub page size buffer in --
- * we populate up the whole page, and hence the other metadata
- * in the same page. This optimization is only valid when the
- * filesystem block size is not smaller than the page size.
- */
- if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
- ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
- (XBF_READ|_XBF_PAGE_LOCKED)) &&
- (blocksize >= PAGE_CACHE_SIZE)) {
- bio = bio_alloc(GFP_NOIO, 1);
-
- bio->bi_bdev = bp->b_target->bt_bdev;
- bio->bi_sector = sector - (offset >> BBSHIFT);
- bio->bi_end_io = xfs_buf_bio_end_io;
- bio->bi_private = bp;
-
- bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
- size = 0;
-
- atomic_inc(&bp->b_io_remaining);
-
- goto submit_io;
- }
next_chunk:
atomic_inc(&bp->b_io_remaining);
@@ -1357,8 +1232,9 @@ next_chunk:
bio->bi_end_io = xfs_buf_bio_end_io;
bio->bi_private = bp;
+
for (; size && nr_pages; nr_pages--, map_i++) {
- int rbytes, nbytes = PAGE_CACHE_SIZE - offset;
+ int rbytes, nbytes = PAGE_SIZE - offset;
if (nbytes > size)
nbytes = size;
@@ -1373,7 +1249,6 @@ next_chunk:
total_nr_pages--;
}
-submit_io:
if (likely(bio->bi_size)) {
if (xfs_buf_is_vmapped(bp)) {
flush_kernel_vmap_range(bp->b_addr,
@@ -1383,18 +1258,7 @@ submit_io:
if (size)
goto next_chunk;
} else {
- /*
- * if we get here, no pages were added to the bio. However,
- * we can't just error out here - if the pages are locked then
- * we have to unlock them otherwise we can hang on a later
- * access to the page.
- */
xfs_buf_ioerror(bp, EIO);
- if (bp->b_flags & _XBF_PAGE_LOCKED) {
- int i;
- for (i = 0; i < bp->b_page_count; i++)
- unlock_page(bp->b_pages[i]);
- }
bio_put(bio);
}
}
@@ -1439,8 +1303,6 @@ xfs_buf_iowait(
{
trace_xfs_buf_iowait(bp, _RET_IP_);
- if (atomic_read(&bp->b_io_remaining))
- blk_flush_plug(current);
wait_for_completion(&bp->b_iowait);
trace_xfs_buf_iowait_done(bp, _RET_IP_);
@@ -1458,8 +1320,8 @@ xfs_buf_offset(
return XFS_BUF_PTR(bp) + offset;
offset += bp->b_offset;
- page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
- return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
+ page = bp->b_pages[offset >> PAGE_SHIFT];
+ return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
}
/*
@@ -1481,9 +1343,9 @@ xfs_buf_iomove(
page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
cpoff = xfs_buf_poff(boff + bp->b_offset);
csize = min_t(size_t,
- PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
+ PAGE_SIZE-cpoff, bp->b_count_desired-boff);
- ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
+ ASSERT(((csize + cpoff) <= PAGE_SIZE));
switch (mode) {
case XBRW_ZERO:
@@ -1596,7 +1458,6 @@ xfs_free_buftarg(
xfs_flush_buftarg(btp, 1);
if (mp->m_flags & XFS_MOUNT_BARRIER)
xfs_blkdev_issue_flush(btp);
- iput(btp->bt_mapping->host);
kthread_stop(btp->bt_task);
kmem_free(btp);
@@ -1620,15 +1481,6 @@ xfs_setsize_buftarg_flags(
return EINVAL;
}
- if (verbose &&
- (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
- printk(KERN_WARNING
- "XFS: %u byte sectors in use on device %s. "
- "This is suboptimal; %u or greater is ideal.\n",
- sectorsize, XFS_BUFTARG_NAME(btp),
- (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
- }
-
return 0;
}
@@ -1643,7 +1495,7 @@ xfs_setsize_buftarg_early(
struct block_device *bdev)
{
return xfs_setsize_buftarg_flags(btp,
- PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
+ PAGE_SIZE, bdev_logical_block_size(bdev), 0);
}
int
@@ -1656,40 +1508,6 @@ xfs_setsize_buftarg(
}
STATIC int
-xfs_mapping_buftarg(
- xfs_buftarg_t *btp,
- struct block_device *bdev)
-{
- struct backing_dev_info *bdi;
- struct inode *inode;
- struct address_space *mapping;
- static const struct address_space_operations mapping_aops = {
- .migratepage = fail_migrate_page,
- };
-
- inode = new_inode(bdev->bd_inode->i_sb);
- if (!inode) {
- printk(KERN_WARNING
- "XFS: Cannot allocate mapping inode for device %s\n",
- XFS_BUFTARG_NAME(btp));
- return ENOMEM;
- }
- inode->i_ino = get_next_ino();
- inode->i_mode = S_IFBLK;
- inode->i_bdev = bdev;
- inode->i_rdev = bdev->bd_dev;
- bdi = blk_get_backing_dev_info(bdev);
- if (!bdi)
- bdi = &default_backing_dev_info;
- mapping = &inode->i_data;
- mapping->a_ops = &mapping_aops;
- mapping->backing_dev_info = bdi;
- mapping_set_gfp_mask(mapping, GFP_NOFS);
- btp->bt_mapping = mapping;
- return 0;
-}
-
-STATIC int
xfs_alloc_delwrite_queue(
xfs_buftarg_t *btp,
const char *fsname)
@@ -1717,12 +1535,14 @@ xfs_alloc_buftarg(
btp->bt_mount = mp;
btp->bt_dev = bdev->bd_dev;
btp->bt_bdev = bdev;
+ btp->bt_bdi = blk_get_backing_dev_info(bdev);
+ if (!btp->bt_bdi)
+ goto error;
+
INIT_LIST_HEAD(&btp->bt_lru);
spin_lock_init(&btp->bt_lru_lock);
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;
- if (xfs_mapping_buftarg(btp, bdev))
- goto error;
if (xfs_alloc_delwrite_queue(btp, fsname))
goto error;
btp->bt_shrinker.shrink = xfs_buftarg_shrink;
@@ -1919,8 +1739,8 @@ xfsbufd(
do {
long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
- int count = 0;
struct list_head tmp;
+ struct blk_plug plug;
if (unlikely(freezing(current))) {
set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
@@ -1936,16 +1756,15 @@ xfsbufd(
xfs_buf_delwri_split(target, &tmp, age);
list_sort(NULL, &tmp, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
while (!list_empty(&tmp)) {
struct xfs_buf *bp;
bp = list_first_entry(&tmp, struct xfs_buf, b_list);
list_del_init(&bp->b_list);
xfs_bdstrat_cb(bp);
- count++;
}
- if (count)
- blk_flush_plug(current);
-
+ blk_finish_plug(&plug);
} while (!kthread_should_stop());
return 0;
@@ -1965,6 +1784,7 @@ xfs_flush_buftarg(
int pincount = 0;
LIST_HEAD(tmp_list);
LIST_HEAD(wait_list);
+ struct blk_plug plug;
xfs_buf_runall_queues(xfsconvertd_workqueue);
xfs_buf_runall_queues(xfsdatad_workqueue);
@@ -1979,6 +1799,8 @@ xfs_flush_buftarg(
* we do that after issuing all the IO.
*/
list_sort(NULL, &tmp_list, xfs_buf_cmp);
+
+ blk_start_plug(&plug);
while (!list_empty(&tmp_list)) {
bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
ASSERT(target == bp->b_target);
@@ -1989,10 +1811,10 @@ xfs_flush_buftarg(
}
xfs_bdstrat_cb(bp);
}
+ blk_finish_plug(&plug);
if (wait) {
- /* Expedite and wait for IO to complete. */
- blk_flush_plug(current);
+ /* Wait for IO to complete. */
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index cbe65950e524..a9a1c4512645 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -61,30 +61,11 @@ typedef enum {
#define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */
/* flags used only internally */
-#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */
#define _XBF_PAGES (1 << 18)/* backed by refcounted pages */
#define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */
+#define _XBF_KMEM (1 << 20)/* backed by heap memory */
#define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */
-/*
- * Special flag for supporting metadata blocks smaller than a FSB.
- *
- * In this case we can have multiple xfs_buf_t on a single page and
- * need to lock out concurrent xfs_buf_t readers as they only
- * serialise access to the buffer.
- *
- * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation
- * between reads of the page. Hence we can have one thread read the
- * page and modify it, but then race with another thread that thinks
- * the page is not up-to-date and hence reads it again.
- *
- * The result is that the first modifcation to the page is lost.
- * This sort of AGF/AGI reading race can happen when unlinking inodes
- * that require truncation and results in the AGI unlinked list
- * modifications being lost.
- */
-#define _XBF_PAGE_LOCKED (1 << 22)
-
typedef unsigned int xfs_buf_flags_t;
#define XFS_BUF_FLAGS \
@@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t;
{ XBF_LOCK, "LOCK" }, /* should never be set */\
{ XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\
{ XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\
- { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \
{ _XBF_PAGES, "PAGES" }, \
{ _XBF_RUN_QUEUES, "RUN_QUEUES" }, \
- { _XBF_DELWRI_Q, "DELWRI_Q" }, \
- { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }
-
+ { _XBF_KMEM, "KMEM" }, \
+ { _XBF_DELWRI_Q, "DELWRI_Q" }
typedef enum {
XBT_FORCE_SLEEP = 0,
@@ -120,7 +99,7 @@ typedef struct xfs_bufhash {
typedef struct xfs_buftarg {
dev_t bt_dev;
struct block_device *bt_bdev;
- struct address_space *bt_mapping;
+ struct backing_dev_info *bt_bdi;
struct xfs_mount *bt_mount;
unsigned int bt_bsize;
unsigned int bt_sshift;
@@ -139,17 +118,6 @@ typedef struct xfs_buftarg {
unsigned int bt_lru_nr;
} xfs_buftarg_t;
-/*
- * xfs_buf_t: Buffer structure for pagecache-based buffers
- *
- * This buffer structure is used by the pagecache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.
- *
- * The buffer structure is used on a temporary basis only, and discarded when
- * released. The real data storage is recorded in the pagecache. Buffers are
- * hashed to the block device on which the file system resides.
- */
-
struct xfs_buf;
typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index a55c1b46b219..f4213ba1ff85 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -381,7 +381,7 @@ xfs_aio_write_isize_update(
/*
* If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occured. In
+ * part of the I/O may have been written to disk before the error occurred. In
* this case the on-disk file size may have been adjusted beyond the in-memory
* file size and now needs to be truncated back.
*/
@@ -896,6 +896,7 @@ xfs_file_fallocate(
xfs_flock64_t bf;
xfs_inode_t *ip = XFS_I(inode);
int cmd = XFS_IOC_RESVSP;
+ int attr_flags = XFS_ATTR_NOLOCK;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -918,7 +919,10 @@ xfs_file_fallocate(
goto out_unlock;
}
- error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK);
+ if (file->f_flags & O_DSYNC)
+ attr_flags |= XFS_ATTR_SYNC;
+
+ error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
if (error)
goto out_unlock;
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 0ca0e3c024d7..acca2c5ca3fa 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -624,6 +624,10 @@ xfs_ioc_space(
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
attr_flags |= XFS_ATTR_NONBLOCK;
+
+ if (filp->f_flags & O_DSYNC)
+ attr_flags |= XFS_ATTR_SYNC;
+
if (ioflags & IO_INVIS)
attr_flags |= XFS_ATTR_DMI;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9ff7fc603d2f..dd21784525a8 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -70,7 +70,7 @@ xfs_synchronize_times(
/*
* If the linux inode is valid, mark it dirty.
- * Used when commiting a dirty inode into a transaction so that
+ * Used when committing a dirty inode into a transaction so that
* the inode will get written back by the linux code
*/
void
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
index 508e06fd7d1e..9f76cceb678d 100644
--- a/fs/xfs/linux-2.6/xfs_message.c
+++ b/fs/xfs/linux-2.6/xfs_message.c
@@ -28,53 +28,49 @@
/*
* XFS logging functions
*/
-static int
+static void
__xfs_printk(
const char *level,
const struct xfs_mount *mp,
struct va_format *vaf)
{
- if (mp && mp->m_fsname)
- return printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
- return printk("%sXFS: %pV\n", level, vaf);
+ if (mp && mp->m_fsname) {
+ printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+ return;
+ }
+ printk("%sXFS: %pV\n", level, vaf);
}
-int xfs_printk(
+void xfs_printk(
const char *level,
const struct xfs_mount *mp,
const char *fmt, ...)
{
struct va_format vaf;
va_list args;
- int r;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- r = __xfs_printk(level, mp, &vaf);
+ __xfs_printk(level, mp, &vaf);
va_end(args);
-
- return r;
}
#define define_xfs_printk_level(func, kern_level) \
-int func(const struct xfs_mount *mp, const char *fmt, ...) \
+void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
- int r; \
\
va_start(args, fmt); \
\
vaf.fmt = fmt; \
vaf.va = &args; \
\
- r = __xfs_printk(kern_level, mp, &vaf); \
+ __xfs_printk(kern_level, mp, &vaf); \
va_end(args); \
- \
- return r; \
} \
define_xfs_printk_level(xfs_emerg, KERN_EMERG);
@@ -88,7 +84,7 @@ define_xfs_printk_level(xfs_info, KERN_INFO);
define_xfs_printk_level(xfs_debug, KERN_DEBUG);
#endif
-int
+void
xfs_alert_tag(
const struct xfs_mount *mp,
int panic_tag,
@@ -97,7 +93,6 @@ xfs_alert_tag(
struct va_format vaf;
va_list args;
int do_panic = 0;
- int r;
if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
xfs_printk(KERN_ALERT, mp,
@@ -110,12 +105,10 @@ xfs_alert_tag(
vaf.fmt = fmt;
vaf.va = &args;
- r = __xfs_printk(KERN_ALERT, mp, &vaf);
+ __xfs_printk(KERN_ALERT, mp, &vaf);
va_end(args);
BUG_ON(do_panic);
-
- return r;
}
void
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
index e77ffa16745b..f1b3fc1b6c4e 100644
--- a/fs/xfs/linux-2.6/xfs_message.h
+++ b/fs/xfs/linux-2.6/xfs_message.h
@@ -3,32 +3,34 @@
struct xfs_mount;
-extern int xfs_printk(const char *level, const struct xfs_mount *mp,
+extern void xfs_printk(const char *level, const struct xfs_mount *mp,
const char *fmt, ...)
__attribute__ ((format (printf, 3, 4)));
-extern int xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
-extern int xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
-extern int xfs_alert_tag(const struct xfs_mount *mp, int tag,
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
const char *fmt, ...)
__attribute__ ((format (printf, 3, 4)));
-extern int xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
-extern int xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
-extern int xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
-extern int xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
-extern int xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
#ifdef DEBUG
-extern int xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
__attribute__ ((format (printf, 2, 3)));
#else
-#define xfs_debug(mp, fmt, ...) (0)
+static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
#endif
extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 818c4cf2de86..b38e58d02299 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -816,75 +816,6 @@ xfs_setup_devices(
return 0;
}
-/*
- * XFS AIL push thread support
- */
-void
-xfsaild_wakeup(
- struct xfs_ail *ailp,
- xfs_lsn_t threshold_lsn)
-{
- /* only ever move the target forwards */
- if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) {
- ailp->xa_target = threshold_lsn;
- wake_up_process(ailp->xa_task);
- }
-}
-
-STATIC int
-xfsaild(
- void *data)
-{
- struct xfs_ail *ailp = data;
- xfs_lsn_t last_pushed_lsn = 0;
- long tout = 0; /* milliseconds */
-
- while (!kthread_should_stop()) {
- /*
- * for short sleeps indicating congestion, don't allow us to
- * get woken early. Otherwise all we do is bang on the AIL lock
- * without making progress.
- */
- if (tout && tout <= 20)
- __set_current_state(TASK_KILLABLE);
- else
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(tout ?
- msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
-
- /* swsusp */
- try_to_freeze();
-
- ASSERT(ailp->xa_mount->m_log);
- if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
- continue;
-
- tout = xfsaild_push(ailp, &last_pushed_lsn);
- }
-
- return 0;
-} /* xfsaild */
-
-int
-xfsaild_start(
- struct xfs_ail *ailp)
-{
- ailp->xa_target = 0;
- ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
- ailp->xa_mount->m_fsname);
- if (IS_ERR(ailp->xa_task))
- return -PTR_ERR(ailp->xa_task);
- return 0;
-}
-
-void
-xfsaild_stop(
- struct xfs_ail *ailp)
-{
- kthread_stop(ailp->xa_task);
-}
-
-
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -1078,7 +1009,7 @@ xfs_fs_write_inode(
error = 0;
goto out_unlock;
}
- error = xfs_iflush(ip, 0);
+ error = xfs_iflush(ip, SYNC_TRYLOCK);
}
out_unlock:
@@ -1191,22 +1122,12 @@ xfs_fs_sync_fs(
return -error;
if (laptop_mode) {
- int prev_sync_seq = mp->m_sync_seq;
-
/*
* The disk must be active because we're syncing.
* We schedule xfssyncd now (now that the disk is
* active) instead of later (when it might not be).
*/
- wake_up_process(mp->m_sync_task);
- /*
- * We have to wait for the sync iteration to complete.
- * If we don't, the disk activity caused by the sync
- * will come after the sync is completed, and that
- * triggers another sync from laptop mode.
- */
- wait_event(mp->m_wait_single_sync_task,
- mp->m_sync_seq != prev_sync_seq);
+ flush_delayed_work_sync(&mp->m_sync_work);
}
return 0;
@@ -1490,9 +1411,6 @@ xfs_fs_fill_super(
spin_lock_init(&mp->m_sb_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
- INIT_LIST_HEAD(&mp->m_sync_list);
- spin_lock_init(&mp->m_sync_lock);
- init_waitqueue_head(&mp->m_wait_single_sync_task);
mp->m_super = sb;
sb->s_fs_info = mp;
@@ -1539,10 +1457,14 @@ xfs_fs_fill_super(
if (error)
goto out_free_sb;
- error = xfs_mountfs(mp);
- if (error)
- goto out_filestream_unmount;
-
+ /*
+ * we must configure the block size in the superblock before we run the
+ * full mount process as the mount process can lookup and cache inodes.
+ * For the same reason we must also initialise the syncd and register
+ * the inode cache shrinker so that inodes can be reclaimed during
+ * operations like a quotacheck that iterate all inodes in the
+ * filesystem.
+ */
sb->s_magic = XFS_SB_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1550,6 +1472,16 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
+ error = xfs_syncd_init(mp);
+ if (error)
+ goto out_filestream_unmount;
+
+ xfs_inode_shrinker_register(mp);
+
+ error = xfs_mountfs(mp);
+ if (error)
+ goto out_syncd_stop;
+
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
error = ENOENT;
@@ -1565,14 +1497,11 @@ xfs_fs_fill_super(
goto fail_vnrele;
}
- error = xfs_syncd_init(mp);
- if (error)
- goto fail_vnrele;
-
- xfs_inode_shrinker_register(mp);
-
return 0;
+ out_syncd_stop:
+ xfs_inode_shrinker_unregister(mp);
+ xfs_syncd_stop(mp);
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
@@ -1596,6 +1525,9 @@ xfs_fs_fill_super(
}
fail_unmount:
+ xfs_inode_shrinker_unregister(mp);
+ xfs_syncd_stop(mp);
+
/*
* Blow away any referenced inode in the filestreams cache.
* This can and will cause log traffic as inodes go inactive
@@ -1785,6 +1717,38 @@ xfs_destroy_zones(void)
}
STATIC int __init
+xfs_init_workqueues(void)
+{
+ /*
+ * max_active is set to 8 to give enough concurency to allow
+ * multiple work operations on each CPU to run. This allows multiple
+ * filesystems to be running sync work concurrently, and scales with
+ * the number of CPUs in the system.
+ */
+ xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+ if (!xfs_syncd_wq)
+ goto out;
+
+ xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
+ if (!xfs_ail_wq)
+ goto out_destroy_syncd;
+
+ return 0;
+
+out_destroy_syncd:
+ destroy_workqueue(xfs_syncd_wq);
+out:
+ return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_workqueues(void)
+{
+ destroy_workqueue(xfs_ail_wq);
+ destroy_workqueue(xfs_syncd_wq);
+}
+
+STATIC int __init
init_xfs_fs(void)
{
int error;
@@ -1799,10 +1763,14 @@ init_xfs_fs(void)
if (error)
goto out;
- error = xfs_mru_cache_init();
+ error = xfs_init_workqueues();
if (error)
goto out_destroy_zones;
+ error = xfs_mru_cache_init();
+ if (error)
+ goto out_destroy_wq;
+
error = xfs_filestream_init();
if (error)
goto out_mru_cache_uninit;
@@ -1819,6 +1787,10 @@ init_xfs_fs(void)
if (error)
goto out_cleanup_procfs;
+ error = xfs_init_workqueues();
+ if (error)
+ goto out_sysctl_unregister;
+
vfs_initquota();
error = register_filesystem(&xfs_fs_type);
@@ -1836,6 +1808,8 @@ init_xfs_fs(void)
xfs_filestream_uninit();
out_mru_cache_uninit:
xfs_mru_cache_uninit();
+ out_destroy_wq:
+ xfs_destroy_workqueues();
out_destroy_zones:
xfs_destroy_zones();
out:
@@ -1852,6 +1826,7 @@ exit_xfs_fs(void)
xfs_buf_terminate();
xfs_filestream_uninit();
xfs_mru_cache_uninit();
+ xfs_destroy_workqueues();
xfs_destroy_zones();
}
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 6c10f1d2e3d3..3e898a48122d 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -22,6 +22,7 @@
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
@@ -39,6 +40,8 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
+struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+
/*
* The inode lookup is done in batches to keep the amount of lock traffic and
* radix tree lookups to a minimum. The batch size is a trade off between
@@ -401,7 +404,7 @@ xfs_quiesce_fs(
/*
* Second stage of a quiesce. The data is already synced, now we have to take
* care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
+ * wait for any remaining transactions to drain out before proceeding.
*/
void
xfs_quiesce_attr(
@@ -431,62 +434,12 @@ xfs_quiesce_attr(
xfs_unmountfs_writesb(mp);
}
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
- struct xfs_mount *mp,
- void *data,
- void (*syncer)(struct xfs_mount *, void *),
- struct completion *completion)
-{
- struct xfs_sync_work *work;
-
- work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
- INIT_LIST_HEAD(&work->w_list);
- work->w_syncer = syncer;
- work->w_data = data;
- work->w_mount = mp;
- work->w_completion = completion;
- spin_lock(&mp->m_sync_lock);
- list_add_tail(&work->w_list, &mp->m_sync_list);
- spin_unlock(&mp->m_sync_lock);
- wake_up_process(mp->m_sync_task);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations. At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inodes_work(
- struct xfs_mount *mp,
- void *arg)
-{
- struct inode *inode = arg;
- xfs_sync_data(mp, SYNC_TRYLOCK);
- xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
- iput(inode);
-}
-
-void
-xfs_flush_inodes(
- xfs_inode_t *ip)
+static void
+xfs_syncd_queue_sync(
+ struct xfs_mount *mp)
{
- struct inode *inode = VFS_I(ip);
- DECLARE_COMPLETION_ONSTACK(completion);
-
- igrab(inode);
- xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
- wait_for_completion(&completion);
- xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
+ queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
+ msecs_to_jiffies(xfs_syncd_centisecs * 10));
}
/*
@@ -496,9 +449,10 @@ xfs_flush_inodes(
*/
STATIC void
xfs_sync_worker(
- struct xfs_mount *mp,
- void *unused)
+ struct work_struct *work)
{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_sync_work);
int error;
if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
@@ -508,73 +462,106 @@ xfs_sync_worker(
error = xfs_fs_log_dummy(mp);
else
xfs_log_force(mp, 0);
- xfs_reclaim_inodes(mp, 0);
error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+
+ /* start pushing all the metadata that is currently dirty */
+ xfs_ail_push_all(mp->m_ail);
}
- mp->m_sync_seq++;
- wake_up(&mp->m_wait_single_sync_task);
+
+ /* queue us up again */
+ xfs_syncd_queue_sync(mp);
}
-STATIC int
-xfssyncd(
- void *arg)
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+ struct xfs_mount *mp)
{
- struct xfs_mount *mp = arg;
- long timeleft;
- xfs_sync_work_t *work, *n;
- LIST_HEAD (tmp);
-
- set_freezable();
- timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
- for (;;) {
- if (list_empty(&mp->m_sync_list))
- timeleft = schedule_timeout_interruptible(timeleft);
- /* swsusp */
- try_to_freeze();
- if (kthread_should_stop() && list_empty(&mp->m_sync_list))
- break;
- spin_lock(&mp->m_sync_lock);
- /*
- * We can get woken by laptop mode, to do a sync -
- * that's the (only!) case where the list would be
- * empty with time remaining.
- */
- if (!timeleft || list_empty(&mp->m_sync_list)) {
- if (!timeleft)
- timeleft = xfs_syncd_centisecs *
- msecs_to_jiffies(10);
- INIT_LIST_HEAD(&mp->m_sync_work.w_list);
- list_add_tail(&mp->m_sync_work.w_list,
- &mp->m_sync_list);
- }
- list_splice_init(&mp->m_sync_list, &tmp);
- spin_unlock(&mp->m_sync_lock);
+ /*
+ * We can have inodes enter reclaim after we've shut down the syncd
+ * workqueue during unmount, so don't allow reclaim work to be queued
+ * during unmount.
+ */
+ if (!(mp->m_super->s_flags & MS_ACTIVE))
+ return;
- list_for_each_entry_safe(work, n, &tmp, w_list) {
- (*work->w_syncer)(mp, work->w_data);
- list_del(&work->w_list);
- if (work == &mp->m_sync_work)
- continue;
- if (work->w_completion)
- complete(work->w_completion);
- kmem_free(work);
- }
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+ queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+ msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
+ rcu_read_unlock();
+}
- return 0;
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_reclaim_work);
+
+ xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+ xfs_syncd_queue_reclaim(mp);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations. At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+
+ queue_work(xfs_syncd_wq, &mp->m_flush_work);
+ flush_work_sync(&mp->m_flush_work);
+}
+
+STATIC void
+xfs_flush_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(work,
+ struct xfs_mount, m_flush_work);
+
+ xfs_sync_data(mp, SYNC_TRYLOCK);
+ xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
}
int
xfs_syncd_init(
struct xfs_mount *mp)
{
- mp->m_sync_work.w_syncer = xfs_sync_worker;
- mp->m_sync_work.w_mount = mp;
- mp->m_sync_work.w_completion = NULL;
- mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
- if (IS_ERR(mp->m_sync_task))
- return -PTR_ERR(mp->m_sync_task);
+ INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+ INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+ xfs_syncd_queue_sync(mp);
+ xfs_syncd_queue_reclaim(mp);
+
return 0;
}
@@ -582,7 +569,9 @@ void
xfs_syncd_stop(
struct xfs_mount *mp)
{
- kthread_stop(mp->m_sync_task);
+ cancel_delayed_work_sync(&mp->m_sync_work);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ cancel_work_sync(&mp->m_flush_work);
}
void
@@ -601,6 +590,10 @@ __xfs_inode_set_reclaim_tag(
XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
XFS_ICI_RECLAIM_TAG);
spin_unlock(&ip->i_mount->m_perag_lock);
+
+ /* schedule periodic background inode reclaim */
+ xfs_syncd_queue_reclaim(ip->i_mount);
+
trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-1, _RET_IP_);
}
@@ -761,8 +754,10 @@ xfs_reclaim_inode(
struct xfs_perag *pag,
int sync_mode)
{
- int error = 0;
+ int error;
+restart:
+ error = 0;
xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iflock_nowait(ip)) {
if (!(sync_mode & SYNC_WAIT))
@@ -788,9 +783,31 @@ xfs_reclaim_inode(
if (xfs_inode_clean(ip))
goto reclaim;
- /* Now we have an inode that needs flushing */
- error = xfs_iflush(ip, sync_mode);
+ /*
+ * Now we have an inode that needs flushing.
+ *
+ * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+ * reclaim as we can deadlock with inode cluster removal.
+ * xfs_ifree_cluster() can lock the inode buffer before it locks the
+ * ip->i_lock, and we are doing the exact opposite here. As a result,
+ * doing a blocking xfs_itobp() to get the cluster buffer will result
+ * in an ABBA deadlock with xfs_ifree_cluster().
+ *
+ * As xfs_ifree_cluser() must gather all inodes that are active in the
+ * cache to mark them stale, if we hit this case we don't actually want
+ * to do IO here - we want the inode marked stale so we can simply
+ * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+ * just unlock the inode, back off and try again. Hopefully the next
+ * pass through will see the stale flag set on the inode.
+ */
+ error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
if (sync_mode & SYNC_WAIT) {
+ if (error == EAGAIN) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ /* backoff longer than in xfs_ifree_cluster */
+ delay(2);
+ goto restart;
+ }
xfs_iflock(ip);
goto reclaim;
}
@@ -909,6 +926,7 @@ restart:
XFS_LOOKUP_BATCH,
XFS_ICI_RECLAIM_TAG);
if (!nr_found) {
+ done = 1;
rcu_read_unlock();
break;
}
@@ -993,7 +1011,13 @@ xfs_reclaim_inodes(
}
/*
- * Shrinker infrastructure.
+ * Inode cache shrinker.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doiing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
*/
static int
xfs_reclaim_inode_shrink(
@@ -1008,10 +1032,15 @@ xfs_reclaim_inode_shrink(
mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
if (nr_to_scan) {
+ /* kick background reclaimer and push the AIL */
+ xfs_syncd_queue_reclaim(mp);
+ xfs_ail_push_all(mp->m_ail);
+
if (!(gfp_mask & __GFP_FS))
return -1;
- xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan);
+ xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT,
+ &nr_to_scan);
/* terminate if we don't exhaust the scan */
if (nr_to_scan > 0)
return -1;
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 32ba6628290c..e3a6ad27415f 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -32,6 +32,8 @@ typedef struct xfs_sync_work {
#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
+extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+
int xfs_syncd_init(struct xfs_mount *mp);
void xfs_syncd_stop(struct xfs_mount *mp);