diff options
47 files changed, 2498 insertions, 3289 deletions
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index daa8e7514eae..9106f42c472c 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + if (status == -ECONNREFUSED) { + dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n", + status); + rpc_force_rebind(clnt); + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + } if (status < 0) dprintk("lockd: NSM upcall RPC failed, status=%d\n", status); diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile index d5815505c020..3ca14c36d08b 100644 --- a/fs/nfs/blocklayout/Makefile +++ b/fs/nfs/blocklayout/Makefile @@ -2,4 +2,5 @@ # Makefile for the pNFS block layout driver kernel module # obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o -blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o + +blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index cbb1797149d5..5228f201d3d5 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -35,7 +35,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/bio.h> /* struct bio */ -#include <linux/buffer_head.h> /* various write calls */ #include <linux/prefetch.h> #include <linux/pagevec.h> @@ -50,40 +49,16 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); -static void print_page(struct page *page) +static bool is_hole(struct pnfs_block_extent *be) { - dprintk("PRINTPAGE page %p\n", page); - dprintk(" PagePrivate %d\n", PagePrivate(page)); - dprintk(" PageUptodate %d\n", PageUptodate(page)); - dprintk(" PageError %d\n", PageError(page)); - dprintk(" PageDirty %d\n", PageDirty(page)); - dprintk(" PageReferenced %d\n", PageReferenced(page)); - dprintk(" PageLocked %d\n", PageLocked(page)); - dprintk(" PageWriteback %d\n", PageWriteback(page)); - dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); - dprintk("\n"); -} - -/* Given the be associated with isect, determine if page data needs to be - * initialized. - */ -static int is_hole(struct pnfs_block_extent *be, sector_t isect) -{ - if (be->be_state == PNFS_BLOCK_NONE_DATA) - return 1; - else if (be->be_state != PNFS_BLOCK_INVALID_DATA) - return 0; - else - return !bl_is_sector_init(be->be_inval, isect); -} - -/* Given the be associated with isect, determine if page data can be - * written to disk. - */ -static int is_writable(struct pnfs_block_extent *be, sector_t isect) -{ - return (be->be_state == PNFS_BLOCK_READWRITE_DATA || - be->be_state == PNFS_BLOCK_INVALID_DATA); + switch (be->be_state) { + case PNFS_BLOCK_NONE_DATA: + return true; + case PNFS_BLOCK_INVALID_DATA: + return be->be_tag ? false : true; + default: + return false; + } } /* The data we are handed might be spread across several bios. We need @@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect) */ struct parallel_io { struct kref refcnt; - void (*pnfs_callback) (void *data, int num_se); + void (*pnfs_callback) (void *data); void *data; - int bse_count; }; static inline struct parallel_io *alloc_parallel(void *data) @@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data) if (rv) { rv->data = data; kref_init(&rv->refcnt); - rv->bse_count = 0; } return rv; } @@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref) struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); dprintk("%s enter\n", __func__); - p->pnfs_callback(p->data, p->bse_count); + p->pnfs_callback(p->data); kfree(p); } @@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio) return NULL; } -static struct bio *bl_alloc_init_bio(int npg, sector_t isect, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par) +static struct bio * +bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, + void (*end_io)(struct bio *, int err), struct parallel_io *par) { struct bio *bio; @@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, } if (bio) { - bio->bi_iter.bi_sector = isect - be->be_f_offset + - be->be_v_offset; - bio->bi_bdev = be->be_mdev; + bio->bi_iter.bi_sector = disk_sector; + bio->bi_bdev = bdev; bio->bi_end_io = end_io; bio->bi_private = par; } return bio; } -static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, - sector_t isect, struct page *page, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par, - unsigned int offset, int len) +static struct bio * +do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, + struct page *page, struct pnfs_block_dev_map *map, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par, unsigned int offset, int *len) { - isect = isect + (offset >> SECTOR_SHIFT); + struct pnfs_block_dev *dev = + container_of(be->be_device, struct pnfs_block_dev, node); + u64 disk_addr, end; + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, - npg, rw, (unsigned long long)isect, offset, len); + npg, rw, (unsigned long long)isect, offset, *len); + + /* translate to device offset */ + isect += be->be_v_offset; + isect -= be->be_f_offset; + + /* translate to physical disk offset */ + disk_addr = (u64)isect << SECTOR_SHIFT; + if (disk_addr < map->start || disk_addr >= map->start + map->len) { + if (!dev->map(dev, disk_addr, map)) + return ERR_PTR(-EIO); + bio = bl_submit_bio(rw, bio); + } + disk_addr += map->disk_offset; + disk_addr -= map->start; + + /* limit length to what the device mapping allows */ + end = disk_addr + *len; + if (end >= map->start + map->len) + *len = map->start + map->len - disk_addr; + retry: if (!bio) { - bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + bio = bl_alloc_init_bio(npg, map->bdev, + disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); } - if (bio_add_page(bio, page, len, offset) < len) { + if (bio_add_page(bio, page, *len, offset) < *len) { bio = bl_submit_bio(rw, bio); goto retry; } return bio; } -static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, - sector_t isect, struct page *page, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par) -{ - return do_add_page_to_bio(bio, npg, rw, isect, page, be, - end_io, par, 0, PAGE_CACHE_SIZE); -} - -/* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - struct bio_vec *bvec; - int i; - - if (!err) - bio_for_each_segment_all(bvec, bio, i) - SetPageUptodate(bvec->bv_page); if (err) { struct nfs_pgio_header *header = par->data; @@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); } + bio_put(bio); put_parallel(par); } @@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work) } static void -bl_end_par_io_read(void *data, int unused) +bl_end_par_io_read(void *data) { struct nfs_pgio_header *hdr = data; @@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused) } static enum pnfs_try_status -bl_read_pagelist(struct nfs_pgio_header *hdr) +bl_read_pagelist(struct nfs_pgio_header *header) { - struct nfs_pgio_header *header = hdr; - int i, hole; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; + struct pnfs_block_extent be; sector_t isect, extent_length = 0; struct parallel_io *par; - loff_t f_offset = hdr->args.offset; - size_t bytes_left = hdr->args.count; + loff_t f_offset = header->args.offset; + size_t bytes_left = header->args.count; unsigned int pg_offset, pg_len; - struct page **pages = hdr->args.pages; - int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT; + struct page **pages = header->args.pages; + int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; const bool is_dio = (header->dreq != NULL); + struct blk_plug plug; + int i; dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, - hdr->page_array.npages, f_offset, - (unsigned int)hdr->args.count); + header->page_array.npages, f_offset, + (unsigned int)header->args.count); - par = alloc_parallel(hdr); + par = alloc_parallel(header); if (!par) - goto use_mds; + return PNFS_NOT_ATTEMPTED; par->pnfs_callback = bl_end_par_io_read; - /* At this point, we can no longer jump to use_mds */ + + blk_start_plug(&plug); isect = (sector_t) (f_offset >> SECTOR_SHIFT); /* Code assumes extents are page-aligned */ - for (i = pg_index; i < hdr->page_array.npages; i++) { - if (!extent_length) { + for (i = pg_index; i < header->page_array.npages; i++) { + if (extent_length <= 0) { /* We've used up the previous extent */ - bl_put_extent(be); - bl_put_extent(cow_read); bio = bl_submit_bio(READ, bio); + /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, &cow_read); - if (!be) { + if (!ext_tree_lookup(bl, isect, &be, false)) { header->pnfs_error = -EIO; goto out; } - extent_length = be->be_length - - (isect - be->be_f_offset); - if (cow_read) { - sector_t cow_length = cow_read->be_length - - (isect - cow_read->be_f_offset); - extent_length = min(extent_length, cow_length); - } + extent_length = be.be_length - (isect - be.be_f_offset); } + pg_offset = f_offset & ~PAGE_CACHE_MASK; if (is_dio) { - pg_offset = f_offset & ~PAGE_CACHE_MASK; if (pg_offset + bytes_left > PAGE_CACHE_SIZE) pg_len = PAGE_CACHE_SIZE - pg_offset; else pg_len = bytes_left; - - f_offset += pg_len; - bytes_left -= pg_len; - isect += (pg_offset >> SECTOR_SHIFT); } else { - pg_offset = 0; + BUG_ON(pg_offset != 0); pg_len = PAGE_CACHE_SIZE; } - hole = is_hole(be, isect); - if (hole && !cow_read) { + isect += (pg_offset >> SECTOR_SHIFT); + extent_length -= (pg_offset >> SECTOR_SHIFT); + + if (is_hole(&be)) { bio = bl_submit_bio(READ, bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); - print_page(pages[i]); - SetPageUptodate(pages[i]); - } else { - struct pnfs_block_extent *be_read; - be_read = (hole && cow_read) ? cow_read : be; + /* invalidate map */ + map.start = NFS4_MAX_UINT64; + } else { bio = do_add_page_to_bio(bio, - hdr->page_array.npages - i, + header->page_array.npages - i, READ, - isect, pages[i], be_read, + isect, pages[i], &map, &be, bl_end_io_read, par, - pg_offset, pg_len); + pg_offset, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; @@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr) } } isect += (pg_len >> SECTOR_SHIFT); - extent_length -= PAGE_CACHE_SECTORS; + extent_length -= (pg_len >> SECTOR_SHIFT); + f_offset += pg_len; + bytes_left -= pg_len; } if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { - hdr->res.eof = 1; - hdr->res.count = header->inode->i_size - hdr->args.offset; + header->res.eof = 1; + header->res.count = header->inode->i_size - header->args.offset; } else { - hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset; + header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_put_extent(be); - bl_put_extent(cow_read); bl_submit_bio(READ, bio); + blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; - - use_mds: - dprintk("Giving up and using normal NFS\n"); - return PNFS_NOT_ATTEMPTED; -} - -static void mark_extents_written(struct pnfs_block_layout *bl, - __u64 offset, __u32 count) -{ - sector_t isect, end; - struct pnfs_block_extent *be; - struct pnfs_block_short_extent *se; - - dprintk("%s(%llu, %u)\n", __func__, offset, count); - if (count == 0) - return; - isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; - end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); - end >>= SECTOR_SHIFT; - while (isect < end) { - sector_t len; - be = bl_find_get_extent(bl, isect, NULL); - BUG_ON(!be); /* FIXME */ - len = min(end, be->be_f_offset + be->be_length) - isect; - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - se = bl_pop_one_short_extent(be->be_inval); - BUG_ON(!se); - bl_mark_for_commit(be, isect, len, se); - } - isect += len; - bl_put_extent(be); - } -} - -static void bl_end_io_write_zero(struct bio *bio, int err) -{ - struct parallel_io *par = bio->bi_private; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - /* This is the zeroing page we added */ - end_page_writeback(bvec->bv_page); - page_cache_release(bvec->bv_page); - } - - if (unlikely(err)) { - struct nfs_pgio_header *header = par->data; - - if (!header->pnfs_error) - header->pnfs_error = -EIO; - pnfs_set_lo_fail(header->lseg); - } - bio_put(bio); - put_parallel(par); } static void bl_end_io_write(struct bio *bio, int err) @@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err) */ static void bl_write_cleanup(struct work_struct *work) { - struct rpc_task *task; - struct nfs_pgio_header *hdr; + struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work); + struct nfs_pgio_header *hdr = + container_of(task, struct nfs_pgio_header, task); + dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); + if (likely(!hdr->pnfs_error)) { - /* Marks for LAYOUTCOMMIT */ - mark_extents_written(BLK_LSEG2EXT(hdr->lseg), - hdr->args.offset, hdr->args.count); + struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg); + u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK; + u64 end = (hdr->args.offset + hdr->args.count + + PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK; + + ext_tree_mark_written(bl, start >> SECTOR_SHIFT, + (end - start) >> SECTOR_SHIFT); } + pnfs_ld_write_done(hdr); } /* Called when last of bios associated with a bl_write_pagelist call finishes */ -static void bl_end_par_io_write(void *data, int num_se) +static void bl_end_par_io_write(void *data) { struct nfs_pgio_header *hdr = data; - if (unlikely(hdr->pnfs_error)) { - bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval, - num_se); - } - hdr->task.tk_status = hdr->pnfs_error; hdr->verf.committed = NFS_FILE_SYNC; INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); schedule_work(&hdr->task.u.tk_work); } -/* FIXME STUB - mark intersection of layout and page as bad, so is not - * used again. - */ -static void mark_bad_read(void) -{ - return; -} - -/* - * map_block: map a requested I/0 block (isect) into an offset in the LVM - * block_device - */ -static void -map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) -{ - dprintk("%s enter be=%p\n", __func__, be); - - set_buffer_mapped(bh); - bh->b_bdev = be->be_mdev; - bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> - (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); - - dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", - __func__, (unsigned long long)isect, (long)bh->b_blocknr, - bh->b_size); - return; -} - -static void -bl_read_single_end_io(struct bio *bio, int error) -{ - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct page *page = bvec->bv_page; - - /* Only one page in bvec */ - unlock_page(page); -} - -static int -bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, - unsigned int offset, unsigned int len) -{ - struct bio *bio; - struct page *shadow_page; - sector_t isect; - char *kaddr, *kshadow_addr; - int ret = 0; - - dprintk("%s: offset %u len %u\n", __func__, offset, len); - - shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (shadow_page == NULL) - return -ENOMEM; - - bio = bio_alloc(GFP_NOIO, 1); - if (bio == NULL) - return -ENOMEM; - - isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + - (offset / SECTOR_SIZE); - - bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; - bio->bi_bdev = be->be_mdev; - bio->bi_end_io = bl_read_single_end_io; - - lock_page(shadow_page); - if (bio_add_page(bio, shadow_page, - SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { - unlock_page(shadow_page); - bio_put(bio); - return -EIO; - } - - submit_bio(READ, bio); - wait_on_page_locked(shadow_page); - if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { - ret = -EIO; - } else { - kaddr = kmap_atomic(page); - kshadow_addr = kmap_atomic(shadow_page); - memcpy(kaddr + offset, kshadow_addr + offset, len); - kunmap_atomic(kshadow_addr); - kunmap_atomic(kaddr); - } - __free_page(shadow_page); - bio_put(bio); - - return ret; -} - -static int -bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, - unsigned int dirty_offset, unsigned int dirty_len, - bool full_page) -{ - int ret = 0; - unsigned int start, end; - - if (full_page) { - start = 0; - end = PAGE_CACHE_SIZE; - } else { - start = round_down(dirty_offset, SECTOR_SIZE); - end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); - } - - dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); - if (!be) { - zero_user_segments(page, start, dirty_offset, - dirty_offset + dirty_len, end); - if (start == 0 && end == PAGE_CACHE_SIZE && - trylock_page(page)) { - SetPageUptodate(page); - unlock_page(page); - } - return ret; - } - - if (start != dirty_offset) - ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); - - if (!ret && (dirty_offset + dirty_len < end)) - ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, - end - dirty_offset - dirty_len); - - return ret; -} - -/* Given an unmapped page, zero it or read in page for COW, page is locked - * by caller. - */ -static int -init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) -{ - struct buffer_head *bh = NULL; - int ret = 0; - sector_t isect; - - dprintk("%s enter, %p\n", __func__, page); - BUG_ON(PageUptodate(page)); - if (!cow_read) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - goto cleanup; - } - - bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); - if (!bh) { - ret = -ENOMEM; - goto cleanup; - } - - isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; - map_block(bh, isect, cow_read); - if (!bh_uptodate_or_lock(bh)) - ret = bh_submit_read(bh); - if (ret) - goto cleanup; - SetPageUptodate(page); - -cleanup: - if (bh) - free_buffer_head(bh); - if (ret) { - /* Need to mark layout with bad read...should now - * just use nfs4 for reads and writes. - */ - mark_bad_read(); - } - return ret; -} - -/* Find or create a zeroing page marked being writeback. - * Return ERR_PTR on error, NULL to indicate skip this page and page itself - * to indicate write out. - */ -static struct page * -bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, - struct pnfs_block_extent *cow_read) -{ - struct page *page; - int locked = 0; - page = find_get_page(inode->i_mapping, index); - if (page) - goto check_page; - - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s oom\n", __func__); - return ERR_PTR(-ENOMEM); - } - locked = 1; - -check_page: - /* PageDirty: Other will write this out - * PageWriteback: Other is writing this out - * PageUptodate: It was read before - */ - if (PageDirty(page) || PageWriteback(page)) { - print_page(page); - if (locked) - unlock_page(page); - page_cache_release(page); - return NULL; - } - - if (!locked) { - lock_page(page); - locked = 1; - goto check_page; - } - if (!PageUptodate(page)) { - /* New page, readin or zero it */ - init_page_for_write(page, cow_read); - } - set_page_writeback(page); - unlock_page(page); - - return page; -} - static enum pnfs_try_status bl_write_pagelist(struct nfs_pgio_header *header, int sync) { - int i, ret, npg_zero, pg_index, last = 0; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; - sector_t isect, last_isect = 0, extent_length = 0; + struct pnfs_block_extent be; + sector_t isect, extent_length = 0; struct parallel_io *par = NULL; loff_t offset = header->args.offset; size_t count = header->args.count; - unsigned int pg_offset, pg_len, saved_len; struct page **pages = header->args.pages; - struct page *page; - pgoff_t index; - u64 temp; - int npg_per_block = - NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + unsigned int pg_len; + struct blk_plug plug; + int i; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); - if (header->dreq != NULL && - (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || - !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { - dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); - goto out_mds; - } /* At this point, header->page_aray is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. */ par = alloc_parallel(header); if (!par) - goto out_mds; + return PNFS_NOT_ATTEMPTED; par->pnfs_callback = bl_end_par_io_write; - /* At this point, have to be more careful with error handling */ - isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); - if (!be || !is_writable(be, isect)) { - dprintk("%s no matching extents!\n", __func__); - goto out_mds; - } + blk_start_plug(&plug); - /* First page inside INVALID extent */ - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (likely(!bl_push_one_short_extent(be->be_inval))) - par->bse_count++; - else - goto out_mds; - temp = offset >> PAGE_CACHE_SHIFT; - npg_zero = do_div(temp, npg_per_block); - isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & - (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - extent_length = be->be_length - (isect - be->be_f_offset); - -fill_invalid_ext: - dprintk("%s need to zero %d pages\n", __func__, npg_zero); - for (;npg_zero > 0; npg_zero--) { - if (bl_is_sector_init(be->be_inval, isect)) { - dprintk("isect %llu already init\n", - (unsigned long long)isect); - goto next_page; - } - /* page ref released in bl_end_io_write_zero */ - index = isect >> PAGE_CACHE_SECTOR_SHIFT; - dprintk("%s zero %dth page: index %lu isect %llu\n", - __func__, npg_zero, index, - (unsigned long long)isect); - page = bl_find_get_zeroing_page(header->inode, index, - cow_read); - if (unlikely(IS_ERR(page))) { - header->pnfs_error = PTR_ERR(page); - goto out; - } else if (page == NULL) - goto next_page; - - ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS); - if (unlikely(ret)) { - dprintk("%s bl_mark_sectors_init fail %d\n", - __func__, ret); - end_page_writeback(page); - page_cache_release(page); - header->pnfs_error = ret; - goto out; - } - if (likely(!bl_push_one_short_extent(be->be_inval))) - par->bse_count++; - else { - end_page_writeback(page); - page_cache_release(page); - header->pnfs_error = -ENOMEM; - goto out; - } - /* FIXME: This should be done in bi_end_io */ - mark_extents_written(BLK_LSEG2EXT(header->lseg), - page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE); - - bio = bl_add_page_to_bio(bio, npg_zero, WRITE, - isect, page, be, - bl_end_io_write_zero, par); - if (IS_ERR(bio)) { - header->pnfs_error = PTR_ERR(bio); - bio = NULL; - goto out; - } -next_page: - isect += PAGE_CACHE_SECTORS; - extent_length -= PAGE_CACHE_SECTORS; - } - if (last) - goto write_done; - } - bio = bl_submit_bio(WRITE, bio); + /* we always write out the whole page */ + offset = offset & (loff_t)PAGE_CACHE_MASK; + isect = offset >> SECTOR_SHIFT; - /* Middle pages */ - pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; for (i = pg_index; i < header->page_array.npages; i++) { - if (!extent_length) { + if (extent_length <= 0) { /* We've used up the previous extent */ - bl_put_extent(be); - bl_put_extent(cow_read); bio = bl_submit_bio(WRITE, bio); /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, &cow_read); - if (!be || !is_writable(be, isect)) { + if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; goto out; } - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (likely(!bl_push_one_short_extent( - be->be_inval))) - par->bse_count++; - else { - header->pnfs_error = -ENOMEM; - goto out; - } - } - extent_length = be->be_length - - (isect - be->be_f_offset); - } - - dprintk("%s offset %lld count %Zu\n", __func__, offset, count); - pg_offset = offset & ~PAGE_CACHE_MASK; - if (pg_offset + count > PAGE_CACHE_SIZE) - pg_len = PAGE_CACHE_SIZE - pg_offset; - else - pg_len = count; - - saved_len = pg_len; - if (be->be_state == PNFS_BLOCK_INVALID_DATA && - !bl_is_sector_init(be->be_inval, isect)) { - ret = bl_read_partial_page_sync(pages[i], cow_read, - pg_offset, pg_len, true); - if (ret) { - dprintk("%s bl_read_partial_page_sync fail %d\n", - __func__, ret); - header->pnfs_error = ret; - goto out; - } - - ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS); - if (unlikely(ret)) { - dprintk("%s bl_mark_sectors_init fail %d\n", - __func__, ret); - header->pnfs_error = ret; - goto out; - } - /* Expand to full page write */ - pg_offset = 0; - pg_len = PAGE_CACHE_SIZE; - } else if ((pg_offset & (SECTOR_SIZE - 1)) || - (pg_len & (SECTOR_SIZE - 1))){ - /* ahh, nasty case. We have to do sync full sector - * read-modify-write cycles. - */ - unsigned int saved_offset = pg_offset; - ret = bl_read_partial_page_sync(pages[i], be, pg_offset, - pg_len, false); - pg_offset = round_down(pg_offset, SECTOR_SIZE); - pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) - - pg_offset; + extent_length = be.be_length - (isect - be.be_f_offset); } - + pg_len = PAGE_CACHE_SIZE; bio = do_add_page_to_bio(bio, header->page_array.npages - i, - WRITE, - isect, pages[i], be, + WRITE, isect, pages[i], &map, &be, bl_end_io_write, par, - pg_offset, pg_len); + 0, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } - offset += saved_len; - count -= saved_len; - isect += PAGE_CACHE_SECTORS; - last_isect = isect; - extent_length -= PAGE_CACHE_SECTORS; - } - /* Last page inside INVALID extent */ - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - bio = bl_submit_bio(WRITE, bio); - temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; - npg_zero = npg_per_block - do_div(temp, npg_per_block); - if (npg_zero < npg_per_block) { - last = 1; - goto fill_invalid_ext; - } + offset += pg_len; + count -= pg_len; + isect += (pg_len >> SECTOR_SHIFT); + extent_length -= (pg_len >> SECTOR_SHIFT); } -write_done: header->res.count = header->args.count; out: - bl_put_extent(be); - bl_put_extent(cow_read); bl_submit_bio(WRITE, bio); + blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; -out_mds: - bl_put_extent(be); - bl_put_extent(cow_read); - kfree(par); - return PNFS_NOT_ATTEMPTED; -} - -/* FIXME - range ignored */ -static void -release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) -{ - int i; - struct pnfs_block_extent *be; - - spin_lock(&bl->bl_ext_lock); - for (i = 0; i < EXTENT_LISTS; i++) { - while (!list_empty(&bl->bl_extents[i])) { - be = list_first_entry(&bl->bl_extents[i], - struct pnfs_block_extent, - be_node); - list_del(&be->be_node); - bl_put_extent(be); - } - } - spin_unlock(&bl->bl_ext_lock); -} - -static void -release_inval_marks(struct pnfs_inval_markings *marks) -{ - struct pnfs_inval_tracking *pos, *temp; - struct pnfs_block_short_extent *se, *stemp; - - list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { - list_del(&pos->it_link); - kfree(pos); - } - - list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { - list_del(&se->bse_node); - kfree(se); - } - return; } static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) { struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int err; dprintk("%s enter\n", __func__); - release_extents(bl, NULL); - release_inval_marks(&bl->bl_inval); + + err = ext_tree_remove(bl, true, 0, LLONG_MAX); + WARN_ON(err); + kfree(bl); } @@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, bl = kzalloc(sizeof(*bl), gfp_flags); if (!bl) return NULL; + + bl->bl_ext_rw = RB_ROOT; + bl->bl_ext_ro = RB_ROOT; spin_lock_init(&bl->bl_ext_lock); - INIT_LIST_HEAD(&bl->bl_extents[0]); - INIT_LIST_HEAD(&bl->bl_extents[1]); - INIT_LIST_HEAD(&bl->bl_commit); - INIT_LIST_HEAD(&bl->bl_committing); - bl->bl_count = 0; - bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; - BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; } @@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg) kfree(lseg); } -/* We pretty much ignore lseg, and store all data layout wide, so we - * can correctly merge. - */ -static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - struct pnfs_layout_segment *lseg; - int status; +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; - dprintk("%s enter\n", __func__); - lseg = kzalloc(sizeof(*lseg), gfp_flags); - if (!lseg) - return ERR_PTR(-ENOMEM); - status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); - if (status) { - /* We don't want to call the full-blown bl_free_lseg, - * since on error extents were not touched. - */ - kfree(lseg); - return ERR_PTR(status); +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; } - return lseg; + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; } -static void -bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) +static int decode_sector_number(__be32 **rp, sector_t *sp) { - dprintk("%s enter\n", __func__); - encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; } -static void -bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +static int +bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, + struct layout_verification *lv, struct list_head *extents, + gfp_t gfp_mask) { - struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + struct pnfs_block_extent *be; + struct nfs4_deviceid id; + int error; + __be32 *p; - dprintk("%s enter\n", __func__); - clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); -} + p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE); + if (!p) + return -EIO; -static void free_blk_mountid(struct block_mount_id *mid) -{ - if (mid) { - struct pnfs_block_dev *dev, *tmp; + be = kzalloc(sizeof(*be), GFP_NOFS); + if (!be) + return -ENOMEM; - /* No need to take bm_lock as we are last user freeing bm_devlist */ - list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { - list_del(&dev->bm_node); - bl_free_block_dev(dev); - } - kfree(mid); + memcpy(&id, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + error = -EIO; + be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + lo->plh_lc_cred, gfp_mask); + if (!be->be_device) + goto out_free_be; + + /* + * The next three values are read in as bytes, but stored in the + * extent structure in 512-byte granularity. + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_put_deviceid; + be->be_state = be32_to_cpup(p++); + + error = verify_extent(be, lv); + if (error) { + dprintk("%s: extent verification failed\n", __func__); + goto out_put_deviceid; } + + list_add_tail(&be->be_list, extents); + return 0; + +out_put_deviceid: + nfs4_put_deviceid_node(be->be_device); +out_free_be: + kfree(be); + return error; } -/* This is mostly copied from the filelayout_get_device_info function. - * It seems much of this should be at the generic pnfs level. - */ -static struct pnfs_block_dev * -nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, - struct nfs4_deviceid *d_id) +static struct pnfs_layout_segment * +bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, + gfp_t gfp_mask) { - struct pnfs_device *dev; - struct pnfs_block_dev *rv; - u32 max_resp_sz; - int max_pages; - struct page **pages = NULL; - int i, rc; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + struct pnfs_layout_segment *lseg; + struct xdr_buf buf; + struct xdr_stream xdr; + struct page *scratch; + int status, i; + uint32_t count; + __be32 *p; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + lseg = kzalloc(sizeof(*lseg), gfp_mask); + if (!lseg) + return ERR_PTR(-ENOMEM); + + status = -ENOMEM; + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, + lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + status = -EIO; + p = xdr_inline_decode(&xdr, 4); + if (unlikely(!p)) + goto out_free_scratch; + + count = be32_to_cpup(p++); + dprintk("%s: number of extents %d\n", __func__, count); /* - * Use the session max response size as the basis for setting - * GETDEVICEINFO's maxcount + * Decode individual extents, putting them in temporary staging area + * until whole layout is decoded to make error recovery easier. */ - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - max_pages = nfs_page_array_len(0, max_resp_sz); - dprintk("%s max_resp_sz %u max_pages %d\n", - __func__, max_resp_sz, max_pages); - - dev = kmalloc(sizeof(*dev), GFP_NOFS); - if (!dev) { - dprintk("%s kmalloc failed\n", __func__); - return ERR_PTR(-ENOMEM); + for (i = 0; i < count; i++) { + status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask); + if (status) + goto process_extents; } - pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - kfree(dev); - return ERR_PTR(-ENOMEM); + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + status = -EIO; + goto process_extents; } - for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) { - rv = ERR_PTR(-ENOMEM); - goto out_free; - } + + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + status = -EIO; } - memcpy(&dev->dev_id, d_id, sizeof(*d_id)); - dev->layout_type = LAYOUT_BLOCK_VOLUME; - dev->pages = pages; - dev->pgbase = 0; - dev->pglen = PAGE_SIZE * max_pages; - dev->mincount = 0; - dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - - dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); - rc = nfs4_proc_getdeviceinfo(server, dev, NULL); - dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) { - rv = ERR_PTR(rc); - goto out_free; +process_extents: + while (!list_empty(&extents)) { + struct pnfs_block_extent *be = + list_first_entry(&extents, struct pnfs_block_extent, + be_list); + list_del(&be->be_list); + + if (!status) + status = ext_tree_insert(bl, be); + + if (status) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } } - rv = nfs4_blk_decode_device(server, dev); - out_free: - for (i = 0; i < max_pages; i++) - __free_page(pages[i]); - kfree(pages); - kfree(dev); - return rv; +out_free_scratch: + __free_page(scratch); +out: + dprintk("%s returns %d\n", __func__, status); + if (status) { + kfree(lseg); + return ERR_PTR(status); + } + return lseg; } -static int -bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +static void +bl_return_range(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) { - struct block_mount_id *b_mt_id = NULL; - struct pnfs_devicelist *dlist = NULL; - struct pnfs_block_dev *bdev; - LIST_HEAD(block_disklist); - int status, i; - - dprintk("%s enter\n", __func__); + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + sector_t offset = range->offset >> SECTOR_SHIFT, end; - if (server->pnfs_blksize == 0) { - dprintk("%s Server did not return blksize\n", __func__); - return -EINVAL; - } - b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); - if (!b_mt_id) { - status = -ENOMEM; - goto out_error; - } - /* Initialize nfs4 block layout mount id */ - spin_lock_init(&b_mt_id->bm_lock); - INIT_LIST_HEAD(&b_mt_id->bm_devlist); - - dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); - if (!dlist) { - status = -ENOMEM; - goto out_error; + if (range->offset % 8) { + dprintk("%s: offset %lld not block size aligned\n", + __func__, range->offset); + return; } - dlist->eof = 0; - while (!dlist->eof) { - status = nfs4_proc_getdevicelist(server, fh, dlist); - if (status) - goto out_error; - dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", - __func__, dlist->num_devs, dlist->eof); - for (i = 0; i < dlist->num_devs; i++) { - bdev = nfs4_blk_get_deviceinfo(server, fh, - &dlist->dev_id[i]); - if (IS_ERR(bdev)) { - status = PTR_ERR(bdev); - goto out_error; - } - spin_lock(&b_mt_id->bm_lock); - list_add(&bdev->bm_node, &b_mt_id->bm_devlist); - spin_unlock(&b_mt_id->bm_lock); + + if (range->length != NFS4_MAX_UINT64) { + if (range->length % 8) { + dprintk("%s: length %lld not block size aligned\n", + __func__, range->length); + return; } - } - dprintk("%s SUCCESS\n", __func__); - server->pnfs_ld_data = b_mt_id; - out_return: - kfree(dlist); - return status; + end = offset + (range->length >> SECTOR_SHIFT); + } else { + end = round_down(NFS4_MAX_UINT64, PAGE_SIZE); + } - out_error: - free_blk_mountid(b_mt_id); - goto out_return; + ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end); } static int -bl_clear_layoutdriver(struct nfs_server *server) +bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg) +{ + return ext_tree_prepare_commit(arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) { - struct block_mount_id *b_mt_id = server->pnfs_ld_data; + ext_tree_mark_committed(&lcdata->args, lcdata->res.status); +} +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ dprintk("%s enter\n", __func__); - free_blk_mountid(b_mt_id); - dprintk("%s RETURNS\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + if (server->pnfs_blksize > PAGE_SIZE) { + printk(KERN_ERR "%s: pNFS blksize %d not supported.\n", + __func__, server->pnfs_blksize); + return -EINVAL; + } + return 0; } static bool -is_aligned_req(struct nfs_page *req, unsigned int alignment) +is_aligned_req(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, unsigned int alignment) { - return IS_ALIGNED(req->wb_offset, alignment) && - IS_ALIGNED(req->wb_bytes, alignment); + /* + * Always accept buffered writes, higher layers take care of the + * right alignment. + */ + if (pgio->pg_dreq == NULL) + return true; + + if (!IS_ALIGNED(req->wb_offset, alignment)) + return false; + + if (IS_ALIGNED(req->wb_bytes, alignment)) + return true; + + if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) { + /* + * If the write goes up to the inode size, just write + * the full page. Data past the inode size is + * guaranteed to be zeroed by the higher level client + * code, and this behaviour is mandated by RFC 5663 + * section 2.3.2. + */ + return true; + } + + return false; } static void bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, SECTOR_SIZE)) + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) { nfs_pageio_reset_read_mds(pgio); - else - pnfs_generic_pg_init_read(pgio, req); + return; + } + + pnfs_generic_pg_init_read(pgio, req); } /* @@ -1196,10 +796,8 @@ static size_t bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, SECTOR_SIZE)) + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) return 0; - return pnfs_generic_pg_test(pgio, prev, req); } @@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) static void bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) { + u64 wb_size; + + if (!is_aligned_req(pgio, req, PAGE_SIZE)) { nfs_pageio_reset_write_mds(pgio); - } else { - u64 wb_size; - if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); + return; } + + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); } /* @@ -1252,10 +851,8 @@ static size_t bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) + if (!is_aligned_req(pgio, req, PAGE_SIZE)) return 0; - return pnfs_generic_pg_test(pgio, prev, req); } @@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .id = LAYOUT_BLOCK_VOLUME, .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, .alloc_layout_hdr = bl_alloc_layout_hdr, .free_layout_hdr = bl_free_layout_hdr, .alloc_lseg = bl_alloc_lseg, .free_lseg = bl_free_lseg, - .encode_layoutcommit = bl_encode_layoutcommit, + .return_range = bl_return_range, + .prepare_layoutcommit = bl_prepare_layoutcommit, .cleanup_layoutcommit = bl_cleanup_layoutcommit, .set_layoutdriver = bl_set_layoutdriver, - .clear_layoutdriver = bl_clear_layoutdriver, + .alloc_deviceid_node = bl_alloc_deviceid_node, + .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, }; -static const struct rpc_pipe_ops bl_upcall_ops = { - .upcall = rpc_pipe_generic_upcall, - .downcall = bl_pipe_downcall, - .destroy_msg = bl_pipe_destroy_msg, -}; - -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - struct dentry *dir, *dentry; - - dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); - if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); - dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); -} - -static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, - void *ptr) -{ - struct super_block *sb = ptr; - struct net *net = sb->s_fs_info; - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; - int ret = 0; - - if (!try_module_get(THIS_MODULE)) - return 0; - - if (nn->bl_device_pipe == NULL) { - module_put(THIS_MODULE); - return 0; - } - - switch (event) { - case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; - break; - case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); - break; - default: - ret = -ENOTSUPP; - break; - } - module_put(THIS_MODULE); - return ret; -} - -static struct notifier_block nfs4blocklayout_block = { - .notifier_call = rpc_pipefs_event, -}; - -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) -{ - struct super_block *pipefs_sb; - struct dentry *dentry; - - pipefs_sb = rpc_get_sb_net(net); - if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); - rpc_put_sb_net(net); - return dentry; -} - -static void nfs4blocklayout_unregister_net(struct net *net, - struct rpc_pipe *pipe) -{ - struct super_block *pipefs_sb; - - pipefs_sb = rpc_get_sb_net(net); - if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); - rpc_put_sb_net(net); - } -} - -static int nfs4blocklayout_net_init(struct net *net) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; - - init_waitqueue_head(&nn->bl_wq); - nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); - if (IS_ERR(nn->bl_device_pipe)) - return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; -} - -static void nfs4blocklayout_net_exit(struct net *net) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - - nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); - rpc_destroy_pipe_data(nn->bl_device_pipe); - nn->bl_device_pipe = NULL; -} - -static struct pernet_operations nfs4blocklayout_net_ops = { - .init = nfs4blocklayout_net_init, - .exit = nfs4blocklayout_net_exit, -}; - static int __init nfs4blocklayout_init(void) { int ret; @@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void) ret = pnfs_register_layoutdriver(&blocklayout_type); if (ret) goto out; - - ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + ret = bl_init_pipefs(); if (ret) - goto out_remove; - ret = register_pernet_subsys(&nfs4blocklayout_net_ops); - if (ret) - goto out_notifier; -out: - return ret; + goto out_unregister; + return 0; -out_notifier: - rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); -out_remove: +out_unregister: pnfs_unregister_layoutdriver(&blocklayout_type); +out: return ret; } @@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void) dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", __func__); - rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); - unregister_pernet_subsys(&nfs4blocklayout_net_ops); + bl_cleanup_pipefs(); pnfs_unregister_layoutdriver(&blocklayout_type); } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 9838fb020473..92dca9e90d8d 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -44,105 +44,112 @@ #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) #define SECTOR_SIZE (1 << SECTOR_SHIFT) -struct block_mount_id { - spinlock_t bm_lock; /* protects list */ - struct list_head bm_devlist; /* holds pnfs_block_dev */ -}; +struct pnfs_block_dev; -struct pnfs_block_dev { - struct list_head bm_node; - struct nfs4_deviceid bm_mdevid; /* associated devid */ - struct block_device *bm_mdev; /* meta device itself */ - struct net *net; +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +#define PNFS_BLOCK_MAX_UUIDS 4 +#define PNFS_BLOCK_MAX_DEVICES 64 + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + int len; + int nr_sigs; + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } sigs[PNFS_BLOCK_MAX_UUIDS]; + } simple; + struct { + u64 start; + u64 len; + u32 volume; + } slice; + struct { + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } concat; + struct { + u64 chunk_size; + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } stripe; + }; }; -#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ +struct pnfs_block_dev_map { + sector_t start; + sector_t len; -struct my_tree { - sector_t mtt_step_size; /* Internal sector alignment */ - struct list_head mtt_stub; /* Should be a radix tree */ + sector_t disk_offset; + struct block_device *bdev; }; -struct pnfs_inval_markings { - spinlock_t im_lock; - struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ - sector_t im_block_size; /* Server blocksize in sectors */ - struct list_head im_extents; /* Short extents for INVAL->RW conversion */ +struct pnfs_block_dev { + struct nfs4_deviceid_node node; + + u64 start; + u64 len; + + u32 nr_children; + struct pnfs_block_dev *children; + u64 chunk_size; + + struct block_device *bdev; + u64 disk_offset; + + bool (*map)(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map); }; -struct pnfs_inval_tracking { - struct list_head it_link; - int it_sector; - int it_tags; +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ }; /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { - struct kref be_refcnt; - struct list_head be_node; /* link into lseg list */ - struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ - struct block_device *be_mdev; + union { + struct rb_node be_node; + struct list_head be_list; + }; + struct nfs4_deviceid_node *be_device; sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ enum exstate4 be_state; /* the state of this extent */ - struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +#define EXTENT_WRITTEN 1 +#define EXTENT_COMMITTING 2 + unsigned int be_tag; }; -/* Shortened extent used by LAYOUTCOMMIT */ -struct pnfs_block_short_extent { - struct list_head bse_node; - struct nfs4_deviceid bse_devid; - struct block_device *bse_mdev; - sector_t bse_f_offset; /* the starting offset in the file */ - sector_t bse_length; /* the size of the extent */ -}; - -static inline void -BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) -{ - spin_lock_init(&marks->im_lock); - INIT_LIST_HEAD(&marks->im_tree.mtt_stub); - INIT_LIST_HEAD(&marks->im_extents); - marks->im_block_size = blocksize; - marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, - blocksize); -} - -enum extentclass4 { - RW_EXTENT = 0, /* READWRTE and INVAL */ - RO_EXTENT = 1, /* READ and NONE */ - EXTENT_LISTS = 2, -}; - -static inline int bl_choose_list(enum exstate4 state) -{ - if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) - return RO_EXTENT; - else - return RW_EXTENT; -} +/* on the wire size of the extent */ +#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) struct pnfs_block_layout { - struct pnfs_layout_hdr bl_layout; - struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + struct pnfs_layout_hdr bl_layout; + struct rb_root bl_ext_rw; + struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ - struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ - struct list_head bl_commit; /* Needs layout commit */ - struct list_head bl_committing; /* Layout committing */ - unsigned int bl_count; /* entries in bl_commit */ - sector_t bl_blocksize; /* Server blocksize in sectors */ }; -#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) - static inline struct pnfs_block_layout * BLK_LO2EXT(struct pnfs_layout_hdr *lo) { @@ -171,41 +178,27 @@ struct bl_msg_hdr { #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ -/* blocklayoutdev.c */ -ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); -void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -void nfs4_blkdev_put(struct block_device *bdev); -struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, - struct pnfs_device *dev); -int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); - -/* blocklayoutdm.c */ -void bl_free_block_dev(struct pnfs_block_dev *bdev); - -/* extents.c */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read); -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length); -void bl_put_extent(struct pnfs_block_extent *be); -struct pnfs_block_extent *bl_alloc_extent(void); -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); -int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg); -void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status); -int bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new); -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new); -int bl_push_one_short_extent(struct pnfs_inval_markings *marks); -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks); -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); +/* dev.c */ +struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_mask); +void bl_free_deviceid_node(struct nfs4_deviceid_node *d); + +/* extent_tree.c */ +int ext_tree_insert(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, + sector_t end); +int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len); +bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw); +int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); +void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); + +/* rpc_pipefs.c */ +dev_t bl_resolve_deviceid(struct nfs_server *server, + struct pnfs_block_volume *b, gfp_t gfp_mask); +int __init bl_init_pipefs(void); +void __exit bl_cleanup_pipefs(void); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c deleted file mode 100644 index 04303b5c9361..000000000000 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayoutdev.c - * - * Device operations for the pnfs nfs4 file layout driver. - * - * Copyright (c) 2006 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@citi.umich.edu> - * Fred Isaman <iisaman@umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ -#include <linux/module.h> -#include <linux/buffer_head.h> /* __bread */ - -#include <linux/genhd.h> -#include <linux/blkdev.h> -#include <linux/hash.h> - -#include "blocklayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -static int decode_sector_number(__be32 **rp, sector_t *sp) -{ - uint64_t s; - - *rp = xdr_decode_hyper(*rp, &s); - if (s & 0x1ff) { - printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); - return -1; - } - *sp = s >> SECTOR_SHIFT; - return 0; -} - -/* - * Release the block device - */ -void nfs4_blkdev_put(struct block_device *bdev) -{ - dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - blkdev_put(bdev, FMODE_READ); -} - -ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, - size_t mlen) -{ - struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, - nfs_net_id); - - if (mlen != sizeof (struct bl_dev_msg)) - return -EINVAL; - - if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) - return -EFAULT; - - wake_up(&nn->bl_wq); - - return mlen; -} - -void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) -{ - struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); - - if (msg->errno >= 0) - return; - wake_up(bl_pipe_msg->bl_wq); -} - -/* - * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. - */ -struct pnfs_block_dev * -nfs4_blk_decode_device(struct nfs_server *server, - struct pnfs_device *dev) -{ - struct pnfs_block_dev *rv; - struct block_device *bd = NULL; - struct bl_pipe_msg bl_pipe_msg; - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; - struct bl_msg_hdr bl_msg = { - .type = BL_DEVICE_MOUNT, - .totallen = dev->mincount, - }; - uint8_t *dataptr; - DECLARE_WAITQUEUE(wq, current); - int offset, len, i, rc; - struct net *net = server->nfs_client->cl_net; - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct bl_dev_msg *reply = &nn->bl_mount_reply; - - dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); - dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, - dev->mincount); - - bl_pipe_msg.bl_wq = &nn->bl_wq; - memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); - if (!msg->data) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - - memcpy(msg->data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg->data; - len = dev->mincount; - offset = sizeof(bl_msg); - for (i = 0; len > 0; i++) { - memcpy(&dataptr[offset], page_address(dev->pages[i]), - len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - offset += PAGE_CACHE_SIZE; - } - msg->len = sizeof(bl_msg) + dev->mincount; - - dprintk("%s CALLING USERSPACE DAEMON\n", __func__); - add_wait_queue(&nn->bl_wq, &wq); - rc = rpc_queue_upcall(nn->bl_device_pipe, msg); - if (rc < 0) { - remove_wait_queue(&nn->bl_wq, &wq); - rv = ERR_PTR(rc); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nn->bl_wq, &wq); - - if (reply->status != BL_DEVICE_REQUEST_PROC) { - dprintk("%s failed to open device: %d\n", - __func__, reply->status); - rv = ERR_PTR(-EINVAL); - goto out; - } - - bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), - FMODE_READ, NULL); - if (IS_ERR(bd)) { - dprintk("%s failed to open device : %ld\n", __func__, - PTR_ERR(bd)); - rv = ERR_CAST(bd); - goto out; - } - - rv = kzalloc(sizeof(*rv), GFP_NOFS); - if (!rv) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - - rv->bm_mdev = bd; - memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); - rv->net = net; - dprintk("%s Created device %s with bd_block_size %u\n", - __func__, - bd->bd_disk->disk_name, - bd->bd_block_size); - -out: - kfree(msg->data); - return rv; -} - -/* Map deviceid returned by the server to constructed block_device */ -static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, - struct nfs4_deviceid *id) -{ - struct block_device *rv = NULL; - struct block_mount_id *mid; - struct pnfs_block_dev *dev; - - dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); - mid = BLK_ID(lo); - spin_lock(&mid->bm_lock); - list_for_each_entry(dev, &mid->bm_devlist, bm_node) { - if (memcmp(id->data, dev->bm_mdevid.data, - NFS4_DEVICEID4_SIZE) == 0) { - rv = dev->bm_mdev; - goto out; - } - } - out: - spin_unlock(&mid->bm_lock); - dprintk("%s returning %p\n", __func__, rv); - return rv; -} - -/* Tracks info needed to ensure extents in layout obey constraints of spec */ -struct layout_verification { - u32 mode; /* R or RW */ - u64 start; /* Expected start of next non-COW extent */ - u64 inval; /* Start of INVAL coverage */ - u64 cowread; /* End of COW read coverage */ -}; - -/* Verify the extent meets the layout requirements of the pnfs-block draft, - * section 2.3.1. - */ -static int verify_extent(struct pnfs_block_extent *be, - struct layout_verification *lv) -{ - if (lv->mode == IOMODE_READ) { - if (be->be_state == PNFS_BLOCK_READWRITE_DATA || - be->be_state == PNFS_BLOCK_INVALID_DATA) - return -EIO; - if (be->be_f_offset != lv->start) - return -EIO; - lv->start += be->be_length; - return 0; - } - /* lv->mode == IOMODE_RW */ - if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { - if (be->be_f_offset != lv->start) - return -EIO; - if (lv->cowread > lv->start) - return -EIO; - lv->start += be->be_length; - lv->inval = lv->start; - return 0; - } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (be->be_f_offset != lv->start) - return -EIO; - lv->start += be->be_length; - return 0; - } else if (be->be_state == PNFS_BLOCK_READ_DATA) { - if (be->be_f_offset > lv->start) - return -EIO; - if (be->be_f_offset < lv->inval) - return -EIO; - if (be->be_f_offset < lv->cowread) - return -EIO; - /* It looks like you might want to min this with lv->start, - * but you really don't. - */ - lv->inval = lv->inval + be->be_length; - lv->cowread = be->be_f_offset + be->be_length; - return 0; - } else - return -EIO; -} - -/* XDR decode pnfs_block_layout4 structure */ -int -nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) -{ - struct pnfs_block_layout *bl = BLK_LO2EXT(lo); - int i, status = -EIO; - uint32_t count; - struct pnfs_block_extent *be = NULL, *save; - struct xdr_stream stream; - struct xdr_buf buf; - struct page *scratch; - __be32 *p; - struct layout_verification lv = { - .mode = lgr->range.iomode, - .start = lgr->range.offset >> SECTOR_SHIFT, - .inval = lgr->range.offset >> SECTOR_SHIFT, - .cowread = lgr->range.offset >> SECTOR_SHIFT, - }; - LIST_HEAD(extents); - - dprintk("---> %s\n", __func__); - - scratch = alloc_page(gfp_flags); - if (!scratch) - return -ENOMEM; - - xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err; - - count = be32_to_cpup(p++); - - dprintk("%s enter, number of extents %i\n", __func__, count); - p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); - if (unlikely(!p)) - goto out_err; - - /* Decode individual extents, putting them in temporary - * staging area until whole layout is decoded to make error - * recovery easier. - */ - for (i = 0; i < count; i++) { - be = bl_alloc_extent(); - if (!be) { - status = -ENOMEM; - goto out_err; - } - memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); - p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - be->be_mdev = translate_devid(lo, &be->be_devid); - if (!be->be_mdev) - goto out_err; - - /* The next three values are read in as bytes, - * but stored as 512-byte sector lengths - */ - if (decode_sector_number(&p, &be->be_f_offset) < 0) - goto out_err; - if (decode_sector_number(&p, &be->be_length) < 0) - goto out_err; - if (decode_sector_number(&p, &be->be_v_offset) < 0) - goto out_err; - be->be_state = be32_to_cpup(p++); - if (be->be_state == PNFS_BLOCK_INVALID_DATA) - be->be_inval = &bl->bl_inval; - if (verify_extent(be, &lv)) { - dprintk("%s verify failed\n", __func__); - goto out_err; - } - list_add_tail(&be->be_node, &extents); - } - if (lgr->range.offset + lgr->range.length != - lv.start << SECTOR_SHIFT) { - dprintk("%s Final length mismatch\n", __func__); - be = NULL; - goto out_err; - } - if (lv.start < lv.cowread) { - dprintk("%s Final uncovered COW extent\n", __func__); - be = NULL; - goto out_err; - } - /* Extents decoded properly, now try to merge them in to - * existing layout extents. - */ - spin_lock(&bl->bl_ext_lock); - list_for_each_entry_safe(be, save, &extents, be_node) { - list_del(&be->be_node); - status = bl_add_merge_extent(bl, be); - if (status) { - spin_unlock(&bl->bl_ext_lock); - /* This is a fairly catastrophic error, as the - * entire layout extent lists are now corrupted. - * We should have some way to distinguish this. - */ - be = NULL; - goto out_err; - } - } - spin_unlock(&bl->bl_ext_lock); - status = 0; - out: - __free_page(scratch); - dprintk("%s returns %i\n", __func__, status); - return status; - - out_err: - bl_put_extent(be); - while (!list_empty(&extents)) { - be = list_first_entry(&extents, struct pnfs_block_extent, - be_node); - list_del(&be->be_node); - bl_put_extent(be); - } - goto out; -} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c deleted file mode 100644 index 8999cfddd866..000000000000 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayoutdm.c - * - * Module for the NFSv4.1 pNFS block layout driver. - * - * Copyright (c) 2007 The Regents of the University of Michigan. - * All rights reserved. - * - * Fred Isaman <iisaman@umich.edu> - * Andy Adamson <andros@citi.umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ - -#include <linux/genhd.h> /* gendisk - used in a dprintk*/ -#include <linux/sched.h> -#include <linux/hash.h> - -#include "blocklayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -static void dev_remove(struct net *net, dev_t dev) -{ - struct bl_pipe_msg bl_pipe_msg; - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; - struct bl_dev_msg bl_umount_request; - struct bl_msg_hdr bl_msg = { - .type = BL_DEVICE_UMOUNT, - .totallen = sizeof(bl_umount_request), - }; - uint8_t *dataptr; - DECLARE_WAITQUEUE(wq, current); - struct nfs_net *nn = net_generic(net, nfs_net_id); - - dprintk("Entering %s\n", __func__); - - bl_pipe_msg.bl_wq = &nn->bl_wq; - memset(msg, 0, sizeof(*msg)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; - msg->data = kzalloc(msg->len, GFP_NOFS); - if (!msg->data) - goto out; - - memset(&bl_umount_request, 0, sizeof(bl_umount_request)); - bl_umount_request.major = MAJOR(dev); - bl_umount_request.minor = MINOR(dev); - - memcpy(msg->data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg->data; - memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - - add_wait_queue(&nn->bl_wq, &wq); - if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { - remove_wait_queue(&nn->bl_wq, &wq); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nn->bl_wq, &wq); - -out: - kfree(msg->data); -} - -/* - * Release meta device - */ -static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) -{ - dprintk("%s Releasing\n", __func__); - nfs4_blkdev_put(bdev->bm_mdev); - dev_remove(bdev->net, bdev->bm_mdev->bd_dev); -} - -void bl_free_block_dev(struct pnfs_block_dev *bdev) -{ - if (bdev) { - if (bdev->bm_mdev) { - dprintk("%s Removing DM device: %d:%d\n", - __func__, - MAJOR(bdev->bm_mdev->bd_dev), - MINOR(bdev->bm_mdev->bd_dev)); - nfs4_blk_metadev_release(bdev); - } - kfree(bdev); - } -} diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c new file mode 100644 index 000000000000..5aed4f98df41 --- /dev/null +++ b/fs/nfs/blocklayout/dev.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/sunrpc/svc.h> +#include <linux/blkdev.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +bl_free_device(struct pnfs_block_dev *dev) +{ + if (dev->nr_children) { + int i; + + for (i = 0; i < dev->nr_children; i++) + bl_free_device(&dev->children[i]); + kfree(dev->children); + } else { + if (dev->bdev) + blkdev_put(dev->bdev, FMODE_READ); + } +} + +void +bl_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct pnfs_block_dev *dev = + container_of(d, struct pnfs_block_dev, node); + + bl_free_device(dev); + kfree(dev); +} + +static int +nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int i; + + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->type = be32_to_cpup(p++); + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->simple.nr_sigs = be32_to_cpup(p++); + if (!b->simple.nr_sigs) { + dprintk("no signature\n"); + return -EIO; + } + + b->simple.len = 4 + 4; + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); + b->simple.sigs[i].sig_len = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); + if (!p) + return -EIO; + memcpy(&b->simple.sigs[i].sig, p, + b->simple.sigs[i].sig_len); + + b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + } + break; + case PNFS_BLOCK_VOLUME_SLICE: + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->slice.start); + p = xdr_decode_hyper(p, &b->slice.len); + b->slice.volume = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_CONCAT: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->concat.volumes_count; i++) + b->concat.volumes[i] = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_STRIPE: + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); + b->stripe.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->stripe.volumes_count; i++) + b->stripe.volumes[i] = be32_to_cpup(p++); + break; + default: + dprintk("unknown volume type!\n"); + return -EIO; + } + + return 0; +} + +static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + map->start = dev->start; + map->len = dev->len; + map->disk_offset = dev->disk_offset; + map->bdev = dev->bdev; + return true; +} + +static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + int i; + + for (i = 0; i < dev->nr_children; i++) { + struct pnfs_block_dev *child = &dev->children[i]; + + if (child->start > offset || + child->start + child->len <= offset) + continue; + + child->map(child, offset - child->start, map); + return true; + } + + dprintk("%s: ran off loop!\n", __func__); + return false; +} + +static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + struct pnfs_block_dev *child; + u64 chunk; + u32 chunk_idx; + u64 disk_offset; + + chunk = div_u64(offset, dev->chunk_size); + div_u64_rem(chunk, dev->nr_children, &chunk_idx); + + if (chunk_idx > dev->nr_children) { + dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", + __func__, chunk_idx, offset, dev->chunk_size); + /* error, should not happen */ + return false; + } + + /* truncate offset to the beginning of the stripe */ + offset = chunk * dev->chunk_size; + + /* disk offset of the stripe */ + disk_offset = div_u64(offset, dev->nr_children); + + child = &dev->children[chunk_idx]; + child->map(child, disk_offset, map); + + map->start += offset; + map->disk_offset += disk_offset; + map->len = dev->chunk_size; + return true; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask); + + +static int +bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + dev_t dev; + + dev = bl_resolve_deviceid(server, v, gfp_mask); + if (!dev) + return -EIO; + + d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(d->bdev)) { + printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", + MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); + return PTR_ERR(d->bdev); + } + + + d->len = i_size_read(d->bdev->bd_inode); + d->map = bl_map_simple; + + printk(KERN_INFO "pNFS: using block device %s\n", + d->bdev->bd_disk->disk_name); + return 0; +} + +static int +bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + int ret; + + ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask); + if (ret) + return ret; + + d->disk_offset = v->slice.start; + d->len = v->slice.len; + return 0; +} + +static int +bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->concat.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->concat.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->concat.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + d->children[i].start += len; + len += d->children[i].len; + } + + d->len = len; + d->map = bl_map_concat; + return 0; +} + +static int +bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->stripe.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->stripe.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->stripe.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + len += d->children[i].len; + } + + d->len = len; + d->chunk_size = v->stripe.chunk_size; + d->map = bl_map_stripe; + return 0; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + switch (volumes[idx].type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + return bl_parse_simple(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_SLICE: + return bl_parse_slice(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_CONCAT: + return bl_parse_concat(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_STRIPE: + return bl_parse_stripe(server, d, volumes, idx, gfp_mask); + default: + dprintk("unsupported volume type: %d\n", volumes[idx].type); + return -EIO; + } +} + +struct nfs4_deviceid_node * +bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node = NULL; + struct pnfs_block_volume *volumes; + struct pnfs_block_dev *top; + struct xdr_stream xdr; + struct xdr_buf buf; + struct page *scratch; + int nr_volumes, ret, i; + __be32 *p; + + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&xdr, sizeof(__be32)); + if (!p) + goto out_free_scratch; + nr_volumes = be32_to_cpup(p++); + + volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume), + gfp_mask); + if (!volumes) + goto out_free_scratch; + + for (i = 0; i < nr_volumes; i++) { + ret = nfs4_block_decode_volume(&xdr, &volumes[i]); + if (ret < 0) + goto out_free_volumes; + } + + top = kzalloc(sizeof(*top), gfp_mask); + if (!top) + goto out_free_volumes; + + ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); + if (ret) { + bl_free_device(top); + kfree(top); + goto out_free_volumes; + } + + node = &top->node; + nfs4_init_deviceid_node(node, server, &pdev->dev_id); + +out_free_volumes: + kfree(volumes); +out_free_scratch: + __free_page(scratch); +out: + return node; +} diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c new file mode 100644 index 000000000000..31d0b5e53dfd --- /dev/null +++ b/fs/nfs/blocklayout/extent_tree.c @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ + +#include <linux/vmalloc.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static inline struct pnfs_block_extent * +ext_node(struct rb_node *node) +{ + return rb_entry(node, struct pnfs_block_extent, be_node); +} + +static struct pnfs_block_extent * +ext_tree_first(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_prev(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_prev(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_next(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_next(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static inline sector_t +ext_f_end(struct pnfs_block_extent *be) +{ + return be->be_f_offset + be->be_length; +} + +static struct pnfs_block_extent * +__ext_tree_search(struct rb_root *root, sector_t start) +{ + struct rb_node *node = root->rb_node; + struct pnfs_block_extent *be = NULL; + + while (node) { + be = ext_node(node); + if (start < be->be_f_offset) + node = node->rb_left; + else if (start >= ext_f_end(be)) + node = node->rb_right; + else + return be; + } + + if (be) { + if (start < be->be_f_offset) + return be; + + if (start >= ext_f_end(be)) + return ext_tree_next(be); + } + + return NULL; +} + +static bool +ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) +{ + if (be1->be_state != be2->be_state) + return false; + if (be1->be_device != be2->be_device) + return false; + + if (be1->be_f_offset + be1->be_length != be2->be_f_offset) + return false; + + if (be1->be_state != PNFS_BLOCK_NONE_DATA && + (be1->be_v_offset + be1->be_length != be2->be_v_offset)) + return false; + + if (be1->be_state == PNFS_BLOCK_INVALID_DATA && + be1->be_tag != be2->be_tag) + return false; + + return true; +} + +static struct pnfs_block_extent * +ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + left->be_length += be->be_length; + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + return left; + } + + return be; +} + +static struct pnfs_block_extent * +ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + be->be_length += right->be_length; + rb_erase(&right->be_node, root); + nfs4_put_deviceid_node(right->be_device); + kfree(right); + } + + return be; +} + +static void +__ext_tree_insert(struct rb_root *root, + struct pnfs_block_extent *new, bool merge_ok) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + struct pnfs_block_extent *be; + + while (*p) { + parent = *p; + be = ext_node(parent); + + if (new->be_f_offset < be->be_f_offset) { + if (merge_ok && ext_can_merge(new, be)) { + be->be_f_offset = new->be_f_offset; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset = new->be_v_offset; + be->be_length += new->be_length; + be = ext_try_to_merge_left(root, be); + goto free_new; + } + p = &(*p)->rb_left; + } else if (new->be_f_offset >= ext_f_end(be)) { + if (merge_ok && ext_can_merge(be, new)) { + be->be_length += new->be_length; + be = ext_try_to_merge_right(root, be); + goto free_new; + } + p = &(*p)->rb_right; + } else { + BUG(); + } + } + + rb_link_node(&new->be_node, parent, p); + rb_insert_color(&new->be_node, root); + return; +free_new: + nfs4_put_deviceid_node(new->be_device); + kfree(new); +} + +static int +__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +{ + struct pnfs_block_extent *be; + sector_t len1 = 0, len2 = 0; + sector_t orig_v_offset; + sector_t orig_len; + + be = __ext_tree_search(root, start); + if (!be) + return 0; + if (be->be_f_offset >= end) + return 0; + + orig_v_offset = be->be_v_offset; + orig_len = be->be_length; + + if (start > be->be_f_offset) + len1 = start - be->be_f_offset; + if (ext_f_end(be) > end) + len2 = ext_f_end(be) - end; + + if (len2 > 0) { + if (len1 > 0) { + struct pnfs_block_extent *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = len1; + + new->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + new->be_v_offset = + orig_v_offset + orig_len - len2; + } + new->be_length = len2; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, true); + } else { + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + be->be_v_offset = + orig_v_offset + orig_len - len2; + } + be->be_length = len2; + } + } else { + if (len1 > 0) { + be->be_length = len1; + be = ext_tree_next(be); + } + + while (be && ext_f_end(be) <= end) { + struct pnfs_block_extent *next = ext_tree_next(be); + + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + be = next; + } + + if (be && be->be_f_offset < end) { + len1 = ext_f_end(be) - end; + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset += be->be_length - len1; + be->be_length = len1; + } + } + + return 0; +} + +int +ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be; + struct rb_root *root; + int err = 0; + + switch (new->be_state) { + case PNFS_BLOCK_READWRITE_DATA: + case PNFS_BLOCK_INVALID_DATA: + root = &bl->bl_ext_rw; + break; + case PNFS_BLOCK_READ_DATA: + case PNFS_BLOCK_NONE_DATA: + root = &bl->bl_ext_ro; + break; + default: + dprintk("invalid extent type\n"); + return -EINVAL; + } + + spin_lock(&bl->bl_ext_lock); +retry: + be = __ext_tree_search(root, new->be_f_offset); + if (!be || be->be_f_offset >= ext_f_end(new)) { + __ext_tree_insert(root, new, true); + } else if (new->be_f_offset >= be->be_f_offset) { + if (ext_f_end(new) <= ext_f_end(be)) { + nfs4_put_deviceid_node(new->be_device); + kfree(new); + } else { + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } + } else if (ext_f_end(new) <= ext_f_end(be)) { + new->be_length = be->be_f_offset - new->be_f_offset; + __ext_tree_insert(root, new, true); + } else { + struct pnfs_block_extent *split; + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + split = kmemdup(new, sizeof(*new), GFP_ATOMIC); + if (!split) { + err = -EINVAL; + goto out; + } + + split->be_length = be->be_f_offset - split->be_f_offset; + split->be_device = nfs4_get_deviceid(new->be_device); + __ext_tree_insert(root, split, true); + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static bool +__ext_tree_lookup(struct rb_root *root, sector_t isect, + struct pnfs_block_extent *ret) +{ + struct rb_node *node; + struct pnfs_block_extent *be; + + node = root->rb_node; + while (node) { + be = ext_node(node); + if (isect < be->be_f_offset) + node = node->rb_left; + else if (isect >= ext_f_end(be)) + node = node->rb_right; + else { + *ret = *be; + return true; + } + } + + return false; +} + +bool +ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw) +{ + bool found = false; + + spin_lock(&bl->bl_ext_lock); + if (!rw) + found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); + if (!found) + found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); + spin_unlock(&bl->bl_ext_lock); + + return found; +} + +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, + sector_t start, sector_t end) +{ + int err, err2; + + spin_lock(&bl->bl_ext_lock); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (rw) { + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + if (!err) + err = err2; + } + spin_unlock(&bl->bl_ext_lock); + + return err; +} + +static int +ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, + sector_t split) +{ + struct pnfs_block_extent *new; + sector_t orig_len = be->be_length; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = split - be->be_f_offset; + + new->be_f_offset = split; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + new->be_v_offset = be->be_v_offset + be->be_length; + new->be_length = orig_len - be->be_length; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, false); + return 0; +} + +int +ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len) +{ + struct rb_root *root = &bl->bl_ext_rw; + sector_t end = start + len; + struct pnfs_block_extent *be; + int err = 0; + + spin_lock(&bl->bl_ext_lock); + /* + * First remove all COW extents or holes from written to range. + */ + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (err) + goto out; + + /* + * Then mark all invalid extents in the range as written to. + */ + for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { + if (be->be_f_offset >= end) + break; + + if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) + continue; + + if (be->be_f_offset < start) { + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + sector_t diff = start - be->be_f_offset; + + left->be_length += diff; + + be->be_f_offset += diff; + be->be_v_offset += diff; + be->be_length -= diff; + } else { + err = ext_tree_split(root, be, start); + if (err) + goto out; + } + } + + if (ext_f_end(be) > end) { + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + sector_t diff = end - be->be_f_offset; + + be->be_length -= diff; + + right->be_f_offset -= diff; + right->be_v_offset -= diff; + right->be_length += diff; + } else { + err = ext_tree_split(root, be, end); + if (err) + goto out; + } + } + + if (be->be_f_offset >= start && ext_f_end(be) <= end) { + be->be_tag = EXTENT_WRITTEN; + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, + size_t buffer_size) +{ + if (arg->layoutupdate_pages != &arg->layoutupdate_page) { + int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; + + for (i = 0; i < nr_pages; i++) + put_page(arg->layoutupdate_pages[i]); + kfree(arg->layoutupdate_pages); + } else { + put_page(arg->layoutupdate_page); + } +} + +static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count) +{ + struct pnfs_block_extent *be; + int ret = 0; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (*count * BL_EXTENT_SIZE > buffer_size) { + /* keep counting.. */ + ret = -ENOSPC; + continue; + } + + p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, + NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + + be->be_tag = EXTENT_COMMITTING; + } + spin_unlock(&bl->bl_ext_lock); + + return ret; +} + +int +ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + size_t count = 0, buffer_size = PAGE_SIZE; + __be32 *start_p; + int ret; + + dprintk("%s enter\n", __func__); + + arg->layoutupdate_page = alloc_page(GFP_NOFS); + if (!arg->layoutupdate_page) + return -ENOMEM; + start_p = page_address(arg->layoutupdate_page); + arg->layoutupdate_pages = &arg->layoutupdate_page; + +retry: + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + if (unlikely(ret)) { + ext_tree_free_commitdata(arg, buffer_size); + + buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + count = 0; + + arg->layoutupdate_pages = + kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), + sizeof(struct page *), GFP_NOFS); + if (!arg->layoutupdate_pages) + return -ENOMEM; + + start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + if (!start_p) { + kfree(arg->layoutupdate_pages); + return -ENOMEM; + } + + goto retry; + } + + *start_p = cpu_to_be32(count); + arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + + if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { + __be32 *p = start_p; + int i = 0; + + for (p = start_p; + p < start_p + arg->layoutupdate_len; + p += PAGE_SIZE) { + arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + } + } + + dprintk("%s found %zu ranges\n", __func__, count); + return 0; +} + +void +ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + struct rb_root *root = &bl->bl_ext_rw; + struct pnfs_block_extent *be; + + dprintk("%s status %d\n", __func__, status); + + ext_tree_free_commitdata(arg, arg->layoutupdate_len); + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_COMMITTING) + continue; + + if (status) { + /* + * Mark as written and try again. + * + * XXX: some real error handling here wouldn't hurt.. + */ + be->be_tag = EXTENT_WRITTEN; + } else { + be->be_state = PNFS_BLOCK_READWRITE_DATA; + be->be_tag = 0; + } + + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + spin_unlock(&bl->bl_ext_lock); +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c deleted file mode 100644 index 4d0161442565..000000000000 --- a/fs/nfs/blocklayout/extents.c +++ /dev/null @@ -1,908 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayout.h - * - * Module for the NFSv4.1 pNFS block layout driver. - * - * Copyright (c) 2006 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@citi.umich.edu> - * Fred Isaman <iisaman@umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ - -#include "blocklayout.h" -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* Bit numbers */ -#define EXTENT_INITIALIZED 0 -#define EXTENT_WRITTEN 1 -#define EXTENT_IN_COMMIT 2 -#define INTERNAL_EXISTS MY_MAX_TAGS -#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) - -/* Returns largest t<=s s.t. t%base==0 */ -static inline sector_t normalize(sector_t s, int base) -{ - sector_t tmp = s; /* Since do_div modifies its argument */ - return s - sector_div(tmp, base); -} - -static inline sector_t normalize_up(sector_t s, int base) -{ - return normalize(s + base - 1, base); -} - -/* Complete stub using list while determine API wanted */ - -/* Returns tags, or negative */ -static int32_t _find_entry(struct my_tree *tree, u64 s) -{ - struct pnfs_inval_tracking *pos; - - dprintk("%s(%llu) enter\n", __func__, s); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector > s) - continue; - else if (pos->it_sector == s) - return pos->it_tags & INTERNAL_MASK; - else - break; - } - return -ENOENT; -} - -static inline -int _has_tag(struct my_tree *tree, u64 s, int32_t tag) -{ - int32_t tags; - - dprintk("%s(%llu, %i) enter\n", __func__, s, tag); - s = normalize(s, tree->mtt_step_size); - tags = _find_entry(tree, s); - if ((tags < 0) || !(tags & (1 << tag))) - return 0; - else - return 1; -} - -/* Creates entry with tag, or if entry already exists, unions tag to it. - * If storage is not NULL, newly created entry will use it. - * Returns number of entries added, or negative on error. - */ -static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, - struct pnfs_inval_tracking *storage) -{ - int found = 0; - struct pnfs_inval_tracking *pos; - - dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector > s) - continue; - else if (pos->it_sector == s) { - found = 1; - break; - } else - break; - } - if (found) { - pos->it_tags |= (1 << tag); - return 0; - } else { - struct pnfs_inval_tracking *new; - new = storage; - new->it_sector = s; - new->it_tags = (1 << tag); - list_add(&new->it_link, &pos->it_link); - return 1; - } -} - -/* XXXX Really want option to not create */ -/* Over range, unions tag with existing entries, else creates entry with tag */ -static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) -{ - u64 i; - - dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); - for (i = normalize(s, tree->mtt_step_size); i < s + length; - i += tree->mtt_step_size) - if (_add_entry(tree, i, tag, NULL)) - return -ENOMEM; - return 0; -} - -/* Ensure that future operations on given range of tree will not malloc */ -static int _preload_range(struct pnfs_inval_markings *marks, - u64 offset, u64 length) -{ - u64 start, end, s; - int count, i, used = 0, status = -ENOMEM; - struct pnfs_inval_tracking **storage; - struct my_tree *tree = &marks->im_tree; - - dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); - start = normalize(offset, tree->mtt_step_size); - end = normalize_up(offset + length, tree->mtt_step_size); - count = (int)(end - start) / (int)tree->mtt_step_size; - - /* Pre-malloc what memory we might need */ - storage = kcalloc(count, sizeof(*storage), GFP_NOFS); - if (!storage) - return -ENOMEM; - for (i = 0; i < count; i++) { - storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), - GFP_NOFS); - if (!storage[i]) - goto out_cleanup; - } - - spin_lock_bh(&marks->im_lock); - for (s = start; s < end; s += tree->mtt_step_size) - used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); - spin_unlock_bh(&marks->im_lock); - - status = 0; - - out_cleanup: - for (i = used; i < count; i++) { - if (!storage[i]) - break; - kfree(storage[i]); - } - kfree(storage); - return status; -} - -/* We are relying on page lock to serialize this */ -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) -{ - int rv; - - spin_lock_bh(&marks->im_lock); - rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); - spin_unlock_bh(&marks->im_lock); - return rv; -} - -/* Assume start, end already sector aligned */ -static int -_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) -{ - struct pnfs_inval_tracking *pos; - u64 expect = 0; - - dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector >= end) - continue; - if (!expect) { - if ((pos->it_sector == end - tree->mtt_step_size) && - (pos->it_tags & (1 << tag))) { - expect = pos->it_sector - tree->mtt_step_size; - if (pos->it_sector < tree->mtt_step_size || expect < start) - return 1; - continue; - } else { - return 0; - } - } - if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) - return 0; - expect -= tree->mtt_step_size; - if (expect < start) - return 1; - } - return 0; -} - -static int is_range_written(struct pnfs_inval_markings *marks, - sector_t start, sector_t end) -{ - int rv; - - spin_lock_bh(&marks->im_lock); - rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); - spin_unlock_bh(&marks->im_lock); - return rv; -} - -/* Marks sectors in [offest, offset_length) as having been initialized. - * All lengths are step-aligned, where step is min(pagesize, blocksize). - * Currently assumes offset is page-aligned - */ -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length) -{ - sector_t start, end; - - dprintk("%s(offset=%llu,len=%llu) enter\n", - __func__, (u64)offset, (u64)length); - - start = normalize(offset, marks->im_block_size); - end = normalize_up(offset + length, marks->im_block_size); - if (_preload_range(marks, start, end - start)) - goto outerr; - - spin_lock_bh(&marks->im_lock); - if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) - goto out_unlock; - spin_unlock_bh(&marks->im_lock); - - return 0; - -out_unlock: - spin_unlock_bh(&marks->im_lock); -outerr: - return -ENOMEM; -} - -/* Marks sectors in [offest, offset+length) as having been written to disk. - * All lengths should be block aligned. - */ -static int mark_written_sectors(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length) -{ - int status; - - dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, - (u64)offset, (u64)length); - spin_lock_bh(&marks->im_lock); - status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); - spin_unlock_bh(&marks->im_lock); - return status; -} - -static void print_short_extent(struct pnfs_block_short_extent *be) -{ - dprintk("PRINT SHORT EXTENT extent %p\n", be); - if (be) { - dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); - dprintk(" be_length %llu\n", (u64)be->bse_length); - } -} - -static void print_clist(struct list_head *list, unsigned int count) -{ - struct pnfs_block_short_extent *be; - unsigned int i = 0; - - ifdebug(FACILITY) { - printk(KERN_DEBUG "****************\n"); - printk(KERN_DEBUG "Extent list looks like:\n"); - list_for_each_entry(be, list, bse_node) { - i++; - print_short_extent(be); - } - if (i != count) - printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); - printk(KERN_DEBUG "****************\n"); - } -} - -/* Note: In theory, we should do more checking that devid's match between - * old and new, but if they don't, the lists are too corrupt to salvage anyway. - */ -/* Note this is very similar to bl_add_merge_extent */ -static void add_to_commitlist(struct pnfs_block_layout *bl, - struct pnfs_block_short_extent *new) -{ - struct list_head *clist = &bl->bl_commit; - struct pnfs_block_short_extent *old, *save; - sector_t end = new->bse_f_offset + new->bse_length; - - dprintk("%s enter\n", __func__); - print_short_extent(new); - print_clist(clist, bl->bl_count); - bl->bl_count++; - /* Scan for proper place to insert, extending new to the left - * as much as possible. - */ - list_for_each_entry_safe(old, save, clist, bse_node) { - if (new->bse_f_offset < old->bse_f_offset) - break; - if (end <= old->bse_f_offset + old->bse_length) { - /* Range is already in list */ - bl->bl_count--; - kfree(new); - return; - } else if (new->bse_f_offset <= - old->bse_f_offset + old->bse_length) { - /* new overlaps or abuts existing be */ - if (new->bse_mdev == old->bse_mdev) { - /* extend new to fully replace old */ - new->bse_length += new->bse_f_offset - - old->bse_f_offset; - new->bse_f_offset = old->bse_f_offset; - list_del(&old->bse_node); - bl->bl_count--; - kfree(old); - } - } - } - /* Note that if we never hit the above break, old will not point to a - * valid extent. However, in that case &old->bse_node==list. - */ - list_add_tail(&new->bse_node, &old->bse_node); - /* Scan forward for overlaps. If we find any, extend new and - * remove the overlapped extent. - */ - old = list_prepare_entry(new, clist, bse_node); - list_for_each_entry_safe_continue(old, save, clist, bse_node) { - if (end < old->bse_f_offset) - break; - /* new overlaps or abuts old */ - if (new->bse_mdev == old->bse_mdev) { - if (end < old->bse_f_offset + old->bse_length) { - /* extend new to fully cover old */ - end = old->bse_f_offset + old->bse_length; - new->bse_length = end - new->bse_f_offset; - } - list_del(&old->bse_node); - bl->bl_count--; - kfree(old); - } - } - dprintk("%s: after merging\n", __func__); - print_clist(clist, bl->bl_count); -} - -/* Note the range described by offset, length is guaranteed to be contained - * within be. - * new will be freed, either by this function or add_to_commitlist if they - * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. - */ -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new) -{ - sector_t new_end, end = offset + length; - struct pnfs_block_layout *bl = container_of(be->be_inval, - struct pnfs_block_layout, - bl_inval); - - mark_written_sectors(be->be_inval, offset, length); - /* We want to add the range to commit list, but it must be - * block-normalized, and verified that the normalized range has - * been entirely written to disk. - */ - new->bse_f_offset = offset; - offset = normalize(offset, bl->bl_blocksize); - if (offset < new->bse_f_offset) { - if (is_range_written(be->be_inval, offset, new->bse_f_offset)) - new->bse_f_offset = offset; - else - new->bse_f_offset = offset + bl->bl_blocksize; - } - new_end = normalize_up(end, bl->bl_blocksize); - if (end < new_end) { - if (is_range_written(be->be_inval, end, new_end)) - end = new_end; - else - end = new_end - bl->bl_blocksize; - } - if (end <= new->bse_f_offset) { - kfree(new); - return 0; - } - new->bse_length = end - new->bse_f_offset; - new->bse_devid = be->be_devid; - new->bse_mdev = be->be_mdev; - - spin_lock(&bl->bl_ext_lock); - add_to_commitlist(bl, new); - spin_unlock(&bl->bl_ext_lock); - return 0; -} - -static void print_bl_extent(struct pnfs_block_extent *be) -{ - dprintk("PRINT EXTENT extent %p\n", be); - if (be) { - dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); - dprintk(" be_length %llu\n", (u64)be->be_length); - dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); - dprintk(" be_state %d\n", be->be_state); - } -} - -static void -destroy_extent(struct kref *kref) -{ - struct pnfs_block_extent *be; - - be = container_of(kref, struct pnfs_block_extent, be_refcnt); - dprintk("%s be=%p\n", __func__, be); - kfree(be); -} - -void -bl_put_extent(struct pnfs_block_extent *be) -{ - if (be) { - dprintk("%s enter %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_put(&be->be_refcnt, destroy_extent); - } -} - -struct pnfs_block_extent *bl_alloc_extent(void) -{ - struct pnfs_block_extent *be; - - be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); - if (!be) - return NULL; - INIT_LIST_HEAD(&be->be_node); - kref_init(&be->be_refcnt); - be->be_inval = NULL; - return be; -} - -static void print_elist(struct list_head *list) -{ - struct pnfs_block_extent *be; - dprintk("****************\n"); - dprintk("Extent list looks like:\n"); - list_for_each_entry(be, list, be_node) { - print_bl_extent(be); - } - dprintk("****************\n"); -} - -static inline int -extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) -{ - /* Note this assumes new->be_f_offset >= old->be_f_offset */ - return (new->be_state == old->be_state) && - ((new->be_state == PNFS_BLOCK_NONE_DATA) || - ((new->be_v_offset - old->be_v_offset == - new->be_f_offset - old->be_f_offset) && - new->be_mdev == old->be_mdev)); -} - -/* Adds new to appropriate list in bl, modifying new and removing existing - * extents as appropriate to deal with overlaps. - * - * See bl_find_get_extent for list constraints. - * - * Refcount on new is already set. If end up not using it, or error out, - * need to put the reference. - * - * bl->bl_ext_lock is held by caller. - */ -int -bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new) -{ - struct pnfs_block_extent *be, *tmp; - sector_t end = new->be_f_offset + new->be_length; - struct list_head *list; - - dprintk("%s enter with be=%p\n", __func__, new); - print_bl_extent(new); - list = &bl->bl_extents[bl_choose_list(new->be_state)]; - print_elist(list); - - /* Scan for proper place to insert, extending new to the left - * as much as possible. - */ - list_for_each_entry_safe_reverse(be, tmp, list, be_node) { - if (new->be_f_offset >= be->be_f_offset + be->be_length) - break; - if (new->be_f_offset >= be->be_f_offset) { - if (end <= be->be_f_offset + be->be_length) { - /* new is a subset of existing be*/ - if (extents_consistent(be, new)) { - dprintk("%s: new is subset, ignoring\n", - __func__); - bl_put_extent(new); - return 0; - } else { - goto out_err; - } - } else { - /* |<-- be -->| - * |<-- new -->| */ - if (extents_consistent(be, new)) { - /* extend new to fully replace be */ - new->be_length += new->be_f_offset - - be->be_f_offset; - new->be_f_offset = be->be_f_offset; - new->be_v_offset = be->be_v_offset; - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } - } else if (end >= be->be_f_offset + be->be_length) { - /* new extent overlap existing be */ - if (extents_consistent(be, new)) { - /* extend new to fully replace be */ - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } else if (end > be->be_f_offset) { - /* |<-- be -->| - *|<-- new -->| */ - if (extents_consistent(new, be)) { - /* extend new to fully replace be */ - new->be_length += be->be_f_offset + be->be_length - - new->be_f_offset - new->be_length; - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } - } - /* Note that if we never hit the above break, be will not point to a - * valid extent. However, in that case &be->be_node==list. - */ - list_add(&new->be_node, &be->be_node); - dprintk("%s: inserting new\n", __func__); - print_elist(list); - /* FIXME - The per-list consistency checks have all been done, - * should now check cross-list consistency. - */ - return 0; - - out_err: - bl_put_extent(new); - return -EIO; -} - -/* Returns extent, or NULL. If a second READ extent exists, it is returned - * in cow_read, if given. - * - * The extents are kept in two seperate ordered lists, one for READ and NONE, - * one for READWRITE and INVALID. Within each list, we assume: - * 1. Extents are ordered by file offset. - * 2. For any given isect, there is at most one extents that matches. - */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read) -{ - struct pnfs_block_extent *be, *cow, *ret; - int i; - - dprintk("%s enter with isect %llu\n", __func__, (u64)isect); - cow = ret = NULL; - spin_lock(&bl->bl_ext_lock); - for (i = 0; i < EXTENT_LISTS; i++) { - list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { - if (isect >= be->be_f_offset + be->be_length) - break; - if (isect >= be->be_f_offset) { - /* We have found an extent */ - dprintk("%s Get %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_get(&be->be_refcnt); - if (!ret) - ret = be; - else if (be->be_state != PNFS_BLOCK_READ_DATA) - bl_put_extent(be); - else - cow = be; - break; - } - } - if (ret && - (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) - break; - } - spin_unlock(&bl->bl_ext_lock); - if (cow_read) - *cow_read = cow; - print_bl_extent(ret); - return ret; -} - -/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ -static struct pnfs_block_extent * -bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) -{ - struct pnfs_block_extent *be, *ret = NULL; - int i; - - dprintk("%s enter with isect %llu\n", __func__, (u64)isect); - for (i = 0; i < EXTENT_LISTS; i++) { - if (ret) - break; - list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { - if (isect >= be->be_f_offset + be->be_length) - break; - if (isect >= be->be_f_offset) { - /* We have found an extent */ - dprintk("%s Get %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_get(&be->be_refcnt); - ret = be; - break; - } - } - } - print_bl_extent(ret); - return ret; -} - -int -encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) -{ - struct pnfs_block_short_extent *lce, *save; - unsigned int count = 0; - __be32 *p, *xdr_start; - - dprintk("%s enter\n", __func__); - /* BUG - creation of bl_commit is buggy - need to wait for - * entire block to be marked WRITTEN before it can be added. - */ - spin_lock(&bl->bl_ext_lock); - /* Want to adjust for possible truncate */ - /* We now want to adjust argument range */ - - /* XDR encode the ranges found */ - xdr_start = xdr_reserve_space(xdr, 8); - if (!xdr_start) - goto out; - list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { - p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); - if (!p) - break; - p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); - p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); - p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); - p = xdr_encode_hyper(p, 0LL); - *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); - list_move_tail(&lce->bse_node, &bl->bl_committing); - bl->bl_count--; - count++; - } - xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); - xdr_start[1] = cpu_to_be32(count); -out: - spin_unlock(&bl->bl_ext_lock); - dprintk("%s found %i ranges\n", __func__, count); - return 0; -} - -/* Helper function to set_to_rw that initialize a new extent */ -static void -_prep_new_extent(struct pnfs_block_extent *new, - struct pnfs_block_extent *orig, - sector_t offset, sector_t length, int state) -{ - kref_init(&new->be_refcnt); - /* don't need to INIT_LIST_HEAD(&new->be_node) */ - memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); - new->be_mdev = orig->be_mdev; - new->be_f_offset = offset; - new->be_length = length; - new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; - new->be_state = state; - new->be_inval = orig->be_inval; -} - -/* Tries to merge be with extent in front of it in list. - * Frees storage if not used. - */ -static struct pnfs_block_extent * -_front_merge(struct pnfs_block_extent *be, struct list_head *head, - struct pnfs_block_extent *storage) -{ - struct pnfs_block_extent *prev; - - if (!storage) - goto no_merge; - if (&be->be_node == head || be->be_node.prev == head) - goto no_merge; - prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); - if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || - !extents_consistent(prev, be)) - goto no_merge; - _prep_new_extent(storage, prev, prev->be_f_offset, - prev->be_length + be->be_length, prev->be_state); - list_replace(&prev->be_node, &storage->be_node); - bl_put_extent(prev); - list_del(&be->be_node); - bl_put_extent(be); - return storage; - - no_merge: - kfree(storage); - return be; -} - -static u64 -set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) -{ - u64 rv = offset + length; - struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; - struct pnfs_block_extent *children[3]; - struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; - int i = 0, j; - - dprintk("%s(%llu, %llu)\n", __func__, offset, length); - /* Create storage for up to three new extents e1, e2, e3 */ - e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); - e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); - e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); - /* BUG - we are ignoring any failure */ - if (!e1 || !e2 || !e3) - goto out_nosplit; - - spin_lock(&bl->bl_ext_lock); - be = bl_find_get_extent_locked(bl, offset); - rv = be->be_f_offset + be->be_length; - if (be->be_state != PNFS_BLOCK_INVALID_DATA) { - spin_unlock(&bl->bl_ext_lock); - goto out_nosplit; - } - /* Add e* to children, bumping e*'s krefs */ - if (be->be_f_offset != offset) { - _prep_new_extent(e1, be, be->be_f_offset, - offset - be->be_f_offset, - PNFS_BLOCK_INVALID_DATA); - children[i++] = e1; - print_bl_extent(e1); - } else - merge1 = e1; - _prep_new_extent(e2, be, offset, - min(length, be->be_f_offset + be->be_length - offset), - PNFS_BLOCK_READWRITE_DATA); - children[i++] = e2; - print_bl_extent(e2); - if (offset + length < be->be_f_offset + be->be_length) { - _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, - be->be_f_offset + be->be_length - - offset - length, - PNFS_BLOCK_INVALID_DATA); - children[i++] = e3; - print_bl_extent(e3); - } else - merge2 = e3; - - /* Remove be from list, and insert the e* */ - /* We don't get refs on e*, since this list is the base reference - * set when init'ed. - */ - if (i < 3) - children[i] = NULL; - new = children[0]; - list_replace(&be->be_node, &new->be_node); - bl_put_extent(be); - new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); - for (j = 1; j < i; j++) { - old = new; - new = children[j]; - list_add(&new->be_node, &old->be_node); - } - if (merge2) { - /* This is a HACK, should just create a _back_merge function */ - new = list_entry(new->be_node.next, - struct pnfs_block_extent, be_node); - new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); - } - spin_unlock(&bl->bl_ext_lock); - - /* Since we removed the base reference above, be is now scheduled for - * destruction. - */ - bl_put_extent(be); - dprintk("%s returns %llu after split\n", __func__, rv); - return rv; - - out_nosplit: - kfree(e1); - kfree(e2); - kfree(e3); - dprintk("%s returns %llu without splitting\n", __func__, rv); - return rv; -} - -void -clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status) -{ - struct pnfs_block_short_extent *lce, *save; - - dprintk("%s status %d\n", __func__, status); - list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { - if (likely(!status)) { - u64 offset = lce->bse_f_offset; - u64 end = offset + lce->bse_length; - - do { - offset = set_to_rw(bl, offset, end - offset); - } while (offset < end); - list_del(&lce->bse_node); - - kfree(lce); - } else { - list_del(&lce->bse_node); - spin_lock(&bl->bl_ext_lock); - add_to_commitlist(bl, lce); - spin_unlock(&bl->bl_ext_lock); - } - } -} - -int bl_push_one_short_extent(struct pnfs_inval_markings *marks) -{ - struct pnfs_block_short_extent *new; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (unlikely(!new)) - return -ENOMEM; - - spin_lock_bh(&marks->im_lock); - list_add(&new->bse_node, &marks->im_extents); - spin_unlock_bh(&marks->im_lock); - - return 0; -} - -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks) -{ - struct pnfs_block_short_extent *rv = NULL; - - spin_lock_bh(&marks->im_lock); - if (!list_empty(&marks->im_extents)) { - rv = list_entry((&marks->im_extents)->next, - struct pnfs_block_short_extent, bse_node); - list_del_init(&rv->bse_node); - } - spin_unlock_bh(&marks->im_lock); - - return rv; -} - -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) -{ - struct pnfs_block_short_extent *se = NULL, *tmp; - - if (num_to_free <= 0) - return; - - spin_lock(&marks->im_lock); - list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { - list_del(&se->bse_node); - kfree(se); - if (--num_to_free == 0) - break; - } - spin_unlock(&marks->im_lock); - - BUG_ON(num_to_free > 0); -} diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c new file mode 100644 index 000000000000..8d04bda2bd2e --- /dev/null +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2006,2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) +{ + int i; + + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(b->simple.nr_sigs); + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_encode_hyper(p, b->simple.sigs[i].offset); + p = xdr_encode_opaque(p, b->simple.sigs[i].sig, + b->simple.sigs[i].sig_len); + } +} + +dev_t +bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, + gfp_t gfp_mask) +{ + struct net *net = server->nfs_client->cl_net; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct bl_dev_msg *reply = &nn->bl_mount_reply; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; + struct bl_msg_hdr *bl_msg; + DECLARE_WAITQUEUE(wq, current); + dev_t dev = 0; + int rc; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + + bl_pipe_msg.bl_wq = &nn->bl_wq; + + b->simple.len += 4; /* single volume */ + if (b->simple.len > PAGE_SIZE) + return -EIO; + + memset(msg, 0, sizeof(*msg)); + msg->len = sizeof(*bl_msg) + b->simple.len; + msg->data = kzalloc(msg->len, gfp_mask); + if (!msg->data) + goto out; + + bl_msg = msg->data; + bl_msg->type = BL_DEVICE_MOUNT, + bl_msg->totallen = b->simple.len; + nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&nn->bl_wq, &wq); + rc = rpc_queue_upcall(nn->bl_device_pipe, msg); + if (rc < 0) { + remove_wait_queue(&nn->bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nn->bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + printk(KERN_WARNING "%s failed to decode device: %d\n", + __func__, reply->status); + goto out; + } + + dev = MKDEV(reply->major, reply->minor); +out: + kfree(msg->data); + return dev; +} + +static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + nfs_net_id); + + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&nn->bl_wq); + + return mlen; +} + +static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct bl_pipe_msg *bl_pipe_msg = + container_of(msg, struct bl_pipe_msg, msg); + + if (msg->errno >= 0) + return; + wake_up(bl_pipe_msg->bl_wq); +} + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + dput(dir); + return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (nn->bl_device_pipe == NULL) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + nn->bl_device_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (nn->bl_device_pipe->dentry) + nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs4blocklayout_block = { + .notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + struct dentry *dentry; + + pipefs_sb = rpc_get_sb_net(net); + if (!pipefs_sb) + return NULL; + dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + } +} + +static int nfs4blocklayout_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + + init_waitqueue_head(&nn->bl_wq); + nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); + if (IS_ERR(nn->bl_device_pipe)) + return PTR_ERR(nn->bl_device_pipe); + dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + rpc_destroy_pipe_data(nn->bl_device_pipe); + return PTR_ERR(dentry); + } + nn->bl_device_pipe->dentry = dentry; + return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); + rpc_destroy_pipe_data(nn->bl_device_pipe); + nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { + .init = nfs4blocklayout_net_init, + .exit = nfs4blocklayout_net_exit, +}; + +int __init bl_init_pipefs(void) +{ + int ret; + + ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + if (ret) + goto out; + ret = register_pernet_subsys(&nfs4blocklayout_net_ops); + if (ret) + goto out_unregister_notifier; + return 0; + +out_unregister_notifier: + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +out: + return ret; +} + +void __exit bl_cleanup_pipefs(void) +{ + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); + unregister_pernet_subsys(&nfs4blocklayout_net_ops); +} diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 41db5258e7a7..73466b934090 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; ino = lo->plh_inode; + + spin_lock(&ino->i_lock); + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) + &args->cbl_range)) { rv = NFS4ERR_DELAY; - else - rv = NFS4ERR_NOMATCHING_LAYOUT; - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + goto unlock; + } + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, + &args->cbl_range); + } +unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); @@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, } found: - if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) - dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " - "deleting instead\n", __func__); nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 94088517039f..f9f4845db989 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); @@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); @@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); @@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 65ef6e00deee..dda4b8667c02 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) /* * nfs_direct_cmp_commit_data_verf - compare verifier for commit data * @dreq - direct request possibly spanning multiple servers @@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); } -#endif /** * nfs_direct_IO - NFS address space operation for direct I/O @@ -576,7 +574,6 @@ out: return result; } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; @@ -700,17 +697,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } -#else -static void nfs_direct_write_schedule_work(struct work_struct *work) -{ -} - -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) -{ - nfs_direct_complete(dreq, true); -} -#endif - static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 524dd80d1898..6920127c5eb7 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #include "nfstrace.h" @@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page, unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); unsigned int end = offset + len; + if (pnfs_ld_read_whole_page(file->f_mapping->host)) { + if (!PageUptodate(page)) + return 1; + return 0; + } + if ((file->f_mode & FMODE_READ) && /* open for read? */ !PageUptodate(page) && /* Uptodate? */ !PagePrivate(page) && /* i/o request already? */ @@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not - * doing this memory reclaim for a fs-related allocation. + /* Always try to initiate a 'commit' if relevant, but only + * wait for it if __GFP_WAIT is set. Even then, only wait 1 + * second and only if the 'bdi' is not congested. + * Waiting indefinitely can cause deadlocks when the NFS + * server is on this machine, when a new TCP connection is + * needed and in other rare cases. There is no particular + * need to wait extensively here. A short wait has the + * benefit that someone else can worry about the freezer. */ - if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && - !(current->flags & PF_FSTRANS)) { - int how = FLUSH_SYNC; - - /* Don't let kswapd deadlock waiting for OOM RPC calls */ - if (current_is_kswapd()) - how = 0; - nfs_commit_inode(mapping->host, how); + if (mapping) { + struct nfs_server *nfss = NFS_SERVER(mapping->host); + nfs_commit_inode(mapping->host, 0); + if ((gfp & __GFP_WAIT) && + !bdi_write_congested(&nfss->backing_dev_info)) { + wait_on_page_bit_killable_timeout(page, PG_private, + HZ); + if (PagePrivate(page)) + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) @@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page) static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { + int ret; + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + *span = sis->pages; - return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); + + rcu_read_lock(); + ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); + rcu_read_unlock(); + + return ret; } static void nfs_swap_deactivate(struct file *file) { - xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + + rcu_read_lock(); + xs_swapper(rcu_dereference(clnt->cl_xprt), 0); + rcu_read_unlock(); } #endif diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 90978075f730..abc5056999d6 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed == NFS_FILE_SYNC) + hdr->res.verf->committed != NFS_DATA_SYNC) return; pnfs_set_layoutcommit(hdr); @@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } + if (data->verf.committed == NFS_UNSTABLE) + pnfs_commit_set_layoutcommit(data); + return 0; } @@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, } /* find and reference the deviceid */ - d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, - NFS_SERVER(lo->plh_inode)->nfs_client, id); - if (d == NULL) { - dsaddr = filelayout_get_device_info(lo->plh_inode, id, - lo->plh_lc_cred, gfp_flags); - if (dsaddr == NULL) - goto out; - } else - dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id, + lo->plh_lc_cred, gfp_flags); + if (d == NULL) + goto out; + + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); /* Found deviceid is unavailable */ if (filelayout_test_devid_unavailable(&dsaddr->id_node)) - goto out_put; + goto out_put; fl->dsaddr = dsaddr; @@ -1368,6 +1368,17 @@ out: cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } +static struct nfs4_deviceid_node * +filelayout_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + + dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags); + if (!dsaddr) + return NULL; + return &dsaddr->id_node; +} static void filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) @@ -1420,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, + .alloc_deviceid_node = filelayout_alloc_deviceid_node, .free_deviceid_node = filelayout_free_deveiceid_node, }; diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index ffbddf2219ea..7c9f800c49d7 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx); + +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags); extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); -struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, gfp_t gfp_flags); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 8540516f4d71..9bb806a76d99 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -484,8 +484,9 @@ out_err: } /* Decode opaque device data and return the result */ -static struct nfs4_file_layout_dsaddr* -decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) +struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) { int i; u32 cnt, num; @@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) dsaddr->stripe_indices = stripe_indices; stripe_indices = NULL; dsaddr->ds_num = num; - nfs4_init_deviceid_node(&dsaddr->id_node, - NFS_SERVER(ino)->pnfs_curr_ld, - NFS_SERVER(ino)->nfs_client, - &pdev->dev_id); + nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id); INIT_LIST_HEAD(&dsaddrs); @@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, + da = decode_ds_addr(server->nfs_client->cl_net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); @@ -637,102 +635,6 @@ out_err: return NULL; } -/* - * Decode the opaque device specified in 'dev' and add it to the cache of - * available devices. - */ -static struct nfs4_file_layout_dsaddr * -decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) -{ - struct nfs4_deviceid_node *d; - struct nfs4_file_layout_dsaddr *n, *new; - - new = decode_device(inode, dev, gfp_flags); - if (!new) { - printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", - __func__); - return NULL; - } - - d = nfs4_insert_deviceid_node(&new->id_node); - n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); - if (n != new) { - nfs4_fl_free_deviceid(new); - return n; - } - - return new; -} - -/* - * Retrieve the information for dev_id, add it to the list - * of available devices, and return it. - */ -struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, - struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, - gfp_t gfp_flags) -{ - struct pnfs_device *pdev = NULL; - u32 max_resp_sz; - int max_pages; - struct page **pages = NULL; - struct nfs4_file_layout_dsaddr *dsaddr = NULL; - int rc, i; - struct nfs_server *server = NFS_SERVER(inode); - - /* - * Use the session max response size as the basis for setting - * GETDEVICEINFO's maxcount - */ - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - max_pages = nfs_page_array_len(0, max_resp_sz); - dprintk("%s inode %p max_resp_sz %u max_pages %d\n", - __func__, inode, max_resp_sz, max_pages); - - pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); - if (pdev == NULL) - return NULL; - - pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); - if (pages == NULL) { - kfree(pdev); - return NULL; - } - for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(gfp_flags); - if (!pages[i]) - goto out_free; - } - - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); - pdev->layout_type = LAYOUT_NFSV4_1_FILES; - pdev->pages = pages; - pdev->pgbase = 0; - pdev->pglen = max_resp_sz; - pdev->mincount = 0; - pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - - rc = nfs4_proc_getdeviceinfo(server, pdev, cred); - dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) - goto out_free; - - /* - * Found new device, need to decode it and then add it to the - * list of known devices for this mountpoint. - */ - dsaddr = decode_and_add_device(inode, pdev, gfp_flags); -out_free: - for (i = 0; i < max_pages; i++) - __free_page(pages[i]); - kfree(pages); - kfree(pdev); - dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); - return dsaddr; -} - void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 577a36f0a510..141c9f4a40de 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { - if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) + BUG_ON(!S_ISREG(inode->i_mode)); + + if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9056622d2230..14ae6f20a172 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void) int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); #endif -/* nfs3client.c */ -#if IS_ENABLED(CONFIG_NFS_V3) -struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); -struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, - struct nfs_fattr *, rpc_authflavor_t); -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h new file mode 100644 index 000000000000..333ae4068506 --- /dev/null +++ b/fs/nfs/nfs3_fs.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014 Anna Schumaker. + * + * NFSv3-specific filesystem definitions and declarations + */ +#ifndef __LINUX_FS_NFS_NFS3_FS_H +#define __LINUX_FS_NFS_NFS3_FS_H + +/* + * nfs3acl.c + */ +#ifdef CONFIG_NFS_V3_ACL +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); +extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl); +extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); +extern const struct xattr_handler *nfs3_xattr_handlers[]; +#else +static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + return 0; +} +#define nfs3_listxattr NULL +#endif /* CONFIG_NFS_V3_ACL */ + +/* nfs3client.c */ +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); + + +#endif /* __LINUX_FS_NFS_NFS3_FS_H */ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 24c6898159cc..658e586ca438 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -7,6 +7,7 @@ #include <linux/nfsacl.h> #include "internal.h" +#include "nfs3_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b3fc65ef39ca..8c1b437c5403 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -1,6 +1,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include "internal.h" +#include "nfs3_fs.h" #ifdef CONFIG_NFS_V3_ACL static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 809670eba52a..524f9f837408 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -22,6 +22,7 @@ #include "iostat.h" #include "internal.h" +#include "nfs3_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index d6a98949af19..6af29c2da352 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -4,6 +4,7 @@ #include <linux/module.h> #include <linux/nfs_fs.h> #include "internal.h" +#include "nfs3_fs.h" #include "nfs.h" static struct nfs_subversion nfs_v3 = { diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 0422d77b73c7..5aa55c132aa2 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -77,7 +77,7 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); @@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static long nfs4_update_delay(long *timeout) +{ + long ret; + if (!timeout) + return NFS4_POLL_RETRY_MAX; + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + ret = *timeout; + *timeout <<= 1; + return ret; +} + static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) { int res = 0; might_sleep(); - if (*timeout <= 0) - *timeout = NFS4_POLL_RETRY_MIN; - if (*timeout > NFS4_POLL_RETRY_MAX) - *timeout = NFS4_POLL_RETRY_MAX; - freezable_schedule_timeout_killable_unsafe(*timeout); + freezable_schedule_timeout_killable_unsafe( + nfs4_update_delay(timeout)); if (fatal_signal_pending(current)) res = -ERESTARTSYS; - *timeout <<= 1; return res; } @@ -1307,15 +1317,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) int ret = -EAGAIN; for (;;) { + spin_lock(&state->owner->so_lock); if (can_open_cached(state, fmode, open_mode)) { - spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { - update_open_stateflags(state, fmode); - spin_unlock(&state->owner->so_lock); - goto out_return_state; - } + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); + goto out_return_state; } + spin_unlock(&state->owner->so_lock); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); if (!can_open_delegated(delegation, fmode)) { @@ -2589,7 +2597,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); goto out_release; } @@ -3217,7 +3225,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs4_label *label = NULL; int status; - if (pnfs_ld_layoutret_on_setattr(inode)) + if (pnfs_ld_layoutret_on_setattr(inode) && + sattr->ia_valid & ATTR_SIZE && + sattr->ia_size < i_size_read(inode)) pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); @@ -3576,7 +3586,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (!nfs4_sequence_done(task, &res->seq_res)) return 0; - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL, + &data->timeout) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); return 1; @@ -3609,7 +3620,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (!nfs4_sequence_done(task, &res->seq_res)) return 0; - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; update_changeattr(old_dir, &res->old_cinfo); @@ -4113,7 +4124,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) trace_nfs4_read(hdr, task->tk_status); if (nfs4_async_handle_error(task, server, - hdr->args.context->state) == -EAGAIN) { + hdr->args.context->state, + NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4181,10 +4193,11 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; - + trace_nfs4_write(hdr, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), - hdr->args.context->state) == -EAGAIN) { + hdr->args.context->state, + NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4264,7 +4277,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da struct inode *inode = data->inode; trace_nfs4_commit(data, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { + if (nfs4_async_handle_error(task, NFS_SERVER(inode), + NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4817,7 +4831,8 @@ out: static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, + struct nfs4_state *state, long *timeout) { struct nfs_client *clp = server->nfs_client; @@ -4867,6 +4882,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); + rpc_delay(task, nfs4_update_delay(timeout)); + goto restart_call; case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); case -NFS4ERR_RETRY_UNCACHED_REP: @@ -5107,8 +5124,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) pnfs_roc_set_barrier(data->inode, data->roc_barrier); break; default: - if (nfs4_async_handle_error(task, data->res.server, NULL) == - -EAGAIN) { + if (nfs4_async_handle_error(task, data->res.server, + NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return; } @@ -5372,7 +5389,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) case -NFS4ERR_EXPIRED: break; default: - if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, calldata->server, + NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -5978,7 +5996,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) break; case -NFS4ERR_LEASE_MOVED: case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, server, + NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } } @@ -7583,14 +7602,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) } else { LIST_HEAD(head); + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); spin_unlock(&inode->i_lock); - /* Mark the bad layout state as invalid, then - * retry using the open stateid. */ pnfs_free_lseg_list(&head); + + task->tk_status = 0; + rpc_restart_call_prepare(task); } } - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) rpc_restart_call_prepare(task); out: dprintk("<-- %s\n", __func__); @@ -7750,7 +7774,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case 0: break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) + if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; rpc_restart_call_prepare(task); return; @@ -7809,54 +7833,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) return status; } -/* - * Retrieve the list of Data Server devices from the MDS. - */ -static int _nfs4_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist) -{ - struct nfs4_getdevicelist_args args = { - .fh = fh, - .layoutclass = server->pnfs_curr_ld->id, - }; - struct nfs4_getdevicelist_res res = { - .devlist = devlist, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], - .rpc_argp = &args, - .rpc_resp = &res, - }; - int status; - - dprintk("--> %s\n", __func__); - status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, - &res.seq_res, 0); - dprintk("<-- %s status=%d\n", __func__, status); - return status; -} - -int nfs4_proc_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist) -{ - struct nfs4_exception exception = { }; - int err; - - do { - err = nfs4_handle_exception(server, - _nfs4_getdevicelist(server, fh, devlist), - &exception); - } while (exception.retry); - - dprintk("%s: err=%d, num_devs=%u\n", __func__, - err, devlist->num_devs); - - return err; -} -EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); - static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev, @@ -7929,7 +7905,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case 0: break; default: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return; } @@ -8225,7 +8201,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 87b2d0e79797..5194933ed419 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2345,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_check_lease(clp); if (status < 0) goto out_error; + continue; } if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e13b59d8d9aa..005d03c5d274 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) -#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ - encode_verifier_maxsz) -#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ - 2 /* nfs_cookie4 gdlr_cookie */ + \ - decode_verifier_maxsz \ - /* verifier4 gdlr_verifier */ + \ - 1 /* gdlr_deviceid_list count */ + \ - XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ - NFS4_DEVICEID4_SIZE) \ - /* gdlr_deviceid_list */ + \ - 1 /* bool gdlr_eof */) -#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ - XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* layout type */ + \ + 1 /* maxcount */ + \ + 1 /* bitmap size */ + \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap, word 0 */) #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 1 /* layout type */ + \ 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap */) + 1 /* notification bitmap, word 0 */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ @@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int); 2 /* last byte written */ + \ 1 /* nt_timechanged (false) */ + \ 1 /* layoutupdate4 layout type */ + \ - 1 /* NULL filelayout layoutupdate4 payload */) + 1 /* layoutupdate4 opaqueue len */) + /* the actual content of layoutupdate4 should + be allocated by drivers and spliced in + using xdr_write_pages */ #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ @@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) -#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_getdevicelist_maxsz) -#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_getdevicelist_maxsz) #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz +\ encode_getdeviceinfo_maxsz) @@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr, #ifdef CONFIG_NFS_V4_1 static void -encode_getdevicelist(struct xdr_stream *xdr, - const struct nfs4_getdevicelist_args *args, - struct compound_hdr *hdr) -{ - __be32 *p; - nfs4_verifier dummy = { - .data = "dummmmmy", - }; - - encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(args->layoutclass); - *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); - xdr_encode_hyper(p, 0ULL); /* cookie */ - encode_nfs4_verifier(xdr, &dummy); -} - -static void encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfs4_getdeviceinfo_args *args, struct compound_hdr *hdr) @@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr, __be32 *p; encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); - p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4); p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(args->pdev->layout_type); *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ - *p++ = cpu_to_be32(0); /* bitmap length 0 */ + + p = reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); } static void @@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr, static int encode_layoutcommit(struct xdr_stream *xdr, struct inode *inode, - const struct nfs4_layoutcommit_args *args, + struct nfs4_layoutcommit_args *args, struct compound_hdr *hdr) { __be32 *p; @@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( NFS_I(inode)->layout, xdr, args); - else - encode_uint32(xdr, 0); /* no layout-type payload */ + } else { + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) { + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); + } + } return 0; } @@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, } /* - * Encode GETDEVICELIST request - */ -static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs4_getdevicelist_args *args) -{ - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - encode_getdevicelist(xdr, args, &hdr); - encode_nops(&hdr); -} - -/* * Encode GETDEVICEINFO request */ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, @@ -5765,54 +5726,6 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) -/* - * TODO: Need to handle case when EOF != true; - */ -static int decode_getdevicelist(struct xdr_stream *xdr, - struct pnfs_devicelist *res) -{ - __be32 *p; - int status, i; - nfs4_verifier verftemp; - - status = decode_op_hdr(xdr, OP_GETDEVICELIST); - if (status) - return status; - - p = xdr_inline_decode(xdr, 8 + 8 + 4); - if (unlikely(!p)) - goto out_overflow; - - /* TODO: Skip cookie for now */ - p += 2; - - /* Read verifier */ - p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE); - - res->num_devs = be32_to_cpup(p); - - dprintk("%s: num_dev %d\n", __func__, res->num_devs); - - if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { - printk(KERN_ERR "NFS: %s too many result dev_num %u\n", - __func__, res->num_devs); - return -EIO; - } - - p = xdr_inline_decode(xdr, - res->num_devs * NFS4_DEVICEID4_SIZE + 4); - if (unlikely(!p)) - goto out_overflow; - for (i = 0; i < res->num_devs; i++) - p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, - NFS4_DEVICEID4_SIZE); - res->eof = be32_to_cpup(p); - return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -} - static int decode_getdeviceinfo(struct xdr_stream *xdr, struct pnfs_device *pdev) { @@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) goto out_overflow; - for (i = 0; i < len; i++, p++) { - if (be32_to_cpup(p)) { - dprintk("%s: notifications not supported\n", + + if (be32_to_cpup(p++) & + ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { + dprintk("%s: unsupported notification\n", + __func__); + } + + for (i = 1; i < len; i++) { + if (be32_to_cpup(p++)) { + dprintk("%s: unsupported notification\n", __func__); return -EIO; } @@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, } /* - * Decode GETDEVICELIST response - */ -static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, - struct xdr_stream *xdr, - struct nfs4_getdevicelist_res *res) -{ - struct compound_hdr hdr; - int status; - - dprintk("encoding getdevicelist!\n"); - - status = decode_compound_hdr(xdr, &hdr); - if (status != 0) - goto out; - status = decode_sequence(xdr, &res->seq_res, rqstp); - if (status != 0) - goto out; - status = decode_putfh(xdr); - if (status != 0) - goto out; - status = decode_getdevicelist(xdr, res->devlist); -out: - return status; -} - -/* * Decode GETDEVINFO response */ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, @@ -7490,7 +7384,6 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), - PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ae05278b3761..c6e4bda63000 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) kfree(de); } -static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, - const struct nfs4_deviceid *d_id) -{ - struct nfs4_deviceid_node *d; - struct objio_dev_ent *de; - - d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - if (!d) - return NULL; - - de = container_of(d, struct objio_dev_ent, id_node); - return de; -} - -static struct objio_dev_ent * -_dev_list_add(const struct nfs_server *nfss, - const struct nfs4_deviceid *d_id, struct osd_dev *od, - gfp_t gfp_flags) -{ - struct nfs4_deviceid_node *d; - struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); - struct objio_dev_ent *n; - - if (!de) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - return NULL; - } - - dprintk("%s: Adding od=%p\n", __func__, od); - nfs4_init_deviceid_node(&de->id_node, - nfss->pnfs_curr_ld, - nfss->nfs_client, - d_id); - de->od.od = od; - - d = nfs4_insert_deviceid_node(&de->id_node); - n = container_of(d, struct objio_dev_ent, id_node); - if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); - objio_free_deviceid_node(&de->id_node); - de = n; - } - - return de; -} - struct objio_segment { struct pnfs_layout_segment lseg; @@ -130,29 +84,24 @@ struct objio_state { /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, - gfp_t gfp_flags) +struct nfs4_deviceid_node * +objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode; + struct objio_dev_ent *ode = NULL; struct osd_dev *od; struct osd_dev_info odi; bool retry_flag = true; + __be32 *p; int err; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) { - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ - return 0; - } + deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); + if (!deviceaddr) + return NULL; - err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); - if (unlikely(err)) { - dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", - __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return err; - } + p = page_address(pdev->pages[0]); + pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { @@ -188,14 +137,24 @@ retry_lookup: goto out; } - ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, - gfp_flags); - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(d_id), _DEVID_HI(d_id)); + _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); + + ode = kzalloc(sizeof(*ode), gfp_flags); + if (!ode) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + goto out; + } + + nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); + kfree(deviceaddr); + + ode->od.od = od; + return &ode->id_node; + out: - objlayout_put_deviceinfo(deviceaddr); - return err; + kfree(deviceaddr); + return NULL; } static void copy_single_comp(struct ore_components *oc, unsigned c, @@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct xdr_stream *xdr, gfp_t gfp_flags) { + struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; @@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, objio_seg->oc.first_dev = layout.olo_comps_index; cur_comp = 0; while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + struct nfs4_deviceid_node *d; + struct objio_dev_ent *ode; + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, - &src_comp.oc_object_id.oid_device_id, - gfp_flags); - if (err) + + d = nfs4_find_get_deviceid(server, + &src_comp.oc_object_id.oid_device_id, + pnfslay->plh_lc_cred, gfp_flags); + if (!d) { + err = -ENXIO; goto err; - ++cur_comp; + } + + ode = container_of(d, struct objio_dev_ent, id_node); + objio_seg->oc.ods[cur_comp++] = &ode->od; } /* pnfs_osd_xdr_decode_layout_comp returns false on error */ if (unlikely(err)) @@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .flags = PNFS_LAYOUTRET_ON_SETATTR | PNFS_LAYOUTRET_ON_ERROR, + .max_deviceinfo_size = PAGE_SIZE, .owner = THIS_MODULE, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 697a16d11fac..c89357c7a914 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -574,76 +574,6 @@ loop_done: dprintk("%s: Return\n", __func__); } - -/* - * Get Device Info API for io engines - */ -struct objlayout_deviceinfo { - struct page *page; - struct pnfs_osd_deviceaddr da; /* This must be last */ -}; - -/* Initialize and call nfs_getdeviceinfo, then decode and return a - * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() - * should be called. - */ -int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, - gfp_t gfp_flags) -{ - struct objlayout_deviceinfo *odi; - struct pnfs_device pd; - struct page *page, **pages; - u32 *p; - int err; - - page = alloc_page(gfp_flags); - if (!page) - return -ENOMEM; - - pages = &page; - pd.pages = pages; - - memcpy(&pd.dev_id, d_id, sizeof(*d_id)); - pd.layout_type = LAYOUT_OSD2_OBJECTS; - pd.pages = &page; - pd.pgbase = 0; - pd.pglen = PAGE_SIZE; - pd.mincount = 0; - pd.maxcount = PAGE_SIZE; - - err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, - pnfslay->plh_lc_cred); - dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); - if (err) - goto err_out; - - p = page_address(page); - odi = kzalloc(sizeof(*odi), gfp_flags); - if (!odi) { - err = -ENOMEM; - goto err_out; - } - pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); - odi->page = page; - *deviceaddr = &odi->da; - return 0; - -err_out: - __free_page(page); - return err; -} - -void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) -{ - struct objlayout_deviceinfo *odi = container_of(deviceaddr, - struct objlayout_deviceinfo, - da); - - __free_page(odi->page); - kfree(odi); -} - enum { OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fd13f1d2f136..3a0828d57339 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir, extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, - gfp_t gfp_flags); -extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); - /* * exported generic objects function vectors */ diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index be7cbce6e4c7..94e16ec88312 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, return 0; } + /* + * Limit the request size so that we can still allocate a page array + * for it without upsetting the slab allocator. + */ + if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * + sizeof(struct page) > PAGE_SIZE) + return 0; + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a3851debf8a2..76de7f568119 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -594,6 +594,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, dprintk("%s freeing layout for inode %lu\n", __func__, lo->plh_inode->i_ino); inode = lo->plh_inode; + + pnfs_layoutcommit_inode(inode, false); + spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ @@ -682,17 +685,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) return (s32)(s1 - s2) > 0; } -static void -pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new, - struct list_head *free_me_list) -{ - if (nfs4_stateid_match_other(&lo->plh_stateid, new)) - return; - /* Layout is new! Kill existing layout segments */ - pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -749,7 +741,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; - } else if (list_empty(&lo->plh_segs)) { + } else if (list_empty(&lo->plh_segs) || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { int seq; do { @@ -864,6 +857,16 @@ _pnfs_return_layout(struct inode *ino) empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); + } + /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); @@ -871,6 +874,8 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout segments to return\n", __func__); goto out; } + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); lo->plh_block_lgets++; spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -1358,25 +1363,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out; } + init_lseg(lo, lseg); + lseg->pls_range = res->range; + spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { dprintk("%s forget reply due to recall\n", __func__); goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, 1) || - pnfs_layout_stateid_blocked(lo, &res->stateid)) { + if (pnfs_layoutgets_blocked(lo, 1)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } - /* Check that the new stateid matches the old stateid */ - pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); - /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid, false); + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + /* existing state ID, make sure the sequence number matches. */ + if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to sequence\n", __func__); + goto out_forget_reply; + } + pnfs_set_layout_stateid(lo, &res->stateid, false); + } else { + /* + * We got an entirely new state ID. Mark all segments for the + * inode invalid, and don't bother validating the stateid + * sequence number. + */ + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + + nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); + lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + } + + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - init_lseg(lo, lseg); - lseg->pls_range = res->range; pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg); @@ -1797,6 +1818,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); +void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) +{ + struct inode *inode = data->inode; + struct nfs_inode *nfsi = NFS_I(inode); + bool mark_as_dirty = false; + + spin_lock(&inode->i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + mark_as_dirty = true; + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, inode->i_ino); + } + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + pnfs_get_lseg(data->lseg); + } + if (data->lwb > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = data->lwb; + spin_unlock(&inode->i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, data->lseg, nfsi->layout->plh_lwb); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); + void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { struct nfs_server *nfss = NFS_SERVER(data->args.inode); @@ -1817,6 +1867,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; @@ -1867,6 +1918,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) data->args.lastbytewritten = end_pos - 1; data->res.server = NFS_SERVER(inode); + if (ld->prepare_layoutcommit) { + status = ld->prepare_layoutcommit(&data->args); + if (status) { + spin_lock(&inode->i_lock); + if (end_pos < nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + spin_unlock(&inode->i_lock); + put_rpccred(data->cred); + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + goto clear_layoutcommitting; + } + } + + status = nfs4_proc_layoutcommit(data, sync); out: if (status) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index aca3dff5dae6..693ce42ec683 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -65,12 +65,15 @@ enum { NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ + NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ }; enum layoutdriver_policy_flags { - /* Should the pNFS client commit and return the layout upon a setattr */ + /* Should the pNFS client commit and return the layout upon truncate to + * a smaller size */ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, PNFS_LAYOUTRET_ON_ERROR = 1 << 1, + PNFS_READ_WHOLE_PAGE = 1 << 2, }; struct nfs4_deviceid_node; @@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type { const char *name; struct module *owner; unsigned flags; + unsigned max_deviceinfo_size; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); @@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); + void (*return_range) (struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range); + /* test for nfs page cache coalescing */ const struct nfs_pageio_ops *pg_read_ops; const struct nfs_pageio_ops *pg_write_ops; @@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type { enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); void (*free_deviceid_node) (struct nfs4_deviceid_node *); + struct nfs4_deviceid_node * (*alloc_deviceid_node) + (struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags); void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); - - void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); }; @@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); /* nfs4proc.c */ -extern int nfs4_proc_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); @@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); void pnfs_set_layoutcommit(struct nfs_pgio_header *); +void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); @@ -255,11 +263,12 @@ struct nfs4_deviceid_node { atomic_t ref; }; -struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, - const struct pnfs_layoutdriver_type *, - const struct nfs_client *, +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); @@ -267,6 +276,13 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); +static inline struct nfs4_deviceid_node * +nfs4_get_deviceid(struct nfs4_deviceid_node *d) +{ + atomic_inc(&d->ref); + return d; +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -368,6 +384,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) } static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; +} + +static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); @@ -443,6 +467,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) } static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + return false; +} + +static inline bool pnfs_roc(struct inode *ino) { return false; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6da209bd9408..aa2ec0015183 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -29,6 +29,9 @@ */ #include <linux/export.h> +#include <linux/nfs_fs.h> +#include "nfs4session.h" +#include "internal.h" #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, return NULL; } +static struct nfs4_deviceid_node * +nfs4_get_device_info(struct nfs_server *server, + const struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d = NULL; + struct pnfs_device *pdev = NULL; + struct page **pages = NULL; + u32 max_resp_sz; + int max_pages; + int rc, i; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + if (server->pnfs_curr_ld->max_deviceinfo_size && + server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz) + max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size; + max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s: server %p max_resp_sz %u max_pages %d\n", + __func__, server, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(*pdev), gfp_flags); + if (!pdev) + return NULL; + + pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); + if (!pages) + goto out_free_pdev; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_free_pages; + } + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = server->pnfs_curr_ld->id; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = max_resp_sz; + pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free_pages; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, + gfp_flags); + +out_free_pages: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); +out_free_pdev: + kfree(pdev); + dprintk("<-- %s d %p\n", __func__, d); + return d; +} + /* * Lookup a deviceid in cache and get a reference count on it if found * @@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, * @id deviceid to look up */ static struct nfs4_deviceid_node * -_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id, - long hash) +__nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, long hash) { struct nfs4_deviceid_node *d; rcu_read_lock(); - d = _lookup_deviceid(ld, clp, id, hash); + d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, + hash); if (d != NULL) atomic_inc(&d->ref); rcu_read_unlock(); @@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, } struct nfs4_deviceid_node * -nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id) +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) { - return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + long hash = nfs4_deviceid_hash(id); + struct nfs4_deviceid_node *d, *new; + + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) + return d; + + new = nfs4_get_device_info(server, id, cred, gfp_mask); + if (!new) + return new; + + spin_lock(&nfs4_deviceid_lock); + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + server->pnfs_curr_ld->free_deviceid_node(new); + return d; + } + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + atomic_inc(&new->ref); + spin_unlock(&nfs4_deviceid_lock); + + return new; } EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); @@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); void -nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, - const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *nfs_client, +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server, const struct nfs4_deviceid *id) { INIT_HLIST_NODE(&d->node); INIT_HLIST_NODE(&d->tmpnode); - d->ld = ld; - d->nfs_client = nfs_client; + d->ld = server->pnfs_curr_ld; + d->nfs_client = server->nfs_client; d->flags = 0; d->deviceid = *id; atomic_set(&d->ref, 1); @@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); /* - * Uniquely initialize and insert a deviceid node into cache - * - * @new new deviceid node - * Note that the caller must set up the following members: - * new->ld - * new->nfs_client - * new->deviceid - * - * @ret the inserted node, if none found, otherwise, the found entry. - */ -struct nfs4_deviceid_node * -nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) -{ - struct nfs4_deviceid_node *d; - long hash; - - spin_lock(&nfs4_deviceid_lock); - hash = nfs4_deviceid_hash(&new->deviceid); - d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); - if (d) { - spin_unlock(&nfs4_deviceid_lock); - return d; - } - - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); - spin_unlock(&nfs4_deviceid_lock); - atomic_inc(&new->ref); - - return new; -} -EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); - -/* * Dereference a deviceid node and delete it when its reference count drops * to zero. * @@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) } rcu_read_unlock(); } - diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e4499d5b51e8..31a11b0e885d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options, return NFS_TEXT_DATA; } -#if !IS_ENABLED(CONFIG_NFS_V3) - if (args->version == 3) - goto out_v3_not_compiled; -#endif /* !CONFIG_NFS_V3 */ - return 0; out_no_data: @@ -2085,12 +2080,6 @@ out_no_sec: dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); return -EINVAL; -#if !IS_ENABLED(CONFIG_NFS_V3) -out_v3_not_compiled: - dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); - return -EPROTONOSUPPORT; -#endif /* !CONFIG_NFS_V3 */ - out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 175d5d073ccf..12493846a2d3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_clear_request_commit(struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) } /* - * nfs_page_search_commits_for_head_request_locked - * - * Search through commit lists on @inode for the head request for @page. - * Must be called while holding the inode (which is cinfo) lock. - * - * Returns the head request if found, or NULL if not found. - */ -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page) -{ - struct nfs_page *freq, *t; - struct nfs_commit_info cinfo; - struct inode *inode = &nfsi->vfs_inode; - - nfs_init_cinfo_from_inode(&cinfo, inode); - - /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, page); - if (freq) - return freq->wb_head; - - /* Linearly search the commit list for the correct request */ - list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - - return NULL; -} - -/* * nfs_page_find_head_request_locked - find head request associated with @page * * must be called while holding the inode lock. @@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { + int ret = 0; if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) + ret = FLUSH_COND_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI | FLUSH_COND_STABLE; - return FLUSH_COND_STABLE; + ret |= FLUSH_LOWPRI; + return ret; } /* @@ -731,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(!PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); + smp_mb__after_atomic(); + wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->npages--; @@ -749,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_page_search_commits_for_head_request_locked + * + * Search through commit lists on @inode for the head request for @page. + * Must be called while holding the inode (which is cinfo) lock. + * + * Returns the head request if found, or NULL if not found. + */ +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page) +{ + struct nfs_page *freq, *t; + struct nfs_commit_info cinfo; + struct inode *inode = &nfsi->vfs_inode; + + nfs_init_cinfo_from_inode(&cinfo, inode); + + /* search through pnfs commit lists */ + freq = pnfs_search_commit_reqs(inode, &cinfo, page); + if (freq) + return freq->wb_head; + + /* Linearly search the commit list for the correct request */ + list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + + return NULL; +} + /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page @@ -867,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr) return hdr->verf.committed != NFS_FILE_SYNC; } -#else -static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, - struct inode *inode) -{ -} - -void nfs_init_cinfo(struct nfs_commit_info *cinfo, - struct inode *inode, - struct nfs_direct_req *dreq) -{ -} - -void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) -{ -} - -static void -nfs_clear_request_commit(struct nfs_page *req) -{ -} - -int nfs_write_need_commit(struct nfs_pgio_header *hdr) -{ - return 0; -} - -#endif - static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; @@ -932,7 +909,6 @@ out: hdr->release(hdr); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { @@ -989,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, return ret; } -#else -unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) -{ - return 0; -} - -int nfs_scan_commit(struct inode *inode, struct list_head *dst, - struct nfs_commit_info *cinfo) -{ - return 0; -} -#endif - /* * Search for an existing write request, and attempt to update * it to reflect a new dirty region on a given page. @@ -1394,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task, return status; nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) if (hdr->res.verf->committed < hdr->args.stable && task->tk_status >= 0) { /* We tried a write call, but the server did not @@ -1416,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task, complain = jiffies + 300 * HZ; } } -#endif /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) @@ -1469,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task, } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { int ret; @@ -1538,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, } EXPORT_SYMBOL_GPL(nfs_initiate_commit); +static loff_t nfs_get_lwb(struct list_head *head) +{ + loff_t lwb = 0; + struct nfs_page *req; + + list_for_each_entry(req, head, wb_list) + if (lwb < (req_offset(req) + req->wb_bytes)) + lwb = req_offset(req) + req->wb_bytes; + + return lwb; +} + /* * Set up the argument/result storage required for the RPC call. */ @@ -1557,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data, data->inode = inode; data->cred = first->wb_context->cred; data->lseg = lseg; /* reference transferred */ + /* only set lwb for pnfs commit */ + if (lseg) + data->lwb = nfs_get_lwb(&data->pages); data->mds_ops = &nfs_commit_ops; data->completion_ops = cinfo->completion_ops; data->dreq = cinfo->dreq; @@ -1636,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; + struct nfs_server *nfss; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1669,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) next: nfs_unlock_and_release_request(req); } + nfss = NFS_SERVER(data->inode); + if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + nfs_init_cinfo(&cinfo, data->inode, data->dreq); if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) nfs_commit_clear_lock(NFS_I(data->inode)); @@ -1778,12 +1758,6 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } -#else -static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) -{ - return 0; -} -#endif int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 5180a7ededec..28d649054d5f 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -443,22 +443,6 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file) } /* - * linux/fs/nfs/xattr.c - */ -#ifdef CONFIG_NFS_V3_ACL -extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); -extern ssize_t nfs3_getxattr(struct dentry *, const char *, void *, size_t); -extern int nfs3_setxattr(struct dentry *, const char *, - const void *, size_t, int); -extern int nfs3_removexattr (struct dentry *, const char *name); -#else -# define nfs3_listxattr NULL -# define nfs3_getxattr NULL -# define nfs3_setxattr NULL -# define nfs3_removexattr NULL -#endif - -/* * linux/fs/nfs/direct.c */ extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t); @@ -529,17 +513,9 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_page(struct inode *inode, struct page* page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); -#else -static inline int -nfs_commit_inode(struct inode *inode, int how) -{ - return 0; -} -#endif static inline int nfs_have_writebacks(struct inode *inode) @@ -557,23 +533,6 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); /* - * linux/fs/nfs3proc.c - */ -#ifdef CONFIG_NFS_V3_ACL -extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); -extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl); -extern const struct xattr_handler *nfs3_xattr_handlers[]; -#else -static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, - struct posix_acl *dfacl) -{ - return 0; -} -#endif /* CONFIG_NFS_V3_ACL */ - -/* * inline functions */ diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 0040629894df..6951c7d9097d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -252,17 +252,6 @@ struct nfs4_layoutget { gfp_t gfp_flags; }; -struct nfs4_getdevicelist_args { - struct nfs4_sequence_args seq_args; - const struct nfs_fh *fh; - u32 layoutclass; -}; - -struct nfs4_getdevicelist_res { - struct nfs4_sequence_res seq_res; - struct pnfs_devicelist *devlist; -}; - struct nfs4_getdeviceinfo_args { struct nfs4_sequence_args seq_args; struct pnfs_device *pdev; @@ -279,6 +268,9 @@ struct nfs4_layoutcommit_args { __u64 lastbytewritten; struct inode *inode; const u32 *bitmask; + size_t layoutupdate_len; + struct page *layoutupdate_page; + struct page **layoutupdate_pages; }; struct nfs4_layoutcommit_res { @@ -1328,6 +1320,7 @@ struct nfs_commit_data { struct pnfs_layout_segment *lseg; struct nfs_client *ds_clp; /* pNFS data server */ int ds_commit_index; + loff_t lwb; const struct rpc_call_ops *mds_ops; const struct nfs_commit_completion_ops *completion_ops; int (*commit_done_cb) (struct rpc_task *task, struct nfs_commit_data *data); @@ -1346,6 +1339,7 @@ struct nfs_unlinkdata { struct inode *dir; struct rpc_cred *cred; struct nfs_fattr dir_attr; + long timeout; }; struct nfs_renamedata { @@ -1359,6 +1353,7 @@ struct nfs_renamedata { struct dentry *new_dentry; struct nfs_fattr new_fattr; void (*complete)(struct rpc_task *, struct nfs_renamedata *); + long timeout; }; struct nfs_access_entry; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 3df8c7db7a4e..2dca0cef3506 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -496,12 +496,14 @@ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, } /* - * This is exported only for wait_on_page_locked/wait_on_page_writeback. - * Never use this directly! + * This is exported only for wait_on_page_locked/wait_on_page_writeback, + * and for filesystems which need to wait on PG_private. */ extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); +extern int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout); static inline int wait_on_page_locked_killable(struct page *page) { @@ -510,6 +512,12 @@ static inline int wait_on_page_locked_killable(struct page *page) return 0; } +extern wait_queue_head_t *page_waitqueue(struct page *page); +static inline void wake_up_page(struct page *page, int bit) +{ + __wake_up_bit(page_waitqueue(page), &page->flags, bit); +} + /* * Wait for a page to be unlocked. * diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index fcbfe8783243..cf391eef2e6d 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -357,6 +357,7 @@ int xs_swapper(struct rpc_xprt *xprt, int enable); #define XPRT_CONNECTION_ABORT (7) #define XPRT_CONNECTION_CLOSE (8) #define XPRT_CONGESTED (9) +#define XPRT_CONNECTION_REUSE (10) static inline void xprt_set_connected(struct rpc_xprt *xprt) { diff --git a/include/linux/wait.h b/include/linux/wait.h index 6fb1ba5f9b2f..80115bf88671 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -25,7 +25,7 @@ struct wait_bit_key { void *flags; int bit_nr; #define WAIT_ATOMIC_T_BIT_NR -1 - unsigned long private; + unsigned long timeout; }; struct wait_bit_queue { @@ -154,6 +154,7 @@ int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, wait_bit_ac void wake_up_bit(void *, int); void wake_up_atomic_t(atomic_t *); int out_of_line_wait_on_bit(void *, int, wait_bit_action_f *, unsigned); +int out_of_line_wait_on_bit_timeout(void *, int, wait_bit_action_f *, unsigned, unsigned long); int out_of_line_wait_on_bit_lock(void *, int, wait_bit_action_f *, unsigned); int out_of_line_wait_on_atomic_t(atomic_t *, int (*)(atomic_t *), unsigned); wait_queue_head_t *bit_waitqueue(void *, int); @@ -859,6 +860,8 @@ int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *key); extern int bit_wait(struct wait_bit_key *); extern int bit_wait_io(struct wait_bit_key *); +extern int bit_wait_timeout(struct wait_bit_key *); +extern int bit_wait_io_timeout(struct wait_bit_key *); /** * wait_on_bit - wait for a bit to be cleared diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 15cab1a4f84e..5a62915f47a8 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit, } EXPORT_SYMBOL(out_of_line_wait_on_bit); +int __sched out_of_line_wait_on_bit_timeout( + void *word, int bit, wait_bit_action_f *action, + unsigned mode, unsigned long timeout) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + wait.key.timeout = jiffies + timeout; + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); + int __sched __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, wait_bit_action_f *action, unsigned mode) @@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word) return 0; } EXPORT_SYMBOL(bit_wait_io); + +__sched int bit_wait_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_timeout); + +__sched int bit_wait_io_timeout(struct wait_bit_key *word) +{ + unsigned long now = ACCESS_ONCE(jiffies); + if (signal_pending_state(current->state, current)) + return 1; + if (time_after_eq(now, word->timeout)) + return -EAGAIN; + io_schedule_timeout(word->timeout - now); + return 0; +} +EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/mm/filemap.c b/mm/filemap.c index 90effcdf948d..b9b1413080be 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -670,17 +670,13 @@ EXPORT_SYMBOL(__page_cache_alloc); * at a cost of "thundering herd" phenomena during rare hash * collisions. */ -static wait_queue_head_t *page_waitqueue(struct page *page) +wait_queue_head_t *page_waitqueue(struct page *page) { const struct zone *zone = page_zone(page); return &zone->wait_table[hash_ptr(page, zone->wait_table_bits)]; } - -static inline void wake_up_page(struct page *page, int bit) -{ - __wake_up_bit(page_waitqueue(page), &page->flags, bit); -} +EXPORT_SYMBOL(page_waitqueue); void wait_on_page_bit(struct page *page, int bit_nr) { @@ -703,6 +699,19 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr) bit_wait_io, TASK_KILLABLE); } +int wait_on_page_bit_killable_timeout(struct page *page, + int bit_nr, unsigned long timeout) +{ + DEFINE_WAIT_BIT(wait, &page->flags, bit_nr); + + wait.key.timeout = jiffies + timeout; + if (!test_bit(bit_nr, &page->flags)) + return 0; + return __wait_on_bit(page_waitqueue(page), &wait, + bit_wait_io_timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL_GPL(wait_on_page_bit_killable_timeout); + /** * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue * @page: Page defining the wait queue of interest diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index e0b94ce4c4e6..9acd6ce88db7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1916,6 +1916,7 @@ call_transmit_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -EPERM: if (RPC_IS_SOFTCONN(task)) { xprt_end_transmit(task); rpc_exit(task, task->tk_status); @@ -2021,6 +2022,7 @@ call_status(struct rpc_task *task) case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + case -EPERM: if (RPC_IS_SOFTCONN(task)) { rpc_exit(task, status); break; diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9358c79fd589..fe3441abdbe5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -821,9 +821,7 @@ void rpc_execute(struct rpc_task *task) static void rpc_async_schedule(struct work_struct *work) { - current->flags |= PF_FSTRANS; __rpc_execute(container_of(work, struct rpc_task, u.tk_work)); - current->flags &= ~PF_FSTRANS; } /** diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 2faac4940563..6a4615dd0261 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -205,7 +205,6 @@ xprt_rdma_connect_worker(struct work_struct *work) struct rpc_xprt *xprt = &r_xprt->xprt; int rc = 0; - current->flags |= PF_FSTRANS; xprt_clear_connected(xprt); dprintk("RPC: %s: %sconnect\n", __func__, @@ -216,7 +215,6 @@ xprt_rdma_connect_worker(struct work_struct *work) dprintk("RPC: %s: exit\n", __func__); xprt_clear_connecting(xprt); - current->flags &= ~PF_FSTRANS; } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 43cd89eacfab..3b305ab17afe 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -399,13 +399,13 @@ static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, return kernel_sendmsg(sock, &msg, NULL, 0, 0); } -static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy) +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more, bool zerocopy, int *sent_p) { ssize_t (*do_sendpage)(struct socket *sock, struct page *page, int offset, size_t size, int flags); struct page **ppage; unsigned int remainder; - int err, sent = 0; + int err; remainder = xdr->page_len - base; base += xdr->page_base; @@ -424,15 +424,15 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i err = do_sendpage(sock, *ppage, base, len, flags); if (remainder == 0 || err != len) break; - sent += err; + *sent_p += err; ppage++; base = 0; } - if (sent == 0) - return err; - if (err > 0) - sent += err; - return sent; + if (err > 0) { + *sent_p += err; + err = 0; + } + return err; } /** @@ -443,12 +443,14 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i * @xdr: buffer containing this request * @base: starting position in the buffer * @zerocopy: true if it is safe to use sendpage() + * @sent_p: return the total number of bytes successfully queued for sending * */ -static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy) +static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, bool zerocopy, int *sent_p) { unsigned int remainder = xdr->len - base; - int err, sent = 0; + int err = 0; + int sent = 0; if (unlikely(!sock)) return -ENOTSOCK; @@ -465,7 +467,7 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0); if (remainder == 0 || err != len) goto out; - sent += err; + *sent_p += err; base = 0; } else base -= xdr->head[0].iov_len; @@ -473,23 +475,23 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, if (base < xdr->page_len) { unsigned int len = xdr->page_len - base; remainder -= len; - err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy); - if (remainder == 0 || err != len) + err = xs_send_pagedata(sock, xdr, base, remainder != 0, zerocopy, &sent); + *sent_p += sent; + if (remainder == 0 || sent != len) goto out; - sent += err; base = 0; } else base -= xdr->page_len; if (base >= xdr->tail[0].iov_len) - return sent; + return 0; err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0); out: - if (sent == 0) - return err; - if (err > 0) - sent += err; - return sent; + if (err > 0) { + *sent_p += err; + err = 0; + } + return err; } static void xs_nospace_callback(struct rpc_task *task) @@ -573,19 +575,20 @@ static int xs_local_send_request(struct rpc_task *task) container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; int status; + int sent = 0; xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); - status = xs_sendpages(transport->sock, NULL, 0, - xdr, req->rq_bytes_sent, true); + status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, + true, &sent); dprintk("RPC: %s(%u) = %d\n", __func__, xdr->len - req->rq_bytes_sent, status); - if (likely(status >= 0)) { - req->rq_bytes_sent += status; - req->rq_xmit_bytes_sent += status; + if (likely(sent > 0) || status == 0) { + req->rq_bytes_sent += sent; + req->rq_xmit_bytes_sent += sent; if (likely(req->rq_bytes_sent >= req->rq_slen)) { req->rq_bytes_sent = 0; return 0; @@ -626,6 +629,7 @@ static int xs_udp_send_request(struct rpc_task *task) struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; + int sent = 0; int status; xs_pktdump("packet data:", @@ -634,22 +638,25 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; - status = xs_sendpages(transport->sock, - xs_addr(xprt), - xprt->addrlen, xdr, - req->rq_bytes_sent, true); + status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, + xdr, req->rq_bytes_sent, true, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); - if (status >= 0) { - req->rq_xmit_bytes_sent += status; - if (status >= req->rq_slen) + /* firewall is blocking us, don't return -EAGAIN or we end up looping */ + if (status == -EPERM) + goto process_status; + + if (sent > 0 || status == 0) { + req->rq_xmit_bytes_sent += sent; + if (sent >= req->rq_slen) return 0; /* Still some bytes left; set up for a retry later. */ status = -EAGAIN; } +process_status: switch (status) { case -ENOTSOCK: status = -ENOTCONN; @@ -665,6 +672,7 @@ static int xs_udp_send_request(struct rpc_task *task) case -ENOBUFS: case -EPIPE: case -ECONNREFUSED: + case -EPERM: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags); @@ -713,6 +721,7 @@ static int xs_tcp_send_request(struct rpc_task *task) struct xdr_buf *xdr = &req->rq_snd_buf; bool zerocopy = true; int status; + int sent; xs_encode_stream_record_marker(&req->rq_snd_buf); @@ -730,26 +739,26 @@ static int xs_tcp_send_request(struct rpc_task *task) * to cope with writespace callbacks arriving _after_ we have * called sendmsg(). */ while (1) { - status = xs_sendpages(transport->sock, - NULL, 0, xdr, req->rq_bytes_sent, - zerocopy); + sent = 0; + status = xs_sendpages(transport->sock, NULL, 0, xdr, + req->rq_bytes_sent, zerocopy, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", xdr->len - req->rq_bytes_sent, status); - if (unlikely(status < 0)) + if (unlikely(sent == 0 && status < 0)) break; /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ - req->rq_bytes_sent += status; - req->rq_xmit_bytes_sent += status; + req->rq_bytes_sent += sent; + req->rq_xmit_bytes_sent += sent; if (likely(req->rq_bytes_sent >= req->rq_slen)) { req->rq_bytes_sent = 0; return 0; } - if (status != 0) + if (sent != 0) continue; status = -EAGAIN; break; @@ -845,6 +854,8 @@ static void xs_error_report(struct sock *sk) dprintk("RPC: xs_error_report client %p, error=%d...\n", xprt, -err); trace_rpc_socket_error(xprt, sk->sk_socket, err); + if (test_bit(XPRT_CONNECTION_REUSE, &xprt->state)) + goto out; xprt_wake_pending_tasks(xprt, err); out: read_unlock_bh(&sk->sk_callback_lock); @@ -1746,13 +1757,29 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) unsigned short port = xs_get_srcport(transport); unsigned short last; + /* + * If we are asking for any ephemeral port (i.e. port == 0 && + * transport->xprt.resvport == 0), don't bind. Let the local + * port selection happen implicitly when the socket is used + * (for example at connect time). + * + * This ensures that we can continue to establish TCP + * connections even when all local ephemeral ports are already + * a part of some TCP connection. This makes no difference + * for UDP sockets, but also doens't harm them. + * + * If we're asking for any reserved port (i.e. port == 0 && + * transport->xprt.resvport == 1) xs_get_srcport above will + * ensure that port is non-zero and we will bind as needed. + */ + if (port == 0) + return 0; + memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); - if (port == 0) - break; if (err == 0) { transport->srcport = port; break; @@ -1927,8 +1954,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport) struct socket *sock; int status = -EIO; - current->flags |= PF_FSTRANS; - clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); status = __sock_create(xprt->xprt_net, AF_LOCAL, SOCK_STREAM, 0, &sock, 1); @@ -1968,7 +1993,6 @@ static int xs_local_setup_socket(struct sock_xprt *transport) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; return status; } @@ -2071,8 +2095,6 @@ static void xs_udp_setup_socket(struct work_struct *work) struct socket *sock = transport->sock; int status = -EIO; - current->flags |= PF_FSTRANS; - /* Start by resetting any existing state */ xs_reset_transport(transport); sock = xs_create_sock(xprt, transport, @@ -2092,7 +2114,6 @@ static void xs_udp_setup_socket(struct work_struct *work) out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; } /* @@ -2229,8 +2250,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) struct rpc_xprt *xprt = &transport->xprt; int status = -EIO; - current->flags |= PF_FSTRANS; - if (!sock) { clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); sock = xs_create_sock(xprt, transport, @@ -2245,7 +2264,9 @@ static void xs_tcp_setup_socket(struct work_struct *work) abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT, &xprt->state); /* "close" the socket, preserving the local port */ + set_bit(XPRT_CONNECTION_REUSE, &xprt->state); xs_tcp_reuse_connection(transport); + clear_bit(XPRT_CONNECTION_REUSE, &xprt->state); if (abort_and_exit) goto out_eagain; @@ -2276,7 +2297,6 @@ static void xs_tcp_setup_socket(struct work_struct *work) case -EINPROGRESS: case -EALREADY: xprt_clear_connecting(xprt); - current->flags &= ~PF_FSTRANS; return; case -EINVAL: /* Happens, for instance, if the user specified a link @@ -2294,7 +2314,6 @@ out_eagain: out: xprt_clear_connecting(xprt); xprt_wake_pending_tasks(xprt, status); - current->flags &= ~PF_FSTRANS; } /** |