111 files changed, 4684 insertions, 3661 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 4fe6df3ec28f..39a824f44e7c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -53,7 +53,7 @@ obj-$(CONFIG_FHANDLE)		+= fhandle.o
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
-obj-$(CONFIG_SYSFS)		+= sysfs/
+obj-$(CONFIG_SYSFS)		+= sysfs/ kernfs/
 obj-$(CONFIG_CONFIGFS_FS)	+= configfs/
 obj-y				+= devpts/
 
diff --git a/fs/affs/Changes b/fs/affs/Changes
index a29409c1ffe0..b41c2c9792ff 100644
--- a/fs/affs/Changes
+++ b/fs/affs/Changes
@@ -91,7 +91,7 @@ more 2.4 fixes: [Roman Zippel]
 Version 3.11
 ------------
 
-- Converted to use 2.3.x page cache [Dave Jones <dave@powertweak.com>]
+- Converted to use 2.3.x page cache [Dave Jones]
 - Corruption in truncate() bugfix [Ken Tyler <kent@werple.net.au>]
 
 Version 3.10
diff --git a/fs/aio.c b/fs/aio.c
index 08159ed13649..062a5f6a1448 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -244,9 +244,14 @@ static void aio_free_ring(struct kioctx *ctx)
 	int i;
 
 	for (i = 0; i < ctx->nr_pages; i++) {
+		struct page *page;
 		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
 				page_count(ctx->ring_pages[i]));
-		put_page(ctx->ring_pages[i]);
+		page = ctx->ring_pages[i];
+		if (!page)
+			continue;
+		ctx->ring_pages[i] = NULL;
+		put_page(page);
 	}
 
 	put_aio_ring_file(ctx);
@@ -280,18 +285,38 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 	unsigned long flags;
 	int rc;
 
+	rc = 0;
+
+	/* Make sure the old page hasn't already been changed */
+	spin_lock(&mapping->private_lock);
+	ctx = mapping->private_data;
+	if (ctx) {
+		pgoff_t idx;
+		spin_lock_irqsave(&ctx->completion_lock, flags);
+		idx = old->index;
+		if (idx < (pgoff_t)ctx->nr_pages) {
+			if (ctx->ring_pages[idx] != old)
+				rc = -EAGAIN;
+		} else
+			rc = -EINVAL;
+		spin_unlock_irqrestore(&ctx->completion_lock, flags);
+	} else
+		rc = -EINVAL;
+	spin_unlock(&mapping->private_lock);
+
+	if (rc != 0)
+		return rc;
+
 	/* Writeback must be complete */
 	BUG_ON(PageWriteback(old));
-	put_page(old);
+	get_page(new);
 
-	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode, 1);
 	if (rc != MIGRATEPAGE_SUCCESS) {
-		get_page(old);
+		put_page(new);
 		return rc;
 	}
 
-	get_page(new);
-
 	/* We can potentially race against kioctx teardown here.  Use the
 	 * address_space's private data lock to protect the mapping's
 	 * private_data.
@@ -303,13 +328,24 @@ static int aio_migratepage(struct address_space *mapping, struct page *new,
 		spin_lock_irqsave(&ctx->completion_lock, flags);
 		migrate_page_copy(new, old);
 		idx = old->index;
-		if (idx < (pgoff_t)ctx->nr_pages)
-			ctx->ring_pages[idx] = new;
+		if (idx < (pgoff_t)ctx->nr_pages) {
+			/* And only do the move if things haven't changed */
+			if (ctx->ring_pages[idx] == old)
+				ctx->ring_pages[idx] = new;
+			else
+				rc = -EAGAIN;
+		} else
+			rc = -EINVAL;
 		spin_unlock_irqrestore(&ctx->completion_lock, flags);
 	} else
 		rc = -EBUSY;
 	spin_unlock(&mapping->private_lock);
 
+	if (rc == MIGRATEPAGE_SUCCESS)
+		put_page(old);
+	else
+		put_page(new);
+
 	return rc;
 }
 #endif
@@ -326,7 +362,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 	struct aio_ring *ring;
 	unsigned nr_events = ctx->max_reqs;
 	struct mm_struct *mm = current->mm;
-	unsigned long size, populate;
+	unsigned long size, unused;
 	int nr_pages;
 	int i;
 	struct file *file;
@@ -347,6 +383,20 @@ static int aio_setup_ring(struct kioctx *ctx)
 		return -EAGAIN;
 	}
 
+	ctx->aio_ring_file = file;
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+			/ sizeof(struct io_event);
+
+	ctx->ring_pages = ctx->internal_pages;
+	if (nr_pages > AIO_RING_PAGES) {
+		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
+					  GFP_KERNEL);
+		if (!ctx->ring_pages) {
+			put_aio_ring_file(ctx);
+			return -ENOMEM;
+		}
+	}
+
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
 		page = find_or_create_page(file->f_inode->i_mapping,
@@ -358,17 +408,14 @@ static int aio_setup_ring(struct kioctx *ctx)
 		SetPageUptodate(page);
 		SetPageDirty(page);
 		unlock_page(page);
+
+		ctx->ring_pages[i] = page;
 	}
-	ctx->aio_ring_file = file;
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
-			/ sizeof(struct io_event);
+	ctx->nr_pages = i;
 
-	ctx->ring_pages = ctx->internal_pages;
-	if (nr_pages > AIO_RING_PAGES) {
-		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
-					  GFP_KERNEL);
-		if (!ctx->ring_pages)
-			return -ENOMEM;
+	if (unlikely(i != nr_pages)) {
+		aio_free_ring(ctx);
+		return -EAGAIN;
 	}
 
 	ctx->mmap_size = nr_pages * PAGE_SIZE;
@@ -377,9 +424,9 @@ static int aio_setup_ring(struct kioctx *ctx)
 	down_write(&mm->mmap_sem);
 	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
 				       PROT_READ | PROT_WRITE,
-				       MAP_SHARED | MAP_POPULATE, 0, &populate);
+				       MAP_SHARED, 0, &unused);
+	up_write(&mm->mmap_sem);
 	if (IS_ERR((void *)ctx->mmap_base)) {
-		up_write(&mm->mmap_sem);
 		ctx->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
@@ -387,27 +434,6 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
 
-	/* We must do this while still holding mmap_sem for write, as we
-	 * need to be protected against userspace attempting to mremap()
-	 * or munmap() the ring buffer.
-	 */
-	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
-				       1, 0, ctx->ring_pages, NULL);
-
-	/* Dropping the reference here is safe as the page cache will hold
-	 * onto the pages for us.  It is also required so that page migration
-	 * can unmap the pages and get the right reference count.
-	 */
-	for (i = 0; i < ctx->nr_pages; i++)
-		put_page(ctx->ring_pages[i]);
-
-	up_write(&mm->mmap_sem);
-
-	if (unlikely(ctx->nr_pages != nr_pages)) {
-		aio_free_ring(ctx);
-		return -EAGAIN;
-	}
-
 	ctx->user_id = ctx->mmap_base;
 	ctx->nr_events = nr_events; /* trusted copy */
 
@@ -645,12 +671,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	    aio_nr + nr_events < aio_nr) {
 		spin_unlock(&aio_nr_lock);
 		err = -EAGAIN;
-		goto err;
+		goto err_ctx;
 	}
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
 
-	percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
+	percpu_ref_get(&ctx->users);	/* io_setup() will drop this ref */
+	percpu_ref_get(&ctx->reqs);	/* free_ioctx_users() will drop this */
 
 	err = ioctx_add_table(ctx, mm);
 	if (err)
@@ -662,6 +689,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 err_cleanup:
 	aio_nr_sub(ctx->max_reqs);
+err_ctx:
+	aio_free_ring(ctx);
 err:
 	free_percpu(ctx->cpu);
 	free_percpu(ctx->reqs.pcpu_count);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b50764bef141..131d82800b3a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -333,7 +333,6 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
 			      struct btrfsic_block_data_ctx *block_ctx);
 static void btrfsic_dump_database(struct btrfsic_state *state);
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
 				     char **datav, unsigned int num_pages);
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
@@ -1687,7 +1686,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 	for (i = 0; i < num_pages;) {
 		struct bio *bio;
 		unsigned int j;
-		DECLARE_COMPLETION_ONSTACK(complete);
 
 		bio = btrfs_io_bio_alloc(GFP_NOFS, num_pages - i);
 		if (!bio) {
@@ -1698,8 +1696,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 		}
 		bio->bi_bdev = block_ctx->dev->bdev;
 		bio->bi_sector = dev_bytenr >> 9;
-		bio->bi_end_io = btrfsic_complete_bio_end_io;
-		bio->bi_private = &complete;
 
 		for (j = i; j < num_pages; j++) {
 			ret = bio_add_page(bio, block_ctx->pagev[j],
@@ -1712,12 +1708,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 			       "btrfsic: error, failed to add a single page!\n");
 			return -1;
 		}
-		submit_bio(READ, bio);
-
-		/* this will also unplug the queue */
-		wait_for_completion(&complete);
-
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		if (submit_bio_wait(READ, bio)) {
 			printk(KERN_INFO
 			       "btrfsic: read error at logical %llu dev %s!\n",
 			       block_ctx->start, block_ctx->dev->name);
@@ -1740,11 +1731,6 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 	return block_ctx->len;
 }
 
-static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 static void btrfsic_dump_database(struct btrfsic_state *state)
 {
 	struct list_head *elem_all;
@@ -3008,14 +2994,12 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
 	return submit_bh(rw, bh);
 }
 
-void btrfsic_submit_bio(int rw, struct bio *bio)
+static void __btrfsic_submit_bio(int rw, struct bio *bio)
 {
 	struct btrfsic_dev_state *dev_state;
 
-	if (!btrfsic_is_initialized) {
-		submit_bio(rw, bio);
+	if (!btrfsic_is_initialized)
 		return;
-	}
 
 	mutex_lock(&btrfsic_mutex);
 	/* since btrfsic_submit_bio() is also called before
@@ -3106,10 +3090,20 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 	}
 leave:
 	mutex_unlock(&btrfsic_mutex);
+}
 
+void btrfsic_submit_bio(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
 	submit_bio(rw, bio);
 }
 
+int btrfsic_submit_bio_wait(int rw, struct bio *bio)
+{
+	__btrfsic_submit_bio(rw, bio);
+	return submit_bio_wait(rw, bio);
+}
+
 int btrfsic_mount(struct btrfs_root *root,
 		  struct btrfs_fs_devices *fs_devices,
 		  int including_extent_data, u32 print_mask)
diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h
index 8b59175cc502..13b8566c97ab 100644
--- a/fs/btrfs/check-integrity.h
+++ b/fs/btrfs/check-integrity.h
@@ -22,9 +22,11 @@
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 int btrfsic_submit_bh(int rw, struct buffer_head *bh);
 void btrfsic_submit_bio(int rw, struct bio *bio);
+int btrfsic_submit_bio_wait(int rw, struct bio *bio);
 #else
 #define btrfsic_submit_bh submit_bh
 #define btrfsic_submit_bio submit_bio
+#define btrfsic_submit_bio_wait submit_bio_wait
 #endif
 
 int btrfsic_mount(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 45d98d01028f..9c01509dd8ab 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -767,20 +767,19 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	if (metadata) {
-		key.objectid = bytenr;
-		key.type = BTRFS_METADATA_ITEM_KEY;
-		key.offset = offset;
-	} else {
-		key.objectid = bytenr;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = offset;
-	}
-
 	if (!trans) {
 		path->skip_locking = 1;
 		path->search_commit_root = 1;
 	}
+
+search_again:
+	key.objectid = bytenr;
+	key.offset = offset;
+	if (metadata)
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+
 again:
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
 				&key, path, 0, 0);
@@ -788,7 +787,6 @@ again:
 		goto out_free;
 
 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
-		metadata = 0;
 		if (path->slots[0]) {
 			path->slots[0]--;
 			btrfs_item_key_to_cpu(path->nodes[0], &key,
@@ -855,7 +853,7 @@ again:
 			mutex_lock(&head->mutex);
 			mutex_unlock(&head->mutex);
 			btrfs_put_delayed_ref(&head->node);
-			goto again;
+			goto search_again;
 		}
 		if (head->extent_op && head->extent_op->update_flags)
 			extent_flags |= head->extent_op->flags_to_set;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8e457fca0a0b..ff43802a7c88 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1952,11 +1952,6 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
 	return err;
 }
 
-static void repair_io_failure_callback(struct bio *bio, int err)
-{
-	complete(bio->bi_private);
-}
-
 /*
  * this bypasses the standard btrfs submit functions deliberately, as
  * the standard behavior is to write all copies in a raid setup. here we only
@@ -1973,7 +1968,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 {
 	struct bio *bio;
 	struct btrfs_device *dev;
-	DECLARE_COMPLETION_ONSTACK(compl);
 	u64 map_length = 0;
 	u64 sector;
 	struct btrfs_bio *bbio = NULL;
@@ -1990,8 +1984,6 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio)
 		return -EIO;
-	bio->bi_private = &compl;
-	bio->bi_end_io = repair_io_failure_callback;
 	bio->bi_size = 0;
 	map_length = length;
 
@@ -2012,10 +2004,8 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 	}
 	bio->bi_bdev = dev->bdev;
 	bio_add_page(bio, page, length, start - page_offset(page));
-	btrfsic_submit_bio(WRITE_SYNC, bio);
-	wait_for_completion(&compl);
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) {
 		/* try to remap that extent elsewhere? */
 		bio_put(bio);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a111622598b0..21da5762b0b1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2121,7 +2121,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 
 	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
 	if (err == -EINTR)
-		goto out;
+		goto out_drop_write;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -2284,6 +2284,7 @@ out_dput:
 	dput(dentry);
 out_unlock_dir:
 	mutex_unlock(&dir->i_mutex);
+out_drop_write:
 	mnt_drop_write_file(file);
 out:
 	kfree(vol_args);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ce459a7cb16d..429c73c374b8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -571,7 +571,9 @@ static int is_cowonly_root(u64 root_objectid)
 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
+	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
+	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
 		return 1;
 	return 0;
 }
@@ -1264,10 +1266,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
 }
 
 /*
- * helper to update/delete the 'address of tree root -> reloc tree'
+ * helper to delete the 'address of tree root -> reloc tree'
  * mapping
  */
-static int __update_reloc_root(struct btrfs_root *root, int del)
+static void __del_reloc_root(struct btrfs_root *root)
 {
 	struct rb_node *rb_node;
 	struct mapping_node *node = NULL;
@@ -1275,7 +1277,7 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
 
 	spin_lock(&rc->reloc_root_tree.lock);
 	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
-			      root->commit_root->start);
+			      root->node->start);
 	if (rb_node) {
 		node = rb_entry(rb_node, struct mapping_node, rb_node);
 		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1283,23 +1285,45 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
 	spin_unlock(&rc->reloc_root_tree.lock);
 
 	if (!node)
-		return 0;
+		return;
 	BUG_ON((struct btrfs_root *)node->data != root);
 
-	if (!del) {
-		spin_lock(&rc->reloc_root_tree.lock);
-		node->bytenr = root->node->start;
-		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
-				      node->bytenr, &node->rb_node);
-		spin_unlock(&rc->reloc_root_tree.lock);
-		if (rb_node)
-			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
-	} else {
-		spin_lock(&root->fs_info->trans_lock);
-		list_del_init(&root->root_list);
-		spin_unlock(&root->fs_info->trans_lock);
-		kfree(node);
+	spin_lock(&root->fs_info->trans_lock);
+	list_del_init(&root->root_list);
+	spin_unlock(&root->fs_info->trans_lock);
+	kfree(node);
+}
+
+/*
+ * helper to update the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->node->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
 	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	if (!node)
+		return 0;
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	node->bytenr = new_bytenr;
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, node->bytenr);
 	return 0;
 }
 
@@ -1420,7 +1444,6 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_root *reloc_root;
 	struct btrfs_root_item *root_item;
-	int del = 0;
 	int ret;
 
 	if (!root->reloc_root)
@@ -1432,11 +1455,9 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
 	if (root->fs_info->reloc_ctl->merge_reloc_tree &&
 	    btrfs_root_refs(root_item) == 0) {
 		root->reloc_root = NULL;
-		del = 1;
+		__del_reloc_root(reloc_root);
 	}
 
-	__update_reloc_root(reloc_root, del);
-
 	if (reloc_root->commit_root != reloc_root->node) {
 		btrfs_set_root_node(root_item, reloc_root->node);
 		free_extent_buffer(reloc_root->commit_root);
@@ -2287,7 +2308,7 @@ void free_reloc_roots(struct list_head *list)
 	while (!list_empty(list)) {
 		reloc_root = list_entry(list->next, struct btrfs_root,
 					root_list);
-		__update_reloc_root(reloc_root, 1);
+		__del_reloc_root(reloc_root);
 		free_extent_buffer(reloc_root->node);
 		free_extent_buffer(reloc_root->commit_root);
 		kfree(reloc_root);
@@ -2332,7 +2353,7 @@ again:
 
 			ret = merge_reloc_root(rc, root);
 			if (ret) {
-				__update_reloc_root(reloc_root, 1);
+				__del_reloc_root(reloc_root);
 				free_extent_buffer(reloc_root->node);
 				free_extent_buffer(reloc_root->commit_root);
 				kfree(reloc_root);
@@ -2388,6 +2409,13 @@ out:
 		btrfs_std_error(root->fs_info, ret);
 		if (!list_empty(&reloc_roots))
 			free_reloc_roots(&reloc_roots);
+
+		/* new reloc root may be added */
+		mutex_lock(&root->fs_info->reloc_mutex);
+		list_splice_init(&rc->reloc_roots, &reloc_roots);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
 	}
 
 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
@@ -4522,6 +4550,11 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
 
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (buf == root->node)
+			__update_reloc_root(root, cow->start);
+	}
+
 	level = btrfs_header_level(buf);
 	if (btrfs_header_generation(buf) <=
 	    btrfs_root_last_snapshot(&root->root_item))
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 561e2f16ba3e..1fd3f33c330a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -208,7 +208,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 int is_metadata, int have_csum,
 					 const u8 *csum, u64 generation,
 					 u16 csum_size);
-static void scrub_complete_bio_end_io(struct bio *bio, int err);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good,
 					     int force_write);
@@ -1294,7 +1293,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
 		struct scrub_page *page = sblock->pagev[page_num];
-		DECLARE_COMPLETION_ONSTACK(complete);
 
 		if (page->dev->bdev == NULL) {
 			page->io_error = 1;
@@ -1311,18 +1309,11 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 		}
 		bio->bi_bdev = page->dev->bdev;
 		bio->bi_sector = page->physical >> 9;
-		bio->bi_end_io = scrub_complete_bio_end_io;
-		bio->bi_private = &complete;
 
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
-		btrfsic_submit_bio(READ, bio);
-
-		/* this will also unplug the queue */
-		wait_for_completion(&complete);
-
-		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		if (btrfsic_submit_bio_wait(READ, bio))
 			sblock->no_io_error_seen = 0;
+
 		bio_put(bio);
 	}
 
@@ -1391,11 +1382,6 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 		sblock->checksum_error = 1;
 }
 
-static void scrub_complete_bio_end_io(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good,
 					     int force_write)
@@ -1430,7 +1416,6 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 	    sblock_bad->checksum_error || page_bad->io_error) {
 		struct bio *bio;
 		int ret;
-		DECLARE_COMPLETION_ONSTACK(complete);
 
 		if (!page_bad->dev->bdev) {
 			printk_ratelimited(KERN_WARNING
@@ -1443,19 +1428,14 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
 		bio->bi_sector = page_bad->physical >> 9;
-		bio->bi_end_io = scrub_complete_bio_end_io;
-		bio->bi_private = &complete;
 
 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
 		if (PAGE_SIZE != ret) {
 			bio_put(bio);
 			return -EIO;
 		}
-		btrfsic_submit_bio(WRITE, bio);
 
-		/* this will also unplug the queue */
-		wait_for_completion(&complete);
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
+		if (btrfsic_submit_bio_wait(WRITE, bio)) {
 			btrfs_dev_stat_inc_and_print(page_bad->dev,
 				BTRFS_DEV_STAT_WRITE_ERRS);
 			btrfs_dev_replace_stats_inc(
@@ -3375,7 +3355,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	struct bio *bio;
 	struct btrfs_device *dev;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(compl);
 
 	dev = sctx->wr_ctx.tgtdev;
 	if (!dev)
@@ -3392,8 +3371,6 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
-	bio->bi_private = &compl;
-	bio->bi_end_io = scrub_complete_bio_end_io;
 	bio->bi_size = 0;
 	bio->bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
@@ -3404,10 +3381,8 @@ leave_with_eio:
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		return -EIO;
 	}
-	btrfsic_submit_bio(WRITE_SYNC, bio);
-	wait_for_completion(&compl);
 
-	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
 		goto leave_with_eio;
 
 	bio_put(bio);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 6837fe87f3a6..945d1db98f26 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4723,8 +4723,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
 	}
 
 	if (!access_ok(VERIFY_READ, arg->clone_sources,
-			sizeof(*arg->clone_sources *
-			arg->clone_sources_count))) {
+			sizeof(*arg->clone_sources) *
+			arg->clone_sources_count)) {
 		ret = -EFAULT;
 		goto out;
 	}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2d8ac1bf0cf9..d71a11d13dfa 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -432,7 +432,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			} else {
 				printk(KERN_INFO "btrfs: setting nodatacow\n");
 			}
-			info->compress_type = BTRFS_COMPRESS_NONE;
 			btrfs_clear_opt(info->mount_opt, COMPRESS);
 			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			btrfs_set_opt(info->mount_opt, NODATACOW);
@@ -461,7 +460,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
-				info->compress_type = BTRFS_COMPRESS_NONE;
 				btrfs_clear_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 				compress_force = false;
@@ -474,9 +472,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
 				pr_info("btrfs: force %s compression\n",
 					compress_type);
-			} else
+			} else if (btrfs_test_opt(root, COMPRESS)) {
 				pr_info("btrfs: use %s compression\n",
 					compress_type);
+			}
 			break;
 		case Opt_ssd:
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6df8bd481425..ec3ba43b9faa 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -210,13 +210,17 @@ static int readpage_nounlock(struct file *filp, struct page *page)
 	if (err < 0) {
 		SetPageError(page);
 		goto out;
-	} else if (err < PAGE_CACHE_SIZE) {
+	} else {
+		if (err < PAGE_CACHE_SIZE) {
 		/* zero fill remainder of page */
-		zero_user_segment(page, err, PAGE_CACHE_SIZE);
+			zero_user_segment(page, err, PAGE_CACHE_SIZE);
+		} else {
+			flush_dcache_page(page);
+		}
 	}
 	SetPageUptodate(page);
 
-	if (err == 0)
+	if (err >= 0)
 		ceph_readpage_to_fscache(inode, page);
 
 out:
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 7db2e6ca4b8f..8c44fdd4e1c3 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -324,6 +324,9 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 
+	if (!PageFsCache(page))
+		return;
+
 	fscache_wait_on_page_write(ci->fscache, page);
 	fscache_uncache_page(ci->fscache, page);
 }
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 13976c33332e..3c0a4bd74996 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -897,7 +897,7 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
  * caller should hold i_ceph_lock.
  * caller will not hold session s_mutex if called from destroy_inode.
  */
-void __ceph_remove_cap(struct ceph_cap *cap)
+void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 {
 	struct ceph_mds_session *session = cap->session;
 	struct ceph_inode_info *ci = cap->ci;
@@ -909,6 +909,16 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 
 	/* remove from session list */
 	spin_lock(&session->s_cap_lock);
+	/*
+	 * s_cap_reconnect is protected by s_cap_lock. no one changes
+	 * s_cap_gen while session is in the reconnect state.
+	 */
+	if (queue_release &&
+	    (!session->s_cap_reconnect ||
+	     cap->cap_gen == session->s_cap_gen))
+		__queue_cap_release(session, ci->i_vino.ino, cap->cap_id,
+				    cap->mseq, cap->issue_seq);
+
 	if (session->s_cap_iterator == cap) {
 		/* not yet, we are iterating over this very cap */
 		dout("__ceph_remove_cap  delaying %p removal from session %p\n",
@@ -1023,7 +1033,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
 	struct ceph_mds_cap_release *head;
 	struct ceph_mds_cap_item *item;
 
-	spin_lock(&session->s_cap_lock);
 	BUG_ON(!session->s_num_cap_releases);
 	msg = list_first_entry(&session->s_cap_releases,
 			       struct ceph_msg, list_head);
@@ -1052,7 +1061,6 @@ void __queue_cap_release(struct ceph_mds_session *session,
 		     (int)CEPH_CAPS_PER_RELEASE,
 		     (int)msg->front.iov_len);
 	}
-	spin_unlock(&session->s_cap_lock);
 }
 
 /*
@@ -1067,12 +1075,8 @@ void ceph_queue_caps_release(struct inode *inode)
 	p = rb_first(&ci->i_caps);
 	while (p) {
 		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
-		struct ceph_mds_session *session = cap->session;
-
-		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
-				    cap->mseq, cap->issue_seq);
 		p = rb_next(p);
-		__ceph_remove_cap(cap);
+		__ceph_remove_cap(cap, true);
 	}
 }
 
@@ -2791,7 +2795,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
 			}
 			spin_unlock(&mdsc->cap_dirty_lock);
 		}
-		__ceph_remove_cap(cap);
+		__ceph_remove_cap(cap, false);
 	}
 	/* else, we already released it */
 
@@ -2931,9 +2935,12 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	if (!inode) {
 		dout(" i don't have ino %llx\n", vino.ino);
 
-		if (op == CEPH_CAP_OP_IMPORT)
+		if (op == CEPH_CAP_OP_IMPORT) {
+			spin_lock(&session->s_cap_lock);
 			__queue_cap_release(session, vino.ino, cap_id,
 					    mseq, seq);
+			spin_unlock(&session->s_cap_lock);
+		}
 		goto flush_cap_releases;
 	}
 
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 868b61d56cac..2a0bcaeb189a 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -352,8 +352,18 @@ more:
 		}
 
 		/* note next offset and last dentry name */
+		rinfo = &req->r_reply_info;
+		if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+			frag = le32_to_cpu(rinfo->dir_dir->frag);
+			if (ceph_frag_is_leftmost(frag))
+				fi->next_offset = 2;
+			else
+				fi->next_offset = 0;
+			off = fi->next_offset;
+		}
 		fi->offset = fi->next_offset;
 		fi->last_readdir = req;
+		fi->frag = frag;
 
 		if (req->r_reply_info.dir_end) {
 			kfree(fi->last_name);
@@ -363,7 +373,6 @@ more:
 			else
 				fi->next_offset = 0;
 		} else {
-			rinfo = &req->r_reply_info;
 			err = note_last_dentry(fi,
 				       rinfo->dir_dname[rinfo->dir_nr-1],
 				       rinfo->dir_dname_len[rinfo->dir_nr-1]);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 8549a48115f7..278fd2891288 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -577,6 +577,8 @@ static int fill_inode(struct inode *inode,
 	int issued = 0, implemented;
 	struct timespec mtime, atime, ctime;
 	u32 nsplits;
+	struct ceph_inode_frag *frag;
+	struct rb_node *rb_node;
 	struct ceph_buffer *xattr_blob = NULL;
 	int err = 0;
 	int queue_trunc = 0;
@@ -751,15 +753,38 @@ no_change:
 	/* FIXME: move me up, if/when version reflects fragtree changes */
 	nsplits = le32_to_cpu(info->fragtree.nsplits);
 	mutex_lock(&ci->i_fragtree_mutex);
+	rb_node = rb_first(&ci->i_fragtree);
 	for (i = 0; i < nsplits; i++) {
 		u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
-		struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
-
-		if (IS_ERR(frag))
-			continue;
+		frag = NULL;
+		while (rb_node) {
+			frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+			if (ceph_frag_compare(frag->frag, id) >= 0) {
+				if (frag->frag != id)
+					frag = NULL;
+				else
+					rb_node = rb_next(rb_node);
+				break;
+			}
+			rb_node = rb_next(rb_node);
+			rb_erase(&frag->node, &ci->i_fragtree);
+			kfree(frag);
+			frag = NULL;
+		}
+		if (!frag) {
+			frag = __get_or_create_frag(ci, id);
+			if (IS_ERR(frag))
+				continue;
+		}
 		frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
 		dout(" frag %x split by %d\n", frag->frag, frag->split_by);
 	}
+	while (rb_node) {
+		frag = rb_entry(rb_node, struct ceph_inode_frag, node);
+		rb_node = rb_next(rb_node);
+		rb_erase(&frag->node, &ci->i_fragtree);
+		kfree(frag);
+	}
 	mutex_unlock(&ci->i_fragtree_mutex);
 
 	/* were we issued a capability? */
@@ -953,7 +978,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
-	int i = 0;
 	int err = 0;
 
 	dout("fill_trace %p is_dentry %d is_target %d\n", req,
@@ -1014,6 +1038,29 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 		}
 	}
 
+	if (rinfo->head->is_target) {
+		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
+		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
+
+		in = ceph_get_inode(sb, vino);
+		if (IS_ERR(in)) {
+			err = PTR_ERR(in);
+			goto done;
+		}
+		req->r_target_inode = in;
+
+		err = fill_inode(in, &rinfo->targeti, NULL,
+				session, req->r_request_started,
+				(le32_to_cpu(rinfo->head->result) == 0) ?
+				req->r_fmode : -1,
+				&req->r_caps_reservation);
+		if (err < 0) {
+			pr_err("fill_inode badness %p %llx.%llx\n",
+				in, ceph_vinop(in));
+			goto done;
+		}
+	}
+
 	/*
 	 * ignore null lease/binding on snapdir ENOENT, or else we
 	 * will have trouble splicing in the virtual snapdir later
@@ -1083,7 +1130,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 			     ceph_dentry(req->r_old_dentry)->offset);
 
 			dn = req->r_old_dentry;  /* use old_dentry */
-			in = dn->d_inode;
 		}
 
 		/* null dentry? */
@@ -1105,44 +1151,28 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 		}
 
 		/* attach proper inode */
-		ininfo = rinfo->targeti.in;
-		vino.ino = le64_to_cpu(ininfo->ino);
-		vino.snap = le64_to_cpu(ininfo->snapid);
-		in = dn->d_inode;
-		if (!in) {
-			in = ceph_get_inode(sb, vino);
-			if (IS_ERR(in)) {
-				pr_err("fill_trace bad get_inode "
-				       "%llx.%llx\n", vino.ino, vino.snap);
-				err = PTR_ERR(in);
-				d_drop(dn);
-				goto done;
-			}
+		if (!dn->d_inode) {
+			ihold(in);
 			dn = splice_dentry(dn, in, &have_lease, true);
 			if (IS_ERR(dn)) {
 				err = PTR_ERR(dn);
 				goto done;
 			}
 			req->r_dentry = dn;  /* may have spliced */
-			ihold(in);
-		} else if (ceph_ino(in) == vino.ino &&
-			   ceph_snap(in) == vino.snap) {
-			ihold(in);
-		} else {
+		} else if (dn->d_inode && dn->d_inode != in) {
 			dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
-			     dn, in, ceph_ino(in), ceph_snap(in),
-			     vino.ino, vino.snap);
+			     dn, dn->d_inode, ceph_vinop(dn->d_inode),
+			     ceph_vinop(in));
 			have_lease = false;
-			in = NULL;
 		}
 
 		if (have_lease)
 			update_dentry_lease(dn, rinfo->dlease, session,
 					    req->r_request_started);
 		dout(" final dn %p\n", dn);
-		i++;
-	} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
-		   req->r_op == CEPH_MDS_OP_MKSNAP) && !req->r_aborted) {
+	} else if (!req->r_aborted &&
+		   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
+		    req->r_op == CEPH_MDS_OP_MKSNAP)) {
 		struct dentry *dn = req->r_dentry;
 
 		/* fill out a snapdir LOOKUPSNAP dentry */
@@ -1152,52 +1182,15 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 		ininfo = rinfo->targeti.in;
 		vino.ino = le64_to_cpu(ininfo->ino);
 		vino.snap = le64_to_cpu(ininfo->snapid);
-		in = ceph_get_inode(sb, vino);
-		if (IS_ERR(in)) {
-			pr_err("fill_inode get_inode badness %llx.%llx\n",
-			       vino.ino, vino.snap);
-			err = PTR_ERR(in);
-			d_delete(dn);
-			goto done;
-		}
 		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		ihold(in);
 		dn = splice_dentry(dn, in, NULL, true);
 		if (IS_ERR(dn)) {
 			err = PTR_ERR(dn);
 			goto done;
 		}
 		req->r_dentry = dn;  /* may have spliced */
-		ihold(in);
-		rinfo->head->is_dentry = 1;  /* fool notrace handlers */
-	}
-
-	if (rinfo->head->is_target) {
-		vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
-		vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
-
-		if (in == NULL || ceph_ino(in) != vino.ino ||
-		    ceph_snap(in) != vino.snap) {
-			in = ceph_get_inode(sb, vino);
-			if (IS_ERR(in)) {
-				err = PTR_ERR(in);
-				goto done;
-			}
-		}
-		req->r_target_inode = in;
-
-		err = fill_inode(in,
-				 &rinfo->targeti, NULL,
-				 session, req->r_request_started,
-				 (le32_to_cpu(rinfo->head->result) == 0) ?
-				 req->r_fmode : -1,
-				 &req->r_caps_reservation);
-		if (err < 0) {
-			pr_err("fill_inode badness %p %llx.%llx\n",
-			       in, ceph_vinop(in));
-			goto done;
-		}
 	}
-
 done:
 	dout("fill_trace done err=%d\n", err);
 	return err;
@@ -1247,11 +1240,23 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 	struct qstr dname;
 	struct dentry *dn;
 	struct inode *in;
-	int err = 0, i;
+	int err = 0, ret, i;
 	struct inode *snapdir = NULL;
 	struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
-	u64 frag = le32_to_cpu(rhead->args.readdir.frag);
 	struct ceph_dentry_info *di;
+	u64 r_readdir_offset = req->r_readdir_offset;
+	u32 frag = le32_to_cpu(rhead->args.readdir.frag);
+
+	if (rinfo->dir_dir &&
+	    le32_to_cpu(rinfo->dir_dir->frag) != frag) {
+		dout("readdir_prepopulate got new frag %x -> %x\n",
+		     frag, le32_to_cpu(rinfo->dir_dir->frag));
+		frag = le32_to_cpu(rinfo->dir_dir->frag);
+		if (ceph_frag_is_leftmost(frag))
+			r_readdir_offset = 2;
+		else
+			r_readdir_offset = 0;
+	}
 
 	if (req->r_aborted)
 		return readdir_prepopulate_inodes_only(req, session);
@@ -1268,6 +1273,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
 			ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
 	}
 
+	/* FIXME: release caps/leases if error occurs */
 	for (i = 0; i < rinfo->dir_nr; i++) {
 		struct ceph_vino vino;
 
@@ -1292,9 +1298,10 @@ retry_lookup:
 				err = -ENOMEM;
 				goto out;
 			}
-			err = ceph_init_dentry(dn);
-			if (err < 0) {
+			ret = ceph_init_dentry(dn);
+			if (ret < 0) {
 				dput(dn);
+				err = ret;
 				goto out;
 			}
 		} else if (dn->d_inode &&
@@ -1314,9 +1321,6 @@ retry_lookup:
 			spin_unlock(&parent->d_lock);
 		}
 
-		di = dn->d_fsdata;
-		di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
-
 		/* inode */
 		if (dn->d_inode) {
 			in = dn->d_inode;
@@ -1329,26 +1333,39 @@ retry_lookup:
 				err = PTR_ERR(in);
 				goto out;
 			}
-			dn = splice_dentry(dn, in, NULL, false);
-			if (IS_ERR(dn))
-				dn = NULL;
 		}
 
 		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
 			       req->r_request_started, -1,
 			       &req->r_caps_reservation) < 0) {
 			pr_err("fill_inode badness on %p\n", in);
+			if (!dn->d_inode)
+				iput(in);
+			d_drop(dn);
 			goto next_item;
 		}
-		if (dn)
-			update_dentry_lease(dn, rinfo->dir_dlease[i],
-					    req->r_session,
-					    req->r_request_started);
+
+		if (!dn->d_inode) {
+			dn = splice_dentry(dn, in, NULL, false);
+			if (IS_ERR(dn)) {
+				err = PTR_ERR(dn);
+				dn = NULL;
+				goto next_item;
+			}
+		}
+
+		di = dn->d_fsdata;
+		di->offset = ceph_make_fpos(frag, i + r_readdir_offset);
+
+		update_dentry_lease(dn, rinfo->dir_dlease[i],
+				    req->r_session,
+				    req->r_request_started);
 next_item:
 		if (dn)
 			dput(dn);
 	}
-	req->r_did_prepopulate = true;
+	if (err == 0)
+		req->r_did_prepopulate = true;
 
 out:
 	if (snapdir) {
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b7bda5d9611d..d90861f45210 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -43,6 +43,7 @@
  */
 
 struct ceph_reconnect_state {
+	int nr_caps;
 	struct ceph_pagelist *pagelist;
 	bool flock;
 };
@@ -443,6 +444,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
 	INIT_LIST_HEAD(&s->s_waiting);
 	INIT_LIST_HEAD(&s->s_unsafe);
 	s->s_num_cap_releases = 0;
+	s->s_cap_reconnect = 0;
 	s->s_cap_iterator = NULL;
 	INIT_LIST_HEAD(&s->s_cap_releases);
 	INIT_LIST_HEAD(&s->s_cap_releases_done);
@@ -642,6 +644,8 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
 		req->r_unsafe_dir = NULL;
 	}
 
+	complete_all(&req->r_safe_completion);
+
 	ceph_mdsc_put_request(req);
 }
 
@@ -986,7 +990,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	dout("removing cap %p, ci is %p, inode is %p\n",
 	     cap, ci, &ci->vfs_inode);
 	spin_lock(&ci->i_ceph_lock);
-	__ceph_remove_cap(cap);
+	__ceph_remove_cap(cap, false);
 	if (!__ceph_is_any_real_caps(ci)) {
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1231,9 +1235,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
 	session->s_trim_caps--;
 	if (oissued) {
 		/* we aren't the only cap.. just remove us */
-		__queue_cap_release(session, ceph_ino(inode), cap->cap_id,
-				    cap->mseq, cap->issue_seq);
-		__ceph_remove_cap(cap);
+		__ceph_remove_cap(cap, true);
 	} else {
 		/* try to drop referring dentries */
 		spin_unlock(&ci->i_ceph_lock);
@@ -1416,7 +1418,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
 	unsigned num;
 
 	dout("discard_cap_releases mds%d\n", session->s_mds);
-	spin_lock(&session->s_cap_lock);
 
 	/* zero out the in-progress message */
 	msg = list_first_entry(&session->s_cap_releases,
@@ -1443,8 +1444,6 @@ static void discard_cap_releases(struct ceph_mds_client *mdsc,
 		msg->front.iov_len = sizeof(*head);
 		list_add(&msg->list_head, &session->s_cap_releases);
 	}
-
-	spin_unlock(&session->s_cap_lock);
 }
 
 /*
@@ -1875,8 +1874,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
 	int mds = -1;
 	int err = -EAGAIN;
 
-	if (req->r_err || req->r_got_result)
+	if (req->r_err || req->r_got_result) {
+		if (req->r_aborted)
+			__unregister_request(mdsc, req);
 		goto out;
+	}
 
 	if (req->r_timeout &&
 	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
@@ -2186,7 +2188,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	if (head->safe) {
 		req->r_got_safe = true;
 		__unregister_request(mdsc, req);
-		complete_all(&req->r_safe_completion);
 
 		if (req->r_got_unsafe) {
 			/*
@@ -2238,8 +2239,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
 	err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
 	if (err == 0) {
 		if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
-				    req->r_op == CEPH_MDS_OP_LSSNAP) &&
-		    rinfo->dir_nr)
+				    req->r_op == CEPH_MDS_OP_LSSNAP))
 			ceph_readdir_prepopulate(req, req->r_session);
 		ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
 	}
@@ -2490,6 +2490,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	cap->seq = 0;        /* reset cap seq */
 	cap->issue_seq = 0;  /* and issue_seq */
 	cap->mseq = 0;       /* and migrate_seq */
+	cap->cap_gen = cap->session->s_cap_gen;
 
 	if (recon_state->flock) {
 		rec.v2.cap_id = cpu_to_le64(cap->cap_id);
@@ -2552,6 +2553,8 @@ encode_again:
 	} else {
 		err = ceph_pagelist_append(pagelist, &rec, reclen);
 	}
+
+	recon_state->nr_caps++;
 out_free:
 	kfree(path);
 out_dput:
@@ -2579,6 +2582,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	struct rb_node *p;
 	int mds = session->s_mds;
 	int err = -ENOMEM;
+	int s_nr_caps;
 	struct ceph_pagelist *pagelist;
 	struct ceph_reconnect_state recon_state;
 
@@ -2610,20 +2614,38 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	dout("session %p state %s\n", session,
 	     session_state_name(session->s_state));
 
+	spin_lock(&session->s_gen_ttl_lock);
+	session->s_cap_gen++;
+	spin_unlock(&session->s_gen_ttl_lock);
+
+	spin_lock(&session->s_cap_lock);
+	/*
+	 * notify __ceph_remove_cap() that we are composing cap reconnect.
+	 * If a cap get released before being added to the cap reconnect,
+	 * __ceph_remove_cap() should skip queuing cap release.
+	 */
+	session->s_cap_reconnect = 1;
 	/* drop old cap expires; we're about to reestablish that state */
 	discard_cap_releases(mdsc, session);
+	spin_unlock(&session->s_cap_lock);
 
 	/* traverse this session's caps */
-	err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
+	s_nr_caps = session->s_nr_caps;
+	err = ceph_pagelist_encode_32(pagelist, s_nr_caps);
 	if (err)
 		goto fail;
 
+	recon_state.nr_caps = 0;
 	recon_state.pagelist = pagelist;
 	recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
 	err = iterate_session_caps(session, encode_caps_cb, &recon_state);
 	if (err < 0)
 		goto fail;
 
+	spin_lock(&session->s_cap_lock);
+	session->s_cap_reconnect = 0;
+	spin_unlock(&session->s_cap_lock);
+
 	/*
 	 * snaprealms.  we provide mds with the ino, seq (version), and
 	 * parent for all of our realms.  If the mds has any newer info,
@@ -2646,11 +2668,18 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
-	if (pagelist->length) {
-		/* set up outbound data if we have any */
-		reply->hdr.data_len = cpu_to_le32(pagelist->length);
-		ceph_msg_data_add_pagelist(reply, pagelist);
+
+	/* raced with cap release? */
+	if (s_nr_caps != recon_state.nr_caps) {
+		struct page *page = list_first_entry(&pagelist->head,
+						     struct page, lru);
+		__le32 *addr = kmap_atomic(page);
+		*addr = cpu_to_le32(recon_state.nr_caps);
+		kunmap_atomic(addr);
 	}
+
+	reply->hdr.data_len = cpu_to_le32(pagelist->length);
+	ceph_msg_data_add_pagelist(reply, pagelist);
 	ceph_con_send(&session->s_con, reply);
 
 	mutex_unlock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c2a19fbbe517..4c053d099ae4 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -132,6 +132,7 @@ struct ceph_mds_session {
 	struct list_head  s_caps;     /* all caps issued by this session */
 	int               s_nr_caps, s_trim_caps;
 	int               s_num_cap_releases;
+	int		  s_cap_reconnect;
 	struct list_head  s_cap_releases; /* waiting cap_release messages */
 	struct list_head  s_cap_releases_done; /* ready to send */
 	struct ceph_cap  *s_cap_iterator;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6014b0a3c405..ef4ac38bb614 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -741,13 +741,7 @@ extern int ceph_add_cap(struct inode *inode,
 			int fmode, unsigned issued, unsigned wanted,
 			unsigned cap, unsigned seq, u64 realmino, int flags,
 			struct ceph_cap_reservation *caps_reservation);
-extern void __ceph_remove_cap(struct ceph_cap *cap);
-static inline void ceph_remove_cap(struct ceph_cap *cap)
-{
-	spin_lock(&cap->ci->i_ceph_lock);
-	__ceph_remove_cap(cap);
-	spin_unlock(&cap->ci->i_ceph_lock);
-}
+extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release);
 extern void ceph_put_cap(struct ceph_mds_client *mdsc,
 			 struct ceph_cap *cap);
 
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index d9ea7ada1378..f918a998a087 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -384,6 +384,7 @@ struct smb_version_operations {
 	int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file,
 			struct cifsFileInfo *target_file, u64 src_off, u64 len,
 			u64 dest_off);
+	int (*validate_negotiate)(const unsigned int, struct cifs_tcon *);
 };
 
 struct smb_version_values {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index aa3397620342..2c29db6a247e 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -477,9 +477,10 @@ extern int CIFSGetExtAttr(const unsigned int xid, struct cifs_tcon *tcon,
 			const int netfid, __u64 *pExtAttrBits, __u64 *pMask);
 extern void cifs_autodisable_serverino(struct cifs_sb_info *cifs_sb);
 extern bool CIFSCouldBeMFSymlink(const struct cifs_fattr *fattr);
-extern int CIFSCheckMFSymlink(struct cifs_fattr *fattr,
-		const unsigned char *path,
-		struct cifs_sb_info *cifs_sb, unsigned int xid);
+extern int CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
+			      struct cifs_sb_info *cifs_sb,
+			      struct cifs_fattr *fattr,
+			      const unsigned char *path);
 extern int mdfour(unsigned char *, unsigned char *, int);
 extern int E_md4hash(const unsigned char *passwd, unsigned char *p16,
 			const struct nls_table *codepage);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 124aa0230c1b..d707edb6b852 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4010,7 +4010,7 @@ QFileInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+		cifs_dbg(FYI, "Send error in QFileInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4179,7 +4179,7 @@ UnixQFileInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+		cifs_dbg(FYI, "Send error in UnixQFileInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
@@ -4263,7 +4263,7 @@ UnixQPathInfoRetry:
 	rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
-		cifs_dbg(FYI, "Send error in QPathInfo = %d\n", rc);
+		cifs_dbg(FYI, "Send error in UnixQPathInfo = %d", rc);
 	} else {		/* decode response */
 		rc = validate_t2((struct smb_t2_rsp *)pSMBr);
 
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 11ff5f116b20..a514e0a65f69 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -193,7 +193,7 @@ check_name(struct dentry *direntry)
 static int
 cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
 	       struct tcon_link *tlink, unsigned oflags, umode_t mode,
-	       __u32 *oplock, struct cifs_fid *fid, int *created)
+	       __u32 *oplock, struct cifs_fid *fid)
 {
 	int rc = -ENOENT;
 	int create_options = CREATE_NOT_DIR;
@@ -349,7 +349,6 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
 				.device	= 0,
 		};
 
-		*created |= FILE_CREATED;
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
 			args.uid = current_fsuid();
 			if (inode->i_mode & S_ISGID)
@@ -480,13 +479,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 	cifs_add_pending_open(&fid, tlink, &open);
 
 	rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
-			    &oplock, &fid, opened);
+			    &oplock, &fid);
 
 	if (rc) {
 		cifs_del_pending_open(&open);
 		goto out;
 	}
 
+	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		*opened |= FILE_CREATED;
+
 	rc = finish_open(file, direntry, generic_file_open, opened);
 	if (rc) {
 		if (server->ops->close)
@@ -529,7 +531,6 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
 	struct TCP_Server_Info *server;
 	struct cifs_fid fid;
 	__u32 oplock;
-	int created = FILE_CREATED;
 
 	cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n",
 		 inode, direntry->d_name.name, direntry);
@@ -546,7 +547,7 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode,
 		server->ops->new_lease_key(&fid);
 
 	rc = cifs_do_create(inode, direntry, xid, tlink, oflags, mode,
-			    &oplock, &fid, &created);
+			    &oplock, &fid);
 	if (!rc && server->ops->close)
 		server->ops->close(xid, tcon, &fid);
 
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 36f9ebb93ceb..49719b8228e5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -383,7 +383,8 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* check for Minshall+French symlinks */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		int tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		int tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
+					       full_path);
 		if (tmprc)
 			cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
 	}
@@ -799,7 +800,8 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 
 	/* check for Minshall+French symlinks */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
-		tmprc = CIFSCheckMFSymlink(&fattr, full_path, cifs_sb, xid);
+		tmprc = CIFSCheckMFSymlink(xid, tcon, cifs_sb, &fattr,
+					   full_path);
 		if (tmprc)
 			cifs_dbg(FYI, "CIFSCheckMFSymlink: %d\n", tmprc);
 	}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 409b45eefe70..77492301cc2b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -26,13 +26,15 @@
 #include <linux/mount.h>
 #include <linux/mm.h>
 #include <linux/pagemap.h>
-#include <linux/btrfs.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifsfs.h"
 
+#define CIFS_IOCTL_MAGIC	0xCF
+#define CIFS_IOC_COPYCHUNK_FILE	_IOW(CIFS_IOCTL_MAGIC, 3, int)
+
 static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
 			unsigned long srcfd, u64 off, u64 len, u64 destoff)
 {
@@ -213,7 +215,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 				cifs_dbg(FYI, "set compress flag rc %d\n", rc);
 			}
 			break;
-		case BTRFS_IOC_CLONE:
+		case CIFS_IOC_COPYCHUNK_FILE:
 			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0);
 			break;
 		default:
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index cc0234710ddb..92aee08483a5 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -354,34 +354,30 @@ open_query_close_cifs_symlink(const unsigned char *path, char *pbuf,
 
 
 int
-CIFSCheckMFSymlink(struct cifs_fattr *fattr,
-		   const unsigned char *path,
-		   struct cifs_sb_info *cifs_sb, unsigned int xid)
+CIFSCheckMFSymlink(unsigned int xid, struct cifs_tcon *tcon,
+		   struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr,
+		   const unsigned char *path)
 {
-	int rc = 0;
+	int rc;
 	u8 *buf = NULL;
 	unsigned int link_len = 0;
 	unsigned int bytes_read = 0;
-	struct cifs_tcon *ptcon;
 
 	if (!CIFSCouldBeMFSymlink(fattr))
 		/* it's not a symlink */
 		return 0;
 
 	buf = kmalloc(CIFS_MF_SYMLINK_FILE_SIZE, GFP_KERNEL);
-	if (!buf) {
-		rc = -ENOMEM;
-		goto out;
-	}
+	if (!buf)
+		return -ENOMEM;
 
-	ptcon = tlink_tcon(cifs_sb_tlink(cifs_sb));
-	if ((ptcon->ses) && (ptcon->ses->server->ops->query_mf_symlink))
-		rc = ptcon->ses->server->ops->query_mf_symlink(path, buf,
-						 &bytes_read, cifs_sb, xid);
+	if (tcon->ses->server->ops->query_mf_symlink)
+		rc = tcon->ses->server->ops->query_mf_symlink(path, buf,
+						&bytes_read, cifs_sb, xid);
 	else
-		goto out;
+		rc = -ENOSYS;
 
-	if (rc != 0)
+	if (rc)
 		goto out;
 
 	if (bytes_read == 0) /* not a symlink */
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 11dde4b24f8a..757da3e54d3d 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -532,7 +532,10 @@ smb2_clone_range(const unsigned int xid,
 	int rc;
 	unsigned int ret_data_len;
 	struct copychunk_ioctl *pcchunk;
-	char *retbuf = NULL;
+	struct copychunk_ioctl_rsp *retbuf = NULL;
+	struct cifs_tcon *tcon;
+	int chunks_copied = 0;
+	bool chunk_sizes_updated = false;
 
 	pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
 
@@ -547,27 +550,96 @@ smb2_clone_range(const unsigned int xid,
 
 	/* Note: request_res_key sets res_key null only if rc !=0 */
 	if (rc)
-		return rc;
+		goto cchunk_out;
 
 	/* For now array only one chunk long, will make more flexible later */
 	pcchunk->ChunkCount = __constant_cpu_to_le32(1);
 	pcchunk->Reserved = 0;
-	pcchunk->SourceOffset = cpu_to_le64(src_off);
-	pcchunk->TargetOffset = cpu_to_le64(dest_off);
-	pcchunk->Length = cpu_to_le32(len);
 	pcchunk->Reserved2 = 0;
 
-	/* Request that server copy to target from src file identified by key */
-	rc = SMB2_ioctl(xid, tlink_tcon(trgtfile->tlink),
-			trgtfile->fid.persistent_fid,
-			trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
-			true /* is_fsctl */, (char *)pcchunk,
-			sizeof(struct copychunk_ioctl),	&retbuf, &ret_data_len);
+	tcon = tlink_tcon(trgtfile->tlink);
 
-	/* BB need to special case rc = EINVAL to alter chunk size */
+	while (len > 0) {
+		pcchunk->SourceOffset = cpu_to_le64(src_off);
+		pcchunk->TargetOffset = cpu_to_le64(dest_off);
+		pcchunk->Length =
+			cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
 
-	cifs_dbg(FYI, "rc %d data length out %d\n", rc, ret_data_len);
+		/* Request server copy to target from src identified by key */
+		rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
+			trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
+			true /* is_fsctl */, (char *)pcchunk,
+			sizeof(struct copychunk_ioctl),	(char **)&retbuf,
+			&ret_data_len);
+		if (rc == 0) {
+			if (ret_data_len !=
+					sizeof(struct copychunk_ioctl_rsp)) {
+				cifs_dbg(VFS, "invalid cchunk response size\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			if (retbuf->TotalBytesWritten == 0) {
+				cifs_dbg(FYI, "no bytes copied\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			/*
+			 * Check if server claimed to write more than we asked
+			 */
+			if (le32_to_cpu(retbuf->TotalBytesWritten) >
+			    le32_to_cpu(pcchunk->Length)) {
+				cifs_dbg(VFS, "invalid copy chunk response\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			if (le32_to_cpu(retbuf->ChunksWritten) != 1) {
+				cifs_dbg(VFS, "invalid num chunks written\n");
+				rc = -EIO;
+				goto cchunk_out;
+			}
+			chunks_copied++;
+
+			src_off += le32_to_cpu(retbuf->TotalBytesWritten);
+			dest_off += le32_to_cpu(retbuf->TotalBytesWritten);
+			len -= le32_to_cpu(retbuf->TotalBytesWritten);
+
+			cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %d\n",
+				le32_to_cpu(retbuf->ChunksWritten),
+				le32_to_cpu(retbuf->ChunkBytesWritten),
+				le32_to_cpu(retbuf->TotalBytesWritten));
+		} else if (rc == -EINVAL) {
+			if (ret_data_len != sizeof(struct copychunk_ioctl_rsp))
+				goto cchunk_out;
+
+			cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n",
+				le32_to_cpu(retbuf->ChunksWritten),
+				le32_to_cpu(retbuf->ChunkBytesWritten),
+				le32_to_cpu(retbuf->TotalBytesWritten));
+
+			/*
+			 * Check if this is the first request using these sizes,
+			 * (ie check if copy succeed once with original sizes
+			 * and check if the server gave us different sizes after
+			 * we already updated max sizes on previous request).
+			 * if not then why is the server returning an error now
+			 */
+			if ((chunks_copied != 0) || chunk_sizes_updated)
+				goto cchunk_out;
+
+			/* Check that server is not asking us to grow size */
+			if (le32_to_cpu(retbuf->ChunkBytesWritten) <
+					tcon->max_bytes_chunk)
+				tcon->max_bytes_chunk =
+					le32_to_cpu(retbuf->ChunkBytesWritten);
+			else
+				goto cchunk_out; /* server gave us bogus size */
+
+			/* No need to change MaxChunks since already set to 1 */
+			chunk_sizes_updated = true;
+		}
+	}
 
+cchunk_out:
 	kfree(pcchunk);
 	return rc;
 }
@@ -1247,6 +1319,7 @@ struct smb_version_operations smb30_operations = {
 	.create_lease_buf = smb3_create_lease_buf,
 	.parse_lease_buf = smb3_parse_lease_buf,
 	.clone_range = smb2_clone_range,
+	.validate_negotiate = smb3_validate_negotiate,
 };
 
 struct smb_version_values smb20_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index d65270c290a1..2013234b73ad 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -454,6 +454,81 @@ neg_exit:
 	return rc;
 }
 
+int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
+{
+	int rc = 0;
+	struct validate_negotiate_info_req vneg_inbuf;
+	struct validate_negotiate_info_rsp *pneg_rsp;
+	u32 rsplen;
+
+	cifs_dbg(FYI, "validate negotiate\n");
+
+	/*
+	 * validation ioctl must be signed, so no point sending this if we
+	 * can not sign it.  We could eventually change this to selectively
+	 * sign just this, the first and only signed request on a connection.
+	 * This is good enough for now since a user who wants better security
+	 * would also enable signing on the mount. Having validation of
+	 * negotiate info for signed connections helps reduce attack vectors
+	 */
+	if (tcon->ses->server->sign == false)
+		return 0; /* validation requires signing */
+
+	vneg_inbuf.Capabilities =
+			cpu_to_le32(tcon->ses->server->vals->req_capabilities);
+	memcpy(vneg_inbuf.Guid, cifs_client_guid, SMB2_CLIENT_GUID_SIZE);
+
+	if (tcon->ses->sign)
+		vneg_inbuf.SecurityMode =
+			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_REQUIRED);
+	else if (global_secflags & CIFSSEC_MAY_SIGN)
+		vneg_inbuf.SecurityMode =
+			cpu_to_le16(SMB2_NEGOTIATE_SIGNING_ENABLED);
+	else
+		vneg_inbuf.SecurityMode = 0;
+
+	vneg_inbuf.DialectCount = cpu_to_le16(1);
+	vneg_inbuf.Dialects[0] =
+		cpu_to_le16(tcon->ses->server->vals->protocol_id);
+
+	rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
+		FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+		(char *)&vneg_inbuf, sizeof(struct validate_negotiate_info_req),
+		(char **)&pneg_rsp, &rsplen);
+
+	if (rc != 0) {
+		cifs_dbg(VFS, "validate protocol negotiate failed: %d\n", rc);
+		return -EIO;
+	}
+
+	if (rsplen != sizeof(struct validate_negotiate_info_rsp)) {
+		cifs_dbg(VFS, "invalid size of protocol negotiate response\n");
+		return -EIO;
+	}
+
+	/* check validate negotiate info response matches what we got earlier */
+	if (pneg_rsp->Dialect !=
+			cpu_to_le16(tcon->ses->server->vals->protocol_id))
+		goto vneg_out;
+
+	if (pneg_rsp->SecurityMode != cpu_to_le16(tcon->ses->server->sec_mode))
+		goto vneg_out;
+
+	/* do not validate server guid because not saved at negprot time yet */
+
+	if ((le32_to_cpu(pneg_rsp->Capabilities) | SMB2_NT_FIND |
+	      SMB2_LARGE_FILES) != tcon->ses->server->capabilities)
+		goto vneg_out;
+
+	/* validate negotiate successful */
+	cifs_dbg(FYI, "validate negotiate info successful\n");
+	return 0;
+
+vneg_out:
+	cifs_dbg(VFS, "protocol revalidation - security settings mismatch\n");
+	return -EIO;
+}
+
 int
 SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
 		const struct nls_table *nls_cp)
@@ -829,6 +904,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
 	    ((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
 		cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
 	init_copy_chunk_defaults(tcon);
+	if (tcon->ses->server->ops->validate_negotiate)
+		rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
 tcon_exit:
 	free_rsp_buf(resp_buftype, rsp);
 	kfree(unc_path);
@@ -1214,10 +1291,17 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 	rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buftype, 0);
 	rsp = (struct smb2_ioctl_rsp *)iov[0].iov_base;
 
-	if (rc != 0) {
+	if ((rc != 0) && (rc != -EINVAL)) {
 		if (tcon)
 			cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
 		goto ioctl_exit;
+	} else if (rc == -EINVAL) {
+		if ((opcode != FSCTL_SRV_COPYCHUNK_WRITE) &&
+		    (opcode != FSCTL_SRV_COPYCHUNK)) {
+			if (tcon)
+				cifs_stats_fail_inc(tcon, SMB2_IOCTL_HE);
+			goto ioctl_exit;
+		}
 	}
 
 	/* check if caller wants to look at return data or just return rc */
@@ -2154,11 +2238,9 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = SendReceive2(xid, ses, iov, num, &resp_buftype, 0);
 	rsp = (struct smb2_set_info_rsp *)iov[0].iov_base;
 
-	if (rc != 0) {
+	if (rc != 0)
 		cifs_stats_fail_inc(tcon, SMB2_SET_INFO_HE);
-		goto out;
-	}
-out:
+
 	free_rsp_buf(resp_buftype, rsp);
 	kfree(iov);
 	return rc;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index f88320bbb477..2022c542ea3a 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -577,13 +577,19 @@ struct copychunk_ioctl_rsp {
 	__le32 TotalBytesWritten;
 } __packed;
 
-/* Response and Request are the same format */
-struct validate_negotiate_info {
+struct validate_negotiate_info_req {
 	__le32 Capabilities;
 	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
 	__le16 SecurityMode;
 	__le16 DialectCount;
-	__le16 Dialect[1];
+	__le16 Dialects[1]; /* dialect (someday maybe list) client asked for */
+} __packed;
+
+struct validate_negotiate_info_rsp {
+	__le32 Capabilities;
+	__u8   Guid[SMB2_CLIENT_GUID_SIZE];
+	__le16 SecurityMode;
+	__le16 Dialect; /* Dialect in use for the connection */
 } __packed;
 
 #define RSS_CAPABLE	0x00000001
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index b4eea105b08c..93adc64666f3 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -162,5 +162,6 @@ extern int smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
 		      struct smb2_lock_element *buf);
 extern int SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon,
 			    __u8 *lease_key, const __le32 lease_state);
+extern int smb3_validate_negotiate(const unsigned int, struct cifs_tcon *);
 
 #endif			/* _SMB2PROTO_H */
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a4b2391fe66e..0e538b5c9622 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
 #define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
 #define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
 #define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
-#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 /* BB add struct */
+#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
 /* Perform server-side data movement */
 #define FSCTL_SRV_COPYCHUNK 0x001440F2
 #define FSCTL_SRV_COPYCHUNK_WRITE 0x001480F2
diff --git a/fs/dcache.c b/fs/dcache.c
index 4bdb300b16e2..cb4a10690868 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -192,7 +192,7 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
 		if (!tcount)
 			return 0;
 	}
-	mask = ~(~0ul << tcount*8);
+	mask = bytemask_from_count(tcount);
 	return unlikely(!!((a ^ b) & mask));
 }
 
@@ -3061,8 +3061,13 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	 * thus don't need to be hashed.  They also don't need a name until a
 	 * user wants to identify the object in /proc/pid/fd/.  The little hack
 	 * below allows us to generate a name for these objects on demand:
+	 *
+	 * Some pseudo inodes are mountable.  When they are mounted
+	 * path->dentry == path->mnt->mnt_root.  In that case don't call d_dname
+	 * and instead have d_path return the mounted path.
 	 */
-	if (path->dentry->d_op && path->dentry->d_op->d_dname)
+	if (path->dentry->d_op && path->dentry->d_op->d_dname &&
+	    (!IS_ROOT(path->dentry) || path->dentry != path->mnt->mnt_root))
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
 	rcu_read_lock();
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index d90909ec6aa6..a5e34dd6a32c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -649,6 +649,7 @@ static void process_sctp_notification(struct connection *con,
 				      struct msghdr *msg, char *buf)
 {
 	union sctp_notification *sn = (union sctp_notification *)buf;
+	struct linger linger;
 
 	switch (sn->sn_header.sn_type) {
 	case SCTP_SEND_FAILED:
@@ -727,6 +728,13 @@ static void process_sctp_notification(struct connection *con,
 			}
 			add_sock(new_con->sock, new_con);
 
+			linger.l_onoff = 1;
+			linger.l_linger = 0;
+			ret = kernel_setsockopt(new_con->sock, SOL_SOCKET, SO_LINGER,
+						(char *)&linger, sizeof(linger));
+			if (ret < 0)
+				log_print("set socket option SO_LINGER failed");
+
 			log_print("connecting to %d sctp association %d",
 				 nodeid, (int)sn->sn_assoc_change.sac_assoc_id);
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 79b65c3b9e87..af903128891c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,8 +1852,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
-	if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
-		epds.events &= ~EPOLLWAKEUP;
+	ep_take_care_of_epollwakeup(&epds);
 
 	/*
 	 * We have to check that the file structure underneath the file descriptor
@@ -1908,10 +1907,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 			}
 		}
 	}
-	if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) {
-		tep = tf.file->private_data;
-		mutex_lock_nested(&tep->mtx, 1);
-	}
 
 	/*
 	 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 288534920fe5..20d6697bd638 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1493,6 +1493,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type,
 				sb->s_blocksize - offset : towrite;
 
 		tmp_bh.b_state = 0;
+		tmp_bh.b_size = sb->s_blocksize;
 		err = ext2_get_block(inode, blk, &tmp_bh, 1);
 		if (err < 0)
 			goto out;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e6185031c1cc..ece55565b9cd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -268,6 +268,16 @@ struct ext4_io_submit {
 /* Translate # of blks to # of clusters */
 #define EXT4_NUM_B2C(sbi, blks)	(((blks) + (sbi)->s_cluster_ratio - 1) >> \
 				 (sbi)->s_cluster_bits)
+/* Mask out the low bits to get the starting block of the cluster */
+#define EXT4_PBLK_CMASK(s, pblk) ((pblk) &				\
+				  ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_CMASK(s, lblk) ((lblk) &				\
+				  ~((ext4_lblk_t) (s)->s_cluster_ratio - 1))
+/* Get the cluster offset */
+#define EXT4_PBLK_COFF(s, pblk) ((pblk) &				\
+				 ((ext4_fsblk_t) (s)->s_cluster_ratio - 1))
+#define EXT4_LBLK_COFF(s, lblk) ((lblk) &				\
+				 ((ext4_lblk_t) (s)->s_cluster_ratio - 1))
 
 /*
  * Structure of a blocks group descriptor
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 17ac112ab101..3fe29de832c8 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -259,6 +259,15 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 		if (WARN_ON_ONCE(err)) {
 			ext4_journal_abort_handle(where, line, __func__, bh,
 						  handle, err);
+			ext4_error_inode(inode, where, line,
+					 bh->b_blocknr,
+					 "journal_dirty_metadata failed: "
+					 "handle type %u started at line %u, "
+					 "credits %u/%u, errcode %d",
+					 handle->h_type,
+					 handle->h_line_no,
+					 handle->h_requested_credits,
+					 handle->h_buffer_credits, err);
 		}
 	} else {
 		if (inode)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 35f65cf4f318..3384dc4bed40 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -360,8 +360,10 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
 	ext4_fsblk_t block = ext4_ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
+	ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
+	ext4_lblk_t last = lblock + len - 1;
 
-	if (len == 0)
+	if (lblock > last)
 		return 0;
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
@@ -387,11 +389,26 @@ static int ext4_valid_extent_entries(struct inode *inode,
 	if (depth == 0) {
 		/* leaf entries */
 		struct ext4_extent *ext = EXT_FIRST_EXTENT(eh);
+		struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+		ext4_fsblk_t pblock = 0;
+		ext4_lblk_t lblock = 0;
+		ext4_lblk_t prev = 0;
+		int len = 0;
 		while (entries) {
 			if (!ext4_valid_extent(inode, ext))
 				return 0;
+
+			/* Check for overlapping extents */
+			lblock = le32_to_cpu(ext->ee_block);
+			len = ext4_ext_get_actual_len(ext);
+			if ((lblock <= prev) && prev) {
+				pblock = ext4_ext_pblock(ext);
+				es->s_last_error_block = cpu_to_le64(pblock);
+				return 0;
+			}
 			ext++;
 			entries--;
+			prev = lblock + len - 1;
 		}
 	} else {
 		struct ext4_extent_idx *ext_idx = EXT_FIRST_INDEX(eh);
@@ -1834,8 +1851,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
 	depth = ext_depth(inode);
 	if (!path[depth].p_ext)
 		goto out;
-	b2 = le32_to_cpu(path[depth].p_ext->ee_block);
-	b2 &= ~(sbi->s_cluster_ratio - 1);
+	b2 = EXT4_LBLK_CMASK(sbi, le32_to_cpu(path[depth].p_ext->ee_block));
 
 	/*
 	 * get the next allocated block if the extent in the path
@@ -1845,7 +1861,7 @@ static unsigned int ext4_ext_check_overlap(struct ext4_sb_info *sbi,
 		b2 = ext4_ext_next_allocated_block(path);
 		if (b2 == EXT_MAX_BLOCKS)
 			goto out;
-		b2 &= ~(sbi->s_cluster_ratio - 1);
+		b2 = EXT4_LBLK_CMASK(sbi, b2);
 	}
 
 	/* check for wrap through zero on extent logical start block*/
@@ -2504,7 +2520,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
 		 * extent, we have to mark the cluster as used (store negative
 		 * cluster number in partial_cluster).
 		 */
-		unaligned = pblk & (sbi->s_cluster_ratio - 1);
+		unaligned = EXT4_PBLK_COFF(sbi, pblk);
 		if (unaligned && (ee_len == num) &&
 		    (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
 			*partial_cluster = EXT4_B2C(sbi, pblk);
@@ -2598,7 +2614,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 			 * accidentally freeing it later on
 			 */
 			pblk = ext4_ext_pblock(ex);
-			if (pblk & (sbi->s_cluster_ratio - 1))
+			if (EXT4_PBLK_COFF(sbi, pblk))
 				*partial_cluster =
 					-((long long)EXT4_B2C(sbi, pblk));
 			ex--;
@@ -3753,7 +3769,7 @@ int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	ext4_lblk_t lblk_start, lblk_end;
-	lblk_start = lblk & (~(sbi->s_cluster_ratio - 1));
+	lblk_start = EXT4_LBLK_CMASK(sbi, lblk);
 	lblk_end = lblk_start + sbi->s_cluster_ratio - 1;
 
 	return ext4_find_delalloc_range(inode, lblk_start, lblk_end);
@@ -3812,9 +3828,9 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 	trace_ext4_get_reserved_cluster_alloc(inode, lblk_start, num_blks);
 
 	/* Check towards left side */
-	c_offset = lblk_start & (sbi->s_cluster_ratio - 1);
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start);
 	if (c_offset) {
-		lblk_from = lblk_start & (~(sbi->s_cluster_ratio - 1));
+		lblk_from = EXT4_LBLK_CMASK(sbi, lblk_start);
 		lblk_to = lblk_from + c_offset - 1;
 
 		if (ext4_find_delalloc_range(inode, lblk_from, lblk_to))
@@ -3822,7 +3838,7 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start,
 	}
 
 	/* Now check towards right. */
-	c_offset = (lblk_start + num_blks) & (sbi->s_cluster_ratio - 1);
+	c_offset = EXT4_LBLK_COFF(sbi, lblk_start + num_blks);
 	if (allocated_clusters && c_offset) {
 		lblk_from = lblk_start + num_blks;
 		lblk_to = lblk_from + (sbi->s_cluster_ratio - c_offset) - 1;
@@ -4030,7 +4046,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 				     struct ext4_ext_path *path)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_lblk_t c_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+	ext4_lblk_t c_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
 	ext4_lblk_t ex_cluster_start, ex_cluster_end;
 	ext4_lblk_t rr_cluster_start;
 	ext4_lblk_t ee_block = le32_to_cpu(ex->ee_block);
@@ -4048,8 +4064,7 @@ static int get_implied_cluster_alloc(struct super_block *sb,
 	    (rr_cluster_start == ex_cluster_start)) {
 		if (rr_cluster_start == ex_cluster_end)
 			ee_start += ee_len - 1;
-		map->m_pblk = (ee_start & ~(sbi->s_cluster_ratio - 1)) +
-			c_offset;
+		map->m_pblk = EXT4_PBLK_CMASK(sbi, ee_start) + c_offset;
 		map->m_len = min(map->m_len,
 				 (unsigned) sbi->s_cluster_ratio - c_offset);
 		/*
@@ -4203,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 */
 	map->m_flags &= ~EXT4_MAP_FROM_CLUSTER;
 	newex.ee_block = cpu_to_le32(map->m_lblk);
-	cluster_offset = map->m_lblk & (sbi->s_cluster_ratio-1);
+	cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
 
 	/*
 	 * If we are doing bigalloc, check to see if the extent returned
@@ -4271,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * needed so that future calls to get_implied_cluster_alloc()
 	 * work correctly.
 	 */
-	offset = map->m_lblk & (sbi->s_cluster_ratio - 1);
+	offset = EXT4_LBLK_COFF(sbi, map->m_lblk);
 	ar.len = EXT4_NUM_B2C(sbi, offset+allocated);
 	ar.goal -= offset;
 	ar.logical -= offset;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 075763474118..61d49ff22c81 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1206,7 +1206,6 @@ static int ext4_journalled_write_end(struct file *file,
  */
 static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
 {
-	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int md_needed;
@@ -1218,7 +1217,6 @@ static int ext4_da_reserve_metadata(struct inode *inode, ext4_lblk_t lblock)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
-repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1238,10 +1236,6 @@ repeat:
 		ei->i_da_metadata_calc_len = save_len;
 		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
 		spin_unlock(&ei->i_block_reservation_lock);
-		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-			cond_resched();
-			goto repeat;
-		}
 		return -ENOSPC;
 	}
 	ei->i_reserved_meta_blocks += md_needed;
@@ -1255,7 +1249,6 @@ repeat:
  */
 static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 {
-	int retries = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int md_needed;
@@ -1277,7 +1270,6 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
-repeat:
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
@@ -1297,10 +1289,6 @@ repeat:
 		ei->i_da_metadata_calc_len = save_len;
 		ei->i_da_metadata_calc_last_lblock = save_last_lblock;
 		spin_unlock(&ei->i_block_reservation_lock);
-		if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
-			cond_resched();
-			goto repeat;
-		}
 		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
 		return -ENOSPC;
 	}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 4d113efa024c..04a5c7504be9 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3442,6 +3442,9 @@ static void ext4_mb_pa_callback(struct rcu_head *head)
 {
 	struct ext4_prealloc_space *pa;
 	pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
+
+	BUG_ON(atomic_read(&pa->pa_count));
+	BUG_ON(pa->pa_deleted == 0);
 	kmem_cache_free(ext4_pspace_cachep, pa);
 }
 
@@ -3455,11 +3458,13 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
 	ext4_group_t grp;
 	ext4_fsblk_t grp_blk;
 
-	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
-		return;
-
 	/* in this short window concurrent discard can set pa_deleted */
 	spin_lock(&pa->pa_lock);
+	if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
+		spin_unlock(&pa->pa_lock);
+		return;
+	}
+
 	if (pa->pa_deleted == 1) {
 		spin_unlock(&pa->pa_lock);
 		return;
@@ -4121,7 +4126,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
 	ext4_get_group_no_and_offset(sb, goal, &group, &block);
 
 	/* set up allocation goals */
-	ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
+	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
 	ac->ac_status = AC_STATUS_CONTINUE;
 	ac->ac_sb = sb;
 	ac->ac_inode = ar->inode;
@@ -4663,7 +4668,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 	 * blocks at the beginning or the end unless we are explicitly
 	 * requested to avoid doing so.
 	 */
-	overflow = block & (sbi->s_cluster_ratio - 1);
+	overflow = EXT4_PBLK_COFF(sbi, block);
 	if (overflow) {
 		if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
 			overflow = sbi->s_cluster_ratio - overflow;
@@ -4677,7 +4682,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
 			count += overflow;
 		}
 	}
-	overflow = count & (sbi->s_cluster_ratio - 1);
+	overflow = EXT4_LBLK_COFF(sbi, count);
 	if (overflow) {
 		if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
 			if (count > overflow)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c977f4e4e63b..1f7784de05b6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -792,7 +792,7 @@ static void ext4_put_super(struct super_block *sb)
 	}
 
 	ext4_es_unregister_shrinker(sbi);
-	del_timer(&sbi->s_err_report);
+	del_timer_sync(&sbi->s_err_report);
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
@@ -3316,11 +3316,19 @@ int ext4_calculate_overhead(struct super_block *sb)
 }
 
 
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
 {
 	ext4_fsblk_t resv_clusters;
 
 	/*
+	 * There's no need to reserve anything when we aren't using extents.
+	 * The space estimates are exact, there are no unwritten extents,
+	 * hole punching doesn't need new metadata... This is needed especially
+	 * to keep ext2/3 backward compatibility.
+	 */
+	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+		return 0;
+	/*
 	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
 	 * This should cover the situations where we can not afford to run
 	 * out of space like for example punch hole, or converting
@@ -3328,7 +3336,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
 	 * allocation would require 1, or 2 blocks, higher numbers are
 	 * very rare.
 	 */
-	resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+	resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
+			EXT4_SB(sb)->s_cluster_bits;
 
 	do_div(resv_clusters, 50);
 	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
@@ -4071,10 +4080,10 @@ no_journal:
 			 "available");
 	}
 
-	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
-			 "reserved pool", ext4_calculate_resv_clusters(sbi));
+			 "reserved pool", ext4_calculate_resv_clusters(sb));
 		goto failed_mount4a;
 	}
 
@@ -4184,7 +4193,7 @@ failed_mount_wq:
 	}
 failed_mount3:
 	ext4_es_unregister_shrinker(sbi);
-	del_timer(&sbi->s_err_report);
+	del_timer_sync(&sbi->s_err_report);
 	if (sbi->s_flex_groups)
 		ext4_kvfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeclusters_counter);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f4a10ece2f1..e0259a163f98 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -516,13 +516,16 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	}
 	WARN_ON(inode->i_state & I_SYNC);
 	/*
-	 * Skip inode if it is clean. We don't want to mess with writeback
-	 * lists in this function since flusher thread may be doing for example
-	 * sync in parallel and if we move the inode, it could get skipped. So
-	 * here we make sure inode is on some writeback list and leave it there
-	 * unless we have completely cleaned the inode.
+	 * Skip inode if it is clean and we have no outstanding writeback in
+	 * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this
+	 * function since flusher thread may be doing for example sync in
+	 * parallel and if we move the inode, it could get skipped. So here we
+	 * make sure inode is on some writeback list and leave it there unless
+	 * we have completely cleaned the inode.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY) &&
+	    (wbc->sync_mode != WB_SYNC_ALL ||
+	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
 	inode->i_state |= I_SYNC;
 	spin_unlock(&inode->i_lock);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index b7fc035a6943..49436fa7cd4f 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -986,6 +986,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int rv;
@@ -1006,6 +1007,36 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb,
 	if (rv != 1)
 		goto out; /* dio not valid, fall back to buffered i/o */
 
+	/*
+	 * Now since we are holding a deferred (CW) lock at this point, you
+	 * might be wondering why this is ever needed. There is a case however
+	 * where we've granted a deferred local lock against a cached exclusive
+	 * glock. That is ok provided all granted local locks are deferred, but
+	 * it also means that it is possible to encounter pages which are
+	 * cached and possibly also mapped. So here we check for that and sort
+	 * them out ahead of the dio. The glock state machine will take care of
+	 * everything else.
+	 *
+	 * If in fact the cached glock state (gl->gl_state) is deferred (CW) in
+	 * the first place, mapping->nr_pages will always be zero.
+	 */
+	if (mapping->nrpages) {
+		loff_t lstart = offset & (PAGE_CACHE_SIZE - 1);
+		loff_t len = iov_length(iov, nr_segs);
+		loff_t end = PAGE_ALIGN(offset + len) - 1;
+
+		rv = 0;
+		if (len == 0)
+			goto out;
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, offset, len);
+		rv = filemap_write_and_wait_range(mapping, lstart, end);
+		if (rv)
+			goto out;
+		if (rw == WRITE)
+			truncate_inode_pages_range(mapping, lstart, end);
+	}
+
 	rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
 				  offset, nr_segs, gfs2_get_block_direct,
 				  NULL, NULL, 0);
@@ -1050,30 +1081,22 @@ int gfs2_releasepage(struct page *page, gfp_t gfp_mask)
 		bh = bh->b_this_page;
 	} while(bh != head);
 	spin_unlock(&sdp->sd_ail_lock);
-	gfs2_log_unlock(sdp);
 
 	head = bh = page_buffers(page);
 	do {
-		gfs2_log_lock(sdp);
 		bd = bh->b_private;
 		if (bd) {
 			gfs2_assert_warn(sdp, bd->bd_bh == bh);
-			if (!list_empty(&bd->bd_list)) {
-				if (!buffer_pinned(bh))
-					list_del_init(&bd->bd_list);
-				else
-					bd = NULL;
-			}
-			if (bd)
-				bd->bd_bh = NULL;
+			if (!list_empty(&bd->bd_list))
+				list_del_init(&bd->bd_list);
+			bd->bd_bh = NULL;
 			bh->b_private = NULL;
-		}
-		gfs2_log_unlock(sdp);
-		if (bd)
 			kmem_cache_free(gfs2_bufdata_cachep, bd);
+		}
 
 		bh = bh->b_this_page;
 	} while (bh != head);
+	gfs2_log_unlock(sdp);
 
 	return try_to_free_buffers(page);
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 2e5fc268d324..fa32655449c8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -834,6 +834,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
 	struct qstr name = { .name = "" };
+	struct timespec tv = CURRENT_TIME;
 
 	error = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 	if (error)
@@ -850,7 +851,11 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 	leaf->lf_entries = 0;
 	leaf->lf_dirent_format = cpu_to_be32(GFS2_FORMAT_DE);
 	leaf->lf_next = 0;
-	memset(leaf->lf_reserved, 0, sizeof(leaf->lf_reserved));
+	leaf->lf_inode = cpu_to_be64(ip->i_no_addr);
+	leaf->lf_dist = cpu_to_be32(1);
+	leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+	leaf->lf_sec = cpu_to_be64(tv.tv_sec);
+	memset(leaf->lf_reserved2, 0, sizeof(leaf->lf_reserved2));
 	dent = (struct gfs2_dirent *)(leaf+1);
 	gfs2_qstr2dirent(&name, bh->b_size - sizeof(struct gfs2_leaf), dent);
 	*pbh = bh;
@@ -1612,11 +1617,31 @@ out:
 	return ret;
 }
 
+/**
+ * dir_new_leaf - Add a new leaf onto hash chain
+ * @inode: The directory
+ * @name: The name we are adding
+ *
+ * This adds a new dir leaf onto an existing leaf when there is not
+ * enough space to add a new dir entry. This is a last resort after
+ * we've expanded the hash table to max size and also split existing
+ * leaf blocks, so it will only occur for very large directories.
+ *
+ * The dist parameter is set to 1 for leaf blocks directly attached
+ * to the hash table, 2 for one layer of indirection, 3 for two layers
+ * etc. We are thus able to tell the difference between an old leaf
+ * with dist set to zero (i.e. "don't know") and a new one where we
+ * set this information for debug/fsck purposes.
+ *
+ * Returns: 0 on success, or -ve on error
+ */
+
 static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 {
 	struct buffer_head *bh, *obh;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_leaf *leaf, *oleaf;
+	u32 dist = 1;
 	int error;
 	u32 index;
 	u64 bn;
@@ -1626,6 +1651,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 	if (error)
 		return error;
 	do {
+		dist++;
 		oleaf = (struct gfs2_leaf *)obh->b_data;
 		bn = be64_to_cpu(oleaf->lf_next);
 		if (!bn)
@@ -1643,6 +1669,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 		brelse(obh);
 		return -ENOSPC;
 	}
+	leaf->lf_dist = cpu_to_be32(dist);
 	oleaf->lf_next = cpu_to_be64(bh->b_blocknr);
 	brelse(bh);
 	brelse(obh);
@@ -1659,39 +1686,53 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
 
 /**
  * gfs2_dir_add - Add new filename into directory
- * @dip: The GFS2 inode
- * @filename: The new name
- * @inode: The inode number of the entry
- * @type: The type of the entry
+ * @inode: The directory inode
+ * @name: The new name
+ * @nip: The GFS2 inode to be linked in to the directory
+ * @da: The directory addition info
+ *
+ * If the call to gfs2_diradd_alloc_required resulted in there being
+ * no need to allocate any new directory blocks, then it will contain
+ * a pointer to the directory entry and the bh in which it resides. We
+ * can use that without having to repeat the search. If there was no
+ * free space, then we must now create more space.
  *
  * Returns: 0 on success, error code on failure
  */
 
 int gfs2_dir_add(struct inode *inode, const struct qstr *name,
-		 const struct gfs2_inode *nip)
+		 const struct gfs2_inode *nip, struct gfs2_diradd *da)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
-	struct buffer_head *bh;
-	struct gfs2_dirent *dent;
+	struct buffer_head *bh = da->bh;
+	struct gfs2_dirent *dent = da->dent;
+	struct timespec tv;
 	struct gfs2_leaf *leaf;
 	int error;
 
 	while(1) {
-		dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space,
-					  &bh);
+		if (da->bh == NULL) {
+			dent = gfs2_dirent_search(inode, name,
+						  gfs2_dirent_find_space, &bh);
+		}
 		if (dent) {
 			if (IS_ERR(dent))
 				return PTR_ERR(dent);
 			dent = gfs2_init_dirent(inode, dent, name, bh);
 			gfs2_inum_out(nip, dent);
 			dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode));
+			tv = CURRENT_TIME;
 			if (ip->i_diskflags & GFS2_DIF_EXHASH) {
 				leaf = (struct gfs2_leaf *)bh->b_data;
 				be16_add_cpu(&leaf->lf_entries, 1);
+				leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+				leaf->lf_sec = cpu_to_be64(tv.tv_sec);
 			}
+			da->dent = NULL;
+			da->bh = NULL;
 			brelse(bh);
 			ip->i_entries++;
-			ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
+			ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv;
 			if (S_ISDIR(nip->i_inode.i_mode))
 				inc_nlink(&ip->i_inode);
 			mark_inode_dirty(inode);
@@ -1742,6 +1783,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 	const struct qstr *name = &dentry->d_name;
 	struct gfs2_dirent *dent, *prev = NULL;
 	struct buffer_head *bh;
+	struct timespec tv = CURRENT_TIME;
 
 	/* Returns _either_ the entry (if its first in block) or the
 	   previous entry otherwise */
@@ -1767,13 +1809,15 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry)
 		if (!entries)
 			gfs2_consist_inode(dip);
 		leaf->lf_entries = cpu_to_be16(--entries);
+		leaf->lf_nsec = cpu_to_be32(tv.tv_nsec);
+		leaf->lf_sec = cpu_to_be64(tv.tv_sec);
 	}
 	brelse(bh);
 
 	if (!dip->i_entries)
 		gfs2_consist_inode(dip);
 	dip->i_entries--;
-	dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
+	dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv;
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		drop_nlink(&dip->i_inode);
 	mark_inode_dirty(&dip->i_inode);
@@ -2017,22 +2061,36 @@ out:
  * gfs2_diradd_alloc_required - find if adding entry will require an allocation
  * @ip: the file being written to
  * @filname: the filename that's going to be added
+ * @da: The structure to return dir alloc info
  *
- * Returns: 1 if alloc required, 0 if not, -ve on error
+ * Returns: 0 if ok, -ve on error
  */
 
-int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name)
+int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name,
+			       struct gfs2_diradd *da)
 {
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	const unsigned int extra = sizeof(struct gfs2_dinode) - sizeof(struct gfs2_leaf);
 	struct gfs2_dirent *dent;
 	struct buffer_head *bh;
 
+	da->nr_blocks = 0;
+	da->bh = NULL;
+	da->dent = NULL;
+
 	dent = gfs2_dirent_search(inode, name, gfs2_dirent_find_space, &bh);
 	if (!dent) {
-		return 1;
+		da->nr_blocks = sdp->sd_max_dirres;
+		if (!(ip->i_diskflags & GFS2_DIF_EXHASH) &&
+		    (GFS2_DIRENT_SIZE(name->len) < extra))
+			da->nr_blocks = 1;
+		return 0;
 	}
 	if (IS_ERR(dent))
 		return PTR_ERR(dent);
-	brelse(bh);
+	da->bh = bh;
+	da->dent = dent;
 	return 0;
 }
 
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f03bbd1873f..126c65dda028 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -16,6 +16,14 @@
 struct inode;
 struct gfs2_inode;
 struct gfs2_inum;
+struct buffer_head;
+struct gfs2_dirent;
+
+struct gfs2_diradd {
+	unsigned nr_blocks;
+	struct gfs2_dirent *dent;
+	struct buffer_head *bh;
+};
 
 extern struct inode *gfs2_dir_search(struct inode *dir,
 				     const struct qstr *filename,
@@ -23,7 +31,13 @@ extern struct inode *gfs2_dir_search(struct inode *dir,
 extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
 			  const struct gfs2_inode *ip);
 extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
-			const struct gfs2_inode *ip);
+			const struct gfs2_inode *ip, struct gfs2_diradd *da);
+static inline void gfs2_dir_no_add(struct gfs2_diradd *da)
+{
+	if (da->bh)
+		brelse(da->bh);
+	da->bh = NULL;
+}
 extern int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry);
 extern int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 			 struct file_ra_state *f_ra);
@@ -33,7 +47,8 @@ extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
 extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
 
 extern int gfs2_diradd_alloc_required(struct inode *dir,
-				      const struct qstr *filename);
+				      const struct qstr *filename,
+				      struct gfs2_diradd *da);
 extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
 				   struct buffer_head **bhp);
 extern void gfs2_dir_hash_inval(struct gfs2_inode *ip);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index c8420f7e4db6..ca0be6c69a26 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1552,13 +1552,11 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
 	glock_hash_walk(thaw_glock, sdp);
 }
 
-static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
+static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
-	int ret;
 	spin_lock(&gl->gl_spin);
-	ret = gfs2_dump_glock(seq, gl);
+	gfs2_dump_glock(seq, gl);
 	spin_unlock(&gl->gl_spin);
-	return ret;
 }
 
 static void dump_glock_func(struct gfs2_glock *gl)
@@ -1647,14 +1645,14 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
  * @seq: the seq_file struct
  * @gh: the glock holder
  *
- * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
+static void dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 {
 	struct task_struct *gh_owner = NULL;
 	char flags_buf[32];
 
+	rcu_read_lock();
 	if (gh->gh_owner_pid)
 		gh_owner = pid_task(gh->gh_owner_pid, PIDTYPE_PID);
 	gfs2_print_dbg(seq, " H: s:%s f:%s e:%d p:%ld [%s] %pS\n",
@@ -1664,7 +1662,7 @@ static int dump_holder(struct seq_file *seq, const struct gfs2_holder *gh)
 		       gh->gh_owner_pid ? (long)pid_nr(gh->gh_owner_pid) : -1,
 		       gh_owner ? gh_owner->comm : "(ended)",
 		       (void *)gh->gh_ip);
-	return 0;
+	rcu_read_unlock();
 }
 
 static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
@@ -1719,16 +1717,14 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
  * example. The field's are n = number (id of the object), f = flags,
  * t = type, s = state, r = refcount, e = error, p = pid.
  *
- * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	unsigned long long dtime;
 	const struct gfs2_holder *gh;
 	char gflags_buf[32];
-	int error = 0;
 
 	dtime = jiffies - gl->gl_demote_time;
 	dtime *= 1000000/HZ; /* demote time in uSec */
@@ -1745,15 +1741,11 @@ int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 		  atomic_read(&gl->gl_revokes),
 		  (int)gl->gl_lockref.count, gl->gl_hold_time);
 
-	list_for_each_entry(gh, &gl->gl_holders, gh_list) {
-		error = dump_holder(seq, gh);
-		if (error)
-			goto out;
-	}
+	list_for_each_entry(gh, &gl->gl_holders, gh_list)
+		dump_holder(seq, gh);
+
 	if (gl->gl_state != LM_ST_UNLOCKED && glops->go_dump)
-		error = glops->go_dump(seq, gl);
-out:
-	return error;
+		glops->go_dump(seq, gl);
 }
 
 static int gfs2_glstats_seq_show(struct seq_file *seq, void *iter_ptr)
@@ -1951,7 +1943,8 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr)
 
 static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr)
 {
-	return dump_glock(seq, iter_ptr);
+	dump_glock(seq, iter_ptr);
+	return 0;
 }
 
 static void *gfs2_sbstats_seq_start(struct seq_file *seq, loff_t *pos)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 6647d77366ba..32572f71f027 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -199,7 +199,7 @@ extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
 			     struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
 #define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
 extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index db908f697139..3bf0631b5d56 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -133,7 +133,8 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 
 static void rgrp_go_sync(struct gfs2_glock *gl)
 {
-	struct address_space *metamapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = &sdp->sd_aspace;
 	struct gfs2_rgrpd *rgd;
 	int error;
 
@@ -141,10 +142,10 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 		return;
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(gl->gl_sbd, gl);
-	filemap_fdatawrite(metamapping);
-	error = filemap_fdatawait(metamapping);
-        mapping_set_error(metamapping, error);
+	gfs2_log_flush(sdp, gl);
+	filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+	error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
+	mapping_set_error(mapping, error);
 	gfs2_ail_empty_gl(gl);
 
 	spin_lock(&gl->gl_spin);
@@ -166,11 +167,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 
 static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
-	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct address_space *mapping = &sdp->sd_aspace;
 
 	WARN_ON_ONCE(!(flags & DIO_METADATA));
-	gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
-	truncate_inode_pages(mapping, 0);
+	gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count));
+	truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 
 	if (gl->gl_object) {
 		struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
@@ -192,8 +194,11 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
 	if (ip && !S_ISREG(ip->i_inode.i_mode))
 		ip = NULL;
-	if (ip && test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
-		unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+	if (ip) {
+		if (test_and_clear_bit(GIF_SW_PAGED, &ip->i_flags))
+			unmap_shared_mapping_range(ip->i_inode.i_mapping, 0, 0);
+		inode_dio_wait(&ip->i_inode);
+	}
 	if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
 		return;
 
@@ -410,6 +415,9 @@ static int inode_go_lock(struct gfs2_holder *gh)
 			return error;
 	}
 
+	if (gh->gh_state != LM_ST_DEFERRED)
+		inode_dio_wait(&ip->i_inode);
+
 	if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) &&
 	    (gl->gl_state == LM_ST_EXCLUSIVE) &&
 	    (gh->gh_state == LM_ST_EXCLUSIVE)) {
@@ -429,21 +437,19 @@ static int inode_go_lock(struct gfs2_holder *gh)
  * @seq: The iterator
  * @ip: the inode
  *
- * Returns: 0 on success, -ENOBUFS when we run out of space
  */
 
-static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	const struct gfs2_inode *ip = gl->gl_object;
 	if (ip == NULL)
-		return 0;
+		return;
 	gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
 		  (unsigned long long)ip->i_no_formal_ino,
 		  (unsigned long long)ip->i_no_addr,
 		  IF2DT(ip->i_inode.i_mode), ip->i_flags,
 		  (unsigned int)ip->i_diskflags,
 		  (unsigned long long)i_size_read(&ip->i_inode));
-	return 0;
 }
 
 /**
@@ -552,7 +558,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_unlock = gfs2_rgrp_go_unlock,
 	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
-	.go_flags = GLOF_ASPACE | GLOF_LVB,
+	.go_flags = GLOF_LVB,
 };
 
 const struct gfs2_glock_operations gfs2_trans_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index ba1ea67f4eeb..cf0e34400f71 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -93,6 +93,7 @@ struct gfs2_rgrpd {
 	struct gfs2_rgrp_lvb *rd_rgl;
 	u32 rd_last_alloc;
 	u32 rd_flags;
+	u32 rd_extfail_pt;		/* extent failure point */
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
@@ -217,7 +218,7 @@ struct gfs2_glock_operations {
 	int (*go_demote_ok) (const struct gfs2_glock *gl);
 	int (*go_lock) (struct gfs2_holder *gh);
 	void (*go_unlock) (struct gfs2_holder *gh);
-	int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+	void (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
 	void (*go_callback)(struct gfs2_glock *gl, bool remote);
 	const int go_type;
 	const unsigned long go_flags;
@@ -350,7 +351,15 @@ struct gfs2_glock {
 	atomic_t gl_ail_count;
 	atomic_t gl_revokes;
 	struct delayed_work gl_work;
-	struct work_struct gl_delete;
+	union {
+		/* For inode and iopen glocks only */
+		struct work_struct gl_delete;
+		/* For rgrp glocks only */
+		struct {
+			loff_t start;
+			loff_t end;
+		} gl_vm;
+	};
 	struct rcu_head gl_rcu;
 };
 
@@ -419,10 +428,13 @@ enum {
 };
 
 struct gfs2_quota_data {
+	struct hlist_bl_node qd_hlist;
 	struct list_head qd_list;
 	struct kqid qd_id;
+	struct gfs2_sbd *qd_sbd;
 	struct lockref qd_lockref;
 	struct list_head qd_lru;
+	unsigned qd_hash;
 
 	unsigned long qd_flags;		/* QDF_... */
 
@@ -441,6 +453,7 @@ struct gfs2_quota_data {
 
 	u64 qd_sync_gen;
 	unsigned long qd_last_warn;
+	struct rcu_head qd_rcu;
 };
 
 struct gfs2_trans {
@@ -720,13 +733,15 @@ struct gfs2_sbd {
 	spinlock_t sd_trunc_lock;
 
 	unsigned int sd_quota_slots;
-	unsigned int sd_quota_chunks;
-	unsigned char **sd_quota_bitmap;
+	unsigned long *sd_quota_bitmap;
+	spinlock_t sd_bitmap_lock;
 
 	u64 sd_quota_sync_gen;
 
 	/* Log stuff */
 
+	struct address_space sd_aspace;
+
 	spinlock_t sd_log_lock;
 
 	struct gfs2_trans *sd_log_tr;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 7119504159f1..890588c7fb33 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -149,7 +149,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 	ip = GFS2_I(inode);
 
 	if (!inode)
-		return ERR_PTR(-ENOBUFS);
+		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
 		struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -469,14 +469,36 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
 	brelse(dibh);
 }
 
+/**
+ * gfs2_trans_da_blocks - Calculate number of blocks to link inode
+ * @dip: The directory we are linking into
+ * @da: The dir add information
+ * @nr_inodes: The number of inodes involved
+ *
+ * This calculate the number of blocks we need to reserve in a
+ * transaction to link @nr_inodes into a directory. In most cases
+ * @nr_inodes will be 2 (the directory plus the inode being linked in)
+ * but in case of rename, 4 may be required.
+ *
+ * Returns: Number of blocks
+ */
+
+static unsigned gfs2_trans_da_blks(const struct gfs2_inode *dip,
+				   const struct gfs2_diradd *da,
+				   unsigned nr_inodes)
+{
+	return da->nr_blocks + gfs2_rg_blocks(dip, da->nr_blocks) +
+	       (nr_inodes * RES_DINODE) + RES_QUOTA + RES_STATFS;
+}
+
 static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip, int arq)
+		       struct gfs2_inode *ip, struct gfs2_diradd *da)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-	struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+	struct gfs2_alloc_parms ap = { .target = da->nr_blocks, };
 	int error;
 
-	if (arq) {
+	if (da->nr_blocks) {
 		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto fail_quota_locks;
@@ -485,10 +507,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 		if (error)
 			goto fail_quota_locks;
 
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 dip->i_rgd->rd_length +
-					 2 * RES_DINODE +
-					 RES_STATFS + RES_QUOTA, 0);
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, da, 2), 0);
 		if (error)
 			goto fail_ipreserv;
 	} else {
@@ -497,7 +516,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
 			goto fail_quota_locks;
 	}
 
-	error = gfs2_dir_add(&dip->i_inode, name, ip);
+	error = gfs2_dir_add(&dip->i_inode, name, ip, da);
 	if (error)
 		goto fail_end_trans;
 
@@ -560,7 +579,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	struct dentry *d;
 	int error;
 	u32 aflags = 0;
-	int arq;
+	struct gfs2_diradd da = { .bh = NULL, };
 
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return -ENAMETOOLONG;
@@ -585,6 +604,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	error = PTR_ERR(inode);
 	if (!IS_ERR(inode)) {
 		d = d_splice_alias(inode, dentry);
+		error = PTR_ERR(d);
+		if (IS_ERR(d))
+			goto fail_gunlock;
 		error = 0;
 		if (file) {
 			if (S_ISREG(inode->i_mode)) {
@@ -602,7 +624,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		goto fail_gunlock;
 	}
 
-	arq = error = gfs2_diradd_alloc_required(dir, name);
+	error = gfs2_diradd_alloc_required(dir, name, &da);
 	if (error < 0)
 		goto fail_gunlock;
 
@@ -690,7 +712,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (error)
 		goto fail_gunlock3;
 
-	error = link_dinode(dip, name, ip, arq);
+	error = link_dinode(dip, name, ip, &da);
 	if (error)
 		goto fail_gunlock3;
 
@@ -719,6 +741,7 @@ fail_free_inode:
 	free_inode_nonrcu(inode);
 	inode = NULL;
 fail_gunlock:
+	gfs2_dir_no_add(&da);
 	gfs2_glock_dq_uninit(ghs);
 	if (inode && !IS_ERR(inode)) {
 		clear_nlink(inode);
@@ -779,6 +802,11 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	d = d_splice_alias(inode, dentry);
+	if (IS_ERR(d)) {
+		iput(inode);
+		gfs2_glock_dq_uninit(&gh);
+		return d;
+	}
 	if (file && S_ISREG(inode->i_mode))
 		error = finish_open(file, dentry, gfs2_open_common, opened);
 
@@ -817,7 +845,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder ghs[2];
 	struct buffer_head *dibh;
-	int alloc_required;
+	struct gfs2_diradd da = { .bh = NULL, };
 	int error;
 
 	if (S_ISDIR(inode->i_mode))
@@ -872,13 +900,12 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (ip->i_inode.i_nlink == (u32)-1)
 		goto out_gunlock;
 
-	alloc_required = error = gfs2_diradd_alloc_required(dir, &dentry->d_name);
+	error = gfs2_diradd_alloc_required(dir, &dentry->d_name, &da);
 	if (error < 0)
 		goto out_gunlock;
-	error = 0;
 
-	if (alloc_required) {
-		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+	if (da.nr_blocks) {
+		struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
 		error = gfs2_quota_lock_check(dip);
 		if (error)
 			goto out_gunlock;
@@ -887,10 +914,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 		if (error)
 			goto out_gunlock_q;
 
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(dip, sdp->sd_max_dirres) +
-					 2 * RES_DINODE + RES_STATFS +
-					 RES_QUOTA, 0);
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(dip, &da, 2), 0);
 		if (error)
 			goto out_ipres;
 	} else {
@@ -903,7 +927,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (error)
 		goto out_end_trans;
 
-	error = gfs2_dir_add(dir, &dentry->d_name, ip);
+	error = gfs2_dir_add(dir, &dentry->d_name, ip, &da);
 	if (error)
 		goto out_brelse;
 
@@ -919,12 +943,13 @@ out_brelse:
 out_end_trans:
 	gfs2_trans_end(sdp);
 out_ipres:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_inplace_release(dip);
 out_gunlock_q:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_quota_unlock(dip);
 out_gunlock:
+	gfs2_dir_no_add(&da);
 	gfs2_glock_dq(ghs + 1);
 out_child:
 	gfs2_glock_dq(ghs);
@@ -1254,7 +1279,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	struct gfs2_rgrpd *nrgd;
 	unsigned int num_gh;
 	int dir_rename = 0;
-	int alloc_required = 0;
+	struct gfs2_diradd da = { .nr_blocks = 0, };
 	unsigned int x;
 	int error;
 
@@ -1388,14 +1413,14 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			goto out_gunlock;
 	}
 
-	if (nip == NULL)
-		alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name);
-	error = alloc_required;
-	if (error < 0)
-		goto out_gunlock;
+	if (nip == NULL) {
+		error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name, &da);
+		if (error)
+			goto out_gunlock;
+	}
 
-	if (alloc_required) {
-		struct gfs2_alloc_parms ap = { .target = sdp->sd_max_dirres, };
+	if (da.nr_blocks) {
+		struct gfs2_alloc_parms ap = { .target = da.nr_blocks, };
 		error = gfs2_quota_lock_check(ndip);
 		if (error)
 			goto out_gunlock;
@@ -1404,10 +1429,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 		if (error)
 			goto out_gunlock_q;
 
-		error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
-					 gfs2_rg_blocks(ndip, sdp->sd_max_dirres) +
-					 4 * RES_DINODE + 4 * RES_LEAF +
-					 RES_STATFS + RES_QUOTA + 4, 0);
+		error = gfs2_trans_begin(sdp, gfs2_trans_da_blks(ndip, &da, 4) +
+					 4 * RES_LEAF + 4, 0);
 		if (error)
 			goto out_ipreserv;
 	} else {
@@ -1441,19 +1464,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	if (error)
 		goto out_end_trans;
 
-	error = gfs2_dir_add(ndir, &ndentry->d_name, ip);
+	error = gfs2_dir_add(ndir, &ndentry->d_name, ip, &da);
 	if (error)
 		goto out_end_trans;
 
 out_end_trans:
 	gfs2_trans_end(sdp);
 out_ipreserv:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_inplace_release(ndip);
 out_gunlock_q:
-	if (alloc_required)
+	if (da.nr_blocks)
 		gfs2_quota_unlock(ndip);
 out_gunlock:
+	gfs2_dir_no_add(&da);
 	while (x--) {
 		gfs2_glock_dq(ghs + x);
 		gfs2_holder_uninit(ghs + x);
@@ -1607,10 +1631,22 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
 		ogid = ngid = NO_GID_QUOTA_CHANGE;
 
-	error = gfs2_quota_lock(ip, nuid, ngid);
+	error = get_write_access(inode);
 	if (error)
 		return error;
 
+	error = gfs2_rs_alloc(ip);
+	if (error)
+		goto out;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		goto out;
+
+	error = gfs2_quota_lock(ip, nuid, ngid);
+	if (error)
+		goto out;
+
 	if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) ||
 	    !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) {
 		error = gfs2_quota_check(ip, nuid, ngid);
@@ -1637,6 +1673,8 @@ out_end_trans:
 	gfs2_trans_end(sdp);
 out_gunlock_q:
 	gfs2_quota_unlock(ip);
+out:
+	put_write_access(inode);
 	return error;
 }
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 610613fb65b5..9dcb9777a5f8 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -551,10 +551,10 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 	struct buffer_head *bh = bd->bd_bh;
 	struct gfs2_glock *gl = bd->bd_gl;
 
-	gfs2_remove_from_ail(bd);
-	bd->bd_bh = NULL;
 	bh->b_private = NULL;
 	bd->bd_blkno = bh->b_blocknr;
+	gfs2_remove_from_ail(bd); /* drops ref on bh */
+	bd->bd_bh = NULL;
 	bd->bd_ops = &gfs2_revoke_lops;
 	sdp->sd_log_num_revoke++;
 	atomic_inc(&gl->gl_revokes);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 010b9fb9fec6..58f06400b7b8 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -83,6 +83,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
 	       bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
 	clear_bit(GBF_FULL, &bi->bi_flags);
 	rgd->rd_free_clone = rgd->rd_free;
+	rgd->rd_extfail_pt = rgd->rd_free;
 }
 
 /**
@@ -588,8 +589,12 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
 static void gfs2_meta_sync(struct gfs2_glock *gl)
 {
 	struct address_space *mapping = gfs2_glock2aspace(gl);
+	struct gfs2_sbd *sdp = gl->gl_sbd;
 	int error;
 
+	if (mapping == NULL)
+		mapping = &sdp->sd_aspace;
+
 	filemap_fdatawrite(mapping);
 	error = filemap_fdatawait(mapping);
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 0650db2541ef..c272e73063de 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -76,6 +76,7 @@ static int __init init_gfs2_fs(void)
 
 	gfs2_str2qstr(&gfs2_qdot, ".");
 	gfs2_str2qstr(&gfs2_qdotdot, "..");
+	gfs2_quota_hash_init();
 
 	error = gfs2_sys_init();
 	if (error)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 932415050540..c7f24690ed05 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -116,6 +116,9 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create)
 	unsigned long index;
 	unsigned int bufnum;
 
+	if (mapping == NULL)
+		mapping = &sdp->sd_aspace;
+
 	shift = PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift;
 	index = blkno >> shift;             /* convert block to page */
 	bufnum = blkno - (index << shift);  /* block buf index within page */
@@ -258,6 +261,7 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 	struct address_space *mapping = bh->b_page->mapping;
 	struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping);
 	struct gfs2_bufdata *bd = bh->b_private;
+	int was_pinned = 0;
 
 	if (test_clear_buffer_pinned(bh)) {
 		trace_gfs2_pin(bd, 0);
@@ -273,12 +277,16 @@ void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int
 			tr->tr_num_databuf_rm++;
 		}
 		tr->tr_touched = 1;
+		was_pinned = 1;
 		brelse(bh);
 	}
 	if (bd) {
 		spin_lock(&sdp->sd_ail_lock);
 		if (bd->bd_tr) {
 			gfs2_trans_add_revoke(sdp, bd);
+		} else if (was_pinned) {
+			bh->b_private = NULL;
+			kmem_cache_free(gfs2_bufdata_cachep, bd);
 		}
 		spin_unlock(&sdp->sd_ail_lock);
 	}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 82303b474958..1e712b566d76 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -36,6 +36,7 @@
 #include "log.h"
 #include "quota.h"
 #include "dir.h"
+#include "meta_io.h"
 #include "trace_gfs2.h"
 
 #define DO 0
@@ -62,6 +63,7 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp;
+	struct address_space *mapping;
 
 	sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL);
 	if (!sdp)
@@ -97,6 +99,18 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	init_waitqueue_head(&sdp->sd_quota_wait);
 	INIT_LIST_HEAD(&sdp->sd_trunc_list);
 	spin_lock_init(&sdp->sd_trunc_lock);
+	spin_lock_init(&sdp->sd_bitmap_lock);
+
+	mapping = &sdp->sd_aspace;
+
+	address_space_init_once(mapping);
+	mapping->a_ops = &gfs2_meta_aops;
+	mapping->host = sb->s_bdev->bd_inode;
+	mapping->flags = 0;
+	mapping_set_gfp_mask(mapping, GFP_NOFS);
+	mapping->private_data = NULL;
+	mapping->backing_dev_info = sb->s_bdi;
+	mapping->writeback_index = 0;
 
 	spin_lock_init(&sdp->sd_log_lock);
 	atomic_set(&sdp->sd_log_pinned, 0);
@@ -217,7 +231,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	page = alloc_page(GFP_NOFS);
 	if (unlikely(!page))
-		return -ENOBUFS;
+		return -ENOMEM;
 
 	ClearPageUptodate(page);
 	ClearPageDirty(page);
@@ -956,40 +970,6 @@ fail:
 	return error;
 }
 
-static int init_threads(struct gfs2_sbd *sdp, int undo)
-{
-	struct task_struct *p;
-	int error = 0;
-
-	if (undo)
-		goto fail_quotad;
-
-	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
-	if (IS_ERR(p)) {
-		error = PTR_ERR(p);
-		fs_err(sdp, "can't start logd thread: %d\n", error);
-		return error;
-	}
-	sdp->sd_logd_process = p;
-
-	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
-	if (IS_ERR(p)) {
-		error = PTR_ERR(p);
-		fs_err(sdp, "can't start quotad thread: %d\n", error);
-		goto fail;
-	}
-	sdp->sd_quotad_process = p;
-
-	return 0;
-
-
-fail_quotad:
-	kthread_stop(sdp->sd_quotad_process);
-fail:
-	kthread_stop(sdp->sd_logd_process);
-	return error;
-}
-
 static const match_table_t nolock_tokens = {
 	{ Opt_jid, "jid=%d\n", },
 	{ Opt_err, NULL },
@@ -1254,15 +1234,11 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 		goto fail_per_node;
 	}
 
-	error = init_threads(sdp, DO);
-	if (error)
-		goto fail_per_node;
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_rw(sdp);
 		if (error) {
 			fs_err(sdp, "can't make FS RW: %d\n", error);
-			goto fail_threads;
+			goto fail_per_node;
 		}
 	}
 
@@ -1270,8 +1246,6 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
 	gfs2_online_uevent(sdp);
 	return 0;
 
-fail_threads:
-	init_threads(sdp, UNDO);
 fail_per_node:
 	init_per_node(sdp, UNDO);
 fail_inodes:
@@ -1366,8 +1340,18 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 	if (IS_ERR(s))
 		goto error_bdev;
 
-	if (s->s_root)
+	if (s->s_root) {
+		/*
+		 * s_umount nests inside bd_mutex during
+		 * __invalidate_device().  blkdev_put() acquires
+		 * bd_mutex and can't be called under s_umount.  Drop
+		 * s_umount temporarily.  This is safe as we're
+		 * holding an active reference.
+		 */
+		up_write(&s->s_umount);
 		blkdev_put(bdev, mode);
+		down_write(&s->s_umount);
+	}
 
 	memset(&args, 0, sizeof(args));
 	args.ar_quota = GFS2_QUOTA_DEFAULT;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 98236d0df3ca..8bec0e3192dd 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -52,6 +52,11 @@
 #include <linux/dqblk_xfs.h>
 #include <linux/lockref.h>
 #include <linux/list_lru.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist_bl.h>
+#include <linux/bit_spinlock.h>
+#include <linux/jhash.h>
+#include <linux/vmalloc.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -67,16 +72,44 @@
 #include "inode.h"
 #include "util.h"
 
-struct gfs2_quota_change_host {
-	u64 qc_change;
-	u32 qc_flags; /* GFS2_QCF_... */
-	struct kqid qc_id;
-};
+#define GFS2_QD_HASH_SHIFT      12
+#define GFS2_QD_HASH_SIZE       (1 << GFS2_QD_HASH_SHIFT)
+#define GFS2_QD_HASH_MASK       (GFS2_QD_HASH_SIZE - 1)
 
-/* Lock order: qd_lock -> qd->lockref.lock -> lru lock */
+/* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */
+/*                     -> sd_bitmap_lock                              */
 static DEFINE_SPINLOCK(qd_lock);
 struct list_lru gfs2_qd_lru;
 
+static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE];
+
+static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp,
+				 const struct kqid qid)
+{
+	unsigned int h;
+
+	h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0);
+	h = jhash(&qid, sizeof(struct kqid), h);
+
+	return h & GFS2_QD_HASH_MASK;
+}
+
+static inline void spin_lock_bucket(unsigned int hash)
+{
+        hlist_bl_lock(&qd_hash_table[hash]);
+}
+
+static inline void spin_unlock_bucket(unsigned int hash)
+{
+        hlist_bl_unlock(&qd_hash_table[hash]);
+}
+
+static void gfs2_qd_dealloc(struct rcu_head *rcu)
+{
+	struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu);
+	kmem_cache_free(gfs2_quotad_cachep, qd);
+}
+
 static void gfs2_qd_dispose(struct list_head *list)
 {
 	struct gfs2_quota_data *qd;
@@ -93,6 +126,10 @@ static void gfs2_qd_dispose(struct list_head *list)
 		list_del(&qd->qd_list);
 		spin_unlock(&qd_lock);
 
+		spin_lock_bucket(qd->qd_hash);
+		hlist_bl_del_rcu(&qd->qd_hlist);
+		spin_unlock_bucket(qd->qd_hash);
+
 		gfs2_assert_warn(sdp, !qd->qd_change);
 		gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -101,7 +138,7 @@ static void gfs2_qd_dispose(struct list_head *list)
 		atomic_dec(&sdp->sd_quota_count);
 
 		/* Delete it from the common reclaim list */
-		kmem_cache_free(gfs2_quotad_cachep, qd);
+		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
 	}
 }
 
@@ -171,83 +208,95 @@ static u64 qd2offset(struct gfs2_quota_data *qd)
 	return offset;
 }
 
-static int qd_alloc(struct gfs2_sbd *sdp, struct kqid qid,
-		    struct gfs2_quota_data **qdp)
+static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid)
 {
 	struct gfs2_quota_data *qd;
 	int error;
 
 	qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS);
 	if (!qd)
-		return -ENOMEM;
+		return NULL;
 
+	qd->qd_sbd = sdp;
 	qd->qd_lockref.count = 1;
 	spin_lock_init(&qd->qd_lockref.lock);
 	qd->qd_id = qid;
 	qd->qd_slot = -1;
 	INIT_LIST_HEAD(&qd->qd_lru);
+	qd->qd_hash = hash;
 
 	error = gfs2_glock_get(sdp, qd2index(qd),
 			      &gfs2_quota_glops, CREATE, &qd->qd_gl);
 	if (error)
 		goto fail;
 
-	*qdp = qd;
-
-	return 0;
+	return qd;
 
 fail:
 	kmem_cache_free(gfs2_quotad_cachep, qd);
-	return error;
+	return NULL;
 }
 
-static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
-		  struct gfs2_quota_data **qdp)
+static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
+						     const struct gfs2_sbd *sdp,
+						     struct kqid qid)
 {
-	struct gfs2_quota_data *qd = NULL, *new_qd = NULL;
-	int error, found;
-
-	*qdp = NULL;
+	struct gfs2_quota_data *qd;
+	struct hlist_bl_node *h;
 
-	for (;;) {
-		found = 0;
-		spin_lock(&qd_lock);
-		list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
-			if (qid_eq(qd->qd_id, qid) &&
-			    lockref_get_not_dead(&qd->qd_lockref)) {
-				list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
-				found = 1;
-				break;
-			}
+	hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) {
+		if (!qid_eq(qd->qd_id, qid))
+			continue;
+		if (qd->qd_sbd != sdp)
+			continue;
+		if (lockref_get_not_dead(&qd->qd_lockref)) {
+			list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+			return qd;
 		}
+	}
 
-		if (!found)
-			qd = NULL;
+	return NULL;
+}
 
-		if (!qd && new_qd) {
-			qd = new_qd;
-			list_add(&qd->qd_list, &sdp->sd_quota_list);
-			atomic_inc(&sdp->sd_quota_count);
-			new_qd = NULL;
-		}
 
-		spin_unlock(&qd_lock);
+static int qd_get(struct gfs2_sbd *sdp, struct kqid qid,
+		  struct gfs2_quota_data **qdp)
+{
+	struct gfs2_quota_data *qd, *new_qd;
+	unsigned int hash = gfs2_qd_hash(sdp, qid);
 
-		if (qd) {
-			if (new_qd) {
-				gfs2_glock_put(new_qd->qd_gl);
-				kmem_cache_free(gfs2_quotad_cachep, new_qd);
-			}
-			*qdp = qd;
-			return 0;
-		}
+	rcu_read_lock();
+	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+	rcu_read_unlock();
 
-		error = qd_alloc(sdp, qid, &new_qd);
-		if (error)
-			return error;
+	if (qd)
+		return 0;
+
+	new_qd = qd_alloc(hash, sdp, qid);
+	if (!new_qd)
+		return -ENOMEM;
+
+	spin_lock(&qd_lock);
+	spin_lock_bucket(hash);
+	*qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid);
+	if (qd == NULL) {
+		*qdp = new_qd;
+		list_add(&new_qd->qd_list, &sdp->sd_quota_list);
+		hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]);
+		atomic_inc(&sdp->sd_quota_count);
 	}
+	spin_unlock_bucket(hash);
+	spin_unlock(&qd_lock);
+
+	if (qd) {
+		gfs2_glock_put(new_qd->qd_gl);
+		kmem_cache_free(gfs2_quotad_cachep, new_qd);
+	}
+
+	return 0;
 }
 
+
 static void qd_hold(struct gfs2_quota_data *qd)
 {
 	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
@@ -268,88 +317,48 @@ static void qd_put(struct gfs2_quota_data *qd)
 
 static int slot_get(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
-	unsigned int c, o = 0, b;
-	unsigned char byte = 0;
+	struct gfs2_sbd *sdp = qd->qd_sbd;
+	unsigned int bit;
+	int error = 0;
 
-	spin_lock(&qd_lock);
+	spin_lock(&sdp->sd_bitmap_lock);
+	if (qd->qd_slot_count != 0)
+		goto out;
 
-	if (qd->qd_slot_count++) {
-		spin_unlock(&qd_lock);
-		return 0;
+	error = -ENOSPC;
+	bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots);
+	if (bit < sdp->sd_quota_slots) {
+		set_bit(bit, sdp->sd_quota_bitmap);
+		qd->qd_slot = bit;
+out:
+		qd->qd_slot_count++;
 	}
+	spin_unlock(&sdp->sd_bitmap_lock);
 
-	for (c = 0; c < sdp->sd_quota_chunks; c++)
-		for (o = 0; o < PAGE_SIZE; o++) {
-			byte = sdp->sd_quota_bitmap[c][o];
-			if (byte != 0xFF)
-				goto found;
-		}
-
-	goto fail;
-
-found:
-	for (b = 0; b < 8; b++)
-		if (!(byte & (1 << b)))
-			break;
-	qd->qd_slot = c * (8 * PAGE_SIZE) + o * 8 + b;
-
-	if (qd->qd_slot >= sdp->sd_quota_slots)
-		goto fail;
-
-	sdp->sd_quota_bitmap[c][o] |= 1 << b;
-
-	spin_unlock(&qd_lock);
-
-	return 0;
-
-fail:
-	qd->qd_slot_count--;
-	spin_unlock(&qd_lock);
-	return -ENOSPC;
+	return error;
 }
 
 static void slot_hold(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_sbd *sdp = qd->qd_sbd;
 
-	spin_lock(&qd_lock);
+	spin_lock(&sdp->sd_bitmap_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	qd->qd_slot_count++;
-	spin_unlock(&qd_lock);
-}
-
-static void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
-			     unsigned int bit, int new_value)
-{
-	unsigned int c, o, b = bit;
-	int old_value;
-
-	c = b / (8 * PAGE_SIZE);
-	b %= 8 * PAGE_SIZE;
-	o = b / 8;
-	b %= 8;
-
-	old_value = (bitmap[c][o] & (1 << b));
-	gfs2_assert_withdraw(sdp, !old_value != !new_value);
-
-	if (new_value)
-		bitmap[c][o] |= 1 << b;
-	else
-		bitmap[c][o] &= ~(1 << b);
+	spin_unlock(&sdp->sd_bitmap_lock);
 }
 
 static void slot_put(struct gfs2_quota_data *qd)
 {
-	struct gfs2_sbd *sdp = qd->qd_gl->gl_sbd;
+	struct gfs2_sbd *sdp = qd->qd_sbd;
 
-	spin_lock(&qd_lock);
+	spin_lock(&sdp->sd_bitmap_lock);
 	gfs2_assert(sdp, qd->qd_slot_count);
 	if (!--qd->qd_slot_count) {
-		gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, qd->qd_slot, 0);
+		BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap));
 		qd->qd_slot = -1;
 	}
-	spin_unlock(&qd_lock);
+	spin_unlock(&sdp->sd_bitmap_lock);
 }
 
 static int bh_get(struct gfs2_quota_data *qd)
@@ -427,8 +436,7 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
 	list_move_tail(&qd->qd_list, &sdp->sd_quota_list);
 	set_bit(QDF_LOCKED, &qd->qd_flags);
 	qd->qd_change_sync = qd->qd_change;
-	gfs2_assert_warn(sdp, qd->qd_slot_count);
-	qd->qd_slot_count++;
+	slot_hold(qd);
 	return 1;
 }
 
@@ -1214,17 +1222,6 @@ int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid)
 	return error;
 }
 
-static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *buf)
-{
-	const struct gfs2_quota_change *str = buf;
-
-	qc->qc_change = be64_to_cpu(str->qc_change);
-	qc->qc_flags = be32_to_cpu(str->qc_flags);
-	qc->qc_id = make_kqid(&init_user_ns,
-			      (qc->qc_flags & GFS2_QCF_USER)?USRQUOTA:GRPQUOTA,
-			      be32_to_cpu(str->qc_id));
-}
-
 int gfs2_quota_init(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
@@ -1232,6 +1229,8 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 	unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
 	unsigned int x, slot = 0;
 	unsigned int found = 0;
+	unsigned int hash;
+	unsigned int bm_size;
 	u64 dblock;
 	u32 extlen = 0;
 	int error;
@@ -1240,23 +1239,20 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 		return -EIO;
 
 	sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
-	sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
-
+	bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long));
+	bm_size *= sizeof(unsigned long);
 	error = -ENOMEM;
-
-	sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
-				       sizeof(unsigned char *), GFP_NOFS);
+	sdp->sd_quota_bitmap = kmalloc(bm_size, GFP_NOFS|__GFP_NOWARN);
+	if (sdp->sd_quota_bitmap == NULL)
+		sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS, PAGE_KERNEL);
 	if (!sdp->sd_quota_bitmap)
 		return error;
 
-	for (x = 0; x < sdp->sd_quota_chunks; x++) {
-		sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
-		if (!sdp->sd_quota_bitmap[x])
-			goto fail;
-	}
+	memset(sdp->sd_quota_bitmap, 0, bm_size);
 
 	for (x = 0; x < blocks; x++) {
 		struct buffer_head *bh;
+		const struct gfs2_quota_change *qc;
 		unsigned int y;
 
 		if (!extlen) {
@@ -1274,34 +1270,42 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
 			goto fail;
 		}
 
+		qc = (const struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header));
 		for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots;
 		     y++, slot++) {
-			struct gfs2_quota_change_host qc;
 			struct gfs2_quota_data *qd;
-
-			gfs2_quota_change_in(&qc, bh->b_data +
-					  sizeof(struct gfs2_meta_header) +
-					  y * sizeof(struct gfs2_quota_change));
-			if (!qc.qc_change)
+			s64 qc_change = be64_to_cpu(qc->qc_change);
+			u32 qc_flags = be32_to_cpu(qc->qc_flags);
+			enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ?
+						USRQUOTA : GRPQUOTA;
+			struct kqid qc_id = make_kqid(&init_user_ns, qtype,
+						      be32_to_cpu(qc->qc_id));
+			qc++;
+			if (!qc_change)
 				continue;
 
-			error = qd_alloc(sdp, qc.qc_id, &qd);
-			if (error) {
+			hash = gfs2_qd_hash(sdp, qc_id);
+			qd = qd_alloc(hash, sdp, qc_id);
+			if (qd == NULL) {
 				brelse(bh);
 				goto fail;
 			}
 
 			set_bit(QDF_CHANGE, &qd->qd_flags);
-			qd->qd_change = qc.qc_change;
+			qd->qd_change = qc_change;
 			qd->qd_slot = slot;
 			qd->qd_slot_count = 1;
 
 			spin_lock(&qd_lock);
-			gfs2_icbit_munge(sdp, sdp->sd_quota_bitmap, slot, 1);
+			BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap));
 			list_add(&qd->qd_list, &sdp->sd_quota_list);
 			atomic_inc(&sdp->sd_quota_count);
 			spin_unlock(&qd_lock);
 
+			spin_lock_bucket(hash);
+			hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]);
+			spin_unlock_bucket(hash);
+
 			found++;
 		}
 
@@ -1324,44 +1328,28 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 {
 	struct list_head *head = &sdp->sd_quota_list;
 	struct gfs2_quota_data *qd;
-	unsigned int x;
 
 	spin_lock(&qd_lock);
 	while (!list_empty(head)) {
 		qd = list_entry(head->prev, struct gfs2_quota_data, qd_list);
 
-		/*
-		 * To be removed in due course... we should be able to
-		 * ensure that all refs to the qd have done by this point
-		 * so that this rather odd test is not required
-		 */
-		spin_lock(&qd->qd_lockref.lock);
-		if (qd->qd_lockref.count > 1 ||
-		    (qd->qd_lockref.count && !test_bit(QDF_CHANGE, &qd->qd_flags))) {
-			spin_unlock(&qd->qd_lockref.lock);
-			list_move(&qd->qd_list, head);
-			spin_unlock(&qd_lock);
-			schedule();
-			spin_lock(&qd_lock);
-			continue;
-		}
-		spin_unlock(&qd->qd_lockref.lock);
-
 		list_del(&qd->qd_list);
+
 		/* Also remove if this qd exists in the reclaim list */
 		list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
 		atomic_dec(&sdp->sd_quota_count);
 		spin_unlock(&qd_lock);
 
-		if (!qd->qd_lockref.count) {
-			gfs2_assert_warn(sdp, !qd->qd_change);
-			gfs2_assert_warn(sdp, !qd->qd_slot_count);
-		} else
-			gfs2_assert_warn(sdp, qd->qd_slot_count == 1);
+		spin_lock_bucket(qd->qd_hash);
+		hlist_bl_del_rcu(&qd->qd_hlist);
+		spin_unlock_bucket(qd->qd_hash);
+
+		gfs2_assert_warn(sdp, !qd->qd_change);
+		gfs2_assert_warn(sdp, !qd->qd_slot_count);
 		gfs2_assert_warn(sdp, !qd->qd_bh_count);
 
 		gfs2_glock_put(qd->qd_gl);
-		kmem_cache_free(gfs2_quotad_cachep, qd);
+		call_rcu(&qd->qd_rcu, gfs2_qd_dealloc);
 
 		spin_lock(&qd_lock);
 	}
@@ -1370,9 +1358,11 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
 	gfs2_assert_warn(sdp, !atomic_read(&sdp->sd_quota_count));
 
 	if (sdp->sd_quota_bitmap) {
-		for (x = 0; x < sdp->sd_quota_chunks; x++)
-			kfree(sdp->sd_quota_bitmap[x]);
-		kfree(sdp->sd_quota_bitmap);
+		if (is_vmalloc_addr(sdp->sd_quota_bitmap))
+			vfree(sdp->sd_quota_bitmap);
+		else
+			kfree(sdp->sd_quota_bitmap);
+		sdp->sd_quota_bitmap = NULL;
 	}
 }
 
@@ -1656,3 +1646,11 @@ const struct quotactl_ops gfs2_quotactl_ops = {
 	.get_dqblk	= gfs2_get_dqblk,
 	.set_dqblk	= gfs2_set_dqblk,
 };
+
+void __init gfs2_quota_hash_init(void)
+{
+	unsigned i;
+
+	for(i = 0; i < GFS2_QD_HASH_SIZE; i++)
+		INIT_HLIST_BL_HEAD(&qd_hash_table[i]);
+}
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 96e4f34a03b0..55d506eb3c4a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -57,5 +57,6 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 extern const struct quotactl_ops gfs2_quotactl_ops;
 extern struct shrinker gfs2_qd_shrinker;
 extern struct list_lru gfs2_qd_lru;
+extern void __init gfs2_quota_hash_init(void);
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c8d6161bd682..a1da21349235 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -57,6 +57,11 @@
  * 3 = Used (metadata)
  */
 
+struct gfs2_extent {
+	struct gfs2_rbm rbm;
+	u32 len;
+};
+
 static const char valid_change[16] = {
 	        /* current */
 	/* n */ 0, 1, 1, 1,
@@ -65,8 +70,9 @@ static const char valid_change[16] = {
 	        1, 0, 0, 0
 };
 
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
-                         const struct gfs2_inode *ip, bool nowrap);
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+			 const struct gfs2_inode *ip, bool nowrap,
+			 const struct gfs2_alloc_parms *ap);
 
 
 /**
@@ -635,9 +641,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
 		/* return reserved blocks to the rgrp */
 		BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
 		rs->rs_rbm.rgd->rd_reserved -= rs->rs_free;
+		/* The rgrp extent failure point is likely not to increase;
+		   it will only do so if the freed blocks are somehow
+		   contiguous with a span of free blocks that follows. Still,
+		   it will force the number to be recalculated later. */
+		rgd->rd_extfail_pt += rs->rs_free;
 		rs->rs_free = 0;
 		clear_bit(GBF_FULL, &bi->bi_flags);
-		smp_mb__after_clear_bit();
 	}
 }
 
@@ -876,6 +886,7 @@ static int rgd_insert(struct gfs2_rgrpd *rgd)
 static int read_rindex_entry(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	const unsigned bsize = sdp->sd_sb.sb_bsize;
 	loff_t pos = sdp->sd_rgrps * sizeof(struct gfs2_rindex);
 	struct gfs2_rindex buf;
 	int error;
@@ -913,6 +924,8 @@ static int read_rindex_entry(struct gfs2_inode *ip)
 		goto fail;
 
 	rgd->rd_gl->gl_object = rgd;
+	rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
+	rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
 	rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
 	if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -1126,6 +1139,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
 		rgd->rd_free_clone = rgd->rd_free;
+		/* max out the rgrp allocation failure point */
+		rgd->rd_extfail_pt = rgd->rd_free;
 	}
 	if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) {
 		rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd));
@@ -1184,7 +1199,7 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh)
 
 	if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb)
 		return 0;
-	return gfs2_rgrp_bh_get((struct gfs2_rgrpd *)gh->gh_gl->gl_object);
+	return gfs2_rgrp_bh_get(rgd);
 }
 
 /**
@@ -1455,7 +1470,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 	if (WARN_ON(gfs2_rbm_from_block(&rbm, goal)))
 		return;
 
-	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, extlen, ip, true);
+	ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true, ap);
 	if (ret == 0) {
 		rs->rs_rbm = rbm;
 		rs->rs_free = extlen;
@@ -1520,6 +1535,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
  * @rbm: The current position in the resource group
  * @ip: The inode for which we are searching for blocks
  * @minext: The minimum extent length
+ * @maxext: A pointer to the maximum extent structure
  *
  * This checks the current position in the rgrp to see whether there is
  * a reservation covering this block. If not then this function is a
@@ -1532,7 +1548,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 
 static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 					     const struct gfs2_inode *ip,
-					     u32 minext)
+					     u32 minext,
+					     struct gfs2_extent *maxext)
 {
 	u64 block = gfs2_rbm_to_block(rbm);
 	u32 extlen = 1;
@@ -1545,8 +1562,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 	 */
 	if (minext) {
 		extlen = gfs2_free_extlen(rbm, minext);
-		nblock = block + extlen;
-		if (extlen < minext)
+		if (extlen <= maxext->len)
 			goto fail;
 	}
 
@@ -1555,9 +1571,17 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm,
 	 * and skip if parts of it are already reserved
 	 */
 	nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip);
-	if (nblock == block)
-		return 0;
+	if (nblock == block) {
+		if (!minext || extlen >= minext)
+			return 0;
+
+		if (extlen > maxext->len) {
+			maxext->len = extlen;
+			maxext->rbm = *rbm;
+		}
 fail:
+		nblock = block + extlen;
+	}
 	ret = gfs2_rbm_from_block(rbm, nblock);
 	if (ret < 0)
 		return ret;
@@ -1568,30 +1592,38 @@ fail:
  * gfs2_rbm_find - Look for blocks of a particular state
  * @rbm: Value/result starting position and final position
  * @state: The state which we want to find
- * @minext: The requested extent length (0 for a single block)
+ * @minext: Pointer to the requested extent length (NULL for a single block)
+ *          This is updated to be the actual reservation size.
  * @ip: If set, check for reservations
  * @nowrap: Stop looking at the end of the rgrp, rather than wrapping
  *          around until we've reached the starting point.
+ * @ap: the allocation parameters
  *
  * Side effects:
  * - If looking for free blocks, we set GBF_FULL on each bitmap which
  *   has no free blocks in it.
+ * - If looking for free blocks, we set rd_extfail_pt on each rgrp which
+ *   has come up short on a free block search.
  *
  * Returns: 0 on success, -ENOSPC if there is no block of the requested state
  */
 
-static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
-			 const struct gfs2_inode *ip, bool nowrap)
+static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
+			 const struct gfs2_inode *ip, bool nowrap,
+			 const struct gfs2_alloc_parms *ap)
 {
 	struct buffer_head *bh;
 	int initial_bii;
 	u32 initial_offset;
+	int first_bii = rbm->bii;
+	u32 first_offset = rbm->offset;
 	u32 offset;
 	u8 *buffer;
 	int n = 0;
 	int iters = rbm->rgd->rd_length;
 	int ret;
 	struct gfs2_bitmap *bi;
+	struct gfs2_extent maxext = { .rbm.rgd = rbm->rgd, };
 
 	/* If we are not starting at the beginning of a bitmap, then we
 	 * need to add one to the bitmap count to ensure that we search
@@ -1620,7 +1652,9 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 minext,
 			return 0;
 
 		initial_bii = rbm->bii;
-		ret = gfs2_reservation_check_and_update(rbm, ip, minext);
+		ret = gfs2_reservation_check_and_update(rbm, ip,
+							minext ? *minext : 0,
+							&maxext);
 		if (ret == 0)
 			return 0;
 		if (ret > 0) {
@@ -1655,6 +1689,24 @@ next_iter:
 			break;
 	}
 
+	if (minext == NULL || state != GFS2_BLKST_FREE)
+		return -ENOSPC;
+
+	/* If the extent was too small, and it's smaller than the smallest
+	   to have failed before, remember for future reference that it's
+	   useless to search this rgrp again for this amount or more. */
+	if ((first_offset == 0) && (first_bii == 0) &&
+	    (*minext < rbm->rgd->rd_extfail_pt))
+		rbm->rgd->rd_extfail_pt = *minext;
+
+	/* If the maximum extent we found is big enough to fulfill the
+	   minimum requirements, use it anyway. */
+	if (maxext.len) {
+		*rbm = maxext.rbm;
+		*minext = maxext.len;
+		return 0;
+	}
+
 	return -ENOSPC;
 }
 
@@ -1680,7 +1732,8 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 
 	while (1) {
 		down_write(&sdp->sd_log_flush_lock);
-		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, 0, NULL, true);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_UNLINKED, NULL, NULL,
+				      true, NULL);
 		up_write(&sdp->sd_log_flush_lock);
 		if (error == -ENOSPC)
 			break;
@@ -1891,7 +1944,9 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
 		}
 
 		/* Skip unuseable resource groups */
-		if (rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
+		if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
+						 GFS2_RDF_ERROR)) ||
+		    (ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
 			goto skip_rgrp;
 
 		if (sdp->sd_args.ar_rgrplvb)
@@ -1911,15 +1966,16 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a
 			return 0;
 		}
 
-		/* Drop reservation, if we couldn't use reserved rgrp */
-		if (gfs2_rs_active(rs))
-			gfs2_rs_deltree(rs);
 check_rgrp:
 		/* Check for unlinked inodes which can be reclaimed */
 		if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK)
 			try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked,
 					ip->i_no_addr);
 skip_rgrp:
+		/* Drop reservation, if we couldn't use reserved rgrp */
+		if (gfs2_rs_active(rs))
+			gfs2_rs_deltree(rs);
+
 		/* Unlock rgrp if required */
 		if (!rg_locked)
 			gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2064,25 +2120,24 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
  *
  */
 
-int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 {
 	struct gfs2_rgrpd *rgd = gl->gl_object;
 	struct gfs2_blkreserv *trs;
 	const struct rb_node *n;
 
 	if (rgd == NULL)
-		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
+		return;
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n",
 		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
 		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
-		       rgd->rd_reserved);
+		       rgd->rd_reserved, rgd->rd_extfail_pt);
 	spin_lock(&rgd->rd_rsspin);
 	for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
 		trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
 		dump_rs(seq, trs);
 	}
 	spin_unlock(&rgd->rd_rsspin);
-	return 0;
 }
 
 static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
@@ -2184,18 +2239,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	int error;
 
 	gfs2_set_alloc_start(&rbm, ip, dinode);
-	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, ip, false);
+	error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false, NULL);
 
 	if (error == -ENOSPC) {
 		gfs2_set_alloc_start(&rbm, ip, dinode);
-		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, 0, NULL, false);
+		error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false,
+				      NULL);
 	}
 
 	/* Since all blocks are reserved in advance, this shouldn't happen */
 	if (error) {
-		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d\n",
+		fs_warn(sdp, "inum=%llu error=%d, nblocks=%u, full=%d fail_pt=%d\n",
 			(unsigned long long)ip->i_no_addr, error, *nblocks,
-			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags));
+			test_bit(GBF_FULL, &rbm.rgd->rd_bits->bi_flags),
+			rbm.rgd->rd_extfail_pt);
 		goto rgrp_error;
 	}
 
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3a10d2ffbbe7..463ab2e95d1c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -68,7 +68,7 @@ extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
 extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
 extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
 extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
-extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
+extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 				   struct buffer_head *bh,
 				   const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 35da5b19c0de..60f60f6181f3 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -369,6 +369,33 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
 	return 0;
 }
 
+static int init_threads(struct gfs2_sbd *sdp)
+{
+	struct task_struct *p;
+	int error = 0;
+
+	p = kthread_run(gfs2_logd, sdp, "gfs2_logd");
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
+		fs_err(sdp, "can't start logd thread: %d\n", error);
+		return error;
+	}
+	sdp->sd_logd_process = p;
+
+	p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad");
+	if (IS_ERR(p)) {
+		error = PTR_ERR(p);
+		fs_err(sdp, "can't start quotad thread: %d\n", error);
+		goto fail;
+	}
+	sdp->sd_quotad_process = p;
+	return 0;
+
+fail:
+	kthread_stop(sdp->sd_logd_process);
+	return error;
+}
+
 /**
  * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one
  * @sdp: the filesystem
@@ -384,10 +411,14 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 	struct gfs2_log_header_host head;
 	int error;
 
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+	error = init_threads(sdp);
 	if (error)
 		return error;
 
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+	if (error)
+		goto fail_threads;
+
 	j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
 
 	error = gfs2_find_jhead(sdp->sd_jdesc, &head);
@@ -417,7 +448,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 fail:
 	t_gh.gh_flags |= GL_NOCACHE;
 	gfs2_glock_dq_uninit(&t_gh);
-
+fail_threads:
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
 	return error;
 }
 
@@ -800,6 +833,9 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	struct gfs2_holder t_gh;
 	int error;
 
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
 	flush_workqueue(gfs2_delete_workqueue);
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -857,9 +893,6 @@ restart:
 	}
 	spin_unlock(&sdp->sd_jindex_spin);
 
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
 		if (error)
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b51a6079108d..e9a97a0d4314 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -24,13 +24,6 @@ struct hfsplus_wd {
 	u16 embed_count;
 };
 
-static void hfsplus_end_io_sync(struct bio *bio, int err)
-{
-	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	complete(bio->bi_private);
-}
-
 /*
  * hfsplus_submit_bio - Perfrom block I/O
  * @sb: super block of volume for I/O
@@ -53,7 +46,6 @@ static void hfsplus_end_io_sync(struct bio *bio, int err)
 int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 		void *buf, void **data, int rw)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
 	struct bio *bio;
 	int ret = 0;
 	u64 io_size;
@@ -73,8 +65,6 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 	bio = bio_alloc(GFP_NOIO, 1);
 	bio->bi_sector = sector;
 	bio->bi_bdev = sb->s_bdev;
-	bio->bi_end_io = hfsplus_end_io_sync;
-	bio->bi_private = &wait;
 
 	if (!(rw & WRITE) && data)
 		*data = (u8 *)buf + offset;
@@ -93,12 +83,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
 		buf = (u8 *)buf + len;
 	}
 
-	submit_bio(rw, bio);
-	wait_for_completion(&wait);
-
-	if (!bio_flagged(bio, BIO_UPTODATE))
-		ret = -EIO;
-
+	ret = submit_bio_wait(rw, bio);
 out:
 	bio_put(bio);
 	return ret < 0 ? ret : 0;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 52032647dd4a..5fa344afb49a 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -702,7 +702,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	read_lock(&journal->j_state_lock);
 #ifdef CONFIG_JBD2_DEBUG
 	if (!tid_geq(journal->j_commit_request, tid)) {
-		printk(KERN_EMERG
+		printk(KERN_ERR
 		       "%s: error: j_commit_request=%d, tid=%d\n",
 		       __func__, journal->j_commit_request, tid);
 	}
@@ -718,10 +718,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	}
 	read_unlock(&journal->j_state_lock);
 
-	if (unlikely(is_journal_aborted(journal))) {
-		printk(KERN_EMERG "journal commit I/O error\n");
+	if (unlikely(is_journal_aborted(journal)))
 		err = -EIO;
-	}
 	return err;
 }
 
@@ -1527,13 +1525,13 @@ static int journal_get_superblock(journal_t *journal)
 	if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
 	    JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
 		/* Can't have checksum v1 and v2 on at the same time! */
-		printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
+		printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 "
 		       "at the same time!\n");
 		goto out;
 	}
 
 	if (!jbd2_verify_csum_type(journal, sb)) {
-		printk(KERN_ERR "JBD: Unknown checksum type\n");
+		printk(KERN_ERR "JBD2: Unknown checksum type\n");
 		goto out;
 	}
 
@@ -1541,7 +1539,7 @@ static int journal_get_superblock(journal_t *journal)
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
 		journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
 		if (IS_ERR(journal->j_chksum_driver)) {
-			printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
+			printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
 			err = PTR_ERR(journal->j_chksum_driver);
 			journal->j_chksum_driver = NULL;
 			goto out;
@@ -1550,7 +1548,7 @@ static int journal_get_superblock(journal_t *journal)
 
 	/* Check superblock checksum */
 	if (!jbd2_superblock_csum_verify(journal, sb)) {
-		printk(KERN_ERR "JBD: journal checksum error\n");
+		printk(KERN_ERR "JBD2: journal checksum error\n");
 		goto out;
 	}
 
@@ -1836,7 +1834,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
 			journal->j_chksum_driver = crypto_alloc_shash("crc32c",
 								      0, 0);
 			if (IS_ERR(journal->j_chksum_driver)) {
-				printk(KERN_ERR "JBD: Cannot load crc32c "
+				printk(KERN_ERR "JBD2: Cannot load crc32c "
 				       "driver.\n");
 				journal->j_chksum_driver = NULL;
 				return 0;
@@ -2645,7 +2643,7 @@ static void __exit journal_exit(void)
 #ifdef CONFIG_JBD2_DEBUG
 	int n = atomic_read(&nr_journal_heads);
 	if (n)
-		printk(KERN_EMERG "JBD2: leaked %d journal_heads!\n", n);
+		printk(KERN_ERR "JBD2: leaked %d journal_heads!\n", n);
 #endif
 	jbd2_remove_jbd_stats_proc_entry();
 	jbd2_journal_destroy_caches();
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 3929c50428b1..3b6bb19d60b1 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -594,7 +594,7 @@ static int do_one_pass(journal_t *journal,
 						be32_to_cpu(tmp->h_sequence))) {
 						brelse(obh);
 						success = -EIO;
-						printk(KERN_ERR "JBD: Invalid "
+						printk(KERN_ERR "JBD2: Invalid "
 						       "checksum recovering "
 						       "block %llu in log\n",
 						       blocknr);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 7aa9a32573bb..8360674c85bc 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -932,7 +932,7 @@ repeat:
 					jbd2_alloc(jh2bh(jh)->b_size,
 							 GFP_NOFS);
 				if (!frozen_buffer) {
-					printk(KERN_EMERG
+					printk(KERN_ERR
 					       "%s: OOM for frozen_buffer\n",
 					       __func__);
 					JBUFFER_TRACE(jh, "oom!");
@@ -1166,7 +1166,7 @@ repeat:
 	if (!jh->b_committed_data) {
 		committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS);
 		if (!committed_data) {
-			printk(KERN_EMERG "%s: No memory for committed data\n",
+			printk(KERN_ERR "%s: No memory for committed data\n",
 				__func__);
 			err = -ENOMEM;
 			goto out;
@@ -1290,7 +1290,10 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 		 * once a transaction -bzzz
 		 */
 		jh->b_modified = 1;
-		J_ASSERT_JH(jh, handle->h_buffer_credits > 0);
+		if (handle->h_buffer_credits <= 0) {
+			ret = -ENOSPC;
+			goto out_unlock_bh;
+		}
 		handle->h_buffer_credits--;
 	}
 
@@ -1305,7 +1308,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 		JBUFFER_TRACE(jh, "fastpath");
 		if (unlikely(jh->b_transaction !=
 			     journal->j_running_transaction)) {
-			printk(KERN_EMERG "JBD: %s: "
+			printk(KERN_ERR "JBD2: %s: "
 			       "jh->b_transaction (%llu, %p, %u) != "
 			       "journal->j_running_transaction (%p, %u)",
 			       journal->j_devname,
@@ -1332,7 +1335,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 		JBUFFER_TRACE(jh, "already on other transaction");
 		if (unlikely(jh->b_transaction !=
 			     journal->j_committing_transaction)) {
-			printk(KERN_EMERG "JBD: %s: "
+			printk(KERN_ERR "JBD2: %s: "
 			       "jh->b_transaction (%llu, %p, %u) != "
 			       "journal->j_committing_transaction (%p, %u)",
 			       journal->j_devname,
@@ -1345,7 +1348,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
 			ret = -EINVAL;
 		}
 		if (unlikely(jh->b_next_transaction != transaction)) {
-			printk(KERN_EMERG "JBD: %s: "
+			printk(KERN_ERR "JBD2: %s: "
 			       "jh->b_next_transaction (%llu, %p, %u) != "
 			       "transaction (%p, %u)",
 			       journal->j_devname,
@@ -1373,7 +1376,6 @@ out_unlock_bh:
 	jbd2_journal_put_journal_head(jh);
 out:
 	JBUFFER_TRACE(jh, "exit");
-	WARN_ON(ret);	/* All errors are bugs, so dump the stack */
 	return ret;
 }
 
diff --git a/fs/kernfs/Makefile b/fs/kernfs/Makefile
new file mode 100644
index 000000000000..674337c76673
--- /dev/null
+++ b/fs/kernfs/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the kernfs pseudo filesystem
+#
+
+obj-y		:= mount.o inode.o dir.o file.o symlink.o
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
new file mode 100644
index 000000000000..5104cf5d25c5
--- /dev/null
+++ b/fs/kernfs/dir.c
@@ -0,0 +1,1073 @@
+/*
+ * fs/kernfs/dir.c - kernfs directory implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/idr.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include <linux/hash.h>
+
+#include "kernfs-internal.h"
+
+DEFINE_MUTEX(kernfs_mutex);
+
+#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
+
+/**
+ *	kernfs_name_hash
+ *	@name: Null terminated string to hash
+ *	@ns:   Namespace tag to hash
+ *
+ *	Returns 31 bit hash of ns + name (so it fits in an off_t )
+ */
+static unsigned int kernfs_name_hash(const char *name, const void *ns)
+{
+	unsigned long hash = init_name_hash();
+	unsigned int len = strlen(name);
+	while (len--)
+		hash = partial_name_hash(*name++, hash);
+	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+	hash &= 0x7fffffffU;
+	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
+	if (hash < 1)
+		hash += 2;
+	if (hash >= INT_MAX)
+		hash = INT_MAX - 1;
+	return hash;
+}
+
+static int kernfs_name_compare(unsigned int hash, const char *name,
+			       const void *ns, const struct kernfs_node *kn)
+{
+	if (hash != kn->hash)
+		return hash - kn->hash;
+	if (ns != kn->ns)
+		return ns - kn->ns;
+	return strcmp(name, kn->name);
+}
+
+static int kernfs_sd_compare(const struct kernfs_node *left,
+			     const struct kernfs_node *right)
+{
+	return kernfs_name_compare(left->hash, left->name, left->ns, right);
+}
+
+/**
+ *	kernfs_link_sibling - link kernfs_node into sibling rbtree
+ *	@kn: kernfs_node of interest
+ *
+ *	Link @kn into its sibling rbtree which starts from
+ *	@kn->parent->dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(kernfs_mutex)
+ *
+ *	RETURNS:
+ *	0 on susccess -EEXIST on failure.
+ */
+static int kernfs_link_sibling(struct kernfs_node *kn)
+{
+	struct rb_node **node = &kn->parent->dir.children.rb_node;
+	struct rb_node *parent = NULL;
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		kn->parent->dir.subdirs++;
+
+	while (*node) {
+		struct kernfs_node *pos;
+		int result;
+
+		pos = rb_to_kn(*node);
+		parent = *node;
+		result = kernfs_sd_compare(kn, pos);
+		if (result < 0)
+			node = &pos->rb.rb_left;
+		else if (result > 0)
+			node = &pos->rb.rb_right;
+		else
+			return -EEXIST;
+	}
+	/* add new node and rebalance the tree */
+	rb_link_node(&kn->rb, parent, node);
+	rb_insert_color(&kn->rb, &kn->parent->dir.children);
+	return 0;
+}
+
+/**
+ *	kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree
+ *	@kn: kernfs_node of interest
+ *
+ *	Unlink @kn from its sibling rbtree which starts from
+ *	kn->parent->dir.children.
+ *
+ *	Locking:
+ *	mutex_lock(kernfs_mutex)
+ */
+static void kernfs_unlink_sibling(struct kernfs_node *kn)
+{
+	if (kernfs_type(kn) == KERNFS_DIR)
+		kn->parent->dir.subdirs--;
+
+	rb_erase(&kn->rb, &kn->parent->dir.children);
+}
+
+/**
+ *	kernfs_get_active - get an active reference to kernfs_node
+ *	@kn: kernfs_node to get an active reference to
+ *
+ *	Get an active reference of @kn.  This function is noop if @kn
+ *	is NULL.
+ *
+ *	RETURNS:
+ *	Pointer to @kn on success, NULL on failure.
+ */
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn)
+{
+	if (unlikely(!kn))
+		return NULL;
+
+	if (!atomic_inc_unless_negative(&kn->active))
+		return NULL;
+
+	if (kn->flags & KERNFS_LOCKDEP)
+		rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_);
+	return kn;
+}
+
+/**
+ *	kernfs_put_active - put an active reference to kernfs_node
+ *	@kn: kernfs_node to put an active reference to
+ *
+ *	Put an active reference to @kn.  This function is noop if @kn
+ *	is NULL.
+ */
+void kernfs_put_active(struct kernfs_node *kn)
+{
+	int v;
+
+	if (unlikely(!kn))
+		return;
+
+	if (kn->flags & KERNFS_LOCKDEP)
+		rwsem_release(&kn->dep_map, 1, _RET_IP_);
+	v = atomic_dec_return(&kn->active);
+	if (likely(v != KN_DEACTIVATED_BIAS))
+		return;
+
+	/*
+	 * atomic_dec_return() is a mb(), we'll always see the updated
+	 * kn->u.completion.
+	 */
+	complete(kn->u.completion);
+}
+
+/**
+ *	kernfs_deactivate - deactivate kernfs_node
+ *	@kn: kernfs_node to deactivate
+ *
+ *	Deny new active references and drain existing ones.
+ */
+static void kernfs_deactivate(struct kernfs_node *kn)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	int v;
+
+	BUG_ON(!(kn->flags & KERNFS_REMOVED));
+
+	if (!(kernfs_type(kn) & KERNFS_ACTIVE_REF))
+		return;
+
+	kn->u.completion = (void *)&wait;
+
+	rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_);
+	/* atomic_add_return() is a mb(), put_active() will always see
+	 * the updated kn->u.completion.
+	 */
+	v = atomic_add_return(KN_DEACTIVATED_BIAS, &kn->active);
+
+	if (v != KN_DEACTIVATED_BIAS) {
+		lock_contended(&kn->dep_map, _RET_IP_);
+		wait_for_completion(&wait);
+	}
+
+	lock_acquired(&kn->dep_map, _RET_IP_);
+	rwsem_release(&kn->dep_map, 1, _RET_IP_);
+}
+
+/**
+ * kernfs_get - get a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ */
+void kernfs_get(struct kernfs_node *kn)
+{
+	if (kn) {
+		WARN_ON(!atomic_read(&kn->count));
+		atomic_inc(&kn->count);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_get);
+
+/**
+ * kernfs_put - put a reference count on a kernfs_node
+ * @kn: the target kernfs_node
+ *
+ * Put a reference count of @kn and destroy it if it reached zero.
+ */
+void kernfs_put(struct kernfs_node *kn)
+{
+	struct kernfs_node *parent;
+	struct kernfs_root *root;
+
+	if (!kn || !atomic_dec_and_test(&kn->count))
+		return;
+	root = kernfs_root(kn);
+ repeat:
+	/* Moving/renaming is always done while holding reference.
+	 * kn->parent won't change beneath us.
+	 */
+	parent = kn->parent;
+
+	WARN(!(kn->flags & KERNFS_REMOVED), "kernfs: free using entry: %s/%s\n",
+	     parent ? parent->name : "", kn->name);
+
+	if (kernfs_type(kn) == KERNFS_LINK)
+		kernfs_put(kn->symlink.target_kn);
+	if (!(kn->flags & KERNFS_STATIC_NAME))
+		kfree(kn->name);
+	if (kn->iattr) {
+		if (kn->iattr->ia_secdata)
+			security_release_secctx(kn->iattr->ia_secdata,
+						kn->iattr->ia_secdata_len);
+		simple_xattrs_free(&kn->iattr->xattrs);
+	}
+	kfree(kn->iattr);
+	ida_simple_remove(&root->ino_ida, kn->ino);
+	kmem_cache_free(kernfs_node_cache, kn);
+
+	kn = parent;
+	if (kn) {
+		if (atomic_dec_and_test(&kn->count))
+			goto repeat;
+	} else {
+		/* just released the root kn, free @root too */
+		ida_destroy(&root->ino_ida);
+		kfree(root);
+	}
+}
+EXPORT_SYMBOL_GPL(kernfs_put);
+
+static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct kernfs_node *kn;
+
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	/* Always perform fresh lookup for negatives */
+	if (!dentry->d_inode)
+		goto out_bad_unlocked;
+
+	kn = dentry->d_fsdata;
+	mutex_lock(&kernfs_mutex);
+
+	/* The kernfs node has been deleted */
+	if (kn->flags & KERNFS_REMOVED)
+		goto out_bad;
+
+	/* The kernfs node has been moved? */
+	if (dentry->d_parent->d_fsdata != kn->parent)
+		goto out_bad;
+
+	/* The kernfs node has been renamed */
+	if (strcmp(dentry->d_name.name, kn->name) != 0)
+		goto out_bad;
+
+	/* The kernfs node has been moved to a different namespace */
+	if (kn->parent && kernfs_ns_enabled(kn->parent) &&
+	    kernfs_info(dentry->d_sb)->ns != kn->ns)
+		goto out_bad;
+
+	mutex_unlock(&kernfs_mutex);
+out_valid:
+	return 1;
+out_bad:
+	mutex_unlock(&kernfs_mutex);
+out_bad_unlocked:
+	/*
+	 * @dentry doesn't match the underlying kernfs node, drop the
+	 * dentry and force lookup.  If we have submounts we must allow the
+	 * vfs caches to lie about the state of the filesystem to prevent
+	 * leaks and other nasty things, so use check_submounts_and_drop()
+	 * instead of d_drop().
+	 */
+	if (check_submounts_and_drop(dentry) != 0)
+		goto out_valid;
+
+	return 0;
+}
+
+static void kernfs_dop_release(struct dentry *dentry)
+{
+	kernfs_put(dentry->d_fsdata);
+}
+
+const struct dentry_operations kernfs_dops = {
+	.d_revalidate	= kernfs_dop_revalidate,
+	.d_release	= kernfs_dop_release,
+};
+
+static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
+					     const char *name, umode_t mode,
+					     unsigned flags)
+{
+	char *dup_name = NULL;
+	struct kernfs_node *kn;
+	int ret;
+
+	if (!(flags & KERNFS_STATIC_NAME)) {
+		name = dup_name = kstrdup(name, GFP_KERNEL);
+		if (!name)
+			return NULL;
+	}
+
+	kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
+	if (!kn)
+		goto err_out1;
+
+	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto err_out2;
+	kn->ino = ret;
+
+	atomic_set(&kn->count, 1);
+	atomic_set(&kn->active, 0);
+
+	kn->name = name;
+	kn->mode = mode;
+	kn->flags = flags | KERNFS_REMOVED;
+
+	return kn;
+
+ err_out2:
+	kmem_cache_free(kernfs_node_cache, kn);
+ err_out1:
+	kfree(dup_name);
+	return NULL;
+}
+
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+				    const char *name, umode_t mode,
+				    unsigned flags)
+{
+	struct kernfs_node *kn;
+
+	kn = __kernfs_new_node(kernfs_root(parent), name, mode, flags);
+	if (kn) {
+		kernfs_get(parent);
+		kn->parent = parent;
+	}
+	return kn;
+}
+
+/**
+ *	kernfs_addrm_start - prepare for kernfs_node add/remove
+ *	@acxt: pointer to kernfs_addrm_cxt to be used
+ *
+ *	This function is called when the caller is about to add or remove
+ *	kernfs_node.  This function acquires kernfs_mutex.  @acxt is used
+ *	to keep and pass context to other addrm functions.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).  kernfs_mutex is locked on
+ *	return.
+ */
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt)
+	__acquires(kernfs_mutex)
+{
+	memset(acxt, 0, sizeof(*acxt));
+
+	mutex_lock(&kernfs_mutex);
+}
+
+/**
+ *	kernfs_add_one - add kernfs_node to parent without warning
+ *	@acxt: addrm context to use
+ *	@kn: kernfs_node to be added
+ *
+ *	The caller must already have initialized @kn->parent.  This
+ *	function increments nlink of the parent's inode if @kn is a
+ *	directory and link into the children list of the parent.
+ *
+ *	This function should be called between calls to
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be passed
+ *	the same @acxt as passed to kernfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by kernfs_addrm_start().
+ *
+ *	RETURNS:
+ *	0 on success, -EEXIST if entry with the given name already
+ *	exists.
+ */
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn)
+{
+	struct kernfs_node *parent = kn->parent;
+	bool has_ns = kernfs_ns_enabled(parent);
+	struct kernfs_iattrs *ps_iattr;
+	int ret;
+
+	if (has_ns != (bool)kn->ns) {
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid", parent->name, kn->name);
+		return -EINVAL;
+	}
+
+	if (kernfs_type(parent) != KERNFS_DIR)
+		return -EINVAL;
+
+	if (parent->flags & KERNFS_REMOVED)
+		return -ENOENT;
+
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
+
+	ret = kernfs_link_sibling(kn);
+	if (ret)
+		return ret;
+
+	/* Update timestamps on the parent */
+	ps_iattr = parent->iattr;
+	if (ps_iattr) {
+		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
+		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
+	}
+
+	/* Mark the entry added into directory tree */
+	kn->flags &= ~KERNFS_REMOVED;
+
+	return 0;
+}
+
+/**
+ *	kernfs_remove_one - remove kernfs_node from parent
+ *	@acxt: addrm context to use
+ *	@kn: kernfs_node to be removed
+ *
+ *	Mark @kn removed and drop nlink of parent inode if @kn is a
+ *	directory.  @kn is unlinked from the children list.
+ *
+ *	This function should be called between calls to
+ *	kernfs_addrm_start() and kernfs_addrm_finish() and should be
+ *	passed the same @acxt as passed to kernfs_addrm_start().
+ *
+ *	LOCKING:
+ *	Determined by kernfs_addrm_start().
+ */
+static void kernfs_remove_one(struct kernfs_addrm_cxt *acxt,
+			      struct kernfs_node *kn)
+{
+	struct kernfs_iattrs *ps_iattr;
+
+	/*
+	 * Removal can be called multiple times on the same node.  Only the
+	 * first invocation is effective and puts the base ref.
+	 */
+	if (kn->flags & KERNFS_REMOVED)
+		return;
+
+	if (kn->parent) {
+		kernfs_unlink_sibling(kn);
+
+		/* Update timestamps on the parent */
+		ps_iattr = kn->parent->iattr;
+		if (ps_iattr) {
+			ps_iattr->ia_iattr.ia_ctime = CURRENT_TIME;
+			ps_iattr->ia_iattr.ia_mtime = CURRENT_TIME;
+		}
+	}
+
+	kn->flags |= KERNFS_REMOVED;
+	kn->u.removed_list = acxt->removed;
+	acxt->removed = kn;
+}
+
+/**
+ *	kernfs_addrm_finish - finish up kernfs_node add/remove
+ *	@acxt: addrm context to finish up
+ *
+ *	Finish up kernfs_node add/remove.  Resources acquired by
+ *	kernfs_addrm_start() are released and removed kernfs_nodes are
+ *	cleaned up.
+ *
+ *	LOCKING:
+ *	kernfs_mutex is released.
+ */
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt)
+	__releases(kernfs_mutex)
+{
+	/* release resources acquired by kernfs_addrm_start() */
+	mutex_unlock(&kernfs_mutex);
+
+	/* kill removed kernfs_nodes */
+	while (acxt->removed) {
+		struct kernfs_node *kn = acxt->removed;
+
+		acxt->removed = kn->u.removed_list;
+
+		kernfs_deactivate(kn);
+		kernfs_unmap_bin_file(kn);
+		kernfs_put(kn);
+	}
+}
+
+/**
+ * kernfs_find_ns - find kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent.  Returns pointer to
+ * the found kernfs_node on success, %NULL on failure.
+ */
+static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
+					  const unsigned char *name,
+					  const void *ns)
+{
+	struct rb_node *node = parent->dir.children.rb_node;
+	bool has_ns = kernfs_ns_enabled(parent);
+	unsigned int hash;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	if (has_ns != (bool)ns) {
+		WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n",
+		     has_ns ? "required" : "invalid", parent->name, name);
+		return NULL;
+	}
+
+	hash = kernfs_name_hash(name, ns);
+	while (node) {
+		struct kernfs_node *kn;
+		int result;
+
+		kn = rb_to_kn(node);
+		result = kernfs_name_compare(hash, name, ns, kn);
+		if (result < 0)
+			node = node->rb_left;
+		else if (result > 0)
+			node = node->rb_right;
+		else
+			return kn;
+	}
+	return NULL;
+}
+
+/**
+ * kernfs_find_and_get_ns - find and get kernfs_node with the given name
+ * @parent: kernfs_node to search under
+ * @name: name to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with name @name under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
+					   const char *name, const void *ns)
+{
+	struct kernfs_node *kn;
+
+	mutex_lock(&kernfs_mutex);
+	kn = kernfs_find_ns(parent, name, ns);
+	kernfs_get(kn);
+	mutex_unlock(&kernfs_mutex);
+
+	return kn;
+}
+EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
+
+/**
+ * kernfs_create_root - create a new kernfs hierarchy
+ * @kdops: optional directory syscall operations for the hierarchy
+ * @priv: opaque data associated with the new directory
+ *
+ * Returns the root of the new hierarchy on success, ERR_PTR() value on
+ * failure.
+ */
+struct kernfs_root *kernfs_create_root(struct kernfs_dir_ops *kdops, void *priv)
+{
+	struct kernfs_root *root;
+	struct kernfs_node *kn;
+
+	root = kzalloc(sizeof(*root), GFP_KERNEL);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
+
+	ida_init(&root->ino_ida);
+
+	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
+			       KERNFS_DIR);
+	if (!kn) {
+		ida_destroy(&root->ino_ida);
+		kfree(root);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	kn->flags &= ~KERNFS_REMOVED;
+	kn->priv = priv;
+	kn->dir.root = root;
+
+	root->dir_ops = kdops;
+	root->kn = kn;
+
+	return root;
+}
+
+/**
+ * kernfs_destroy_root - destroy a kernfs hierarchy
+ * @root: root of the hierarchy to destroy
+ *
+ * Destroy the hierarchy anchored at @root by removing all existing
+ * directories and destroying @root.
+ */
+void kernfs_destroy_root(struct kernfs_root *root)
+{
+	kernfs_remove(root->kn);	/* will also free @root */
+}
+
+/**
+ * kernfs_create_dir_ns - create a directory
+ * @parent: parent in which to create a new directory
+ * @name: name of the new directory
+ * @mode: mode of the new directory
+ * @priv: opaque data associated with the new directory
+ * @ns: optional namespace tag of the directory
+ *
+ * Returns the created node on success, ERR_PTR() value on failure.
+ */
+struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent,
+					 const char *name, umode_t mode,
+					 void *priv, const void *ns)
+{
+	struct kernfs_addrm_cxt acxt;
+	struct kernfs_node *kn;
+	int rc;
+
+	/* allocate */
+	kn = kernfs_new_node(parent, name, mode | S_IFDIR, KERNFS_DIR);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->dir.root = parent->dir.root;
+	kn->ns = ns;
+	kn->priv = priv;
+
+	/* link in */
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn);
+	kernfs_addrm_finish(&acxt);
+
+	if (!rc)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(rc);
+}
+
+static struct dentry *kernfs_iop_lookup(struct inode *dir,
+					struct dentry *dentry,
+					unsigned int flags)
+{
+	struct dentry *ret;
+	struct kernfs_node *parent = dentry->d_parent->d_fsdata;
+	struct kernfs_node *kn;
+	struct inode *inode;
+	const void *ns = NULL;
+
+	mutex_lock(&kernfs_mutex);
+
+	if (kernfs_ns_enabled(parent))
+		ns = kernfs_info(dir->i_sb)->ns;
+
+	kn = kernfs_find_ns(parent, dentry->d_name.name, ns);
+
+	/* no such entry */
+	if (!kn) {
+		ret = NULL;
+		goto out_unlock;
+	}
+	kernfs_get(kn);
+	dentry->d_fsdata = kn;
+
+	/* attach dentry and inode */
+	inode = kernfs_get_inode(dir->i_sb, kn);
+	if (!inode) {
+		ret = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	/* instantiate and hash dentry */
+	ret = d_materialise_unique(dentry, inode);
+ out_unlock:
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry,
+			    umode_t mode)
+{
+	struct kernfs_node *parent = dir->i_private;
+	struct kernfs_dir_ops *kdops = kernfs_root(parent)->dir_ops;
+
+	if (!kdops || !kdops->mkdir)
+		return -EPERM;
+
+	return kdops->mkdir(parent, dentry->d_name.name, mode);
+}
+
+static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct kernfs_node *kn  = dentry->d_fsdata;
+	struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+
+	if (!kdops || !kdops->rmdir)
+		return -EPERM;
+
+	return kdops->rmdir(kn);
+}
+
+static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry,
+			     struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct kernfs_node *kn  = old_dentry->d_fsdata;
+	struct kernfs_node *new_parent = new_dir->i_private;
+	struct kernfs_dir_ops *kdops = kernfs_root(kn)->dir_ops;
+
+	if (!kdops || !kdops->rename)
+		return -EPERM;
+
+	return kdops->rename(kn, new_parent, new_dentry->d_name.name);
+}
+
+const struct inode_operations kernfs_dir_iops = {
+	.lookup		= kernfs_iop_lookup,
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+
+	.mkdir		= kernfs_iop_mkdir,
+	.rmdir		= kernfs_iop_rmdir,
+	.rename		= kernfs_iop_rename,
+};
+
+static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos)
+{
+	struct kernfs_node *last;
+
+	while (true) {
+		struct rb_node *rbn;
+
+		last = pos;
+
+		if (kernfs_type(pos) != KERNFS_DIR)
+			break;
+
+		rbn = rb_first(&pos->dir.children);
+		if (!rbn)
+			break;
+
+		pos = rb_to_kn(rbn);
+	}
+
+	return last;
+}
+
+/**
+ * kernfs_next_descendant_post - find the next descendant for post-order walk
+ * @pos: the current position (%NULL to initiate traversal)
+ * @root: kernfs_node whose descendants to walk
+ *
+ * Find the next descendant to visit for post-order traversal of @root's
+ * descendants.  @root is included in the iteration and the last node to be
+ * visited.
+ */
+static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos,
+						       struct kernfs_node *root)
+{
+	struct rb_node *rbn;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	/* if first iteration, visit leftmost descendant which may be root */
+	if (!pos)
+		return kernfs_leftmost_descendant(root);
+
+	/* if we visited @root, we're done */
+	if (pos == root)
+		return NULL;
+
+	/* if there's an unvisited sibling, visit its leftmost descendant */
+	rbn = rb_next(&pos->rb);
+	if (rbn)
+		return kernfs_leftmost_descendant(rb_to_kn(rbn));
+
+	/* no sibling left, visit parent */
+	return pos->parent;
+}
+
+static void __kernfs_remove(struct kernfs_addrm_cxt *acxt,
+			    struct kernfs_node *kn)
+{
+	struct kernfs_node *pos, *next;
+
+	if (!kn)
+		return;
+
+	pr_debug("kernfs %s: removing\n", kn->name);
+
+	next = NULL;
+	do {
+		pos = next;
+		next = kernfs_next_descendant_post(pos, kn);
+		if (pos)
+			kernfs_remove_one(acxt, pos);
+	} while (next);
+}
+
+/**
+ * kernfs_remove - remove a kernfs_node recursively
+ * @kn: the kernfs_node to remove
+ *
+ * Remove @kn along with all its subdirectories and files.
+ */
+void kernfs_remove(struct kernfs_node *kn)
+{
+	struct kernfs_addrm_cxt acxt;
+
+	kernfs_addrm_start(&acxt);
+	__kernfs_remove(&acxt, kn);
+	kernfs_addrm_finish(&acxt);
+}
+
+/**
+ * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it
+ * @parent: parent of the target
+ * @name: name of the kernfs_node to remove
+ * @ns: namespace tag of the kernfs_node to remove
+ *
+ * Look for the kernfs_node with @name and @ns under @parent and remove it.
+ * Returns 0 on success, -ENOENT if such entry doesn't exist.
+ */
+int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
+			     const void *ns)
+{
+	struct kernfs_addrm_cxt acxt;
+	struct kernfs_node *kn;
+
+	if (!parent) {
+		WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n",
+			name);
+		return -ENOENT;
+	}
+
+	kernfs_addrm_start(&acxt);
+
+	kn = kernfs_find_ns(parent, name, ns);
+	if (kn)
+		__kernfs_remove(&acxt, kn);
+
+	kernfs_addrm_finish(&acxt);
+
+	if (kn)
+		return 0;
+	else
+		return -ENOENT;
+}
+
+/**
+ * kernfs_rename_ns - move and rename a kernfs_node
+ * @kn: target node
+ * @new_parent: new parent to put @sd under
+ * @new_name: new name
+ * @new_ns: new namespace tag
+ */
+int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
+		     const char *new_name, const void *new_ns)
+{
+	int error;
+
+	mutex_lock(&kernfs_mutex);
+
+	error = -ENOENT;
+	if ((kn->flags | new_parent->flags) & KERNFS_REMOVED)
+		goto out;
+
+	error = 0;
+	if ((kn->parent == new_parent) && (kn->ns == new_ns) &&
+	    (strcmp(kn->name, new_name) == 0))
+		goto out;	/* nothing to rename */
+
+	error = -EEXIST;
+	if (kernfs_find_ns(new_parent, new_name, new_ns))
+		goto out;
+
+	/* rename kernfs_node */
+	if (strcmp(kn->name, new_name) != 0) {
+		error = -ENOMEM;
+		new_name = kstrdup(new_name, GFP_KERNEL);
+		if (!new_name)
+			goto out;
+
+		if (kn->flags & KERNFS_STATIC_NAME)
+			kn->flags &= ~KERNFS_STATIC_NAME;
+		else
+			kfree(kn->name);
+
+		kn->name = new_name;
+	}
+
+	/*
+	 * Move to the appropriate place in the appropriate directories rbtree.
+	 */
+	kernfs_unlink_sibling(kn);
+	kernfs_get(new_parent);
+	kernfs_put(kn->parent);
+	kn->ns = new_ns;
+	kn->hash = kernfs_name_hash(kn->name, kn->ns);
+	kn->parent = new_parent;
+	kernfs_link_sibling(kn);
+
+	error = 0;
+ out:
+	mutex_unlock(&kernfs_mutex);
+	return error;
+}
+
+/* Relationship between s_mode and the DT_xxx types */
+static inline unsigned char dt_type(struct kernfs_node *kn)
+{
+	return (kn->mode >> 12) & 15;
+}
+
+static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
+{
+	kernfs_put(filp->private_data);
+	return 0;
+}
+
+static struct kernfs_node *kernfs_dir_pos(const void *ns,
+	struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+{
+	if (pos) {
+		int valid = !(pos->flags & KERNFS_REMOVED) &&
+			pos->parent == parent && hash == pos->hash;
+		kernfs_put(pos);
+		if (!valid)
+			pos = NULL;
+	}
+	if (!pos && (hash > 1) && (hash < INT_MAX)) {
+		struct rb_node *node = parent->dir.children.rb_node;
+		while (node) {
+			pos = rb_to_kn(node);
+
+			if (hash < pos->hash)
+				node = node->rb_left;
+			else if (hash > pos->hash)
+				node = node->rb_right;
+			else
+				break;
+		}
+	}
+	/* Skip over entries in the wrong namespace */
+	while (pos && pos->ns != ns) {
+		struct rb_node *node = rb_next(&pos->rb);
+		if (!node)
+			pos = NULL;
+		else
+			pos = rb_to_kn(node);
+	}
+	return pos;
+}
+
+static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
+	struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+{
+	pos = kernfs_dir_pos(ns, parent, ino, pos);
+	if (pos)
+		do {
+			struct rb_node *node = rb_next(&pos->rb);
+			if (!node)
+				pos = NULL;
+			else
+				pos = rb_to_kn(node);
+		} while (pos && pos->ns != ns);
+	return pos;
+}
+
+static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
+{
+	struct dentry *dentry = file->f_path.dentry;
+	struct kernfs_node *parent = dentry->d_fsdata;
+	struct kernfs_node *pos = file->private_data;
+	const void *ns = NULL;
+
+	if (!dir_emit_dots(file, ctx))
+		return 0;
+	mutex_lock(&kernfs_mutex);
+
+	if (kernfs_ns_enabled(parent))
+		ns = kernfs_info(dentry->d_sb)->ns;
+
+	for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
+	     pos;
+	     pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
+		const char *name = pos->name;
+		unsigned int type = dt_type(pos);
+		int len = strlen(name);
+		ino_t ino = pos->ino;
+
+		ctx->pos = pos->hash;
+		file->private_data = pos;
+		kernfs_get(pos);
+
+		mutex_unlock(&kernfs_mutex);
+		if (!dir_emit(ctx, name, len, ino, type))
+			return 0;
+		mutex_lock(&kernfs_mutex);
+	}
+	mutex_unlock(&kernfs_mutex);
+	file->private_data = NULL;
+	ctx->pos = INT_MAX;
+	return 0;
+}
+
+static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
+				    int whence)
+{
+	struct inode *inode = file_inode(file);
+	loff_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = generic_file_llseek(file, offset, whence);
+	mutex_unlock(&inode->i_mutex);
+
+	return ret;
+}
+
+const struct file_operations kernfs_dir_fops = {
+	.read		= generic_read_dir,
+	.iterate	= kernfs_fop_readdir,
+	.release	= kernfs_dir_fop_release,
+	.llseek		= kernfs_dir_fop_llseek,
+};
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
new file mode 100644
index 000000000000..dbf397bfdff2
--- /dev/null
+++ b/fs/kernfs/file.c
@@ -0,0 +1,867 @@
+/*
+ * fs/kernfs/file.c - kernfs file implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/pagemap.h>
+#include <linux/sched.h>
+
+#include "kernfs-internal.h"
+
+/*
+ * There's one kernfs_open_file for each open file and one kernfs_open_node
+ * for each kernfs_node with one or more open files.
+ *
+ * kernfs_node->attr.open points to kernfs_open_node.  attr.open is
+ * protected by kernfs_open_node_lock.
+ *
+ * filp->private_data points to seq_file whose ->private points to
+ * kernfs_open_file.  kernfs_open_files are chained at
+ * kernfs_open_node->files, which is protected by kernfs_open_file_mutex.
+ */
+static DEFINE_SPINLOCK(kernfs_open_node_lock);
+static DEFINE_MUTEX(kernfs_open_file_mutex);
+
+struct kernfs_open_node {
+	atomic_t		refcnt;
+	atomic_t		event;
+	wait_queue_head_t	poll;
+	struct list_head	files; /* goes through kernfs_open_file.list */
+};
+
+static struct kernfs_open_file *kernfs_of(struct file *file)
+{
+	return ((struct seq_file *)file->private_data)->private;
+}
+
+/*
+ * Determine the kernfs_ops for the given kernfs_node.  This function must
+ * be called while holding an active reference.
+ */
+static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn)
+{
+	if (kn->flags & KERNFS_LOCKDEP)
+		lockdep_assert_held(kn);
+	return kn->attr.ops;
+}
+
+/*
+ * As kernfs_seq_stop() is also called after kernfs_seq_start() or
+ * kernfs_seq_next() failure, it needs to distinguish whether it's stopping
+ * a seq_file iteration which is fully initialized with an active reference
+ * or an aborted kernfs_seq_start() due to get_active failure.  The
+ * position pointer is the only context for each seq_file iteration and
+ * thus the stop condition should be encoded in it.  As the return value is
+ * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable
+ * choice to indicate get_active failure.
+ *
+ * Unfortunately, this is complicated due to the optional custom seq_file
+ * operations which may return ERR_PTR(-ENODEV) too.  kernfs_seq_stop()
+ * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or
+ * custom seq_file operations and thus can't decide whether put_active
+ * should be performed or not only on ERR_PTR(-ENODEV).
+ *
+ * This is worked around by factoring out the custom seq_stop() and
+ * put_active part into kernfs_seq_stop_active(), skipping it from
+ * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after
+ * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures
+ * that kernfs_seq_stop_active() is skipped only after get_active failure.
+ */
+static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_stop)
+		ops->seq_stop(sf, v);
+	kernfs_put_active(of->kn);
+}
+
+static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn))
+		return ERR_PTR(-ENODEV);
+
+	ops = kernfs_ops(of->kn);
+	if (ops->seq_start) {
+		void *next = ops->seq_start(sf, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
+	} else {
+		/*
+		 * The same behavior and code as single_open().  Returns
+		 * !NULL if pos is at the beginning; otherwise, NULL.
+		 */
+		return NULL + !*ppos;
+	}
+}
+
+static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos)
+{
+	struct kernfs_open_file *of = sf->private;
+	const struct kernfs_ops *ops = kernfs_ops(of->kn);
+
+	if (ops->seq_next) {
+		void *next = ops->seq_next(sf, v, ppos);
+		/* see the comment above kernfs_seq_stop_active() */
+		if (next == ERR_PTR(-ENODEV))
+			kernfs_seq_stop_active(sf, next);
+		return next;
+	} else {
+		/*
+		 * The same behavior and code as single_open(), always
+		 * terminate after the initial read.
+		 */
+		++*ppos;
+		return NULL;
+	}
+}
+
+static void kernfs_seq_stop(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+
+	if (v != ERR_PTR(-ENODEV))
+		kernfs_seq_stop_active(sf, v);
+	mutex_unlock(&of->mutex);
+}
+
+static int kernfs_seq_show(struct seq_file *sf, void *v)
+{
+	struct kernfs_open_file *of = sf->private;
+
+	of->event = atomic_read(&of->kn->attr.open->event);
+
+	return of->kn->attr.ops->seq_show(sf, v);
+}
+
+static const struct seq_operations kernfs_seq_ops = {
+	.start = kernfs_seq_start,
+	.next = kernfs_seq_next,
+	.stop = kernfs_seq_stop,
+	.show = kernfs_seq_show,
+};
+
+/*
+ * As reading a bin file can have side-effects, the exact offset and bytes
+ * specified in read(2) call should be passed to the read callback making
+ * it difficult to use seq_file.  Implement simplistic custom buffering for
+ * bin files.
+ */
+static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of,
+				       char __user *user_buf, size_t count,
+				       loff_t *ppos)
+{
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = kmalloc(len, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		len = -ENODEV;
+		mutex_unlock(&of->mutex);
+		goto out_free;
+	}
+
+	ops = kernfs_ops(of->kn);
+	if (ops->read)
+		len = ops->read(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
+	if (len < 0)
+		goto out_free;
+
+	if (copy_to_user(user_buf, buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+
+	*ppos += len;
+
+ out_free:
+	kfree(buf);
+	return len;
+}
+
+/**
+ * kernfs_fop_read - kernfs vfs read callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ */
+static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf,
+			       size_t count, loff_t *ppos)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+
+	if (of->kn->flags & KERNFS_HAS_SEQ_SHOW)
+		return seq_read(file, user_buf, count, ppos);
+	else
+		return kernfs_file_direct_read(of, user_buf, count, ppos);
+}
+
+/**
+ * kernfs_fop_write - kernfs vfs write callback
+ * @file: file pointer
+ * @user_buf: data to write
+ * @count: number of bytes
+ * @ppos: starting offset
+ *
+ * Copy data in from userland and pass it to the matching kernfs write
+ * operation.
+ *
+ * There is no easy way for us to know if userspace is only doing a partial
+ * write, so we don't support them. We expect the entire buffer to come on
+ * the first write.  Hint: if you're writing a value, first read the file,
+ * modify only the the value you're changing, then write entire buffer
+ * back.
+ */
+static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf,
+				size_t count, loff_t *ppos)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+	const struct kernfs_ops *ops;
+	char *buf;
+
+	buf = kmalloc(len + 1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (copy_from_user(buf, user_buf, len)) {
+		len = -EFAULT;
+		goto out_free;
+	}
+	buf[len] = '\0';	/* guarantee string termination */
+
+	/*
+	 * @of->mutex nests outside active ref and is just to ensure that
+	 * the ops aren't called concurrently for the same open file.
+	 */
+	mutex_lock(&of->mutex);
+	if (!kernfs_get_active(of->kn)) {
+		mutex_unlock(&of->mutex);
+		len = -ENODEV;
+		goto out_free;
+	}
+
+	ops = kernfs_ops(of->kn);
+	if (ops->write)
+		len = ops->write(of, buf, len, *ppos);
+	else
+		len = -EINVAL;
+
+	kernfs_put_active(of->kn);
+	mutex_unlock(&of->mutex);
+
+	if (len > 0)
+		*ppos += len;
+out_free:
+	kfree(buf);
+	return len;
+}
+
+static void kernfs_vma_open(struct vm_area_struct *vma)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+
+	if (!of->vm_ops)
+		return;
+
+	if (!kernfs_get_active(of->kn))
+		return;
+
+	if (of->vm_ops->open)
+		of->vm_ops->open(vma);
+
+	kernfs_put_active(of->kn);
+}
+
+static int kernfs_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!kernfs_get_active(of->kn))
+		return VM_FAULT_SIGBUS;
+
+	ret = VM_FAULT_SIGBUS;
+	if (of->vm_ops->fault)
+		ret = of->vm_ops->fault(vma, vmf);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static int kernfs_vma_page_mkwrite(struct vm_area_struct *vma,
+				   struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return VM_FAULT_SIGBUS;
+
+	if (!kernfs_get_active(of->kn))
+		return VM_FAULT_SIGBUS;
+
+	ret = 0;
+	if (of->vm_ops->page_mkwrite)
+		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	else
+		file_update_time(file);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr,
+			     void *buf, int len, int write)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return -EINVAL;
+
+	if (!kernfs_get_active(of->kn))
+		return -EINVAL;
+
+	ret = -EINVAL;
+	if (of->vm_ops->access)
+		ret = of->vm_ops->access(vma, addr, buf, len, write);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+#ifdef CONFIG_NUMA
+static int kernfs_vma_set_policy(struct vm_area_struct *vma,
+				 struct mempolicy *new)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!kernfs_get_active(of->kn))
+		return -EINVAL;
+
+	ret = 0;
+	if (of->vm_ops->set_policy)
+		ret = of->vm_ops->set_policy(vma, new);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+
+static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma,
+					       unsigned long addr)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	struct mempolicy *pol;
+
+	if (!of->vm_ops)
+		return vma->vm_policy;
+
+	if (!kernfs_get_active(of->kn))
+		return vma->vm_policy;
+
+	pol = vma->vm_policy;
+	if (of->vm_ops->get_policy)
+		pol = of->vm_ops->get_policy(vma, addr);
+
+	kernfs_put_active(of->kn);
+	return pol;
+}
+
+static int kernfs_vma_migrate(struct vm_area_struct *vma,
+			      const nodemask_t *from, const nodemask_t *to,
+			      unsigned long flags)
+{
+	struct file *file = vma->vm_file;
+	struct kernfs_open_file *of = kernfs_of(file);
+	int ret;
+
+	if (!of->vm_ops)
+		return 0;
+
+	if (!kernfs_get_active(of->kn))
+		return 0;
+
+	ret = 0;
+	if (of->vm_ops->migrate)
+		ret = of->vm_ops->migrate(vma, from, to, flags);
+
+	kernfs_put_active(of->kn);
+	return ret;
+}
+#endif
+
+static const struct vm_operations_struct kernfs_vm_ops = {
+	.open		= kernfs_vma_open,
+	.fault		= kernfs_vma_fault,
+	.page_mkwrite	= kernfs_vma_page_mkwrite,
+	.access		= kernfs_vma_access,
+#ifdef CONFIG_NUMA
+	.set_policy	= kernfs_vma_set_policy,
+	.get_policy	= kernfs_vma_get_policy,
+	.migrate	= kernfs_vma_migrate,
+#endif
+};
+
+static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct kernfs_open_file *of = kernfs_of(file);
+	const struct kernfs_ops *ops;
+	int rc;
+
+	/*
+	 * mmap path and of->mutex are prone to triggering spurious lockdep
+	 * warnings and we don't want to add spurious locking dependency
+	 * between the two.  Check whether mmap is actually implemented
+	 * without grabbing @of->mutex by testing HAS_MMAP flag.  See the
+	 * comment in kernfs_file_open() for more details.
+	 */
+	if (!(of->kn->flags & KERNFS_HAS_MMAP))
+		return -ENODEV;
+
+	mutex_lock(&of->mutex);
+
+	rc = -ENODEV;
+	if (!kernfs_get_active(of->kn))
+		goto out_unlock;
+
+	ops = kernfs_ops(of->kn);
+	rc = ops->mmap(of, vma);
+
+	/*
+	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
+	 * to satisfy versions of X which crash if the mmap fails: that
+	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
+	 */
+	if (vma->vm_file != file)
+		goto out_put;
+
+	rc = -EINVAL;
+	if (of->mmapped && of->vm_ops != vma->vm_ops)
+		goto out_put;
+
+	/*
+	 * It is not possible to successfully wrap close.
+	 * So error if someone is trying to use close.
+	 */
+	rc = -EINVAL;
+	if (vma->vm_ops && vma->vm_ops->close)
+		goto out_put;
+
+	rc = 0;
+	of->mmapped = 1;
+	of->vm_ops = vma->vm_ops;
+	vma->vm_ops = &kernfs_vm_ops;
+out_put:
+	kernfs_put_active(of->kn);
+out_unlock:
+	mutex_unlock(&of->mutex);
+
+	return rc;
+}
+
+/**
+ *	kernfs_get_open_node - get or create kernfs_open_node
+ *	@kn: target kernfs_node
+ *	@of: kernfs_open_file for this instance of open
+ *
+ *	If @kn->attr.open exists, increment its reference count; otherwise,
+ *	create one.  @of is chained to the files list.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	0 on success, -errno on failure.
+ */
+static int kernfs_get_open_node(struct kernfs_node *kn,
+				struct kernfs_open_file *of)
+{
+	struct kernfs_open_node *on, *new_on = NULL;
+
+ retry:
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irq(&kernfs_open_node_lock);
+
+	if (!kn->attr.open && new_on) {
+		kn->attr.open = new_on;
+		new_on = NULL;
+	}
+
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->refcnt);
+		list_add_tail(&of->list, &on->files);
+	}
+
+	spin_unlock_irq(&kernfs_open_node_lock);
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	if (on) {
+		kfree(new_on);
+		return 0;
+	}
+
+	/* not there, initialize a new one and retry */
+	new_on = kmalloc(sizeof(*new_on), GFP_KERNEL);
+	if (!new_on)
+		return -ENOMEM;
+
+	atomic_set(&new_on->refcnt, 0);
+	atomic_set(&new_on->event, 1);
+	init_waitqueue_head(&new_on->poll);
+	INIT_LIST_HEAD(&new_on->files);
+	goto retry;
+}
+
+/**
+ *	kernfs_put_open_node - put kernfs_open_node
+ *	@kn: target kernfs_nodet
+ *	@of: associated kernfs_open_file
+ *
+ *	Put @kn->attr.open and unlink @of from the files list.  If
+ *	reference count reaches zero, disassociate and free it.
+ *
+ *	LOCKING:
+ *	None.
+ */
+static void kernfs_put_open_node(struct kernfs_node *kn,
+				 struct kernfs_open_file *of)
+{
+	struct kernfs_open_node *on = kn->attr.open;
+	unsigned long flags;
+
+	mutex_lock(&kernfs_open_file_mutex);
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+	if (of)
+		list_del(&of->list);
+
+	if (atomic_dec_and_test(&on->refcnt))
+		kn->attr.open = NULL;
+	else
+		on = NULL;
+
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	kfree(on);
+}
+
+static int kernfs_fop_open(struct inode *inode, struct file *file)
+{
+	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+	const struct kernfs_ops *ops;
+	struct kernfs_open_file *of;
+	bool has_read, has_write, has_mmap;
+	int error = -EACCES;
+
+	if (!kernfs_get_active(kn))
+		return -ENODEV;
+
+	ops = kernfs_ops(kn);
+
+	has_read = ops->seq_show || ops->read || ops->mmap;
+	has_write = ops->write || ops->mmap;
+	has_mmap = ops->mmap;
+
+	/* check perms and supported operations */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    (!(inode->i_mode & S_IWUGO) || !has_write))
+		goto err_out;
+
+	if ((file->f_mode & FMODE_READ) &&
+	    (!(inode->i_mode & S_IRUGO) || !has_read))
+		goto err_out;
+
+	/* allocate a kernfs_open_file for the file */
+	error = -ENOMEM;
+	of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+	if (!of)
+		goto err_out;
+
+	/*
+	 * The following is done to give a different lockdep key to
+	 * @of->mutex for files which implement mmap.  This is a rather
+	 * crude way to avoid false positive lockdep warning around
+	 * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and
+	 * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under
+	 * which mm->mmap_sem nests, while holding @of->mutex.  As each
+	 * open file has a separate mutex, it's okay as long as those don't
+	 * happen on the same file.  At this point, we can't easily give
+	 * each file a separate locking class.  Let's differentiate on
+	 * whether the file has mmap or not for now.
+	 *
+	 * Both paths of the branch look the same.  They're supposed to
+	 * look that way and give @of->mutex different static lockdep keys.
+	 */
+	if (has_mmap)
+		mutex_init(&of->mutex);
+	else
+		mutex_init(&of->mutex);
+
+	of->kn = kn;
+	of->file = file;
+
+	/*
+	 * Always instantiate seq_file even if read access doesn't use
+	 * seq_file or is not requested.  This unifies private data access
+	 * and readable regular files are the vast majority anyway.
+	 */
+	if (ops->seq_show)
+		error = seq_open(file, &kernfs_seq_ops);
+	else
+		error = seq_open(file, NULL);
+	if (error)
+		goto err_free;
+
+	((struct seq_file *)file->private_data)->private = of;
+
+	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
+	if (file->f_mode & FMODE_WRITE)
+		file->f_mode |= FMODE_PWRITE;
+
+	/* make sure we have open node struct */
+	error = kernfs_get_open_node(kn, of);
+	if (error)
+		goto err_close;
+
+	/* open succeeded, put active references */
+	kernfs_put_active(kn);
+	return 0;
+
+err_close:
+	seq_release(inode, file);
+err_free:
+	kfree(of);
+err_out:
+	kernfs_put_active(kn);
+	return error;
+}
+
+static int kernfs_fop_release(struct inode *inode, struct file *filp)
+{
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct kernfs_open_file *of = kernfs_of(filp);
+
+	kernfs_put_open_node(kn, of);
+	seq_release(inode, filp);
+	kfree(of);
+
+	return 0;
+}
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn)
+{
+	struct kernfs_open_node *on;
+	struct kernfs_open_file *of;
+
+	if (!(kn->flags & KERNFS_HAS_MMAP))
+		return;
+
+	spin_lock_irq(&kernfs_open_node_lock);
+	on = kn->attr.open;
+	if (on)
+		atomic_inc(&on->refcnt);
+	spin_unlock_irq(&kernfs_open_node_lock);
+	if (!on)
+		return;
+
+	mutex_lock(&kernfs_open_file_mutex);
+	list_for_each_entry(of, &on->files, list) {
+		struct inode *inode = file_inode(of->file);
+		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+	}
+	mutex_unlock(&kernfs_open_file_mutex);
+
+	kernfs_put_open_node(kn, NULL);
+}
+
+/*
+ * Kernfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, or seek to 0 and read again.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Neither 'poll' nor 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
+{
+	struct kernfs_open_file *of = kernfs_of(filp);
+	struct kernfs_node *kn = filp->f_path.dentry->d_fsdata;
+	struct kernfs_open_node *on = kn->attr.open;
+
+	/* need parent for the kobj, grab both */
+	if (!kernfs_get_active(kn))
+		goto trigger;
+
+	poll_wait(filp, &on->poll, wait);
+
+	kernfs_put_active(kn);
+
+	if (of->event != atomic_read(&on->event))
+		goto trigger;
+
+	return DEFAULT_POLLMASK;
+
+ trigger:
+	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
+}
+
+/**
+ * kernfs_notify - notify a kernfs file
+ * @kn: file to notify
+ *
+ * Notify @kn such that poll(2) on @kn wakes up.
+ */
+void kernfs_notify(struct kernfs_node *kn)
+{
+	struct kernfs_open_node *on;
+	unsigned long flags;
+
+	spin_lock_irqsave(&kernfs_open_node_lock, flags);
+
+	if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
+		on = kn->attr.open;
+		if (on) {
+			atomic_inc(&on->event);
+			wake_up_interruptible(&on->poll);
+		}
+	}
+
+	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+}
+EXPORT_SYMBOL_GPL(kernfs_notify);
+
+const struct file_operations kernfs_file_fops = {
+	.read		= kernfs_fop_read,
+	.write		= kernfs_fop_write,
+	.llseek		= generic_file_llseek,
+	.mmap		= kernfs_fop_mmap,
+	.open		= kernfs_fop_open,
+	.release	= kernfs_fop_release,
+	.poll		= kernfs_fop_poll,
+};
+
+/**
+ * __kernfs_create_file - kernfs internal function to create a file
+ * @parent: directory to create the file in
+ * @name: name of the file
+ * @mode: mode of the file
+ * @size: size of the file
+ * @ops: kernfs operations for the file
+ * @priv: private data for the file
+ * @ns: optional namespace tag of the file
+ * @static_name: don't copy file name
+ * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
+					 const char *name,
+					 umode_t mode, loff_t size,
+					 const struct kernfs_ops *ops,
+					 void *priv, const void *ns,
+					 bool name_is_static,
+					 struct lock_class_key *key)
+{
+	struct kernfs_addrm_cxt acxt;
+	struct kernfs_node *kn;
+	unsigned flags;
+	int rc;
+
+	flags = KERNFS_FILE;
+	if (name_is_static)
+		flags |= KERNFS_STATIC_NAME;
+
+	kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	kn->attr.ops = ops;
+	kn->attr.size = size;
+	kn->ns = ns;
+	kn->priv = priv;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (key) {
+		lockdep_init_map(&kn->dep_map, "s_active", key, 0);
+		kn->flags |= KERNFS_LOCKDEP;
+	}
+#endif
+
+	/*
+	 * kn->attr.ops is accesible only while holding active ref.  We
+	 * need to know whether some ops are implemented outside active
+	 * ref.  Cache their existence in flags.
+	 */
+	if (ops->seq_show)
+		kn->flags |= KERNFS_HAS_SEQ_SHOW;
+	if (ops->mmap)
+		kn->flags |= KERNFS_HAS_MMAP;
+
+	kernfs_addrm_start(&acxt);
+	rc = kernfs_add_one(&acxt, kn);
+	kernfs_addrm_finish(&acxt);
+
+	if (rc) {
+		kernfs_put(kn);
+		return ERR_PTR(rc);
+	}
+	return kn;
+}
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
new file mode 100644
index 000000000000..e55126f85bd2
--- /dev/null
+++ b/fs/kernfs/inode.c
@@ -0,0 +1,377 @@
+/*
+ * fs/kernfs/inode.c - kernfs inode implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/pagemap.h>
+#include <linux/backing-dev.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
+
+#include "kernfs-internal.h"
+
+static const struct address_space_operations kernfs_aops = {
+	.readpage	= simple_readpage,
+	.write_begin	= simple_write_begin,
+	.write_end	= simple_write_end,
+};
+
+static struct backing_dev_info kernfs_bdi = {
+	.name		= "kernfs",
+	.ra_pages	= 0,	/* No readahead */
+	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
+};
+
+static const struct inode_operations kernfs_iops = {
+	.permission	= kernfs_iop_permission,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+};
+
+void __init kernfs_inode_init(void)
+{
+	if (bdi_init(&kernfs_bdi))
+		panic("failed to init kernfs_bdi");
+}
+
+static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
+{
+	struct iattr *iattrs;
+
+	if (kn->iattr)
+		return kn->iattr;
+
+	kn->iattr = kzalloc(sizeof(struct kernfs_iattrs), GFP_KERNEL);
+	if (!kn->iattr)
+		return NULL;
+	iattrs = &kn->iattr->ia_iattr;
+
+	/* assign default attributes */
+	iattrs->ia_mode = kn->mode;
+	iattrs->ia_uid = GLOBAL_ROOT_UID;
+	iattrs->ia_gid = GLOBAL_ROOT_GID;
+	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+
+	simple_xattrs_init(&kn->iattr->xattrs);
+
+	return kn->iattr;
+}
+
+static int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+	struct kernfs_iattrs *attrs;
+	struct iattr *iattrs;
+	unsigned int ia_valid = iattr->ia_valid;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	iattrs = &attrs->ia_iattr;
+
+	if (ia_valid & ATTR_UID)
+		iattrs->ia_uid = iattr->ia_uid;
+	if (ia_valid & ATTR_GID)
+		iattrs->ia_gid = iattr->ia_gid;
+	if (ia_valid & ATTR_ATIME)
+		iattrs->ia_atime = iattr->ia_atime;
+	if (ia_valid & ATTR_MTIME)
+		iattrs->ia_mtime = iattr->ia_mtime;
+	if (ia_valid & ATTR_CTIME)
+		iattrs->ia_ctime = iattr->ia_ctime;
+	if (ia_valid & ATTR_MODE) {
+		umode_t mode = iattr->ia_mode;
+		iattrs->ia_mode = kn->mode = mode;
+	}
+	return 0;
+}
+
+/**
+ * kernfs_setattr - set iattr on a node
+ * @kn: target node
+ * @iattr: iattr to set
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+{
+	int ret;
+
+	mutex_lock(&kernfs_mutex);
+	ret = __kernfs_setattr(kn, iattr);
+	mutex_unlock(&kernfs_mutex);
+	return ret;
+}
+
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	struct kernfs_node *kn = dentry->d_fsdata;
+	int error;
+
+	if (!kn)
+		return -EINVAL;
+
+	mutex_lock(&kernfs_mutex);
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		goto out;
+
+	error = __kernfs_setattr(kn, iattr);
+	if (error)
+		goto out;
+
+	/* this ignores size changes */
+	setattr_copy(inode, iattr);
+
+out:
+	mutex_unlock(&kernfs_mutex);
+	return error;
+}
+
+static int kernfs_node_setsecdata(struct kernfs_node *kn, void **secdata,
+				  u32 *secdata_len)
+{
+	struct kernfs_iattrs *attrs;
+	void *old_secdata;
+	size_t old_secdata_len;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	old_secdata = attrs->ia_secdata;
+	old_secdata_len = attrs->ia_secdata_len;
+
+	attrs->ia_secdata = *secdata;
+	attrs->ia_secdata_len = *secdata_len;
+
+	*secdata = old_secdata;
+	*secdata_len = old_secdata_len;
+	return 0;
+}
+
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+	void *secdata;
+	int error;
+	u32 secdata_len = 0;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
+		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
+		error = security_inode_setsecurity(dentry->d_inode, suffix,
+						value, size, flags);
+		if (error)
+			return error;
+		error = security_inode_getsecctx(dentry->d_inode,
+						&secdata, &secdata_len);
+		if (error)
+			return error;
+
+		mutex_lock(&kernfs_mutex);
+		error = kernfs_node_setsecdata(kn, &secdata, &secdata_len);
+		mutex_unlock(&kernfs_mutex);
+
+		if (secdata)
+			security_release_secctx(secdata, secdata_len);
+		return error;
+	} else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
+		return simple_xattr_set(&attrs->xattrs, name, value, size,
+					flags);
+	}
+
+	return -EINVAL;
+}
+
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_remove(&attrs->xattrs, name);
+}
+
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_get(&attrs->xattrs, name, buf, size);
+}
+
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_iattrs *attrs;
+
+	attrs = kernfs_iattrs(kn);
+	if (!attrs)
+		return -ENOMEM;
+
+	return simple_xattr_list(&attrs->xattrs, buf, size);
+}
+
+static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
+{
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+}
+
+static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
+{
+	inode->i_uid = iattr->ia_uid;
+	inode->i_gid = iattr->ia_gid;
+	inode->i_atime = iattr->ia_atime;
+	inode->i_mtime = iattr->ia_mtime;
+	inode->i_ctime = iattr->ia_ctime;
+}
+
+static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode)
+{
+	struct kernfs_iattrs *attrs = kn->iattr;
+
+	inode->i_mode = kn->mode;
+	if (attrs) {
+		/*
+		 * kernfs_node has non-default attributes get them from
+		 * persistent copy in kernfs_node.
+		 */
+		set_inode_attr(inode, &attrs->ia_iattr);
+		security_inode_notifysecctx(inode, attrs->ia_secdata,
+					    attrs->ia_secdata_len);
+	}
+
+	if (kernfs_type(kn) == KERNFS_DIR)
+		set_nlink(inode, kn->dir.subdirs + 2);
+}
+
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		   struct kstat *stat)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_refresh_inode(kn, inode);
+	mutex_unlock(&kernfs_mutex);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
+{
+	kernfs_get(kn);
+	inode->i_private = kn;
+	inode->i_mapping->a_ops = &kernfs_aops;
+	inode->i_mapping->backing_dev_info = &kernfs_bdi;
+	inode->i_op = &kernfs_iops;
+
+	set_default_inode_attr(inode, kn->mode);
+	kernfs_refresh_inode(kn, inode);
+
+	/* initialize inode according to type */
+	switch (kernfs_type(kn)) {
+	case KERNFS_DIR:
+		inode->i_op = &kernfs_dir_iops;
+		inode->i_fop = &kernfs_dir_fops;
+		break;
+	case KERNFS_FILE:
+		inode->i_size = kn->attr.size;
+		inode->i_fop = &kernfs_file_fops;
+		break;
+	case KERNFS_LINK:
+		inode->i_op = &kernfs_symlink_iops;
+		break;
+	default:
+		BUG();
+	}
+
+	unlock_new_inode(inode);
+}
+
+/**
+ *	kernfs_get_inode - get inode for kernfs_node
+ *	@sb: super block
+ *	@kn: kernfs_node to allocate inode for
+ *
+ *	Get inode for @kn.  If such inode doesn't exist, a new inode is
+ *	allocated and basics are initialized.  New inode is returned
+ *	locked.
+ *
+ *	LOCKING:
+ *	Kernel thread context (may sleep).
+ *
+ *	RETURNS:
+ *	Pointer to allocated inode on success, NULL on failure.
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn)
+{
+	struct inode *inode;
+
+	inode = iget_locked(sb, kn->ino);
+	if (inode && (inode->i_state & I_NEW))
+		kernfs_init_inode(kn, inode);
+
+	return inode;
+}
+
+/*
+ * The kernfs_node serves as both an inode and a directory entry for
+ * kernfs.  To prevent the kernfs inode numbers from being freed
+ * prematurely we take a reference to kernfs_node from the kernfs inode.  A
+ * super_operations.evict_inode() implementation is needed to drop that
+ * reference upon inode destruction.
+ */
+void kernfs_evict_inode(struct inode *inode)
+{
+	struct kernfs_node *kn = inode->i_private;
+
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+	kernfs_put(kn);
+}
+
+int kernfs_iop_permission(struct inode *inode, int mask)
+{
+	struct kernfs_node *kn;
+
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	kn = inode->i_private;
+
+	mutex_lock(&kernfs_mutex);
+	kernfs_refresh_inode(kn, inode);
+	mutex_unlock(&kernfs_mutex);
+
+	return generic_permission(inode, mask);
+}
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
new file mode 100644
index 000000000000..eb536b76374a
--- /dev/null
+++ b/fs/kernfs/kernfs-internal.h
@@ -0,0 +1,122 @@
+/*
+ * fs/kernfs/kernfs-internal.h - kernfs internal header file
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#ifndef __KERNFS_INTERNAL_H
+#define __KERNFS_INTERNAL_H
+
+#include <linux/lockdep.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/xattr.h>
+
+#include <linux/kernfs.h>
+
+struct kernfs_iattrs {
+	struct iattr		ia_iattr;
+	void			*ia_secdata;
+	u32			ia_secdata_len;
+
+	struct simple_xattrs	xattrs;
+};
+
+#define KN_DEACTIVATED_BIAS		INT_MIN
+
+/* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */
+
+/**
+ * kernfs_root - find out the kernfs_root a kernfs_node belongs to
+ * @kn: kernfs_node of interest
+ *
+ * Return the kernfs_root @kn belongs to.
+ */
+static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
+{
+	/* if parent exists, it's always a dir; otherwise, @sd is a dir */
+	if (kn->parent)
+		kn = kn->parent;
+	return kn->dir.root;
+}
+
+/*
+ * Context structure to be used while adding/removing nodes.
+ */
+struct kernfs_addrm_cxt {
+	struct kernfs_node	*removed;
+};
+
+/*
+ * mount.c
+ */
+struct kernfs_super_info {
+	/*
+	 * The root associated with this super_block.  Each super_block is
+	 * identified by the root and ns it's associated with.
+	 */
+	struct kernfs_root	*root;
+
+	/*
+	 * Each sb is associated with one namespace tag, currently the
+	 * network namespace of the task which mounted this kernfs
+	 * instance.  If multiple tags become necessary, make the following
+	 * an array and compare kernfs_node tag against every entry.
+	 */
+	const void		*ns;
+};
+#define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
+
+extern struct kmem_cache *kernfs_node_cache;
+
+/*
+ * inode.c
+ */
+struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
+void kernfs_evict_inode(struct inode *inode);
+int kernfs_iop_permission(struct inode *inode, int mask);
+int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr);
+int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		       struct kstat *stat);
+int kernfs_iop_setxattr(struct dentry *dentry, const char *name, const void *value,
+			size_t size, int flags);
+int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
+ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
+			    size_t size);
+ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
+void kernfs_inode_init(void);
+
+/*
+ * dir.c
+ */
+extern struct mutex kernfs_mutex;
+extern const struct dentry_operations kernfs_dops;
+extern const struct file_operations kernfs_dir_fops;
+extern const struct inode_operations kernfs_dir_iops;
+
+struct kernfs_node *kernfs_get_active(struct kernfs_node *kn);
+void kernfs_put_active(struct kernfs_node *kn);
+void kernfs_addrm_start(struct kernfs_addrm_cxt *acxt);
+int kernfs_add_one(struct kernfs_addrm_cxt *acxt, struct kernfs_node *kn);
+void kernfs_addrm_finish(struct kernfs_addrm_cxt *acxt);
+struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
+				    const char *name, umode_t mode,
+				    unsigned flags);
+
+/*
+ * file.c
+ */
+extern const struct file_operations kernfs_file_fops;
+
+void kernfs_unmap_bin_file(struct kernfs_node *kn);
+
+/*
+ * symlink.c
+ */
+extern const struct inode_operations kernfs_symlink_iops;
+
+#endif	/* __KERNFS_INTERNAL_H */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
new file mode 100644
index 000000000000..0d6ce895a9ee
--- /dev/null
+++ b/fs/kernfs/mount.c
@@ -0,0 +1,165 @@
+/*
+ * fs/kernfs/mount.c - kernfs mount implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/init.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+
+#include "kernfs-internal.h"
+
+struct kmem_cache *kernfs_node_cache;
+
+static const struct super_operations kernfs_sops = {
+	.statfs		= simple_statfs,
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= kernfs_evict_inode,
+};
+
+static int kernfs_fill_super(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct inode *inode;
+	struct dentry *root;
+
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = SYSFS_MAGIC;
+	sb->s_op = &kernfs_sops;
+	sb->s_time_gran = 1;
+
+	/* get root inode, initialize and unlock it */
+	mutex_lock(&kernfs_mutex);
+	inode = kernfs_get_inode(sb, info->root->kn);
+	mutex_unlock(&kernfs_mutex);
+	if (!inode) {
+		pr_debug("kernfs: could not get root inode\n");
+		return -ENOMEM;
+	}
+
+	/* instantiate and link root dentry */
+	root = d_make_root(inode);
+	if (!root) {
+		pr_debug("%s: could not get root dentry!\n", __func__);
+		return -ENOMEM;
+	}
+	kernfs_get(info->root->kn);
+	root->d_fsdata = info->root->kn;
+	sb->s_root = root;
+	sb->s_d_op = &kernfs_dops;
+	return 0;
+}
+
+static int kernfs_test_super(struct super_block *sb, void *data)
+{
+	struct kernfs_super_info *sb_info = kernfs_info(sb);
+	struct kernfs_super_info *info = data;
+
+	return sb_info->root == info->root && sb_info->ns == info->ns;
+}
+
+static int kernfs_set_super(struct super_block *sb, void *data)
+{
+	int error;
+	error = set_anon_super(sb, data);
+	if (!error)
+		sb->s_fs_info = data;
+	return error;
+}
+
+/**
+ * kernfs_super_ns - determine the namespace tag of a kernfs super_block
+ * @sb: super_block of interest
+ *
+ * Return the namespace tag associated with kernfs super_block @sb.
+ */
+const void *kernfs_super_ns(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+
+	return info->ns;
+}
+
+/**
+ * kernfs_mount_ns - kernfs mount helper
+ * @fs_type: file_system_type of the fs being mounted
+ * @flags: mount flags specified for the mount
+ * @root: kernfs_root of the hierarchy being mounted
+ * @ns: optional namespace tag of the mount
+ *
+ * This is to be called from each kernfs user's file_system_type->mount()
+ * implementation, which should pass through the specified @fs_type and
+ * @flags, and specify the hierarchy and namespace tag to mount via @root
+ * and @ns, respectively.
+ *
+ * The return value can be passed to the vfs layer verbatim.
+ */
+struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
+			       struct kernfs_root *root, const void *ns)
+{
+	struct super_block *sb;
+	struct kernfs_super_info *info;
+	int error;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	info->root = root;
+	info->ns = ns;
+
+	sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+	if (IS_ERR(sb) || sb->s_fs_info != info)
+		kfree(info);
+	if (IS_ERR(sb))
+		return ERR_CAST(sb);
+	if (!sb->s_root) {
+		error = kernfs_fill_super(sb);
+		if (error) {
+			deactivate_locked_super(sb);
+			return ERR_PTR(error);
+		}
+		sb->s_flags |= MS_ACTIVE;
+	}
+
+	return dget(sb->s_root);
+}
+
+/**
+ * kernfs_kill_sb - kill_sb for kernfs
+ * @sb: super_block being killed
+ *
+ * This can be used directly for file_system_type->kill_sb().  If a kernfs
+ * user needs extra cleanup, it can implement its own kill_sb() and call
+ * this function at the end.
+ */
+void kernfs_kill_sb(struct super_block *sb)
+{
+	struct kernfs_super_info *info = kernfs_info(sb);
+	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
+
+	/*
+	 * Remove the superblock from fs_supers/s_instances
+	 * so we can't find it, before freeing kernfs_super_info.
+	 */
+	kill_anon_super(sb);
+	kfree(info);
+	kernfs_put(root_kn);
+}
+
+void __init kernfs_init(void)
+{
+	kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
+					      sizeof(struct kernfs_node),
+					      0, SLAB_PANIC, NULL);
+	kernfs_inode_init();
+}
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
new file mode 100644
index 000000000000..4d457055acb9
--- /dev/null
+++ b/fs/kernfs/symlink.c
@@ -0,0 +1,151 @@
+/*
+ * fs/kernfs/symlink.c - kernfs symlink implementation
+ *
+ * Copyright (c) 2001-3 Patrick Mochel
+ * Copyright (c) 2007 SUSE Linux Products GmbH
+ * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/namei.h>
+
+#include "kernfs-internal.h"
+
+/**
+ * kernfs_create_link - create a symlink
+ * @parent: directory to create the symlink in
+ * @name: name of the symlink
+ * @target: target node for the symlink to point to
+ *
+ * Returns the created node on success, ERR_PTR() value on error.
+ */
+struct kernfs_node *kernfs_create_link(struct kernfs_node *parent,
+				       const char *name,
+				       struct kernfs_node *target)
+{
+	struct kernfs_node *kn;
+	struct kernfs_addrm_cxt acxt;
+	int error;
+
+	kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, KERNFS_LINK);
+	if (!kn)
+		return ERR_PTR(-ENOMEM);
+
+	if (kernfs_ns_enabled(parent))
+		kn->ns = target->ns;
+	kn->symlink.target_kn = target;
+	kernfs_get(target);	/* ref owned by symlink */
+
+	kernfs_addrm_start(&acxt);
+	error = kernfs_add_one(&acxt, kn);
+	kernfs_addrm_finish(&acxt);
+
+	if (!error)
+		return kn;
+
+	kernfs_put(kn);
+	return ERR_PTR(error);
+}
+
+static int kernfs_get_target_path(struct kernfs_node *parent,
+				  struct kernfs_node *target, char *path)
+{
+	struct kernfs_node *base, *kn;
+	char *s = path;
+	int len = 0;
+
+	/* go up to the root, stop at the base */
+	base = parent;
+	while (base->parent) {
+		kn = target->parent;
+		while (kn->parent && base != kn)
+			kn = kn->parent;
+
+		if (base == kn)
+			break;
+
+		strcpy(s, "../");
+		s += 3;
+		base = base->parent;
+	}
+
+	/* determine end of target string for reverse fillup */
+	kn = target;
+	while (kn->parent && kn != base) {
+		len += strlen(kn->name) + 1;
+		kn = kn->parent;
+	}
+
+	/* check limits */
+	if (len < 2)
+		return -EINVAL;
+	len--;
+	if ((s - path) + len > PATH_MAX)
+		return -ENAMETOOLONG;
+
+	/* reverse fillup of target string from target to base */
+	kn = target;
+	while (kn->parent && kn != base) {
+		int slen = strlen(kn->name);
+
+		len -= slen;
+		strncpy(s + len, kn->name, slen);
+		if (len)
+			s[--len] = '/';
+
+		kn = kn->parent;
+	}
+
+	return 0;
+}
+
+static int kernfs_getlink(struct dentry *dentry, char *path)
+{
+	struct kernfs_node *kn = dentry->d_fsdata;
+	struct kernfs_node *parent = kn->parent;
+	struct kernfs_node *target = kn->symlink.target_kn;
+	int error;
+
+	mutex_lock(&kernfs_mutex);
+	error = kernfs_get_target_path(parent, target, path);
+	mutex_unlock(&kernfs_mutex);
+
+	return error;
+}
+
+static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	int error = -ENOMEM;
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	if (page) {
+		error = kernfs_getlink(dentry, (char *) page);
+		if (error < 0)
+			free_page((unsigned long)page);
+	}
+	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
+	return NULL;
+}
+
+static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd,
+				void *cookie)
+{
+	char *page = nd_get_link(nd);
+	if (!IS_ERR(page))
+		free_page((unsigned long)page);
+}
+
+const struct inode_operations kernfs_symlink_iops = {
+	.setxattr	= kernfs_iop_setxattr,
+	.removexattr	= kernfs_iop_removexattr,
+	.getxattr	= kernfs_iop_getxattr,
+	.listxattr	= kernfs_iop_listxattr,
+	.readlink	= generic_readlink,
+	.follow_link	= kernfs_iop_follow_link,
+	.put_link	= kernfs_iop_put_link,
+	.setattr	= kernfs_iop_setattr,
+	.getattr	= kernfs_iop_getattr,
+	.permission	= kernfs_iop_permission,
+};
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index 550475ca6a0e..0f95f0d0b313 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -14,16 +14,10 @@
 
 #define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
 
-static void request_complete(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
 static int sync_request(struct page *page, struct block_device *bdev, int rw)
 {
 	struct bio bio;
 	struct bio_vec bio_vec;
-	struct completion complete;
 
 	bio_init(&bio);
 	bio.bi_max_vecs = 1;
@@ -35,13 +29,8 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
 	bio.bi_size = PAGE_SIZE;
 	bio.bi_bdev = bdev;
 	bio.bi_sector = page->index * (PAGE_SIZE >> 9);
-	init_completion(&complete);
-	bio.bi_private = &complete;
-	bio.bi_end_io = request_complete;
 
-	submit_bio(rw, &bio);
-	wait_for_completion(&complete);
-	return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
+	return submit_bio_wait(rw, &bio);
 }
 
 static int bdev_readpage(void *_sb, struct page *page)
diff --git a/fs/namei.c b/fs/namei.c
index 8f77a8cea289..3531deebad30 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -513,8 +513,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
 
 	if (!lockref_get_not_dead(&parent->d_lockref)) {
 		nd->path.dentry = NULL;	
-		rcu_read_unlock();
-		return -ECHILD;
+		goto out;
 	}
 
 	/*
@@ -1599,11 +1598,6 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
  *   do a "get_unaligned()" if this helps and is sufficiently
  *   fast.
  *
- * - Little-endian machines (so that we can generate the mask
- *   of low bytes efficiently). Again, we *could* do a byte
- *   swapping load on big-endian architectures if that is not
- *   expensive enough to make the optimization worthless.
- *
  * - non-CONFIG_DEBUG_PAGEALLOC configurations (so that we
  *   do not trap on the (extremely unlikely) case of a page
  *   crossing operation.
@@ -1647,7 +1641,7 @@ unsigned int full_name_hash(const unsigned char *name, unsigned int len)
 		if (!len)
 			goto done;
 	}
-	mask = ~(~0ul << len*8);
+	mask = bytemask_from_count(len);
 	hash += mask & a;
 done:
 	return fold_hash(hash);
diff --git a/fs/namespace.c b/fs/namespace.c
index ac2ce8a766e1..22e536705c45 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2790,6 +2790,8 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mountpoint_hashtable[u]);
 
+	kernfs_init();
+
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2886,7 +2888,7 @@ bool fs_fully_visible(struct file_system_type *type)
 			struct inode *inode = child->mnt_mountpoint->d_inode;
 			if (!S_ISDIR(inode->i_mode))
 				goto next;
-			if (inode->i_nlink != 2)
+			if (inode->i_nlink > 2)
 				goto next;
 		}
 		visible = true;
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 8485978993e8..9838fb020473 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -36,6 +36,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 
+#include "../nfs4_fs.h"
 #include "../pnfs.h"
 #include "../netns.h"
 
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index 9c3e117c3ed1..4d0161442565 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -44,7 +44,7 @@
 static inline sector_t normalize(sector_t s, int base)
 {
 	sector_t tmp = s; /* Since do_div modifies its argument */
-	return s - do_div(tmp, base);
+	return s - sector_div(tmp, base);
 }
 
 static inline sector_t normalize_up(sector_t s, int base)
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index fc0f95ec7358..d25f10fb4926 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -46,7 +46,9 @@ ssize_t nfs_dns_resolve_name(struct net *net, char *name, size_t namelen,
 #include <linux/sunrpc/cache.h>
 #include <linux/sunrpc/svcauth.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
+#include <linux/nfs_fs.h>
 
+#include "nfs4_fs.h"
 #include "dns_resolve.h"
 #include "cache_lib.h"
 #include "netns.h"
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 18ab2da4eeb6..00ad1c2b217d 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -312,7 +312,7 @@ struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags)
 }
 EXPORT_SYMBOL_GPL(nfs4_label_alloc);
 #else
-void inline nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
+void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr,
 					struct nfs4_label *label)
 {
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index bca6a3e3c49c..8b5cc04a8611 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -269,6 +269,21 @@ extern const u32 nfs41_maxgetdevinfo_overhead;
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
 
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags);
+static inline void nfs4_label_free(struct nfs4_label *label)
+{
+	if (label) {
+		kfree(label->label);
+		kfree(label);
+	}
+	return;
+}
+#else
+static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; }
+static inline void nfs4_label_free(void *label) {}
+#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
+
 /* proc.c */
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 3ce79b04522e..5609edc742a0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -9,6 +9,14 @@
 #ifndef __LINUX_FS_NFS_NFS4_FS_H
 #define __LINUX_FS_NFS_NFS4_FS_H
 
+#if defined(CONFIG_NFS_V4_2)
+#define NFS4_MAX_MINOR_VERSION 2
+#elif defined(CONFIG_NFS_V4_1)
+#define NFS4_MAX_MINOR_VERSION 1
+#else
+#define NFS4_MAX_MINOR_VERSION 0
+#endif
+
 #if IS_ENABLED(CONFIG_NFS_V4)
 
 #define NFS4_MAX_LOOP_ON_RECOVER (10)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 659990c0109e..15052b81df42 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2518,9 +2518,8 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 						     calldata->roc_barrier);
 			nfs_set_open_stateid(state, &calldata->res.stateid, 0);
 			renew_lease(server, calldata->timestamp);
-			nfs4_close_clear_stateid_flags(state,
-					calldata->arg.fmode);
 			break;
+		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_BAD_STATEID:
@@ -2528,9 +2527,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			if (calldata->arg.fmode == 0)
 				break;
 		default:
-			if (nfs4_async_handle_error(task, server, state) == -EAGAIN)
+			if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
 				rpc_restart_call_prepare(task);
+				goto out_release;
+			}
 	}
+	nfs4_close_clear_stateid_flags(state, calldata->arg.fmode);
+out_release:
 	nfs_release_seqid(calldata->arg.seqid);
 	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
 	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
@@ -4802,7 +4805,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
 			nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
-			goto restart_call;
+			goto wait_on_recovery;
 #endif /* CONFIG_NFS_V4_1 */
 		case -NFS4ERR_DELAY:
 			nfs_inc_server_stats(server, NFSIOS_DELAY);
@@ -4987,11 +4990,17 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 
 	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
 	switch (task->tk_status) {
-	case -NFS4ERR_STALE_STATEID:
-	case -NFS4ERR_EXPIRED:
 	case 0:
 		renew_lease(data->res.server, data->timestamp);
 		break;
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_BAD_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_EXPIRED:
+		task->tk_status = 0;
+		break;
 	default:
 		if (nfs4_async_handle_error(task, data->res.server, NULL) ==
 				-EAGAIN) {
@@ -7589,7 +7598,14 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata)
 		return;
 
 	server = NFS_SERVER(lrp->args.inode);
-	if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) {
+	switch (task->tk_status) {
+	default:
+		task->tk_status = 0;
+	case 0:
+		break;
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN)
+			break;
 		rpc_restart_call_prepare(task);
 		return;
 	}
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 9186c7ce0b14..b6af150c96b8 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -132,6 +132,13 @@ nfsd_reply_cache_alloc(void)
 }
 
 static void
+nfsd_reply_cache_unhash(struct svc_cacherep *rp)
+{
+	hlist_del_init(&rp->c_hash);
+	list_del_init(&rp->c_lru);
+}
+
+static void
 nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
 {
 	if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
@@ -417,7 +424,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
 		rp = list_first_entry(&lru_head, struct svc_cacherep, c_lru);
 		if (nfsd_cache_entry_expired(rp) ||
 		    num_drc_entries >= max_drc_entries) {
-			lru_put_end(rp);
+			nfsd_reply_cache_unhash(rp);
 			prune_cache_entries();
 			goto search_cache;
 		}
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 9f6b486b6c01..a1a191634abc 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1440,17 +1440,19 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
 
 		nilfs_clear_logs(&sci->sc_segbufs);
 
-		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
-		if (unlikely(err))
-			return err;
-
 		if (sci->sc_stage.flags & NILFS_CF_SUFREED) {
 			err = nilfs_sufile_cancel_freev(nilfs->ns_sufile,
 							sci->sc_freesegs,
 							sci->sc_nfreesegs,
 							NULL);
 			WARN_ON(err); /* do not happen */
+			sci->sc_stage.flags &= ~NILFS_CF_SUFREED;
 		}
+
+		err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
+		if (unlikely(err))
+			return err;
+
 		nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
 		sci->sc_stage = prev_stage;
 	}
diff --git a/fs/pipe.c b/fs/pipe.c
index d2c45e14e6d8..0e0752ef2715 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -726,11 +726,25 @@ pipe_poll(struct file *filp, poll_table *wait)
 	return mask;
 }
 
+static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
+{
+	int kill = 0;
+
+	spin_lock(&inode->i_lock);
+	if (!--pipe->files) {
+		inode->i_pipe = NULL;
+		kill = 1;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (kill)
+		free_pipe_info(pipe);
+}
+
 static int
 pipe_release(struct inode *inode, struct file *file)
 {
-	struct pipe_inode_info *pipe = inode->i_pipe;
-	int kill = 0;
+	struct pipe_inode_info *pipe = file->private_data;
 
 	__pipe_lock(pipe);
 	if (file->f_mode & FMODE_READ)
@@ -743,17 +757,9 @@ pipe_release(struct inode *inode, struct file *file)
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	spin_lock(&inode->i_lock);
-	if (!--pipe->files) {
-		inode->i_pipe = NULL;
-		kill = 1;
-	}
-	spin_unlock(&inode->i_lock);
 	__pipe_unlock(pipe);
 
-	if (kill)
-		free_pipe_info(pipe);
-
+	put_pipe_info(inode, pipe);
 	return 0;
 }
 
@@ -1014,7 +1020,6 @@ static int fifo_open(struct inode *inode, struct file *filp)
 {
 	struct pipe_inode_info *pipe;
 	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
-	int kill = 0;
 	int ret;
 
 	filp->f_version = 0;
@@ -1130,15 +1135,9 @@ err_wr:
 	goto err;
 
 err:
-	spin_lock(&inode->i_lock);
-	if (!--pipe->files) {
-		inode->i_pipe = NULL;
-		kill = 1;
-	}
-	spin_unlock(&inode->i_lock);
 	__pipe_unlock(pipe);
-	if (kill)
-		free_pipe_info(pipe);
+
+	put_pipe_info(inode, pipe);
 	return ret;
 }
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 28955d4b7218..124fc43c7090 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -292,16 +292,20 @@ proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr,
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned long rv = -EIO;
-	unsigned long (*get_area)(struct file *, unsigned long, unsigned long,
-				  unsigned long, unsigned long) = NULL;
+
 	if (use_pde(pde)) {
+		typeof(proc_reg_get_unmapped_area) *get_area;
+
+		get_area = pde->proc_fops->get_unmapped_area;
 #ifdef CONFIG_MMU
-		get_area = current->mm->get_unmapped_area;
+		if (!get_area)
+			get_area = current->mm->get_unmapped_area;
 #endif
-		if (pde->proc_fops->get_unmapped_area)
-			get_area = pde->proc_fops->get_unmapped_area;
+
 		if (get_area)
 			rv = get_area(file, orig_addr, len, pgoff, flags);
+		else
+			rv = orig_addr;
 		unuse_pde(pde);
 	}
 	return rv;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b8e93a40a5d3..78c3c2097787 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -443,8 +443,11 @@ int pstore_register(struct pstore_info *psi)
 		pstore_get_records(0);
 
 	kmsg_dump_register(&pstore_dumper);
-	pstore_register_console();
-	pstore_register_ftrace();
+
+	if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
+		pstore_register_console();
+		pstore_register_ftrace();
+	}
 
 	if (pstore_update_ms >= 0) {
 		pstore_timer.expires = jiffies +
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 2943b2bfae48..62a0de6632e1 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -84,6 +84,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
 		 */
 		res = squashfs_read_cache(target_page, block, bsize, pages,
 								page);
+		if (res < 0)
+			goto mark_errored;
+
 		goto out;
 	}
 
@@ -119,7 +122,7 @@ mark_errored:
 	 * dealt with by the caller
 	 */
 	for (i = 0; i < pages; i++) {
-		if (page[i] == target_page)
+		if (page[i] == NULL || page[i] == target_page)
 			continue;
 		flush_dcache_page(page[i]);
 		SetPageError(page[i]);
diff --git a/fs/sysfs/Makefile b/fs/sysfs/Makefile
index 8876ac183373..6eff6e1205a5 100644
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the sysfs virtual filesystem
 #
 
-obj-y		:= inode.o file.o dir.o symlink.o mount.o group.o
+obj-y		:= file.o dir.o symlink.o mount.o group.o
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 5e73d6626e50..ee0d761c3179 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -13,465 +13,31 @@
 #undef DEBUG
 
 #include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
-#include <linux/idr.h>
-#include <linux/completion.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/security.h>
-#include <linux/hash.h>
 #include "sysfs.h"
 
-DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_symlink_target_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
-
-static DEFINE_SPINLOCK(sysfs_ino_lock);
-static DEFINE_IDA(sysfs_ino_ida);
-
-/**
- *	sysfs_name_hash
- *	@name: Null terminated string to hash
- *	@ns:   Namespace tag to hash
- *
- *	Returns 31 bit hash of ns + name (so it fits in an off_t )
- */
-static unsigned int sysfs_name_hash(const char *name, const void *ns)
-{
-	unsigned long hash = init_name_hash();
-	unsigned int len = strlen(name);
-	while (len--)
-		hash = partial_name_hash(*name++, hash);
-	hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
-	hash &= 0x7fffffffU;
-	/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
-	if (hash < 1)
-		hash += 2;
-	if (hash >= INT_MAX)
-		hash = INT_MAX - 1;
-	return hash;
-}
-
-static int sysfs_name_compare(unsigned int hash, const char *name,
-			      const void *ns, const struct sysfs_dirent *sd)
-{
-	if (hash != sd->s_hash)
-		return hash - sd->s_hash;
-	if (ns != sd->s_ns)
-		return ns - sd->s_ns;
-	return strcmp(name, sd->s_name);
-}
-
-static int sysfs_sd_compare(const struct sysfs_dirent *left,
-			    const struct sysfs_dirent *right)
-{
-	return sysfs_name_compare(left->s_hash, left->s_name, left->s_ns,
-				  right);
-}
-
-/**
- *	sysfs_link_sibling - link sysfs_dirent into sibling rbtree
- *	@sd: sysfs_dirent of interest
- *
- *	Link @sd into its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
- *
- *	Locking:
- *	mutex_lock(sysfs_mutex)
- *
- *	RETURNS:
- *	0 on susccess -EEXIST on failure.
- */
-static int sysfs_link_sibling(struct sysfs_dirent *sd)
-{
-	struct rb_node **node = &sd->s_parent->s_dir.children.rb_node;
-	struct rb_node *parent = NULL;
-
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs++;
-
-	while (*node) {
-		struct sysfs_dirent *pos;
-		int result;
-
-		pos = to_sysfs_dirent(*node);
-		parent = *node;
-		result = sysfs_sd_compare(sd, pos);
-		if (result < 0)
-			node = &pos->s_rb.rb_left;
-		else if (result > 0)
-			node = &pos->s_rb.rb_right;
-		else
-			return -EEXIST;
-	}
-	/* add new node and rebalance the tree */
-	rb_link_node(&sd->s_rb, parent, node);
-	rb_insert_color(&sd->s_rb, &sd->s_parent->s_dir.children);
-	return 0;
-}
-
-/**
- *	sysfs_unlink_sibling - unlink sysfs_dirent from sibling rbtree
- *	@sd: sysfs_dirent of interest
- *
- *	Unlink @sd from its sibling rbtree which starts from
- *	sd->s_parent->s_dir.children.
- *
- *	Locking:
- *	mutex_lock(sysfs_mutex)
- */
-static void sysfs_unlink_sibling(struct sysfs_dirent *sd)
-{
-	if (sysfs_type(sd) == SYSFS_DIR)
-		sd->s_parent->s_dir.subdirs--;
-
-	rb_erase(&sd->s_rb, &sd->s_parent->s_dir.children);
-}
-
-/**
- *	sysfs_get_active - get an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to get an active reference to
- *
- *	Get an active reference of @sd.  This function is noop if @sd
- *	is NULL.
- *
- *	RETURNS:
- *	Pointer to @sd on success, NULL on failure.
- */
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd)
-{
-	if (unlikely(!sd))
-		return NULL;
-
-	if (!atomic_inc_unless_negative(&sd->s_active))
-		return NULL;
-
-	if (likely(!sysfs_ignore_lockdep(sd)))
-		rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_);
-	return sd;
-}
-
-/**
- *	sysfs_put_active - put an active reference to sysfs_dirent
- *	@sd: sysfs_dirent to put an active reference to
- *
- *	Put an active reference to @sd.  This function is noop if @sd
- *	is NULL.
- */
-void sysfs_put_active(struct sysfs_dirent *sd)
-{
-	int v;
-
-	if (unlikely(!sd))
-		return;
-
-	if (likely(!sysfs_ignore_lockdep(sd)))
-		rwsem_release(&sd->dep_map, 1, _RET_IP_);
-	v = atomic_dec_return(&sd->s_active);
-	if (likely(v != SD_DEACTIVATED_BIAS))
-		return;
-
-	/* atomic_dec_return() is a mb(), we'll always see the updated
-	 * sd->u.completion.
-	 */
-	complete(sd->u.completion);
-}
-
-/**
- *	sysfs_deactivate - deactivate sysfs_dirent
- *	@sd: sysfs_dirent to deactivate
- *
- *	Deny new active references and drain existing ones.
- */
-static void sysfs_deactivate(struct sysfs_dirent *sd)
-{
-	DECLARE_COMPLETION_ONSTACK(wait);
-	int v;
-
-	BUG_ON(!(sd->s_flags & SYSFS_FLAG_REMOVED));
-
-	if (!(sysfs_type(sd) & SYSFS_ACTIVE_REF))
-		return;
-
-	sd->u.completion = (void *)&wait;
-
-	rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_);
-	/* atomic_add_return() is a mb(), put_active() will always see
-	 * the updated sd->u.completion.
-	 */
-	v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active);
-
-	if (v != SD_DEACTIVATED_BIAS) {
-		lock_contended(&sd->dep_map, _RET_IP_);
-		wait_for_completion(&wait);
-	}
-
-	lock_acquired(&sd->dep_map, _RET_IP_);
-	rwsem_release(&sd->dep_map, 1, _RET_IP_);
-}
-
-static int sysfs_alloc_ino(unsigned int *pino)
-{
-	int ino, rc;
-
- retry:
-	spin_lock(&sysfs_ino_lock);
-	rc = ida_get_new_above(&sysfs_ino_ida, 2, &ino);
-	spin_unlock(&sysfs_ino_lock);
-
-	if (rc == -EAGAIN) {
-		if (ida_pre_get(&sysfs_ino_ida, GFP_KERNEL))
-			goto retry;
-		rc = -ENOMEM;
-	}
-
-	*pino = ino;
-	return rc;
-}
-
-static void sysfs_free_ino(unsigned int ino)
-{
-	spin_lock(&sysfs_ino_lock);
-	ida_remove(&sysfs_ino_ida, ino);
-	spin_unlock(&sysfs_ino_lock);
-}
-
-void release_sysfs_dirent(struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *parent_sd;
-
- repeat:
-	/* Moving/renaming is always done while holding reference.
-	 * sd->s_parent won't change beneath us.
-	 */
-	parent_sd = sd->s_parent;
-
-	WARN(!(sd->s_flags & SYSFS_FLAG_REMOVED),
-		"sysfs: free using entry: %s/%s\n",
-		parent_sd ? parent_sd->s_name : "", sd->s_name);
-
-	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
-		sysfs_put(sd->s_symlink.target_sd);
-	if (sysfs_type(sd) & SYSFS_COPY_NAME)
-		kfree(sd->s_name);
-	if (sd->s_iattr && sd->s_iattr->ia_secdata)
-		security_release_secctx(sd->s_iattr->ia_secdata,
-					sd->s_iattr->ia_secdata_len);
-	kfree(sd->s_iattr);
-	sysfs_free_ino(sd->s_ino);
-	kmem_cache_free(sysfs_dir_cachep, sd);
-
-	sd = parent_sd;
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		goto repeat;
-}
-
-static int sysfs_dentry_delete(const struct dentry *dentry)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	return !(sd && !(sd->s_flags & SYSFS_FLAG_REMOVED));
-}
-
-static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	struct sysfs_dirent *sd;
-	int type;
-
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	sd = dentry->d_fsdata;
-	mutex_lock(&sysfs_mutex);
-
-	/* The sysfs dirent has been deleted */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
-		goto out_bad;
-
-	/* The sysfs dirent has been moved? */
-	if (dentry->d_parent->d_fsdata != sd->s_parent)
-		goto out_bad;
-
-	/* The sysfs dirent has been renamed */
-	if (strcmp(dentry->d_name.name, sd->s_name) != 0)
-		goto out_bad;
-
-	/* The sysfs dirent has been moved to a different namespace */
-	type = KOBJ_NS_TYPE_NONE;
-	if (sd->s_parent) {
-		type = sysfs_ns_type(sd->s_parent);
-		if (type != KOBJ_NS_TYPE_NONE &&
-				sysfs_info(dentry->d_sb)->ns[type] != sd->s_ns)
-			goto out_bad;
-	}
-
-	mutex_unlock(&sysfs_mutex);
-out_valid:
-	return 1;
-out_bad:
-	/* Remove the dentry from the dcache hashes.
-	 * If this is a deleted dentry we use d_drop instead of d_delete
-	 * so sysfs doesn't need to cope with negative dentries.
-	 *
-	 * If this is a dentry that has simply been renamed we
-	 * use d_drop to remove it from the dcache lookup on its
-	 * old parent.  If this dentry persists later when a lookup
-	 * is performed at its new name the dentry will be readded
-	 * to the dcache hashes.
-	 */
-	mutex_unlock(&sysfs_mutex);
-
-	/* If we have submounts we must allow the vfs caches
-	 * to lie about the state of the filesystem to prevent
-	 * leaks and other nasty things.
-	 */
-	if (check_submounts_and_drop(dentry) != 0)
-		goto out_valid;
-
-	return 0;
-}
-
-static void sysfs_dentry_release(struct dentry *dentry)
-{
-	sysfs_put(dentry->d_fsdata);
-}
-
-const struct dentry_operations sysfs_dentry_ops = {
-	.d_revalidate	= sysfs_dentry_revalidate,
-	.d_delete	= sysfs_dentry_delete,
-	.d_release	= sysfs_dentry_release,
-};
-
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type)
-{
-	char *dup_name = NULL;
-	struct sysfs_dirent *sd;
-
-	if (type & SYSFS_COPY_NAME) {
-		name = dup_name = kstrdup(name, GFP_KERNEL);
-		if (!name)
-			return NULL;
-	}
-
-	sd = kmem_cache_zalloc(sysfs_dir_cachep, GFP_KERNEL);
-	if (!sd)
-		goto err_out1;
-
-	if (sysfs_alloc_ino(&sd->s_ino))
-		goto err_out2;
-
-	atomic_set(&sd->s_count, 1);
-	atomic_set(&sd->s_active, 0);
-
-	sd->s_name = name;
-	sd->s_mode = mode;
-	sd->s_flags = type | SYSFS_FLAG_REMOVED;
-
-	return sd;
-
- err_out2:
-	kmem_cache_free(sysfs_dir_cachep, sd);
- err_out1:
-	kfree(dup_name);
-	return NULL;
-}
-
-/**
- *	sysfs_addrm_start - prepare for sysfs_dirent add/remove
- *	@acxt: pointer to sysfs_addrm_cxt to be used
- *
- *	This function is called when the caller is about to add or remove
- *	sysfs_dirent.  This function acquires sysfs_mutex.  @acxt is used
- *	to keep and pass context to other addrm functions.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  sysfs_mutex is locked on
- *	return.
- */
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt)
-	__acquires(sysfs_mutex)
-{
-	memset(acxt, 0, sizeof(*acxt));
-
-	mutex_lock(&sysfs_mutex);
-}
-
-/**
- *	__sysfs_add_one - add sysfs_dirent to parent without warning
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
- *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
- *	list of the parent.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- *
- *	RETURNS:
- *	0 on success, -EEXIST if entry with the given name already
- *	exists.
- */
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		    struct sysfs_dirent *parent_sd)
-{
-	struct sysfs_inode_attrs *ps_iattr;
-	int ret;
-
-	if (!!sysfs_ns_type(parent_sd) != !!sd->s_ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, sd->s_name);
-		return -EINVAL;
-	}
-
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = sysfs_get(parent_sd);
-
-	ret = sysfs_link_sibling(sd);
-	if (ret)
-		return ret;
-
-	/* Update timestamps on the parent */
-	ps_iattr = parent_sd->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-	}
-
-	/* Mark the entry added into directory tree */
-	sd->s_flags &= ~SYSFS_FLAG_REMOVED;
-
-	return 0;
-}
-
 /**
  *	sysfs_pathname - return full path to sysfs dirent
- *	@sd: sysfs_dirent whose path we want
+ *	@kn: kernfs_node whose path we want
  *	@path: caller allocated buffer of size PATH_MAX
  *
  *	Gives the name "/" to the sysfs_root entry; any path returned
  *	is relative to wherever sysfs is mounted.
  */
-static char *sysfs_pathname(struct sysfs_dirent *sd, char *path)
+static char *sysfs_pathname(struct kernfs_node *kn, char *path)
 {
-	if (sd->s_parent) {
-		sysfs_pathname(sd->s_parent, path);
+	if (kn->parent) {
+		sysfs_pathname(kn->parent, path);
 		strlcat(path, "/", PATH_MAX);
 	}
-	strlcat(path, sd->s_name, PATH_MAX);
+	strlcat(path, kn->name, PATH_MAX);
 	return path;
 }
 
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name)
 {
 	char *path;
 
@@ -489,445 +55,34 @@ void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name)
 }
 
 /**
- *	sysfs_add_one - add sysfs_dirent to parent
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be added
- *	@parent_sd: the parent sysfs_dirent to add @sd to
- *
- *	Get @parent_sd and set @sd->s_parent to it and increment nlink of
- *	the parent inode if @sd is a directory and link into the children
- *	list of the parent.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- *
- *	RETURNS:
- *	0 on success, -EEXIST if entry with the given name already
- *	exists.
- */
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd)
-{
-	int ret;
-
-	ret = __sysfs_add_one(acxt, sd, parent_sd);
-
-	if (ret == -EEXIST)
-		sysfs_warn_dup(parent_sd, sd->s_name);
-	return ret;
-}
-
-/**
- *	sysfs_remove_one - remove sysfs_dirent from parent
- *	@acxt: addrm context to use
- *	@sd: sysfs_dirent to be removed
- *
- *	Mark @sd removed and drop nlink of parent inode if @sd is a
- *	directory.  @sd is unlinked from the children list.
- *
- *	This function should be called between calls to
- *	sysfs_addrm_start() and sysfs_addrm_finish() and should be
- *	passed the same @acxt as passed to sysfs_addrm_start().
- *
- *	LOCKING:
- *	Determined by sysfs_addrm_start().
- */
-static void sysfs_remove_one(struct sysfs_addrm_cxt *acxt,
-			     struct sysfs_dirent *sd)
-{
-	struct sysfs_inode_attrs *ps_iattr;
-
-	/*
-	 * Removal can be called multiple times on the same node.  Only the
-	 * first invocation is effective and puts the base ref.
-	 */
-	if (sd->s_flags & SYSFS_FLAG_REMOVED)
-		return;
-
-	sysfs_unlink_sibling(sd);
-
-	/* Update timestamps on the parent */
-	ps_iattr = sd->s_parent->s_iattr;
-	if (ps_iattr) {
-		struct iattr *ps_iattrs = &ps_iattr->ia_iattr;
-		ps_iattrs->ia_ctime = ps_iattrs->ia_mtime = CURRENT_TIME;
-	}
-
-	sd->s_flags |= SYSFS_FLAG_REMOVED;
-	sd->u.removed_list = acxt->removed;
-	acxt->removed = sd;
-}
-
-/**
- *	sysfs_addrm_finish - finish up sysfs_dirent add/remove
- *	@acxt: addrm context to finish up
- *
- *	Finish up sysfs_dirent add/remove.  Resources acquired by
- *	sysfs_addrm_start() are released and removed sysfs_dirents are
- *	cleaned up.
- *
- *	LOCKING:
- *	sysfs_mutex is released.
- */
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt)
-	__releases(sysfs_mutex)
-{
-	/* release resources acquired by sysfs_addrm_start() */
-	mutex_unlock(&sysfs_mutex);
-
-	/* kill removed sysfs_dirents */
-	while (acxt->removed) {
-		struct sysfs_dirent *sd = acxt->removed;
-
-		acxt->removed = sd->u.removed_list;
-
-		sysfs_deactivate(sd);
-		sysfs_unmap_bin_file(sd);
-		sysfs_put(sd);
-	}
-}
-
-/**
- *	sysfs_find_dirent - find sysfs_dirent with the given name
- *	@parent_sd: sysfs_dirent to search under
- *	@name: name to look for
- *	@ns: the namespace tag to use
- *
- *	Look for sysfs_dirent with name @name under @parent_sd.
- *
- *	LOCKING:
- *	mutex_lock(sysfs_mutex)
- *
- *	RETURNS:
- *	Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const unsigned char *name,
-				       const void *ns)
-{
-	struct rb_node *node = parent_sd->s_dir.children.rb_node;
-	unsigned int hash;
-
-	if (!!sysfs_ns_type(parent_sd) != !!ns) {
-		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
-			sysfs_ns_type(parent_sd) ? "required" : "invalid",
-			parent_sd->s_name, name);
-		return NULL;
-	}
-
-	hash = sysfs_name_hash(name, ns);
-	while (node) {
-		struct sysfs_dirent *sd;
-		int result;
-
-		sd = to_sysfs_dirent(node);
-		result = sysfs_name_compare(hash, name, ns, sd);
-		if (result < 0)
-			node = node->rb_left;
-		else if (result > 0)
-			node = node->rb_right;
-		else
-			return sd;
-	}
-	return NULL;
-}
-
-/**
- *	sysfs_get_dirent_ns - find and get sysfs_dirent with the given name
- *	@parent_sd: sysfs_dirent to search under
- *	@name: name to look for
- *	@ns: the namespace tag to use
- *
- *	Look for sysfs_dirent with name @name under @parent_sd and get
- *	it if found.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).  Grabs sysfs_mutex.
- *
- *	RETURNS:
- *	Pointer to sysfs_dirent if found, NULL if not.
- */
-struct sysfs_dirent *sysfs_get_dirent_ns(struct sysfs_dirent *parent_sd,
-					 const unsigned char *name,
-					 const void *ns)
-{
-	struct sysfs_dirent *sd;
-
-	mutex_lock(&sysfs_mutex);
-	sd = sysfs_find_dirent(parent_sd, name, ns);
-	sysfs_get(sd);
-	mutex_unlock(&sysfs_mutex);
-
-	return sd;
-}
-EXPORT_SYMBOL_GPL(sysfs_get_dirent_ns);
-
-static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd,
-		      enum kobj_ns_type type,
-		      const char *name, const void *ns,
-		      struct sysfs_dirent **p_sd)
-{
-	umode_t mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-	int rc;
-
-	/* allocate */
-	sd = sysfs_new_dirent(name, mode, SYSFS_DIR);
-	if (!sd)
-		return -ENOMEM;
-
-	sd->s_flags |= (type << SYSFS_NS_TYPE_SHIFT);
-	sd->s_ns = ns;
-	sd->s_dir.kobj = kobj;
-
-	/* link in */
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, parent_sd);
-	sysfs_addrm_finish(&acxt);
-
-	if (rc == 0)
-		*p_sd = sd;
-	else
-		sysfs_put(sd);
-
-	return rc;
-}
-
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			struct sysfs_dirent **p_sd)
-{
-	return create_dir(kobj, kobj->sd,
-			  KOBJ_NS_TYPE_NONE, name, NULL, p_sd);
-}
-
-/**
- *	sysfs_read_ns_type: return associated ns_type
- *	@kobj: the kobject being queried
- *
- *	Each kobject can be tagged with exactly one namespace type
- *	(i.e. network or user).  Return the ns_type associated with
- *	this object if any
- */
-static enum kobj_ns_type sysfs_read_ns_type(struct kobject *kobj)
-{
-	const struct kobj_ns_type_operations *ops;
-	enum kobj_ns_type type;
-
-	ops = kobj_child_ns_ops(kobj);
-	if (!ops)
-		return KOBJ_NS_TYPE_NONE;
-
-	type = ops->type;
-	BUG_ON(type <= KOBJ_NS_TYPE_NONE);
-	BUG_ON(type >= KOBJ_NS_TYPES);
-	BUG_ON(!kobj_ns_type_registered(type));
-
-	return type;
-}
-
-/**
  * sysfs_create_dir_ns - create a directory for an object with a namespace tag
  * @kobj: object we're creating directory for
  * @ns: the namespace tag to use
  */
 int sysfs_create_dir_ns(struct kobject *kobj, const void *ns)
 {
-	enum kobj_ns_type type;
-	struct sysfs_dirent *parent_sd, *sd;
-	int error = 0;
+	struct kernfs_node *parent, *kn;
 
 	BUG_ON(!kobj);
 
 	if (kobj->parent)
-		parent_sd = kobj->parent->sd;
+		parent = kobj->parent->sd;
 	else
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 
-	if (!parent_sd)
+	if (!parent)
 		return -ENOENT;
 
-	type = sysfs_read_ns_type(kobj);
-
-	error = create_dir(kobj, parent_sd, type, kobject_name(kobj), ns, &sd);
-	if (!error)
-		kobj->sd = sd;
-	return error;
-}
-
-static struct dentry *sysfs_lookup(struct inode *dir, struct dentry *dentry,
-				   unsigned int flags)
-{
-	struct dentry *ret = NULL;
-	struct dentry *parent = dentry->d_parent;
-	struct sysfs_dirent *parent_sd = parent->d_fsdata;
-	struct sysfs_dirent *sd;
-	struct inode *inode;
-	enum kobj_ns_type type;
-	const void *ns;
-
-	mutex_lock(&sysfs_mutex);
-
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dir->i_sb)->ns[type];
-
-	sd = sysfs_find_dirent(parent_sd, dentry->d_name.name, ns);
-
-	/* no such entry */
-	if (!sd) {
-		ret = ERR_PTR(-ENOENT);
-		goto out_unlock;
-	}
-	dentry->d_fsdata = sysfs_get(sd);
-
-	/* attach dentry and inode */
-	inode = sysfs_get_inode(dir->i_sb, sd);
-	if (!inode) {
-		ret = ERR_PTR(-ENOMEM);
-		goto out_unlock;
-	}
-
-	/* instantiate and hash dentry */
-	ret = d_materialise_unique(dentry, inode);
- out_unlock:
-	mutex_unlock(&sysfs_mutex);
-	return ret;
-}
-
-const struct inode_operations sysfs_dir_inode_operations = {
-	.lookup		= sysfs_lookup,
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-};
-
-static struct sysfs_dirent *sysfs_leftmost_descendant(struct sysfs_dirent *pos)
-{
-	struct sysfs_dirent *last;
-
-	while (true) {
-		struct rb_node *rbn;
-
-		last = pos;
-
-		if (sysfs_type(pos) != SYSFS_DIR)
-			break;
-
-		rbn = rb_first(&pos->s_dir.children);
-		if (!rbn)
-			break;
-
-		pos = to_sysfs_dirent(rbn);
-	}
-
-	return last;
-}
-
-/**
- * sysfs_next_descendant_post - find the next descendant for post-order walk
- * @pos: the current position (%NULL to initiate traversal)
- * @root: sysfs_dirent whose descendants to walk
- *
- * Find the next descendant to visit for post-order traversal of @root's
- * descendants.  @root is included in the iteration and the last node to be
- * visited.
- */
-static struct sysfs_dirent *sysfs_next_descendant_post(struct sysfs_dirent *pos,
-						       struct sysfs_dirent *root)
-{
-	struct rb_node *rbn;
-
-	lockdep_assert_held(&sysfs_mutex);
-
-	/* if first iteration, visit leftmost descendant which may be root */
-	if (!pos)
-		return sysfs_leftmost_descendant(root);
-
-	/* if we visited @root, we're done */
-	if (pos == root)
-		return NULL;
-
-	/* if there's an unvisited sibling, visit its leftmost descendant */
-	rbn = rb_next(&pos->s_rb);
-	if (rbn)
-		return sysfs_leftmost_descendant(to_sysfs_dirent(rbn));
-
-	/* no sibling left, visit parent */
-	return pos->s_parent;
-}
-
-static void __sysfs_remove(struct sysfs_addrm_cxt *acxt,
-			   struct sysfs_dirent *sd)
-{
-	struct sysfs_dirent *pos, *next;
-
-	if (!sd)
-		return;
-
-	pr_debug("sysfs %s: removing\n", sd->s_name);
-
-	next = NULL;
-	do {
-		pos = next;
-		next = sysfs_next_descendant_post(pos, sd);
-		if (pos)
-			sysfs_remove_one(acxt, pos);
-	} while (next);
-}
-
-/**
- * sysfs_remove - remove a sysfs_dirent recursively
- * @sd: the sysfs_dirent to remove
- *
- * Remove @sd along with all its subdirectories and files.
- */
-void sysfs_remove(struct sysfs_dirent *sd)
-{
-	struct sysfs_addrm_cxt acxt;
-
-	sysfs_addrm_start(&acxt);
-	__sysfs_remove(&acxt, sd);
-	sysfs_addrm_finish(&acxt);
-}
-
-/**
- * sysfs_hash_and_remove - find a sysfs_dirent by name and remove it
- * @dir_sd: parent of the target
- * @name: name of the sysfs_dirent to remove
- * @ns: namespace tag of the sysfs_dirent to remove
- *
- * Look for the sysfs_dirent with @name and @ns under @dir_sd and remove
- * it.  Returns 0 on success, -ENOENT if such entry doesn't exist.
- */
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns)
-{
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-
-	if (!dir_sd) {
-		WARN(1, KERN_WARNING "sysfs: can not remove '%s', no directory\n",
-			name);
-		return -ENOENT;
+	kn = kernfs_create_dir_ns(parent, kobject_name(kobj),
+				  S_IRWXU | S_IRUGO | S_IXUGO, kobj, ns);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, kobject_name(kobj));
+		return PTR_ERR(kn);
 	}
 
-	sysfs_addrm_start(&acxt);
-
-	sd = sysfs_find_dirent(dir_sd, name, ns);
-	if (sd)
-		__sysfs_remove(&acxt, sd);
-
-	sysfs_addrm_finish(&acxt);
-
-	if (sd)
-		return 0;
-	else
-		return -ENOENT;
+	kobj->sd = kn;
+	return 0;
 }
 
 /**
@@ -940,207 +95,47 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
  */
 void sysfs_remove_dir(struct kobject *kobj)
 {
-	struct sysfs_dirent *sd = kobj->sd;
+	struct kernfs_node *kn = kobj->sd;
 
 	/*
 	 * In general, kboject owner is responsible for ensuring removal
 	 * doesn't race with other operations and sysfs doesn't provide any
 	 * protection; however, when @kobj is used as a symlink target, the
 	 * symlinking entity usually doesn't own @kobj and thus has no
-	 * control over removal.  @kobj->sd may be removed anytime and
-	 * symlink code may end up dereferencing an already freed sd.
+	 * control over removal.  @kobj->sd may be removed anytime
+	 * and symlink code may end up dereferencing an already freed node.
 	 *
-	 * sysfs_symlink_target_lock synchronizes @kobj->sd disassociation
-	 * against symlink operations so that symlink code can safely
-	 * dereference @kobj->sd.
+	 * sysfs_symlink_target_lock synchronizes @kobj->sd
+	 * disassociation against symlink operations so that symlink code
+	 * can safely dereference @kobj->sd.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
 	kobj->sd = NULL;
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	if (sd) {
-		WARN_ON_ONCE(sysfs_type(sd) != SYSFS_DIR);
-		sysfs_remove(sd);
+	if (kn) {
+		WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR);
+		kernfs_remove(kn);
 	}
 }
 
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const char *new_name, const void *new_ns)
-{
-	int error;
-
-	mutex_lock(&sysfs_mutex);
-
-	error = 0;
-	if ((sd->s_parent == new_parent_sd) && (sd->s_ns == new_ns) &&
-	    (strcmp(sd->s_name, new_name) == 0))
-		goto out;	/* nothing to rename */
-
-	error = -EEXIST;
-	if (sysfs_find_dirent(new_parent_sd, new_name, new_ns))
-		goto out;
-
-	/* rename sysfs_dirent */
-	if (strcmp(sd->s_name, new_name) != 0) {
-		error = -ENOMEM;
-		new_name = kstrdup(new_name, GFP_KERNEL);
-		if (!new_name)
-			goto out;
-
-		kfree(sd->s_name);
-		sd->s_name = new_name;
-	}
-
-	/*
-	 * Move to the appropriate place in the appropriate directories rbtree.
-	 */
-	sysfs_unlink_sibling(sd);
-	sysfs_get(new_parent_sd);
-	sysfs_put(sd->s_parent);
-	sd->s_ns = new_ns;
-	sd->s_hash = sysfs_name_hash(sd->s_name, sd->s_ns);
-	sd->s_parent = new_parent_sd;
-	sysfs_link_sibling(sd);
-
-	error = 0;
- out:
-	mutex_unlock(&sysfs_mutex);
-	return error;
-}
-
 int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name,
 			const void *new_ns)
 {
-	struct sysfs_dirent *parent_sd = kobj->sd->s_parent;
+	struct kernfs_node *parent = kobj->sd->parent;
 
-	return sysfs_rename(kobj->sd, parent_sd, new_name, new_ns);
+	return kernfs_rename_ns(kobj->sd, parent, new_name, new_ns);
 }
 
 int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj,
 		      const void *new_ns)
 {
-	struct sysfs_dirent *sd = kobj->sd;
-	struct sysfs_dirent *new_parent_sd;
+	struct kernfs_node *kn = kobj->sd;
+	struct kernfs_node *new_parent;
 
-	BUG_ON(!sd->s_parent);
-	new_parent_sd = new_parent_kobj && new_parent_kobj->sd ?
-		new_parent_kobj->sd : &sysfs_root;
+	BUG_ON(!kn->parent);
+	new_parent = new_parent_kobj && new_parent_kobj->sd ?
+		new_parent_kobj->sd : sysfs_root_kn;
 
-	return sysfs_rename(sd, new_parent_sd, sd->s_name, new_ns);
+	return kernfs_rename_ns(kn, new_parent, kn->name, new_ns);
 }
-
-/* Relationship between s_mode and the DT_xxx types */
-static inline unsigned char dt_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_mode >> 12) & 15;
-}
-
-static int sysfs_dir_release(struct inode *inode, struct file *filp)
-{
-	sysfs_put(filp->private_data);
-	return 0;
-}
-
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
-{
-	if (pos) {
-		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
-			pos->s_parent == parent_sd &&
-			hash == pos->s_hash;
-		sysfs_put(pos);
-		if (!valid)
-			pos = NULL;
-	}
-	if (!pos && (hash > 1) && (hash < INT_MAX)) {
-		struct rb_node *node = parent_sd->s_dir.children.rb_node;
-		while (node) {
-			pos = to_sysfs_dirent(node);
-
-			if (hash < pos->s_hash)
-				node = node->rb_left;
-			else if (hash > pos->s_hash)
-				node = node->rb_right;
-			else
-				break;
-		}
-	}
-	/* Skip over entries in the wrong namespace */
-	while (pos && pos->s_ns != ns) {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	}
-	return pos;
-}
-
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
-{
-	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos)
-		do {
-			struct rb_node *node = rb_next(&pos->s_rb);
-			if (!node)
-				pos = NULL;
-			else
-				pos = to_sysfs_dirent(node);
-		} while (pos && pos->s_ns != ns);
-	return pos;
-}
-
-static int sysfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct dentry *dentry = file->f_path.dentry;
-	struct sysfs_dirent *parent_sd = dentry->d_fsdata;
-	struct sysfs_dirent *pos = file->private_data;
-	enum kobj_ns_type type;
-	const void *ns;
-
-	type = sysfs_ns_type(parent_sd);
-	ns = sysfs_info(dentry->d_sb)->ns[type];
-
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-	mutex_lock(&sysfs_mutex);
-	for (pos = sysfs_dir_pos(ns, parent_sd, ctx->pos, pos);
-	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, ctx->pos, pos)) {
-		const char *name = pos->s_name;
-		unsigned int type = dt_type(pos);
-		int len = strlen(name);
-		ino_t ino = pos->s_ino;
-		ctx->pos = pos->s_hash;
-		file->private_data = sysfs_get(pos);
-
-		mutex_unlock(&sysfs_mutex);
-		if (!dir_emit(ctx, name, len, ino, type))
-			return 0;
-		mutex_lock(&sysfs_mutex);
-	}
-	mutex_unlock(&sysfs_mutex);
-	file->private_data = NULL;
-	ctx->pos = INT_MAX;
-	return 0;
-}
-
-static loff_t sysfs_dir_llseek(struct file *file, loff_t offset, int whence)
-{
-	struct inode *inode = file_inode(file);
-	loff_t ret;
-
-	mutex_lock(&inode->i_mutex);
-	ret = generic_file_llseek(file, offset, whence);
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-const struct file_operations sysfs_dir_operations = {
-	.read		= generic_read_dir,
-	.iterate	= sysfs_readdir,
-	.release	= sysfs_dir_release,
-	.llseek		= sysfs_dir_llseek,
-};
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 79b5da2acbe1..810cf6e613e5 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -14,70 +14,23 @@
 #include <linux/kobject.h>
 #include <linux/kallsyms.h>
 #include <linux/slab.h>
-#include <linux/fsnotify.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
 #include <linux/list.h>
 #include <linux/mutex.h>
-#include <linux/limits.h>
-#include <linux/uaccess.h>
 #include <linux/seq_file.h>
-#include <linux/mm.h>
 
 #include "sysfs.h"
+#include "../kernfs/kernfs-internal.h"
 
 /*
- * There's one sysfs_open_file for each open file and one sysfs_open_dirent
- * for each sysfs_dirent with one or more open files.
- *
- * sysfs_dirent->s_attr.open points to sysfs_open_dirent.  s_attr.open is
- * protected by sysfs_open_dirent_lock.
- *
- * filp->private_data points to seq_file whose ->private points to
- * sysfs_open_file.  sysfs_open_files are chained at
- * sysfs_open_dirent->files, which is protected by sysfs_open_file_mutex.
- */
-static DEFINE_SPINLOCK(sysfs_open_dirent_lock);
-static DEFINE_MUTEX(sysfs_open_file_mutex);
-
-struct sysfs_open_dirent {
-	atomic_t		refcnt;
-	atomic_t		event;
-	wait_queue_head_t	poll;
-	struct list_head	files; /* goes through sysfs_open_file.list */
-};
-
-struct sysfs_open_file {
-	struct sysfs_dirent	*sd;
-	struct file		*file;
-	struct mutex		mutex;
-	int			event;
-	struct list_head	list;
-
-	bool			mmapped;
-	const struct vm_operations_struct *vm_ops;
-};
-
-static bool sysfs_is_bin(struct sysfs_dirent *sd)
-{
-	return sysfs_type(sd) == SYSFS_KOBJ_BIN_ATTR;
-}
-
-static struct sysfs_open_file *sysfs_of(struct file *file)
-{
-	return ((struct seq_file *)file->private_data)->private;
-}
-
-/*
- * Determine ktype->sysfs_ops for the given sysfs_dirent.  This function
+ * Determine ktype->sysfs_ops for the given kernfs_node.  This function
  * must be called while holding an active reference.
  */
-static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
+static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn)
 {
-	struct kobject *kobj = sd->s_parent->s_dir.kobj;
+	struct kobject *kobj = kn->parent->priv;
 
-	if (!sysfs_ignore_lockdep(sd))
-		lockdep_assert_held(sd);
+	if (kn->flags & KERNFS_LOCKDEP)
+		lockdep_assert_held(kn);
 	return kobj->ktype ? kobj->ktype->sysfs_ops : NULL;
 }
 
@@ -86,13 +39,13 @@ static const struct sysfs_ops *sysfs_file_ops(struct sysfs_dirent *sd)
  * details like buffering and seeking.  The following function pipes
  * sysfs_ops->show() result through seq_file.
  */
-static int sysfs_seq_show(struct seq_file *sf, void *v)
+static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 {
-	struct sysfs_open_file *of = sf->private;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	const struct sysfs_ops *ops;
-	char *buf;
+	struct kernfs_open_file *of = sf->private;
+	struct kobject *kobj = of->kn->parent->priv;
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
 	ssize_t count;
+	char *buf;
 
 	/* acquire buffer and ensure that it's >= PAGE_SIZE */
 	count = seq_get_buf(sf, &buf);
@@ -102,34 +55,15 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	}
 
 	/*
-	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-	 * nests outside active ref and is just to ensure that the ops
-	 * aren't called concurrently for the same open file.
+	 * Invoke show().  Control may reach here via seq file lseek even
+	 * if @ops->show() isn't implemented.
 	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		return -ENODEV;
+	if (ops->show) {
+		count = ops->show(kobj, of->kn->priv, buf);
+		if (count < 0)
+			return count;
 	}
 
-	of->event = atomic_read(&of->sd->s_attr.open->event);
-
-	/*
-	 * Lookup @ops and invoke show().  Control may reach here via seq
-	 * file lseek even if @ops->show() isn't implemented.
-	 */
-	ops = sysfs_file_ops(of->sd);
-	if (ops->show)
-		count = ops->show(kobj, of->sd->s_attr.attr, buf);
-	else
-		count = 0;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (count < 0)
-		return count;
-
 	/*
 	 * The code works fine with PAGE_SIZE return but it's likely to
 	 * indicate truncated result or overflow in normal use cases.
@@ -144,710 +78,194 @@ static int sysfs_seq_show(struct seq_file *sf, void *v)
 	return 0;
 }
 
-/*
- * Read method for bin files.  As reading a bin file can have side-effects,
- * the exact offset and bytes specified in read(2) call should be passed to
- * the read callback making it difficult to use seq_file.  Implement
- * simplistic custom buffering for bin files.
- */
-static ssize_t sysfs_bin_read(struct file *file, char __user *userbuf,
-			      size_t bytes, loff_t *off)
+static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf,
+				 size_t count, loff_t pos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	loff_t size = file_inode(file)->i_size;
-	int count = min_t(size_t, bytes, PAGE_SIZE);
-	loff_t offs = *off;
-	char *buf;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
 
-	if (!bytes)
+	if (!count)
 		return 0;
 
 	if (size) {
-		if (offs > size)
+		if (pos > size)
 			return 0;
-		if (offs + count > size)
-			count = size - offs;
+		if (pos + count > size)
+			count = size - pos;
 	}
 
-	buf = kmalloc(count, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-
-	/* need of->sd for battr, its parent for kobj */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		count = -ENODEV;
-		mutex_unlock(&of->mutex);
-		goto out_free;
-	}
+	if (!battr->read)
+		return -EIO;
 
-	if (battr->read)
-		count = battr->read(file, kobj, battr, buf, offs, count);
-	else
-		count = -EIO;
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
-
-	if (count < 0)
-		goto out_free;
-
-	if (copy_to_user(userbuf, buf, count)) {
-		count = -EFAULT;
-		goto out_free;
-	}
-
-	pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count);
-
-	*off = offs + count;
-
- out_free:
-	kfree(buf);
-	return count;
+	return battr->read(of->file, kobj, battr, buf, pos, count);
 }
 
-/**
- * flush_write_buffer - push buffer to kobject
- * @of: open file
- * @buf: data buffer for file
- * @off: file offset to write to
- * @count: number of bytes
- *
- * Get the correct pointers for the kobject and the attribute we're dealing
- * with, then call the store() method for it with @buf.
- */
-static int flush_write_buffer(struct sysfs_open_file *of, char *buf, loff_t off,
-			      size_t count)
+/* kernfs write callback for regular sysfs files */
+static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf,
+			      size_t count, loff_t pos)
 {
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	int rc = 0;
-
-	/*
-	 * Need @of->sd for attr and ops, its parent for kobj.  @of->mutex
-	 * nests outside active ref and is just to ensure that the ops
-	 * aren't called concurrently for the same open file.
-	 */
-	mutex_lock(&of->mutex);
-	if (!sysfs_get_active(of->sd)) {
-		mutex_unlock(&of->mutex);
-		return -ENODEV;
-	}
-
-	if (sysfs_is_bin(of->sd)) {
-		struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-
-		rc = -EIO;
-		if (battr->write)
-			rc = battr->write(of->file, kobj, battr, buf, off,
-					  count);
-	} else {
-		const struct sysfs_ops *ops = sysfs_file_ops(of->sd);
+	const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
+	struct kobject *kobj = of->kn->parent->priv;
 
-		rc = ops->store(kobj, of->sd->s_attr.attr, buf, count);
-	}
-
-	sysfs_put_active(of->sd);
-	mutex_unlock(&of->mutex);
+	if (!count)
+		return 0;
 
-	return rc;
+	return ops->store(kobj, of->kn->priv, buf, count);
 }
 
-/**
- * sysfs_write_file - write an attribute
- * @file: file pointer
- * @user_buf: data to write
- * @count: number of bytes
- * @ppos: starting offset
- *
- * Copy data in from userland and pass it to the matching
- * sysfs_ops->store() by invoking flush_write_buffer().
- *
- * There is no easy way for us to know if userspace is only doing a partial
- * write, so we don't support them. We expect the entire buffer to come on
- * the first write.  Hint: if you're writing a value, first read the file,
- * modify only the the value you're changing, then write entire buffer
- * back.
- */
-static ssize_t sysfs_write_file(struct file *file, const char __user *user_buf,
-				size_t count, loff_t *ppos)
+/* kernfs write callback for bin sysfs files */
+static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf,
+				  size_t count, loff_t pos)
 {
-	struct sysfs_open_file *of = sysfs_of(file);
-	ssize_t len = min_t(size_t, count, PAGE_SIZE);
-	loff_t size = file_inode(file)->i_size;
-	char *buf;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
+	loff_t size = file_inode(of->file)->i_size;
 
-	if (sysfs_is_bin(of->sd) && size) {
-		if (size <= *ppos)
+	if (size) {
+		if (size <= pos)
 			return 0;
-		len = min_t(ssize_t, len, size - *ppos);
+		count = min_t(ssize_t, count, size - pos);
 	}
-
-	if (!len)
+	if (!count)
 		return 0;
 
-	buf = kmalloc(len + 1, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
+	if (!battr->write)
+		return -EIO;
 
-	if (copy_from_user(buf, user_buf, len)) {
-		len = -EFAULT;
-		goto out_free;
-	}
-	buf[len] = '\0';	/* guarantee string termination */
-
-	len = flush_write_buffer(of, buf, *ppos, len);
-	if (len > 0)
-		*ppos += len;
-out_free:
-	kfree(buf);
-	return len;
+	return battr->write(of->file, kobj, battr, buf, pos, count);
 }
 
-static void sysfs_bin_vma_open(struct vm_area_struct *vma)
+static int sysfs_kf_bin_mmap(struct kernfs_open_file *of,
+			     struct vm_area_struct *vma)
 {
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-
-	if (!of->vm_ops)
-		return;
-
-	if (!sysfs_get_active(of->sd))
-		return;
+	struct bin_attribute *battr = of->kn->priv;
+	struct kobject *kobj = of->kn->parent->priv;
 
-	if (of->vm_ops->open)
-		of->vm_ops->open(vma);
-
-	sysfs_put_active(of->sd);
+	return battr->mmap(of->file, kobj, battr, vma);
 }
 
-static int sysfs_bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
 {
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(of->sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = VM_FAULT_SIGBUS;
-	if (of->vm_ops->fault)
-		ret = of->vm_ops->fault(vma, vmf);
+	struct kernfs_node *kn = kobj->sd, *tmp;
 
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static int sysfs_bin_page_mkwrite(struct vm_area_struct *vma,
-				  struct vm_fault *vmf)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return VM_FAULT_SIGBUS;
-
-	if (!sysfs_get_active(of->sd))
-		return VM_FAULT_SIGBUS;
-
-	ret = 0;
-	if (of->vm_ops->page_mkwrite)
-		ret = of->vm_ops->page_mkwrite(vma, vmf);
+	if (kn && dir)
+		kn = kernfs_find_and_get(kn, dir);
 	else
-		file_update_time(file);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static int sysfs_bin_access(struct vm_area_struct *vma, unsigned long addr,
-			    void *buf, int len, int write)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return -EINVAL;
-
-	if (!sysfs_get_active(of->sd))
-		return -EINVAL;
-
-	ret = -EINVAL;
-	if (of->vm_ops->access)
-		ret = of->vm_ops->access(vma, addr, buf, len, write);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-#ifdef CONFIG_NUMA
-static int sysfs_bin_set_policy(struct vm_area_struct *vma,
-				struct mempolicy *new)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(of->sd))
-		return -EINVAL;
-
-	ret = 0;
-	if (of->vm_ops->set_policy)
-		ret = of->vm_ops->set_policy(vma, new);
-
-	sysfs_put_active(of->sd);
-	return ret;
-}
-
-static struct mempolicy *sysfs_bin_get_policy(struct vm_area_struct *vma,
-					      unsigned long addr)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct mempolicy *pol;
-
-	if (!of->vm_ops)
-		return vma->vm_policy;
-
-	if (!sysfs_get_active(of->sd))
-		return vma->vm_policy;
-
-	pol = vma->vm_policy;
-	if (of->vm_ops->get_policy)
-		pol = of->vm_ops->get_policy(vma, addr);
-
-	sysfs_put_active(of->sd);
-	return pol;
-}
-
-static int sysfs_bin_migrate(struct vm_area_struct *vma, const nodemask_t *from,
-			     const nodemask_t *to, unsigned long flags)
-{
-	struct file *file = vma->vm_file;
-	struct sysfs_open_file *of = sysfs_of(file);
-	int ret;
-
-	if (!of->vm_ops)
-		return 0;
-
-	if (!sysfs_get_active(of->sd))
-		return 0;
-
-	ret = 0;
-	if (of->vm_ops->migrate)
-		ret = of->vm_ops->migrate(vma, from, to, flags);
+		kernfs_get(kn);
 
-	sysfs_put_active(of->sd);
-	return ret;
-}
-#endif
-
-static const struct vm_operations_struct sysfs_bin_vm_ops = {
-	.open		= sysfs_bin_vma_open,
-	.fault		= sysfs_bin_fault,
-	.page_mkwrite	= sysfs_bin_page_mkwrite,
-	.access		= sysfs_bin_access,
-#ifdef CONFIG_NUMA
-	.set_policy	= sysfs_bin_set_policy,
-	.get_policy	= sysfs_bin_get_policy,
-	.migrate	= sysfs_bin_migrate,
-#endif
-};
-
-static int sysfs_bin_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	struct sysfs_open_file *of = sysfs_of(file);
-	struct bin_attribute *battr = of->sd->s_attr.bin_attr;
-	struct kobject *kobj = of->sd->s_parent->s_dir.kobj;
-	int rc;
-
-	mutex_lock(&of->mutex);
-
-	/* need of->sd for battr, its parent for kobj */
-	rc = -ENODEV;
-	if (!sysfs_get_active(of->sd))
-		goto out_unlock;
-
-	if (!battr->mmap)
-		goto out_put;
-
-	rc = battr->mmap(file, kobj, battr, vma);
-	if (rc)
-		goto out_put;
-
-	/*
-	 * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup()
-	 * to satisfy versions of X which crash if the mmap fails: that
-	 * substitutes a new vm_file, and we don't then want bin_vm_ops.
-	 */
-	if (vma->vm_file != file)
-		goto out_put;
-
-	rc = -EINVAL;
-	if (of->mmapped && of->vm_ops != vma->vm_ops)
-		goto out_put;
-
-	/*
-	 * It is not possible to successfully wrap close.
-	 * So error if someone is trying to use close.
-	 */
-	rc = -EINVAL;
-	if (vma->vm_ops && vma->vm_ops->close)
-		goto out_put;
-
-	rc = 0;
-	of->mmapped = 1;
-	of->vm_ops = vma->vm_ops;
-	vma->vm_ops = &sysfs_bin_vm_ops;
-out_put:
-	sysfs_put_active(of->sd);
-out_unlock:
-	mutex_unlock(&of->mutex);
-
-	return rc;
-}
-
-/**
- *	sysfs_get_open_dirent - get or create sysfs_open_dirent
- *	@sd: target sysfs_dirent
- *	@of: sysfs_open_file for this instance of open
- *
- *	If @sd->s_attr.open exists, increment its reference count;
- *	otherwise, create one.  @of is chained to the files list.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	0 on success, -errno on failure.
- */
-static int sysfs_get_open_dirent(struct sysfs_dirent *sd,
-				 struct sysfs_open_file *of)
-{
-	struct sysfs_open_dirent *od, *new_od = NULL;
-
- retry:
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irq(&sysfs_open_dirent_lock);
-
-	if (!sd->s_attr.open && new_od) {
-		sd->s_attr.open = new_od;
-		new_od = NULL;
-	}
-
-	od = sd->s_attr.open;
-	if (od) {
-		atomic_inc(&od->refcnt);
-		list_add_tail(&of->list, &od->files);
+	if (kn && attr) {
+		tmp = kernfs_find_and_get(kn, attr);
+		kernfs_put(kn);
+		kn = tmp;
 	}
 
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	if (od) {
-		kfree(new_od);
-		return 0;
+	if (kn) {
+		kernfs_notify(kn);
+		kernfs_put(kn);
 	}
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
 
-	/* not there, initialize a new one and retry */
-	new_od = kmalloc(sizeof(*new_od), GFP_KERNEL);
-	if (!new_od)
-		return -ENOMEM;
+static const struct kernfs_ops sysfs_file_kfops_empty = {
+};
 
-	atomic_set(&new_od->refcnt, 0);
-	atomic_set(&new_od->event, 1);
-	init_waitqueue_head(&new_od->poll);
-	INIT_LIST_HEAD(&new_od->files);
-	goto retry;
-}
+static const struct kernfs_ops sysfs_file_kfops_ro = {
+	.seq_show	= sysfs_kf_seq_show,
+};
 
-/**
- *	sysfs_put_open_dirent - put sysfs_open_dirent
- *	@sd: target sysfs_dirent
- *	@of: associated sysfs_open_file
- *
- *	Put @sd->s_attr.open and unlink @of from the files list.  If
- *	reference count reaches zero, disassociate and free it.
- *
- *	LOCKING:
- *	None.
- */
-static void sysfs_put_open_dirent(struct sysfs_dirent *sd,
-				  struct sysfs_open_file *of)
-{
-	struct sysfs_open_dirent *od = sd->s_attr.open;
-	unsigned long flags;
+static const struct kernfs_ops sysfs_file_kfops_wo = {
+	.write		= sysfs_kf_write,
+};
 
-	mutex_lock(&sysfs_open_file_mutex);
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
+static const struct kernfs_ops sysfs_file_kfops_rw = {
+	.seq_show	= sysfs_kf_seq_show,
+	.write		= sysfs_kf_write,
+};
 
-	if (of)
-		list_del(&of->list);
+static const struct kernfs_ops sysfs_bin_kfops_ro = {
+	.read		= sysfs_kf_bin_read,
+};
 
-	if (atomic_dec_and_test(&od->refcnt))
-		sd->s_attr.open = NULL;
-	else
-		od = NULL;
+static const struct kernfs_ops sysfs_bin_kfops_wo = {
+	.write		= sysfs_kf_bin_write,
+};
 
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-	mutex_unlock(&sysfs_open_file_mutex);
+static const struct kernfs_ops sysfs_bin_kfops_rw = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+};
 
-	kfree(od);
-}
+static const struct kernfs_ops sysfs_bin_kfops_mmap = {
+	.read		= sysfs_kf_bin_read,
+	.write		= sysfs_kf_bin_write,
+	.mmap		= sysfs_kf_bin_mmap,
+};
 
-static int sysfs_open_file(struct inode *inode, struct file *file)
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+			   const struct attribute *attr, bool is_bin,
+			   umode_t mode, const void *ns)
 {
-	struct sysfs_dirent *attr_sd = file->f_path.dentry->d_fsdata;
-	struct kobject *kobj = attr_sd->s_parent->s_dir.kobj;
-	struct sysfs_open_file *of;
-	bool has_read, has_write;
-	int error = -EACCES;
-
-	/* need attr_sd for attr and ops, its parent for kobj */
-	if (!sysfs_get_active(attr_sd))
-		return -ENODEV;
+	struct lock_class_key *key = NULL;
+	const struct kernfs_ops *ops;
+	struct kernfs_node *kn;
+	loff_t size;
 
-	if (sysfs_is_bin(attr_sd)) {
-		struct bin_attribute *battr = attr_sd->s_attr.bin_attr;
-
-		has_read = battr->read || battr->mmap;
-		has_write = battr->write || battr->mmap;
-	} else {
-		const struct sysfs_ops *ops = sysfs_file_ops(attr_sd);
+	if (!is_bin) {
+		struct kobject *kobj = parent->priv;
+		const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops;
 
 		/* every kobject with an attribute needs a ktype assigned */
-		if (WARN(!ops, KERN_ERR
+		if (WARN(!sysfs_ops, KERN_ERR
 			 "missing sysfs attribute operations for kobject: %s\n",
 			 kobject_name(kobj)))
-			goto err_out;
-
-		has_read = ops->show;
-		has_write = ops->store;
-	}
-
-	/* check perms and supported operations */
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (!(inode->i_mode & S_IWUGO) || !has_write))
-		goto err_out;
-
-	if ((file->f_mode & FMODE_READ) &&
-	    (!(inode->i_mode & S_IRUGO) || !has_read))
-		goto err_out;
-
-	/* allocate a sysfs_open_file for the file */
-	error = -ENOMEM;
-	of = kzalloc(sizeof(struct sysfs_open_file), GFP_KERNEL);
-	if (!of)
-		goto err_out;
-
-	mutex_init(&of->mutex);
-	of->sd = attr_sd;
-	of->file = file;
-
-	/*
-	 * Always instantiate seq_file even if read access doesn't use
-	 * seq_file or is not requested.  This unifies private data access
-	 * and readable regular files are the vast majority anyway.
-	 */
-	if (sysfs_is_bin(attr_sd))
-		error = single_open(file, NULL, of);
-	else
-		error = single_open(file, sysfs_seq_show, of);
-	if (error)
-		goto err_free;
-
-	/* seq_file clears PWRITE unconditionally, restore it if WRITE */
-	if (file->f_mode & FMODE_WRITE)
-		file->f_mode |= FMODE_PWRITE;
-
-	/* make sure we have open dirent struct */
-	error = sysfs_get_open_dirent(attr_sd, of);
-	if (error)
-		goto err_close;
-
-	/* open succeeded, put active references */
-	sysfs_put_active(attr_sd);
-	return 0;
-
-err_close:
-	single_release(inode, file);
-err_free:
-	kfree(of);
-err_out:
-	sysfs_put_active(attr_sd);
-	return error;
-}
-
-static int sysfs_release(struct inode *inode, struct file *filp)
-{
-	struct sysfs_dirent *sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_file *of = sysfs_of(filp);
-
-	sysfs_put_open_dirent(sd, of);
-	single_release(inode, filp);
-	kfree(of);
-
-	return 0;
-}
-
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd)
-{
-	struct sysfs_open_dirent *od;
-	struct sysfs_open_file *of;
-
-	if (!sysfs_is_bin(sd))
-		return;
-
-	spin_lock_irq(&sysfs_open_dirent_lock);
-	od = sd->s_attr.open;
-	if (od)
-		atomic_inc(&od->refcnt);
-	spin_unlock_irq(&sysfs_open_dirent_lock);
-	if (!od)
-		return;
-
-	mutex_lock(&sysfs_open_file_mutex);
-	list_for_each_entry(of, &od->files, list) {
-		struct inode *inode = file_inode(of->file);
-		unmap_mapping_range(inode->i_mapping, 0, 0, 1);
+			return -EINVAL;
+
+		if (sysfs_ops->show && sysfs_ops->store)
+			ops = &sysfs_file_kfops_rw;
+		else if (sysfs_ops->show)
+			ops = &sysfs_file_kfops_ro;
+		else if (sysfs_ops->store)
+			ops = &sysfs_file_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+
+		size = PAGE_SIZE;
+	} else {
+		struct bin_attribute *battr = (void *)attr;
+
+		if (battr->mmap)
+			ops = &sysfs_bin_kfops_mmap;
+		else if (battr->read && battr->write)
+			ops = &sysfs_bin_kfops_rw;
+		else if (battr->read)
+			ops = &sysfs_bin_kfops_ro;
+		else if (battr->write)
+			ops = &sysfs_bin_kfops_wo;
+		else
+			ops = &sysfs_file_kfops_empty;
+
+		size = battr->size;
 	}
-	mutex_unlock(&sysfs_open_file_mutex);
-
-	sysfs_put_open_dirent(sd, NULL);
-}
-
-/* Sysfs attribute files are pollable.  The idea is that you read
- * the content and then you use 'poll' or 'select' to wait for
- * the content to change.  When the content changes (assuming the
- * manager for the kobject supports notification), poll will
- * return POLLERR|POLLPRI, and select will return the fd whether
- * it is waiting for read, write, or exceptions.
- * Once poll/select indicates that the value has changed, you
- * need to close and re-open the file, or seek to 0 and read again.
- * Reminder: this only works for attributes which actively support
- * it, and it is not possible to test an attribute from userspace
- * to see if it supports poll (Neither 'poll' nor 'select' return
- * an appropriate error code).  When in doubt, set a suitable timeout value.
- */
-static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
-{
-	struct sysfs_open_file *of = sysfs_of(filp);
-	struct sysfs_dirent *attr_sd = filp->f_path.dentry->d_fsdata;
-	struct sysfs_open_dirent *od = attr_sd->s_attr.open;
-
-	/* need parent for the kobj, grab both */
-	if (!sysfs_get_active(attr_sd))
-		goto trigger;
-
-	poll_wait(filp, &od->poll, wait);
 
-	sysfs_put_active(attr_sd);
-
-	if (of->event != atomic_read(&od->event))
-		goto trigger;
-
-	return DEFAULT_POLLMASK;
-
- trigger:
-	return DEFAULT_POLLMASK|POLLERR|POLLPRI;
-}
-
-void sysfs_notify_dirent(struct sysfs_dirent *sd)
-{
-	struct sysfs_open_dirent *od;
-	unsigned long flags;
-
-	spin_lock_irqsave(&sysfs_open_dirent_lock, flags);
-
-	if (!WARN_ON(sysfs_type(sd) != SYSFS_KOBJ_ATTR)) {
-		od = sd->s_attr.open;
-		if (od) {
-			atomic_inc(&od->event);
-			wake_up_interruptible(&od->poll);
-		}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (!attr->ignore_lockdep)
+		key = attr->key ?: (struct lock_class_key *)&attr->skey;
+#endif
+	kn = __kernfs_create_file(parent, attr->name, mode, size, ops,
+				  (void *)attr, ns, true, key);
+	if (IS_ERR(kn)) {
+		if (PTR_ERR(kn) == -EEXIST)
+			sysfs_warn_dup(parent, attr->name);
+		return PTR_ERR(kn);
 	}
-
-	spin_unlock_irqrestore(&sysfs_open_dirent_lock, flags);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify_dirent);
-
-void sysfs_notify(struct kobject *k, const char *dir, const char *attr)
-{
-	struct sysfs_dirent *sd = k->sd;
-
-	mutex_lock(&sysfs_mutex);
-
-	if (sd && dir)
-		sd = sysfs_find_dirent(sd, dir, NULL);
-	if (sd && attr)
-		sd = sysfs_find_dirent(sd, attr, NULL);
-	if (sd)
-		sysfs_notify_dirent(sd);
-
-	mutex_unlock(&sysfs_mutex);
-}
-EXPORT_SYMBOL_GPL(sysfs_notify);
-
-const struct file_operations sysfs_file_operations = {
-	.read		= seq_read,
-	.write		= sysfs_write_file,
-	.llseek		= generic_file_llseek,
-	.open		= sysfs_open_file,
-	.release	= sysfs_release,
-	.poll		= sysfs_poll,
-};
-
-const struct file_operations sysfs_bin_operations = {
-	.read		= sysfs_bin_read,
-	.write		= sysfs_write_file,
-	.llseek		= generic_file_llseek,
-	.mmap		= sysfs_bin_mmap,
-	.open		= sysfs_open_file,
-	.release	= sysfs_release,
-	.poll		= sysfs_poll,
-};
-
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-			   const struct attribute *attr, int type,
-			   umode_t amode, const void *ns)
-{
-	umode_t mode = (amode & S_IALLUGO) | S_IFREG;
-	struct sysfs_addrm_cxt acxt;
-	struct sysfs_dirent *sd;
-	int rc;
-
-	sd = sysfs_new_dirent(attr->name, mode, type);
-	if (!sd)
-		return -ENOMEM;
-
-	sd->s_ns = ns;
-	sd->s_attr.attr = (void *)attr;
-	sysfs_dirent_init_lockdep(sd);
-
-	sysfs_addrm_start(&acxt);
-	rc = sysfs_add_one(&acxt, sd, dir_sd);
-	sysfs_addrm_finish(&acxt);
-
-	if (rc)
-		sysfs_put(sd);
-
-	return rc;
+	return 0;
 }
 
-
-int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr,
-		   int type)
+int sysfs_add_file(struct kernfs_node *parent, const struct attribute *attr,
+		   bool is_bin)
 {
-	return sysfs_add_file_mode_ns(dir_sd, attr, type, attr->mode, NULL);
+	return sysfs_add_file_mode_ns(parent, attr, is_bin, attr->mode, NULL);
 }
 
 /**
@@ -861,8 +279,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr,
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file_mode_ns(kobj->sd, attr, SYSFS_KOBJ_ATTR,
-				      attr->mode, ns);
+	return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, ns);
 
 }
 EXPORT_SYMBOL_GPL(sysfs_create_file_ns);
@@ -890,19 +307,21 @@ EXPORT_SYMBOL_GPL(sysfs_create_files);
 int sysfs_add_file_to_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error;
 
-	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
-	else
-		dir_sd = sysfs_get(kobj->sd);
+	if (group) {
+		parent = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		parent = kobj->sd;
+		kernfs_get(parent);
+	}
 
-	if (!dir_sd)
+	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_add_file(dir_sd, attr, SYSFS_KOBJ_ATTR);
-	sysfs_put(dir_sd);
+	error = sysfs_add_file(parent, attr, false);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -918,23 +337,20 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group);
 int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr,
 		     umode_t mode)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	struct iattr newattrs;
 	int rc;
 
-	mutex_lock(&sysfs_mutex);
-
-	rc = -ENOENT;
-	sd = sysfs_find_dirent(kobj->sd, attr->name, NULL);
-	if (!sd)
-		goto out;
+	kn = kernfs_find_and_get(kobj->sd, attr->name);
+	if (!kn)
+		return -ENOENT;
 
-	newattrs.ia_mode = (mode & S_IALLUGO) | (sd->s_mode & ~S_IALLUGO);
+	newattrs.ia_mode = (mode & S_IALLUGO) | (kn->mode & ~S_IALLUGO);
 	newattrs.ia_valid = ATTR_MODE;
-	rc = sysfs_sd_setattr(sd, &newattrs);
 
- out:
-	mutex_unlock(&sysfs_mutex);
+	rc = kernfs_setattr(kn, &newattrs);
+
+	kernfs_put(kn);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(sysfs_chmod_file);
@@ -950,9 +366,9 @@ EXPORT_SYMBOL_GPL(sysfs_chmod_file);
 void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr,
 			  const void *ns)
 {
-	struct sysfs_dirent *dir_sd = kobj->sd;
+	struct kernfs_node *parent = kobj->sd;
 
-	sysfs_hash_and_remove(dir_sd, attr->name, ns);
+	kernfs_remove_by_name_ns(parent, attr->name, ns);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_ns);
 
@@ -973,15 +389,18 @@ EXPORT_SYMBOL_GPL(sysfs_remove_files);
 void sysfs_remove_file_from_group(struct kobject *kobj,
 		const struct attribute *attr, const char *group)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 
-	if (group)
-		dir_sd = sysfs_get_dirent(kobj->sd, group);
-	else
-		dir_sd = sysfs_get(kobj->sd);
-	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, attr->name, NULL);
-		sysfs_put(dir_sd);
+	if (group) {
+		parent = kernfs_find_and_get(kobj->sd, group);
+	} else {
+		parent = kobj->sd;
+		kernfs_get(parent);
+	}
+
+	if (parent) {
+		kernfs_remove_by_name(parent, attr->name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_file_from_group);
@@ -996,7 +415,7 @@ int sysfs_create_bin_file(struct kobject *kobj,
 {
 	BUG_ON(!kobj || !kobj->sd || !attr);
 
-	return sysfs_add_file(kobj->sd, &attr->attr, SYSFS_KOBJ_BIN_ATTR);
+	return sysfs_add_file(kobj->sd, &attr->attr, true);
 }
 EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 
@@ -1008,7 +427,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_bin_file);
 void sysfs_remove_bin_file(struct kobject *kobj,
 			   const struct bin_attribute *attr)
 {
-	sysfs_hash_and_remove(kobj->sd, attr->attr.name, NULL);
+	kernfs_remove_by_name(kobj->sd, attr->attr.name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_bin_file);
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 1898a10e38ce..6b579387c67a 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
 #include "sysfs.h"
 
 
-static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
 			 const struct attribute_group *grp)
 {
 	struct attribute *const *attr;
@@ -26,13 +26,13 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 
 	if (grp->attrs)
 		for (attr = grp->attrs; *attr; attr++)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
+			kernfs_remove_by_name(parent, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
 			sysfs_remove_bin_file(kobj, *bin_attr);
 }
 
-static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
+static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 			const struct attribute_group *grp, int update)
 {
 	struct attribute *const *attr;
@@ -49,22 +49,20 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			 * re-adding (if required) the file.
 			 */
 			if (update)
-				sysfs_hash_and_remove(dir_sd, (*attr)->name,
-						      NULL);
+				kernfs_remove_by_name(parent, (*attr)->name);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
 				if (!mode)
 					continue;
 			}
-			error = sysfs_add_file_mode_ns(dir_sd, *attr,
-						       SYSFS_KOBJ_ATTR,
+			error = sysfs_add_file_mode_ns(parent, *attr, false,
 						       (*attr)->mode | mode,
 						       NULL);
 			if (unlikely(error))
 				break;
 		}
 		if (error) {
-			remove_files(dir_sd, kobj, grp);
+			remove_files(parent, kobj, grp);
 			goto exit;
 		}
 	}
@@ -78,7 +76,7 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 				break;
 		}
 		if (error)
-			remove_files(dir_sd, kobj, grp);
+			remove_files(parent, kobj, grp);
 	}
 exit:
 	return error;
@@ -88,7 +86,7 @@ exit:
 static int internal_create_group(struct kobject *kobj, int update,
 				 const struct attribute_group *grp)
 {
-	struct sysfs_dirent *sd;
+	struct kernfs_node *kn;
 	int error;
 
 	BUG_ON(!kobj || (!update && !kobj->sd));
@@ -102,18 +100,22 @@ static int internal_create_group(struct kobject *kobj, int update,
 		return -EINVAL;
 	}
 	if (grp->name) {
-		error = sysfs_create_subdir(kobj, grp->name, &sd);
-		if (error)
-			return error;
+		kn = kernfs_create_dir(kobj->sd, grp->name,
+				       S_IRWXU | S_IRUGO | S_IXUGO, kobj);
+		if (IS_ERR(kn)) {
+			if (PTR_ERR(kn) == -EEXIST)
+				sysfs_warn_dup(kobj->sd, grp->name);
+			return PTR_ERR(kn);
+		}
 	} else
-		sd = kobj->sd;
-	sysfs_get(sd);
-	error = create_files(sd, kobj, grp, update);
+		kn = kobj->sd;
+	kernfs_get(kn);
+	error = create_files(kn, kobj, grp, update);
 	if (error) {
 		if (grp->name)
-			sysfs_remove(sd);
+			kernfs_remove(kn);
 	}
-	sysfs_put(sd);
+	kernfs_put(kn);
 	return error;
 }
 
@@ -203,25 +205,27 @@ EXPORT_SYMBOL_GPL(sysfs_update_group);
 void sysfs_remove_group(struct kobject *kobj,
 			const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd = kobj->sd;
-	struct sysfs_dirent *sd;
+	struct kernfs_node *parent = kobj->sd;
+	struct kernfs_node *kn;
 
 	if (grp->name) {
-		sd = sysfs_get_dirent(dir_sd, grp->name);
-		if (!sd) {
-			WARN(!sd, KERN_WARNING
+		kn = kernfs_find_and_get(parent, grp->name);
+		if (!kn) {
+			WARN(!kn, KERN_WARNING
 			     "sysfs group %p not found for kobject '%s'\n",
 			     grp, kobject_name(kobj));
 			return;
 		}
-	} else
-		sd = sysfs_get(dir_sd);
+	} else {
+		kn = parent;
+		kernfs_get(kn);
+	}
 
-	remove_files(sd, kobj, grp);
+	remove_files(kn, kobj, grp);
 	if (grp->name)
-		sysfs_remove(sd);
+		kernfs_remove(kn);
 
-	sysfs_put(sd);
+	kernfs_put(kn);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_group);
 
@@ -257,22 +261,22 @@ EXPORT_SYMBOL_GPL(sysfs_remove_groups);
 int sysfs_merge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error = 0;
 	struct attribute *const *attr;
 	int i;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
-	if (!dir_sd)
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (!parent)
 		return -ENOENT;
 
 	for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr))
-		error = sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR);
+		error = sysfs_add_file(parent, *attr, false);
 	if (error) {
 		while (--i >= 0)
-			sysfs_hash_and_remove(dir_sd, (*--attr)->name, NULL);
+			kernfs_remove_by_name(parent, (*--attr)->name);
 	}
-	sysfs_put(dir_sd);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -286,14 +290,14 @@ EXPORT_SYMBOL_GPL(sysfs_merge_group);
 void sysfs_unmerge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	struct attribute *const *attr;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, grp->name);
-	if (dir_sd) {
+	parent = kernfs_find_and_get(kobj->sd, grp->name);
+	if (parent) {
 		for (attr = grp->attrs; *attr; ++attr)
-			sysfs_hash_and_remove(dir_sd, (*attr)->name, NULL);
-		sysfs_put(dir_sd);
+			kernfs_remove_by_name(parent, (*attr)->name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
@@ -308,15 +312,15 @@ EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
 int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
 			    struct kobject *target, const char *link_name)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 	int error = 0;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
-	if (!dir_sd)
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (!parent)
 		return -ENOENT;
 
-	error = sysfs_create_link_sd(dir_sd, target, link_name);
-	sysfs_put(dir_sd);
+	error = sysfs_create_link_sd(parent, target, link_name);
+	kernfs_put(parent);
 
 	return error;
 }
@@ -331,12 +335,12 @@ EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
 void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
 				  const char *link_name)
 {
-	struct sysfs_dirent *dir_sd;
+	struct kernfs_node *parent;
 
-	dir_sd = sysfs_get_dirent(kobj->sd, group_name);
-	if (dir_sd) {
-		sysfs_hash_and_remove(dir_sd, link_name, NULL);
-		sysfs_put(dir_sd);
+	parent = kernfs_find_and_get(kobj->sd, group_name);
+	if (parent) {
+		kernfs_remove_by_name(parent, link_name);
+		kernfs_put(parent);
 	}
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
deleted file mode 100644
index 1750f790af3b..000000000000
--- a/fs/sysfs/inode.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * fs/sysfs/inode.c - basic sysfs inode and dentry operations
- *
- * Copyright (c) 2001-3 Patrick Mochel
- * Copyright (c) 2007 SUSE Linux Products GmbH
- * Copyright (c) 2007 Tejun Heo <teheo@suse.de>
- *
- * This file is released under the GPLv2.
- *
- * Please see Documentation/filesystems/sysfs.txt for more information.
- */
-
-#undef DEBUG
-
-#include <linux/pagemap.h>
-#include <linux/namei.h>
-#include <linux/backing-dev.h>
-#include <linux/capability.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/sysfs.h>
-#include <linux/xattr.h>
-#include <linux/security.h>
-#include "sysfs.h"
-
-static const struct address_space_operations sysfs_aops = {
-	.readpage	= simple_readpage,
-	.write_begin	= simple_write_begin,
-	.write_end	= simple_write_end,
-};
-
-static struct backing_dev_info sysfs_backing_dev_info = {
-	.name		= "sysfs",
-	.ra_pages	= 0,	/* No readahead */
-	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
-static const struct inode_operations sysfs_inode_operations = {
-	.permission	= sysfs_permission,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.setxattr	= sysfs_setxattr,
-};
-
-int __init sysfs_inode_init(void)
-{
-	return bdi_init(&sysfs_backing_dev_info);
-}
-
-static struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
-{
-	struct sysfs_inode_attrs *attrs;
-	struct iattr *iattrs;
-
-	attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
-	if (!attrs)
-		return NULL;
-	iattrs = &attrs->ia_iattr;
-
-	/* assign default attributes */
-	iattrs->ia_mode = sd->s_mode;
-	iattrs->ia_uid = GLOBAL_ROOT_UID;
-	iattrs->ia_gid = GLOBAL_ROOT_GID;
-	iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
-
-	return attrs;
-}
-
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr)
-{
-	struct sysfs_inode_attrs *sd_attrs;
-	struct iattr *iattrs;
-	unsigned int ia_valid = iattr->ia_valid;
-
-	sd_attrs = sd->s_iattr;
-
-	if (!sd_attrs) {
-		/* setting attributes for the first time, allocate now */
-		sd_attrs = sysfs_init_inode_attrs(sd);
-		if (!sd_attrs)
-			return -ENOMEM;
-		sd->s_iattr = sd_attrs;
-	}
-	/* attributes were changed at least once in past */
-	iattrs = &sd_attrs->ia_iattr;
-
-	if (ia_valid & ATTR_UID)
-		iattrs->ia_uid = iattr->ia_uid;
-	if (ia_valid & ATTR_GID)
-		iattrs->ia_gid = iattr->ia_gid;
-	if (ia_valid & ATTR_ATIME)
-		iattrs->ia_atime = iattr->ia_atime;
-	if (ia_valid & ATTR_MTIME)
-		iattrs->ia_mtime = iattr->ia_mtime;
-	if (ia_valid & ATTR_CTIME)
-		iattrs->ia_ctime = iattr->ia_ctime;
-	if (ia_valid & ATTR_MODE) {
-		umode_t mode = iattr->ia_mode;
-		iattrs->ia_mode = sd->s_mode = mode;
-	}
-	return 0;
-}
-
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-	struct inode *inode = dentry->d_inode;
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	int error;
-
-	if (!sd)
-		return -EINVAL;
-
-	mutex_lock(&sysfs_mutex);
-	error = inode_change_ok(inode, iattr);
-	if (error)
-		goto out;
-
-	error = sysfs_sd_setattr(sd, iattr);
-	if (error)
-		goto out;
-
-	/* this ignores size changes */
-	setattr_copy(inode, iattr);
-
-out:
-	mutex_unlock(&sysfs_mutex);
-	return error;
-}
-
-static int sysfs_sd_setsecdata(struct sysfs_dirent *sd, void **secdata,
-			       u32 *secdata_len)
-{
-	struct sysfs_inode_attrs *iattrs;
-	void *old_secdata;
-	size_t old_secdata_len;
-
-	if (!sd->s_iattr) {
-		sd->s_iattr = sysfs_init_inode_attrs(sd);
-		if (!sd->s_iattr)
-			return -ENOMEM;
-	}
-
-	iattrs = sd->s_iattr;
-	old_secdata = iattrs->ia_secdata;
-	old_secdata_len = iattrs->ia_secdata_len;
-
-	iattrs->ia_secdata = *secdata;
-	iattrs->ia_secdata_len = *secdata_len;
-
-	*secdata = old_secdata;
-	*secdata_len = old_secdata_len;
-	return 0;
-}
-
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		size_t size, int flags)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	void *secdata;
-	int error;
-	u32 secdata_len = 0;
-
-	if (!sd)
-		return -EINVAL;
-
-	if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
-		const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-		error = security_inode_setsecurity(dentry->d_inode, suffix,
-						value, size, flags);
-		if (error)
-			goto out;
-		error = security_inode_getsecctx(dentry->d_inode,
-						&secdata, &secdata_len);
-		if (error)
-			goto out;
-
-		mutex_lock(&sysfs_mutex);
-		error = sysfs_sd_setsecdata(sd, &secdata, &secdata_len);
-		mutex_unlock(&sysfs_mutex);
-
-		if (secdata)
-			security_release_secctx(secdata, secdata_len);
-	} else
-		return -EINVAL;
-out:
-	return error;
-}
-
-static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
-{
-	inode->i_mode = mode;
-	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-}
-
-static inline void set_inode_attr(struct inode *inode, struct iattr *iattr)
-{
-	inode->i_uid = iattr->ia_uid;
-	inode->i_gid = iattr->ia_gid;
-	inode->i_atime = iattr->ia_atime;
-	inode->i_mtime = iattr->ia_mtime;
-	inode->i_ctime = iattr->ia_ctime;
-}
-
-static void sysfs_refresh_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-	struct sysfs_inode_attrs *iattrs = sd->s_iattr;
-
-	inode->i_mode = sd->s_mode;
-	if (iattrs) {
-		/* sysfs_dirent has non-default attributes
-		 * get them from persistent copy in sysfs_dirent
-		 */
-		set_inode_attr(inode, &iattrs->ia_iattr);
-		security_inode_notifysecctx(inode,
-					    iattrs->ia_secdata,
-					    iattrs->ia_secdata_len);
-	}
-
-	if (sysfs_type(sd) == SYSFS_DIR)
-		set_nlink(inode, sd->s_dir.subdirs + 2);
-}
-
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct inode *inode = dentry->d_inode;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
-	mutex_unlock(&sysfs_mutex);
-
-	generic_fillattr(inode, stat);
-	return 0;
-}
-
-static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
-{
-	struct bin_attribute *bin_attr;
-
-	inode->i_private = sysfs_get(sd);
-	inode->i_mapping->a_ops = &sysfs_aops;
-	inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
-	inode->i_op = &sysfs_inode_operations;
-
-	set_default_inode_attr(inode, sd->s_mode);
-	sysfs_refresh_inode(sd, inode);
-
-	/* initialize inode according to type */
-	switch (sysfs_type(sd)) {
-	case SYSFS_DIR:
-		inode->i_op = &sysfs_dir_inode_operations;
-		inode->i_fop = &sysfs_dir_operations;
-		break;
-	case SYSFS_KOBJ_ATTR:
-		inode->i_size = PAGE_SIZE;
-		inode->i_fop = &sysfs_file_operations;
-		break;
-	case SYSFS_KOBJ_BIN_ATTR:
-		bin_attr = sd->s_attr.bin_attr;
-		inode->i_size = bin_attr->size;
-		inode->i_fop = &sysfs_bin_operations;
-		break;
-	case SYSFS_KOBJ_LINK:
-		inode->i_op = &sysfs_symlink_inode_operations;
-		break;
-	default:
-		BUG();
-	}
-
-	unlock_new_inode(inode);
-}
-
-/**
- *	sysfs_get_inode - get inode for sysfs_dirent
- *	@sb: super block
- *	@sd: sysfs_dirent to allocate inode for
- *
- *	Get inode for @sd.  If such inode doesn't exist, a new inode
- *	is allocated and basics are initialized.  New inode is
- *	returned locked.
- *
- *	LOCKING:
- *	Kernel thread context (may sleep).
- *
- *	RETURNS:
- *	Pointer to allocated inode on success, NULL on failure.
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd)
-{
-	struct inode *inode;
-
-	inode = iget_locked(sb, sd->s_ino);
-	if (inode && (inode->i_state & I_NEW))
-		sysfs_init_inode(sd, inode);
-
-	return inode;
-}
-
-/*
- * The sysfs_dirent serves as both an inode and a directory entry for sysfs.
- * To prevent the sysfs inode numbers from being freed prematurely we take a
- * reference to sysfs_dirent from the sysfs inode.  A
- * super_operations.evict_inode() implementation is needed to drop that
- * reference upon inode destruction.
- */
-void sysfs_evict_inode(struct inode *inode)
-{
-	struct sysfs_dirent *sd  = inode->i_private;
-
-	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
-	sysfs_put(sd);
-}
-
-int sysfs_permission(struct inode *inode, int mask)
-{
-	struct sysfs_dirent *sd;
-
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
-	sd = inode->i_private;
-
-	mutex_lock(&sysfs_mutex);
-	sysfs_refresh_inode(sd, inode);
-	mutex_unlock(&sysfs_mutex);
-
-	return generic_permission(inode, mask);
-}
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index 834ec2cdb7a3..6211230814fd 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -14,146 +14,41 @@
 
 #include <linux/fs.h>
 #include <linux/mount.h>
-#include <linux/pagemap.h>
 #include <linux/init.h>
-#include <linux/module.h>
-#include <linux/magic.h>
-#include <linux/slab.h>
 #include <linux/user_namespace.h>
 
 #include "sysfs.h"
 
-
-static struct vfsmount *sysfs_mnt;
-struct kmem_cache *sysfs_dir_cachep;
-
-static const struct super_operations sysfs_ops = {
-	.statfs		= simple_statfs,
-	.drop_inode	= generic_delete_inode,
-	.evict_inode	= sysfs_evict_inode,
-};
-
-struct sysfs_dirent sysfs_root = {
-	.s_name		= "",
-	.s_count	= ATOMIC_INIT(1),
-	.s_flags	= SYSFS_DIR | (KOBJ_NS_TYPE_NONE << SYSFS_NS_TYPE_SHIFT),
-	.s_mode		= S_IFDIR | S_IRUGO | S_IXUGO,
-	.s_ino		= 1,
-};
-
-static int sysfs_fill_super(struct super_block *sb, void *data, int silent)
-{
-	struct inode *inode;
-	struct dentry *root;
-
-	sb->s_blocksize = PAGE_CACHE_SIZE;
-	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = SYSFS_MAGIC;
-	sb->s_op = &sysfs_ops;
-	sb->s_time_gran = 1;
-
-	/* get root inode, initialize and unlock it */
-	mutex_lock(&sysfs_mutex);
-	inode = sysfs_get_inode(sb, &sysfs_root);
-	mutex_unlock(&sysfs_mutex);
-	if (!inode) {
-		pr_debug("sysfs: could not get root inode\n");
-		return -ENOMEM;
-	}
-
-	/* instantiate and link root dentry */
-	root = d_make_root(inode);
-	if (!root) {
-		pr_debug("%s: could not get root dentry!\n", __func__);
-		return -ENOMEM;
-	}
-	root->d_fsdata = &sysfs_root;
-	sb->s_root = root;
-	sb->s_d_op = &sysfs_dentry_ops;
-	return 0;
-}
-
-static int sysfs_test_super(struct super_block *sb, void *data)
-{
-	struct sysfs_super_info *sb_info = sysfs_info(sb);
-	struct sysfs_super_info *info = data;
-	enum kobj_ns_type type;
-	int found = 1;
-
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-		if (sb_info->ns[type] != info->ns[type])
-			found = 0;
-	}
-	return found;
-}
-
-static int sysfs_set_super(struct super_block *sb, void *data)
-{
-	int error;
-	error = set_anon_super(sb, data);
-	if (!error)
-		sb->s_fs_info = data;
-	return error;
-}
-
-static void free_sysfs_super_info(struct sysfs_super_info *info)
-{
-	int type;
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		kobj_ns_drop(type, info->ns[type]);
-	kfree(info);
-}
+static struct kernfs_root *sysfs_root;
+struct kernfs_node *sysfs_root_kn;
 
 static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	struct sysfs_super_info *info;
-	enum kobj_ns_type type;
-	struct super_block *sb;
-	int error;
+	struct dentry *root;
+	void *ns;
 
 	if (!(flags & MS_KERNMOUNT)) {
 		if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type))
 			return ERR_PTR(-EPERM);
 
-		for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++) {
-			if (!kobj_ns_current_may_mount(type))
-				return ERR_PTR(-EPERM);
-		}
-	}
-
-	info = kzalloc(sizeof(*info), GFP_KERNEL);
-	if (!info)
-		return ERR_PTR(-ENOMEM);
-
-	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
-		info->ns[type] = kobj_ns_grab_current(type);
-
-	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
-	if (IS_ERR(sb) || sb->s_fs_info != info)
-		free_sysfs_super_info(info);
-	if (IS_ERR(sb))
-		return ERR_CAST(sb);
-	if (!sb->s_root) {
-		error = sysfs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(sb);
-			return ERR_PTR(error);
-		}
-		sb->s_flags |= MS_ACTIVE;
+		if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET))
+			return ERR_PTR(-EPERM);
 	}
 
-	return dget(sb->s_root);
+	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
+	root = kernfs_mount_ns(fs_type, flags, sysfs_root, ns);
+	if (IS_ERR(root))
+		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
+	return root;
 }
 
 static void sysfs_kill_sb(struct super_block *sb)
 {
-	struct sysfs_super_info *info = sysfs_info(sb);
-	/* Remove the superblock from fs_supers/s_instances
-	 * so we can't find it, before freeing sysfs_super_info.
-	 */
-	kill_anon_super(sb);
-	free_sysfs_super_info(info);
+	void *ns = (void *)kernfs_super_ns(sb);
+
+	kernfs_kill_sb(sb);
+	kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 }
 
 static struct file_system_type sysfs_fs_type = {
@@ -165,48 +60,19 @@ static struct file_system_type sysfs_fs_type = {
 
 int __init sysfs_init(void)
 {
-	int err = -ENOMEM;
+	int err;
 
-	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
-					      sizeof(struct sysfs_dirent),
-					      0, 0, NULL);
-	if (!sysfs_dir_cachep)
-		goto out;
+	sysfs_root = kernfs_create_root(NULL, NULL);
+	if (IS_ERR(sysfs_root))
+		return PTR_ERR(sysfs_root);
 
-	err = sysfs_inode_init();
-	if (err)
-		goto out_err;
+	sysfs_root_kn = sysfs_root->kn;
 
 	err = register_filesystem(&sysfs_fs_type);
-	if (!err) {
-		sysfs_mnt = kern_mount(&sysfs_fs_type);
-		if (IS_ERR(sysfs_mnt)) {
-			printk(KERN_ERR "sysfs: could not mount!\n");
-			err = PTR_ERR(sysfs_mnt);
-			sysfs_mnt = NULL;
-			unregister_filesystem(&sysfs_fs_type);
-			goto out_err;
-		}
-	} else
-		goto out_err;
-out:
-	return err;
-out_err:
-	kmem_cache_destroy(sysfs_dir_cachep);
-	sysfs_dir_cachep = NULL;
-	goto out;
-}
-
-#undef sysfs_get
-struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd)
-{
-	return __sysfs_get(sd);
-}
-EXPORT_SYMBOL_GPL(sysfs_get);
+	if (err) {
+		kernfs_destroy_root(sysfs_root);
+		return err;
+	}
 
-#undef sysfs_put
-void sysfs_put(struct sysfs_dirent *sd)
-{
-	__sysfs_put(sd);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(sysfs_put);
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3ae3f1bf1a09..aecb15f84557 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -11,109 +11,73 @@
  */
 
 #include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mount.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <linux/namei.h>
 #include <linux/mutex.h>
 #include <linux/security.h>
 
 #include "sysfs.h"
 
-static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
-				   struct kobject *target,
+static int sysfs_do_create_link_sd(struct kernfs_node *parent,
+				   struct kobject *target_kobj,
 				   const char *name, int warn)
 {
-	struct sysfs_dirent *target_sd = NULL;
-	struct sysfs_dirent *sd = NULL;
-	struct sysfs_addrm_cxt acxt;
-	enum kobj_ns_type ns_type;
-	int error;
+	struct kernfs_node *kn, *target = NULL;
 
-	BUG_ON(!name || !parent_sd);
+	BUG_ON(!name || !parent);
 
 	/*
-	 * We don't own @target and it may be removed at any time.
+	 * We don't own @target_kobj and it may be removed at any time.
 	 * Synchronize using sysfs_symlink_target_lock.  See
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (target->sd)
-		target_sd = sysfs_get(target->sd);
+	if (target_kobj->sd) {
+		target = target_kobj->sd;
+		kernfs_get(target);
+	}
 	spin_unlock(&sysfs_symlink_target_lock);
 
-	error = -ENOENT;
-	if (!target_sd)
-		goto out_put;
-
-	error = -ENOMEM;
-	sd = sysfs_new_dirent(name, S_IFLNK|S_IRWXUGO, SYSFS_KOBJ_LINK);
-	if (!sd)
-		goto out_put;
+	if (!target)
+		return -ENOENT;
 
-	ns_type = sysfs_ns_type(parent_sd);
-	if (ns_type)
-		sd->s_ns = target_sd->s_ns;
-	sd->s_symlink.target_sd = target_sd;
-	target_sd = NULL;	/* reference is now owned by the symlink */
-
-	sysfs_addrm_start(&acxt);
-	/* Symlinks must be between directories with the same ns_type */
-	if (!ns_type ||
-	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
-		if (warn)
-			error = sysfs_add_one(&acxt, sd, parent_sd);
-		else
-			error = __sysfs_add_one(&acxt, sd, parent_sd);
-	} else {
-		error = -EINVAL;
-		WARN(1, KERN_WARNING
-			"sysfs: symlink across ns_types %s/%s -> %s/%s\n",
-			parent_sd->s_name,
-			sd->s_name,
-			sd->s_symlink.target_sd->s_parent->s_name,
-			sd->s_symlink.target_sd->s_name);
-	}
-	sysfs_addrm_finish(&acxt);
+	kn = kernfs_create_link(parent, name, target);
+	kernfs_put(target);
 
-	if (error)
-		goto out_put;
+	if (!IS_ERR(kn))
+		return 0;
 
-	return 0;
-
- out_put:
-	sysfs_put(target_sd);
-	sysfs_put(sd);
-	return error;
+	if (warn && PTR_ERR(kn) == -EEXIST)
+		sysfs_warn_dup(parent, name);
+	return PTR_ERR(kn);
 }
 
 /**
  *	sysfs_create_link_sd - create symlink to a given object.
- *	@sd:		directory we're creating the link in.
+ *	@kn:		directory we're creating the link in.
  *	@target:	object we're pointing to.
  *	@name:		name of the symlink.
  */
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name)
 {
-	return sysfs_do_create_link_sd(sd, target, name, 1);
+	return sysfs_do_create_link_sd(kn, target, name, 1);
 }
 
 static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 				const char *name, int warn)
 {
-	struct sysfs_dirent *parent_sd = NULL;
+	struct kernfs_node *parent = NULL;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
-	if (!parent_sd)
+	if (!parent)
 		return -EFAULT;
 
-	return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+	return sysfs_do_create_link_sd(parent, target, name, warn);
 }
 
 /**
@@ -164,10 +128,10 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
 	 * sysfs_remove_dir() for details.
 	 */
 	spin_lock(&sysfs_symlink_target_lock);
-	if (targ->sd && sysfs_ns_type(kobj->sd))
-		ns = targ->sd->s_ns;
+	if (targ->sd && kernfs_ns_enabled(kobj->sd))
+		ns = targ->sd->ns;
 	spin_unlock(&sysfs_symlink_target_lock);
-	sysfs_hash_and_remove(kobj->sd, name, ns);
+	kernfs_remove_by_name_ns(kobj->sd, name, ns);
 }
 
 /**
@@ -177,14 +141,14 @@ void sysfs_delete_link(struct kobject *kobj, struct kobject *targ,
  */
 void sysfs_remove_link(struct kobject *kobj, const char *name)
 {
-	struct sysfs_dirent *parent_sd = NULL;
+	struct kernfs_node *parent = NULL;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
-	sysfs_hash_and_remove(parent_sd, name, NULL);
+	kernfs_remove_by_name(parent, name);
 }
 EXPORT_SYMBOL_GPL(sysfs_remove_link);
 
@@ -201,130 +165,33 @@ EXPORT_SYMBOL_GPL(sysfs_remove_link);
 int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *targ,
 			 const char *old, const char *new, const void *new_ns)
 {
-	struct sysfs_dirent *parent_sd, *sd = NULL;
+	struct kernfs_node *parent, *kn = NULL;
 	const void *old_ns = NULL;
 	int result;
 
 	if (!kobj)
-		parent_sd = &sysfs_root;
+		parent = sysfs_root_kn;
 	else
-		parent_sd = kobj->sd;
+		parent = kobj->sd;
 
 	if (targ->sd)
-		old_ns = targ->sd->s_ns;
+		old_ns = targ->sd->ns;
 
 	result = -ENOENT;
-	sd = sysfs_get_dirent_ns(parent_sd, old, old_ns);
-	if (!sd)
+	kn = kernfs_find_and_get_ns(parent, old, old_ns);
+	if (!kn)
 		goto out;
 
 	result = -EINVAL;
-	if (sysfs_type(sd) != SYSFS_KOBJ_LINK)
+	if (kernfs_type(kn) != KERNFS_LINK)
 		goto out;
-	if (sd->s_symlink.target_sd->s_dir.kobj != targ)
+	if (kn->symlink.target_kn->priv != targ)
 		goto out;
 
-	result = sysfs_rename(sd, parent_sd, new, new_ns);
+	result = kernfs_rename_ns(kn, parent, new, new_ns);
 
 out:
-	sysfs_put(sd);
+	kernfs_put(kn);
 	return result;
 }
 EXPORT_SYMBOL_GPL(sysfs_rename_link_ns);
-
-static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
-				 struct sysfs_dirent *target_sd, char *path)
-{
-	struct sysfs_dirent *base, *sd;
-	char *s = path;
-	int len = 0;
-
-	/* go up to the root, stop at the base */
-	base = parent_sd;
-	while (base->s_parent) {
-		sd = target_sd->s_parent;
-		while (sd->s_parent && base != sd)
-			sd = sd->s_parent;
-
-		if (base == sd)
-			break;
-
-		strcpy(s, "../");
-		s += 3;
-		base = base->s_parent;
-	}
-
-	/* determine end of target string for reverse fillup */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		len += strlen(sd->s_name) + 1;
-		sd = sd->s_parent;
-	}
-
-	/* check limits */
-	if (len < 2)
-		return -EINVAL;
-	len--;
-	if ((s - path) + len > PATH_MAX)
-		return -ENAMETOOLONG;
-
-	/* reverse fillup of target string from target to base */
-	sd = target_sd;
-	while (sd->s_parent && sd != base) {
-		int slen = strlen(sd->s_name);
-
-		len -= slen;
-		strncpy(s + len, sd->s_name, slen);
-		if (len)
-			s[--len] = '/';
-
-		sd = sd->s_parent;
-	}
-
-	return 0;
-}
-
-static int sysfs_getlink(struct dentry *dentry, char *path)
-{
-	struct sysfs_dirent *sd = dentry->d_fsdata;
-	struct sysfs_dirent *parent_sd = sd->s_parent;
-	struct sysfs_dirent *target_sd = sd->s_symlink.target_sd;
-	int error;
-
-	mutex_lock(&sysfs_mutex);
-	error = sysfs_get_target_path(parent_sd, target_sd, path);
-	mutex_unlock(&sysfs_mutex);
-
-	return error;
-}
-
-static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-	int error = -ENOMEM;
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (page) {
-		error = sysfs_getlink(dentry, (char *) page);
-		if (error < 0)
-			free_page((unsigned long)page);
-	}
-	nd_set_link(nd, error ? ERR_PTR(error) : (char *)page);
-	return NULL;
-}
-
-static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd,
-			   void *cookie)
-{
-	char *page = nd_get_link(nd);
-	if (!IS_ERR(page))
-		free_page((unsigned long)page);
-}
-
-const struct inode_operations sysfs_symlink_inode_operations = {
-	.setxattr	= sysfs_setxattr,
-	.readlink	= generic_readlink,
-	.follow_link	= sysfs_follow_link,
-	.put_link	= sysfs_put_link,
-	.setattr	= sysfs_setattr,
-	.getattr	= sysfs_getattr,
-	.permission	= sysfs_permission,
-};
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 0af09fbfb3f6..0e2f1cccb812 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,248 +8,36 @@
  * This file is released under the GPLv2.
  */
 
-#include <linux/lockdep.h>
-#include <linux/kobject_ns.h>
-#include <linux/fs.h>
-#include <linux/rbtree.h>
+#ifndef __SYSFS_INTERNAL_H
+#define __SYSFS_INTERNAL_H
 
-struct sysfs_open_dirent;
-
-/* type-specific structures for sysfs_dirent->s_* union members */
-struct sysfs_elem_dir {
-	struct kobject		*kobj;
-
-	unsigned long		subdirs;
-	/* children rbtree starts here and goes through sd->s_rb */
-	struct rb_root		children;
-};
-
-struct sysfs_elem_symlink {
-	struct sysfs_dirent	*target_sd;
-};
-
-struct sysfs_elem_attr {
-	union {
-		struct attribute	*attr;
-		struct bin_attribute	*bin_attr;
-	};
-	struct sysfs_open_dirent *open;
-};
-
-struct sysfs_inode_attrs {
-	struct iattr	ia_iattr;
-	void		*ia_secdata;
-	u32		ia_secdata_len;
-};
-
-/*
- * sysfs_dirent - the building block of sysfs hierarchy.  Each and
- * every sysfs node is represented by single sysfs_dirent.
- *
- * As long as s_count reference is held, the sysfs_dirent itself is
- * accessible.  Dereferencing s_elem or any other outer entity
- * requires s_active reference.
- */
-struct sysfs_dirent {
-	atomic_t		s_count;
-	atomic_t		s_active;
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-	struct lockdep_map	dep_map;
-#endif
-	struct sysfs_dirent	*s_parent;
-	const char		*s_name;
-
-	struct rb_node		s_rb;
-
-	union {
-		struct completion	*completion;
-		struct sysfs_dirent	*removed_list;
-	} u;
-
-	const void		*s_ns; /* namespace tag */
-	unsigned int		s_hash; /* ns + name hash */
-	union {
-		struct sysfs_elem_dir		s_dir;
-		struct sysfs_elem_symlink	s_symlink;
-		struct sysfs_elem_attr		s_attr;
-	};
-
-	unsigned short		s_flags;
-	umode_t			s_mode;
-	unsigned int		s_ino;
-	struct sysfs_inode_attrs *s_iattr;
-};
-
-#define SD_DEACTIVATED_BIAS		INT_MIN
-
-#define SYSFS_TYPE_MASK			0x00ff
-#define SYSFS_DIR			0x0001
-#define SYSFS_KOBJ_ATTR			0x0002
-#define SYSFS_KOBJ_BIN_ATTR		0x0004
-#define SYSFS_KOBJ_LINK			0x0008
-#define SYSFS_COPY_NAME			(SYSFS_DIR | SYSFS_KOBJ_LINK)
-#define SYSFS_ACTIVE_REF		(SYSFS_KOBJ_ATTR | SYSFS_KOBJ_BIN_ATTR)
-
-/* identify any namespace tag on sysfs_dirents */
-#define SYSFS_NS_TYPE_MASK		0xf00
-#define SYSFS_NS_TYPE_SHIFT		8
-
-#define SYSFS_FLAG_MASK			~(SYSFS_NS_TYPE_MASK|SYSFS_TYPE_MASK)
-#define SYSFS_FLAG_REMOVED		0x02000
-
-static inline unsigned int sysfs_type(struct sysfs_dirent *sd)
-{
-	return sd->s_flags & SYSFS_TYPE_MASK;
-}
-
-/*
- * Return any namespace tags on this dirent.
- * enum kobj_ns_type is defined in linux/kobject.h
- */
-static inline enum kobj_ns_type sysfs_ns_type(struct sysfs_dirent *sd)
-{
-	return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;
-}
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-
-#define sysfs_dirent_init_lockdep(sd)				\
-do {								\
-	struct attribute *attr = sd->s_attr.attr;		\
-	struct lock_class_key *key = attr->key;			\
-	if (!key)						\
-		key = &attr->skey;				\
-								\
-	lockdep_init_map(&sd->dep_map, "s_active", key, 0);	\
-} while (0)
-
-/* Test for attributes that want to ignore lockdep for read-locking */
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-	int type = sysfs_type(sd);
-
-	return (type == SYSFS_KOBJ_ATTR || type == SYSFS_KOBJ_BIN_ATTR) &&
-		sd->s_attr.attr->ignore_lockdep;
-}
-
-#else
-
-#define sysfs_dirent_init_lockdep(sd) do {} while (0)
-
-static inline bool sysfs_ignore_lockdep(struct sysfs_dirent *sd)
-{
-	return true;
-}
-
-#endif
-
-/*
- * Context structure to be used while adding/removing nodes.
- */
-struct sysfs_addrm_cxt {
-	struct sysfs_dirent	*removed;
-};
+#include <linux/sysfs.h>
 
 /*
  * mount.c
  */
-
-/*
- * Each sb is associated with a set of namespace tags (i.e.
- * the network namespace of the task which mounted this sysfs
- * instance).
- */
-struct sysfs_super_info {
-	void *ns[KOBJ_NS_TYPES];
-};
-#define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
-extern struct sysfs_dirent sysfs_root;
-extern struct kmem_cache *sysfs_dir_cachep;
+extern struct kernfs_node *sysfs_root_kn;
 
 /*
  * dir.c
  */
-extern struct mutex sysfs_mutex;
 extern spinlock_t sysfs_symlink_target_lock;
-extern const struct dentry_operations sysfs_dentry_ops;
-
-extern const struct file_operations sysfs_dir_operations;
-extern const struct inode_operations sysfs_dir_inode_operations;
 
-struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd);
-void sysfs_put_active(struct sysfs_dirent *sd);
-void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt);
-void sysfs_warn_dup(struct sysfs_dirent *parent, const char *name);
-int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		    struct sysfs_dirent *parent_sd);
-int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd,
-		  struct sysfs_dirent *parent_sd);
-void sysfs_remove(struct sysfs_dirent *sd);
-int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name,
-			  const void *ns);
-void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
-
-struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
-				       const unsigned char *name,
-				       const void *ns);
-struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type);
-
-void release_sysfs_dirent(struct sysfs_dirent *sd);
-
-int sysfs_create_subdir(struct kobject *kobj, const char *name,
-			struct sysfs_dirent **p_sd);
-
-int sysfs_rename(struct sysfs_dirent *sd, struct sysfs_dirent *new_parent_sd,
-		 const char *new_name, const void *new_ns);
-
-static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd)
-{
-	if (sd) {
-		WARN_ON(!atomic_read(&sd->s_count));
-		atomic_inc(&sd->s_count);
-	}
-	return sd;
-}
-#define sysfs_get(sd) __sysfs_get(sd)
-
-static inline void __sysfs_put(struct sysfs_dirent *sd)
-{
-	if (sd && atomic_dec_and_test(&sd->s_count))
-		release_sysfs_dirent(sd);
-}
-#define sysfs_put(sd) __sysfs_put(sd)
-
-/*
- * inode.c
- */
-struct inode *sysfs_get_inode(struct super_block *sb, struct sysfs_dirent *sd);
-void sysfs_evict_inode(struct inode *inode);
-int sysfs_sd_setattr(struct sysfs_dirent *sd, struct iattr *iattr);
-int sysfs_permission(struct inode *inode, int mask);
-int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
-int sysfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
-		  struct kstat *stat);
-int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		   size_t size, int flags);
-int sysfs_inode_init(void);
+void sysfs_warn_dup(struct kernfs_node *parent, const char *name);
 
 /*
  * file.c
  */
-extern const struct file_operations sysfs_file_operations;
-extern const struct file_operations sysfs_bin_operations;
-
-int sysfs_add_file(struct sysfs_dirent *dir_sd,
-		   const struct attribute *attr, int type);
-
-int sysfs_add_file_mode_ns(struct sysfs_dirent *dir_sd,
-			   const struct attribute *attr, int type,
+int sysfs_add_file(struct kernfs_node *parent,
+		   const struct attribute *attr, bool is_bin);
+int sysfs_add_file_mode_ns(struct kernfs_node *parent,
+			   const struct attribute *attr, bool is_bin,
 			   umode_t amode, const void *ns);
-void sysfs_unmap_bin_file(struct sysfs_dirent *sd);
 
 /*
  * symlink.c
  */
-extern const struct inode_operations sysfs_symlink_inode_operations;
-int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name);
+
+#endif	/* __SYSFS_INTERNAL_H */
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 739e0a52deda..5549d69ddb45 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -110,7 +110,7 @@ xfs_attr3_rmt_verify(
 	if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
 		return false;
 	if (be32_to_cpu(rmt->rm_offset) +
-				be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX)
+				be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
 		return false;
 	if (rmt->rm_owner == 0)
 		return false;
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 3ef11b22e750..3b2c14b6f0fb 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -1635,7 +1635,7 @@ xfs_bmap_last_extent(
  * blocks at the end of the file which do not start at the previous data block,
  * we will try to align the new blocks at stripe unit boundaries.
  *
- * Returns 0 in bma->aeof if the file (fork) is empty as any new write will be
+ * Returns 1 in bma->aeof if the file (fork) is empty as any new write will be
  * at, or past the EOF.
  */
 STATIC int
@@ -1650,9 +1650,14 @@ xfs_bmap_isaeof(
 	bma->aeof = 0;
 	error = xfs_bmap_last_extent(NULL, bma->ip, whichfork, &rec,
 				     &is_empty);
-	if (error || is_empty)
+	if (error)
 		return error;
 
+	if (is_empty) {
+		bma->aeof = 1;
+		return 0;
+	}
+
 	/*
 	 * Check if we are allocation or past the last extent, or at least into
 	 * the last delayed allocated extent.
@@ -3643,10 +3648,19 @@ xfs_bmap_btalloc(
 	int		isaligned;
 	int		tryagain;
 	int		error;
+	int		stripe_align;
 
 	ASSERT(ap->length);
 
 	mp = ap->ip->i_mount;
+
+	/* stripe alignment for allocation is determined by mount parameters */
+	stripe_align = 0;
+	if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC))
+		stripe_align = mp->m_swidth;
+	else if (mp->m_dalign)
+		stripe_align = mp->m_dalign;
+
 	align = ap->userdata ? xfs_get_extsz_hint(ap->ip) : 0;
 	if (unlikely(align)) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3655,6 +3669,8 @@ xfs_bmap_btalloc(
 		ASSERT(!error);
 		ASSERT(ap->length);
 	}
+
+
 	nullfb = *ap->firstblock == NULLFSBLOCK;
 	fb_agno = nullfb ? NULLAGNUMBER : XFS_FSB_TO_AGNO(mp, *ap->firstblock);
 	if (nullfb) {
@@ -3730,7 +3746,7 @@ xfs_bmap_btalloc(
 	 */
 	if (!ap->flist->xbf_low && ap->aeof) {
 		if (!ap->offset) {
-			args.alignment = mp->m_dalign;
+			args.alignment = stripe_align;
 			atype = args.type;
 			isaligned = 1;
 			/*
@@ -3755,13 +3771,13 @@ xfs_bmap_btalloc(
 			 * of minlen+alignment+slop doesn't go up
 			 * between the calls.
 			 */
-			if (blen > mp->m_dalign && blen <= args.maxlen)
-				nextminlen = blen - mp->m_dalign;
+			if (blen > stripe_align && blen <= args.maxlen)
+				nextminlen = blen - stripe_align;
 			else
 				nextminlen = args.minlen;
-			if (nextminlen + mp->m_dalign > args.minlen + 1)
+			if (nextminlen + stripe_align > args.minlen + 1)
 				args.minalignslop =
-					nextminlen + mp->m_dalign -
+					nextminlen + stripe_align -
 					args.minlen - 1;
 			else
 				args.minalignslop = 0;
@@ -3783,7 +3799,7 @@ xfs_bmap_btalloc(
 		 */
 		args.type = atype;
 		args.fsbno = ap->blkno;
-		args.alignment = mp->m_dalign;
+		args.alignment = stripe_align;
 		args.minlen = nextminlen;
 		args.minalignslop = 0;
 		isaligned = 1;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 5887e41c0323..82e0dab46ee5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -287,6 +287,7 @@ xfs_bmapi_allocate(
 	INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
 	queue_work(xfs_alloc_wq, &args->work);
 	wait_for_completion(&done);
+	destroy_work_on_stack(&args->work);
 	return args->result;
 }
 
@@ -1187,7 +1188,12 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNWRITE(bp);
 		XFS_BUF_READ(bp);
 		XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock));
-		xfsbdstrat(mp, bp);
+
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			error = XFS_ERROR(EIO);
+			break;
+		}
+		xfs_buf_iorequest(bp);
 		error = xfs_buf_iowait(bp);
 		if (error) {
 			xfs_buf_ioerror_alert(bp,
@@ -1200,7 +1206,12 @@ xfs_zero_remaining_bytes(
 		XFS_BUF_UNDONE(bp);
 		XFS_BUF_UNREAD(bp);
 		XFS_BUF_WRITE(bp);
-		xfsbdstrat(mp, bp);
+
+		if (XFS_FORCED_SHUTDOWN(mp)) {
+			error = XFS_ERROR(EIO);
+			break;
+		}
+		xfs_buf_iorequest(bp);
 		error = xfs_buf_iowait(bp);
 		if (error) {
 			xfs_buf_ioerror_alert(bp,
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c7f0b77dcb00..afe7645e4b2b 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -698,7 +698,11 @@ xfs_buf_read_uncached(
 	bp->b_flags |= XBF_READ;
 	bp->b_ops = ops;
 
-	xfsbdstrat(target->bt_mount, bp);
+	if (XFS_FORCED_SHUTDOWN(target->bt_mount)) {
+		xfs_buf_relse(bp);
+		return NULL;
+	}
+	xfs_buf_iorequest(bp);
 	xfs_buf_iowait(bp);
 	return bp;
 }
@@ -1089,7 +1093,7 @@ xfs_bioerror(
  * This is meant for userdata errors; metadata bufs come with
  * iodone functions attached, so that we can track down errors.
  */
-STATIC int
+int
 xfs_bioerror_relse(
 	struct xfs_buf	*bp)
 {
@@ -1152,7 +1156,7 @@ xfs_bwrite(
 	ASSERT(xfs_buf_islocked(bp));
 
 	bp->b_flags |= XBF_WRITE;
-	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
+	bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL);
 
 	xfs_bdstrat_cb(bp);
 
@@ -1164,25 +1168,6 @@ xfs_bwrite(
 	return error;
 }
 
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem.  Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
-{
-	if (XFS_FORCED_SHUTDOWN(mp)) {
-		trace_xfs_bdstrat_shut(bp, _RET_IP_);
-		xfs_bioerror_relse(bp);
-		return;
-	}
-
-	xfs_buf_iorequest(bp);
-}
-
 STATIC void
 _xfs_buf_ioend(
 	xfs_buf_t		*bp,
@@ -1516,6 +1501,12 @@ xfs_wait_buftarg(
 			struct xfs_buf *bp;
 			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 			list_del_init(&bp->b_lru);
+			if (bp->b_flags & XBF_WRITE_FAIL) {
+				xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+					(long long)bp->b_bn);
+			}
 			xfs_buf_rele(bp);
 		}
 		if (loop++ != 0)
@@ -1799,7 +1790,7 @@ __xfs_buf_delwri_submit(
 
 	blk_start_plug(&plug);
 	list_for_each_entry_safe(bp, n, io_list, b_list) {
-		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
+		bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL);
 		bp->b_flags |= XBF_WRITE;
 
 		if (!wait) {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e65683361017..1cf21a4a9f22 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -45,6 +45,7 @@ typedef enum {
 #define XBF_ASYNC	 (1 << 4) /* initiator will not wait for completion */
 #define XBF_DONE	 (1 << 5) /* all pages in the buffer uptodate */
 #define XBF_STALE	 (1 << 6) /* buffer has been staled, do not find it */
+#define XBF_WRITE_FAIL	 (1 << 24)/* async writes have failed on this buffer */
 
 /* I/O hints for the BIO layer */
 #define XBF_SYNCIO	 (1 << 10)/* treat this buffer as synchronous I/O */
@@ -70,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_ASYNC,		"ASYNC" }, \
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_STALE,		"STALE" }, \
+	{ XBF_WRITE_FAIL,	"WRITE_FAIL" }, \
 	{ XBF_SYNCIO,		"SYNCIO" }, \
 	{ XBF_FUA,		"FUA" }, \
 	{ XBF_FLUSH,		"FLUSH" }, \
@@ -80,6 +82,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
 	{ _XBF_COMPOUND,	"COMPOUND" }
 
+
 /*
  * Internal state flags.
  */
@@ -269,9 +272,6 @@ extern void xfs_buf_unlock(xfs_buf_t *);
 
 /* Buffer Read and Write Routines */
 extern int xfs_bwrite(struct xfs_buf *bp);
-
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-
 extern void xfs_buf_ioend(xfs_buf_t *,	int);
 extern void xfs_buf_ioerror(xfs_buf_t *, int);
 extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func);
@@ -282,6 +282,8 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
 #define xfs_buf_zero(bp, off, len) \
 	    xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
 
+extern int xfs_bioerror_relse(struct xfs_buf *);
+
 static inline int xfs_buf_geterror(xfs_buf_t *bp)
 {
 	return bp ? bp->b_error : ENOMEM;
@@ -301,7 +303,8 @@ extern void xfs_buf_terminate(void);
 
 #define XFS_BUF_ZEROFLAGS(bp) \
 	((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
-			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
+			    XBF_SYNCIO|XBF_FUA|XBF_FLUSH| \
+			    XBF_WRITE_FAIL))
 
 void xfs_buf_stale(struct xfs_buf *bp);
 #define XFS_BUF_UNSTALE(bp)	((bp)->b_flags &= ~XBF_STALE)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a64f67ba25d3..2227b9b050bb 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -496,6 +496,14 @@ xfs_buf_item_unpin(
 	}
 }
 
+/*
+ * Buffer IO error rate limiting. Limit it to no more than 10 messages per 30
+ * seconds so as to not spam logs too much on repeated detection of the same
+ * buffer being bad..
+ */
+
+DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10);
+
 STATIC uint
 xfs_buf_item_push(
 	struct xfs_log_item	*lip,
@@ -524,6 +532,14 @@ xfs_buf_item_push(
 
 	trace_xfs_buf_item_push(bip);
 
+	/* has a previous flush failed due to IO errors? */
+	if ((bp->b_flags & XBF_WRITE_FAIL) &&
+	    ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
+		xfs_warn(bp->b_target->bt_mount,
+"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+			 (long long)bp->b_bn);
+	}
+
 	if (!xfs_buf_delwri_queue(bp, buffer_list))
 		rval = XFS_ITEM_FLUSHING;
 	xfs_buf_unlock(bp);
@@ -1096,8 +1112,9 @@ xfs_buf_iodone_callbacks(
 
 		xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
 
-		if (!XFS_BUF_ISSTALE(bp)) {
-			bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
+		if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) {
+			bp->b_flags |= XBF_WRITE | XBF_ASYNC |
+				       XBF_DONE | XBF_WRITE_FAIL;
 			xfs_buf_iorequest(bp);
 		} else {
 			xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 56369d4509d5..48c7d18f68c3 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -2067,12 +2067,12 @@ xfs_dir2_node_lookup(
  */
 int						/* error */
 xfs_dir2_node_removename(
-	xfs_da_args_t		*args)		/* operation arguments */
+	struct xfs_da_args	*args)		/* operation arguments */
 {
-	xfs_da_state_blk_t	*blk;		/* leaf block */
+	struct xfs_da_state_blk	*blk;		/* leaf block */
 	int			error;		/* error return value */
 	int			rval;		/* operation return value */
-	xfs_da_state_t		*state;		/* btree cursor */
+	struct xfs_da_state	*state;		/* btree cursor */
 
 	trace_xfs_dir2_node_removename(args);
 
@@ -2084,19 +2084,18 @@ xfs_dir2_node_removename(
 	state->mp = args->dp->i_mount;
 	state->blocksize = state->mp->m_dirblksize;
 	state->node_ents = state->mp->m_dir_node_ents;
-	/*
-	 * Look up the entry we're deleting, set up the cursor.
-	 */
+
+	/* Look up the entry we're deleting, set up the cursor. */
 	error = xfs_da3_node_lookup_int(state, &rval);
 	if (error)
-		rval = error;
-	/*
-	 * Didn't find it, upper layer screwed up.
-	 */
+		goto out_free;
+
+	/* Didn't find it, upper layer screwed up. */
 	if (rval != EEXIST) {
-		xfs_da_state_free(state);
-		return rval;
+		error = rval;
+		goto out_free;
 	}
+
 	blk = &state->path.blk[state->path.active - 1];
 	ASSERT(blk->magic == XFS_DIR2_LEAFN_MAGIC);
 	ASSERT(state->extravalid);
@@ -2107,7 +2106,7 @@ xfs_dir2_node_removename(
 	error = xfs_dir2_leafn_remove(args, blk->bp, blk->index,
 		&state->extrablk, &rval);
 	if (error)
-		return error;
+		goto out_free;
 	/*
 	 * Fix the hash values up the btree.
 	 */
@@ -2122,6 +2121,7 @@ xfs_dir2_node_removename(
 	 */
 	if (!error)
 		error = xfs_dir2_node_to_leaf(state);
+out_free:
 	xfs_da_state_free(state);
 	return error;
 }
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index 8367d6dc18c9..4f11ef011139 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -157,7 +157,7 @@ xfs_ioc_trim(
 	struct xfs_mount		*mp,
 	struct fstrim_range __user	*urange)
 {
-	struct request_queue	*q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+	struct request_queue	*q = bdev_get_queue(mp->m_ddev_targp->bt_bdev);
 	unsigned int		granularity = q->limits.discard_granularity;
 	struct fstrim_range	range;
 	xfs_daddr_t		start, end, minlen;
@@ -180,7 +180,8 @@ xfs_ioc_trim(
 	 * matter as trimming blocks is an advisory interface.
 	 */
 	if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
-	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)))
+	    range.minlen > XFS_FSB_TO_B(mp, XFS_ALLOC_AG_MAX_USABLE(mp)) ||
+	    range.len < mp->m_sb.sb_blocksize)
 		return -XFS_ERROR(EINVAL);
 
 	start = BTOBB(range.start);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a6e54b3319bd..02fb943cbf22 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -220,6 +220,8 @@ xfs_growfs_data_private(
 	 */
 	nfree = 0;
 	for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
+		__be32	*agfl_bno;
+
 		/*
 		 * AG freespace header block
 		 */
@@ -279,8 +281,10 @@ xfs_growfs_data_private(
 			agfl->agfl_seqno = cpu_to_be32(agno);
 			uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_uuid);
 		}
+
+		agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp);
 		for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
-			agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+			agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 4d613401a5e0..33ad9a77791f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -442,7 +442,8 @@ xfs_attrlist_by_handle(
 		return -XFS_ERROR(EPERM);
 	if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
 	/*
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index e8fb1231db81..a7992f8de9d3 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,8 @@ xfs_compat_attrlist_by_handle(
 	if (copy_from_user(&al_hreq, arg,
 			   sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
 		return -XFS_ERROR(EFAULT);
-	if (al_hreq.buflen > XATTR_LIST_MAX)
+	if (al_hreq.buflen < sizeof(struct attrlist) ||
+	    al_hreq.buflen > XATTR_LIST_MAX)
 		return -XFS_ERROR(EINVAL);
 
 	/*
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 27e0e544e963..104455b8046c 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -618,7 +618,8 @@ xfs_setattr_nonsize(
 		}
 		if (!gid_eq(igid, gid)) {
 			if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
-				ASSERT(!XFS_IS_PQUOTA_ON(mp));
+				ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) ||
+				       !XFS_IS_PQUOTA_ON(mp));
 				ASSERT(mask & ATTR_GID);
 				ASSERT(gdqp);
 				olddquot2 = xfs_qm_vop_chown(tp, ip,
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b6b669df40f3..eae16920655b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -193,7 +193,10 @@ xlog_bread_noalign(
 	bp->b_io_length = nbblks;
 	bp->b_error = 0;
 
-	xfsbdstrat(log->l_mp, bp);
+	if (XFS_FORCED_SHUTDOWN(log->l_mp))
+		return XFS_ERROR(EIO);
+
+	xfs_buf_iorequest(bp);
 	error = xfs_buf_iowait(bp);
 	if (error)
 		xfs_buf_ioerror_alert(bp, __func__);
@@ -4397,7 +4400,13 @@ xlog_do_recover(
 	XFS_BUF_READ(bp);
 	XFS_BUF_UNASYNC(bp);
 	bp->b_ops = &xfs_sb_buf_ops;
-	xfsbdstrat(log->l_mp, bp);
+
+	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+		xfs_buf_relse(bp);
+		return XFS_ERROR(EIO);
+	}
+
+	xfs_buf_iorequest(bp);
 	error = xfs_buf_iowait(bp);
 	if (error) {
 		xfs_buf_ioerror_alert(bp, __func__);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 14a4996cfec6..dd88f0e27bd8 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -134,8 +134,6 @@ xfs_qm_dqpurge(
 {
 	struct xfs_mount	*mp = dqp->q_mount;
 	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	struct xfs_dquot	*gdqp = NULL;
-	struct xfs_dquot	*pdqp = NULL;
 
 	xfs_dqlock(dqp);
 	if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
@@ -143,21 +141,6 @@ xfs_qm_dqpurge(
 		return EAGAIN;
 	}
 
-	/*
-	 * If this quota has a hint attached, prepare for releasing it now.
-	 */
-	gdqp = dqp->q_gdquot;
-	if (gdqp) {
-		xfs_dqlock(gdqp);
-		dqp->q_gdquot = NULL;
-	}
-
-	pdqp = dqp->q_pdquot;
-	if (pdqp) {
-		xfs_dqlock(pdqp);
-		dqp->q_pdquot = NULL;
-	}
-
 	dqp->dq_flags |= XFS_DQ_FREEING;
 
 	xfs_dqflock(dqp);
@@ -206,11 +189,47 @@ xfs_qm_dqpurge(
 	XFS_STATS_DEC(xs_qm_dquot_unused);
 
 	xfs_qm_dqdestroy(dqp);
+	return 0;
+}
+
+/*
+ * Release the group or project dquot pointers the user dquots maybe carrying
+ * around as a hint, and proceed to purge the user dquot cache if requested.
+*/
+STATIC int
+xfs_qm_dqpurge_hints(
+	struct xfs_dquot	*dqp,
+	void			*data)
+{
+	struct xfs_dquot	*gdqp = NULL;
+	struct xfs_dquot	*pdqp = NULL;
+	uint			flags = *((uint *)data);
+
+	xfs_dqlock(dqp);
+	if (dqp->dq_flags & XFS_DQ_FREEING) {
+		xfs_dqunlock(dqp);
+		return EAGAIN;
+	}
+
+	/* If this quota has a hint attached, prepare for releasing it now */
+	gdqp = dqp->q_gdquot;
+	if (gdqp)
+		dqp->q_gdquot = NULL;
+
+	pdqp = dqp->q_pdquot;
+	if (pdqp)
+		dqp->q_pdquot = NULL;
+
+	xfs_dqunlock(dqp);
 
 	if (gdqp)
-		xfs_qm_dqput(gdqp);
+		xfs_qm_dqrele(gdqp);
 	if (pdqp)
-		xfs_qm_dqput(pdqp);
+		xfs_qm_dqrele(pdqp);
+
+	if (flags & XFS_QMOPT_UQUOTA)
+		return xfs_qm_dqpurge(dqp, NULL);
+
 	return 0;
 }
 
@@ -222,8 +241,18 @@ xfs_qm_dqpurge_all(
 	struct xfs_mount	*mp,
 	uint			flags)
 {
-	if (flags & XFS_QMOPT_UQUOTA)
-		xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
+	/*
+	 * We have to release group/project dquot hint(s) from the user dquot
+	 * at first if they are there, otherwise we would run into an infinite
+	 * loop while walking through radix tree to purge other type of dquots
+	 * since their refcount is not zero if the user dquot refers to them
+	 * as hint.
+	 *
+	 * Call the special xfs_qm_dqpurge_hints() will end up go through the
+	 * general xfs_qm_dqpurge() against user dquot cache if requested.
+	 */
+	xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge_hints, &flags);
+
 	if (flags & XFS_QMOPT_GQUOTA)
 		xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
 	if (flags & XFS_QMOPT_PQUOTA)
@@ -2082,24 +2111,21 @@ xfs_qm_vop_create_dqattach(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
 	ASSERT(XFS_IS_QUOTA_RUNNING(mp));
 
-	if (udqp) {
+	if (udqp && XFS_IS_UQUOTA_ON(mp)) {
 		ASSERT(ip->i_udquot == NULL);
-		ASSERT(XFS_IS_UQUOTA_ON(mp));
 		ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
 
 		ip->i_udquot = xfs_qm_dqhold(udqp);
 		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
-	if (gdqp) {
+	if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
 		ASSERT(ip->i_gdquot == NULL);
-		ASSERT(XFS_IS_GQUOTA_ON(mp));
 		ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
 		ip->i_gdquot = xfs_qm_dqhold(gdqp);
 		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
-	if (pdqp) {
+	if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
 		ASSERT(ip->i_pdquot == NULL);
-		ASSERT(XFS_IS_PQUOTA_ON(mp));
 		ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
 
 		ip->i_pdquot = xfs_qm_dqhold(pdqp);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index c035d11b7734..647b6f1d8923 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -314,7 +314,18 @@ xfs_trans_read_buf_map(
 			ASSERT(bp->b_iodone == NULL);
 			XFS_BUF_READ(bp);
 			bp->b_ops = ops;
-			xfsbdstrat(tp->t_mountp, bp);
+
+			/*
+			 * XXX(hch): clean up the error handling here to be less
+			 * of a mess..
+			 */
+			if (XFS_FORCED_SHUTDOWN(mp)) {
+				trace_xfs_bdstrat_shut(bp, _RET_IP_);
+				xfs_bioerror_relse(bp);
+			} else {
+				xfs_buf_iorequest(bp);
+			}
+
 			error = xfs_buf_iowait(bp);
 			if (error) {
 				xfs_buf_ioerror_alert(bp, __func__);