From b296821a7c42fa58baa17513b2b7b30ae66f3336 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 10 Apr 2016 20:48:24 -0400
Subject: xattr_handler: pass dentry and inode as separate arguments of ->get()

... and do not assume they are already attached to each other

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/shmem.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 9428c51ab2d6..00d5d025eece 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2646,10 +2646,10 @@ static int shmem_initxattrs(struct inode *inode,
 }
 
 static int shmem_xattr_handler_get(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   void *buffer, size_t size)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
 {
-	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	name = xattr_full_name(handler, name);
 	return simple_xattr_get(&info->xattrs, name, buffer, size);
-- 
cgit v1.2.3


From 8efd755ac2fe262d4c8d5c9bbe054bb67dae93da Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Thu, 28 Apr 2016 11:39:12 +0200
Subject: mm/mmu_context, sched/core: Fix mmu_context.h assumption

Some architectures (such as Alpha) rely on include/linux/sched.h definitions
in their mmu_context.h files.

So include sched.h before mmu_context.h.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 mm/mmu_context.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index f802c2d216a7..6f4d27c5bb32 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -4,9 +4,9 @@
  */
 
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/mmu_context.h>
 #include <linux/export.h>
-#include <linux/sched.h>
 
 #include <asm/mmu_context.h>
 
-- 
cgit v1.2.3


From c64fb5c7448c1a0cfa163f126df3c112b6ca3e97 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:51:55 -0700
Subject: filemap: remove pos variables in generic_file_read_iter

Just use ki_pos directly to make everyones life easier.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 mm/filemap.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index f2479af09da9..5885925cdb5b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1838,8 +1838,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t retval = 0;
-	loff_t *ppos = &iocb->ki_pos;
-	loff_t pos = *ppos;
 	size_t count = iov_iter_count(iter);
 
 	if (!count)
@@ -1851,15 +1849,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		loff_t size;
 
 		size = i_size_read(inode);
-		retval = filemap_write_and_wait_range(mapping, pos,
-					pos + count - 1);
+		retval = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+					iocb->ki_pos + count - 1);
 		if (!retval) {
 			struct iov_iter data = *iter;
-			retval = mapping->a_ops->direct_IO(iocb, &data, pos);
+			retval = mapping->a_ops->direct_IO(iocb, &data,
+					iocb->ki_pos);
 		}
 
 		if (retval > 0) {
-			*ppos = pos + retval;
+			iocb->ki_pos += retval;
 			iov_iter_advance(iter, retval);
 		}
 
@@ -1872,14 +1871,14 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		 * the rest of the read.  Buffered reads will not work for
 		 * DAX files, so don't bother trying.
 		 */
-		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+		if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size ||
 		    IS_DAX(inode)) {
 			file_accessed(file);
 			goto out;
 		}
 	}
 
-	retval = do_generic_file_read(file, ppos, iter, retval);
+	retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval);
 out:
 	return retval;
 }
-- 
cgit v1.2.3


From 1af5bb491fbb41c8dab9d728a92758dd6a28afd4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:51:56 -0700
Subject: filemap: remove the pos argument to generic_file_direct_write

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/file.c    | 9 ++++-----
 fs/fuse/file.c     | 2 +-
 include/linux/fs.h | 2 +-
 mm/filemap.c       | 5 +++--
 4 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8d7b5a45c005..6c376311a9d7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1703,18 +1703,17 @@ again:
 	return num_written ? num_written : ret;
 }
 
-static ssize_t __btrfs_direct_write(struct kiocb *iocb,
-				    struct iov_iter *from,
-				    loff_t pos)
+static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
+	loff_t pos = iocb->ki_pos;
 	ssize_t written;
 	ssize_t written_buffered;
 	loff_t endbyte;
 	int err;
 
-	written = generic_file_direct_write(iocb, from, pos);
+	written = generic_file_direct_write(iocb, from);
 
 	if (written < 0 || !iov_iter_count(from))
 		return written;
@@ -1832,7 +1831,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		atomic_inc(&BTRFS_I(inode)->sync_writers);
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
-		num_written = __btrfs_direct_write(iocb, from, pos);
+		num_written = __btrfs_direct_write(iocb, from);
 	} else {
 		num_written = __btrfs_buffered_write(file, from, pos);
 		if (num_written > 0)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 719924d6c706..7e8c4603d43a 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1186,7 +1186,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		loff_t pos = iocb->ki_pos;
-		written = generic_file_direct_write(iocb, from, pos);
+		written = generic_file_direct_write(iocb, from);
 		if (written < 0 || !iov_iter_count(from))
 			goto out;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 70e61b58baaf..e9eaa2074061 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2703,7 +2703,7 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
-extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t);
+extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t);
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos);
diff --git a/mm/filemap.c b/mm/filemap.c
index 5885925cdb5b..e7108c31346d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2499,11 +2499,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos)
+generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode	*inode = mapping->host;
+	loff_t		pos = iocb->ki_pos;
 	ssize_t		written;
 	size_t		write_len;
 	pgoff_t		end;
@@ -2717,7 +2718,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		loff_t pos, endbyte;
 
-		written = generic_file_direct_write(iocb, from, iocb->ki_pos);
+		written = generic_file_direct_write(iocb, from);
 		/*
 		 * If the write stopped short of completing, fall back to
 		 * buffered writes.  Some filesystems do this for writes to
-- 
cgit v1.2.3


From c8b8e32d700fe943a935e435ae251364d016c497 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:51:58 -0700
Subject: direct-io: eliminate the offset argument to ->direct_IO

Including blkdev_direct_IO and dax_do_io.  It has to be ki_pos to actually
work, so eliminate the superflous argument.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/Locking          |  2 +-
 Documentation/filesystems/vfs.txt          |  2 +-
 drivers/staging/lustre/lustre/llite/rw26.c |  4 ++--
 fs/9p/vfs_addr.c                           |  3 ++-
 fs/affs/file.c                             |  5 +++--
 fs/block_dev.c                             |  6 +++---
 fs/btrfs/inode.c                           |  6 +++---
 fs/ceph/addr.c                             |  3 +--
 fs/cifs/file.c                             |  2 +-
 fs/dax.c                                   |  4 ++--
 fs/direct-io.c                             |  7 ++++---
 fs/exofs/inode.c                           |  3 +--
 fs/ext2/inode.c                            |  8 ++++----
 fs/ext4/ext4.h                             |  3 +--
 fs/ext4/indirect.c                         | 12 ++++++------
 fs/ext4/inode.c                            | 18 +++++++++---------
 fs/f2fs/data.c                             |  6 +++---
 fs/fat/inode.c                             |  6 +++---
 fs/fuse/file.c                             |  3 ++-
 fs/gfs2/aops.c                             |  6 +++---
 fs/hfs/inode.c                             |  7 +++----
 fs/hfsplus/inode.c                         |  7 +++----
 fs/jfs/inode.c                             |  7 +++----
 fs/nfs/direct.c                            | 17 +++++++----------
 fs/nfs/file.c                              |  2 +-
 fs/nilfs2/inode.c                          |  4 ++--
 fs/ocfs2/aops.c                            |  9 ++++-----
 fs/reiserfs/inode.c                        |  7 +++----
 fs/udf/file.c                              |  3 +--
 fs/udf/inode.c                             |  7 +++----
 fs/xfs/xfs_aops.c                          |  7 +++----
 fs/xfs/xfs_file.c                          |  2 +-
 include/linux/dax.h                        |  2 +-
 include/linux/fs.h                         |  9 ++++-----
 include/linux/nfs_fs.h                     |  5 ++---
 mm/filemap.c                               |  5 ++---
 mm/page_io.c                               |  2 +-
 37 files changed, 99 insertions(+), 112 deletions(-)

(limited to 'mm')

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 619af9bfdcb3..75eea7ce3d7c 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -194,7 +194,7 @@ prototypes:
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
-	int (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
+	int (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	int (*migratepage)(struct address_space *, struct page *, struct page *);
 	int (*launder_page)(struct page *);
 	int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4164bd6397a2..c61a223ef3ff 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -591,7 +591,7 @@ struct address_space_operations {
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, int);
 	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
+	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	/* migrate the contents of a page to the specified target */
 	int (*migratepage) (struct page *, struct page *);
 	int (*launder_page) (struct page *);
diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c
index 69aa15e8e3ef..0c3459c1a518 100644
--- a/drivers/staging/lustre/lustre/llite/rw26.c
+++ b/drivers/staging/lustre/lustre/llite/rw26.c
@@ -358,14 +358,14 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io,
  */
 #define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) *	  \
 		       PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1))
-static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t file_offset)
+static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct lu_env *env;
 	struct cl_io *io;
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ccc_object *obj = cl_inode2ccc(inode);
+	loff_t file_offset = iocb->ki_pos;
 	ssize_t count = iov_iter_count(iter);
 	ssize_t tot_bytes = 0, result = 0;
 	struct ll_inode_info *lli = ll_i2info(inode);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index ac9225e86bf3..c37fb9c08970 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -245,9 +245,10 @@ static int v9fs_launder_page(struct page *page)
  *
  */
 static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
+	loff_t pos = iocb->ki_pos;
 	ssize_t n;
 	int err = 0;
 	if (iov_iter_rw(iter) == WRITE) {
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 0cde550050e8..0deec9cc2362 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -389,12 +389,13 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
 }
 
 static ssize_t
-affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 	if (iov_iter_rw(iter) == WRITE) {
@@ -404,7 +405,7 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 			return 0;
 	}
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, affs_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		affs_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02b77c4..9e1f3fe25753 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -162,15 +162,15 @@ static struct inode *bdev_file_inode(struct file *file)
 }
 
 static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
 
 	if (IS_DAX(inode))
-		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
+		return dax_do_io(iocb, inode, iter, blkdev_get_block,
 				NULL, DIO_SKIP_DIO_COUNT);
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset,
+	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
 				    blkdev_get_block, NULL, NULL,
 				    DIO_SKIP_DIO_COUNT);
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2aaba58b4856..352d4e1dc985 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -8541,13 +8541,13 @@ out:
 	return retval;
 }
 
-static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
+static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_data dio_data = { 0 };
+	loff_t offset = iocb->ki_pos;
 	size_t count = 0;
 	int flags = 0;
 	bool wakeup = true;
@@ -8607,7 +8607,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	ret = __blockdev_direct_IO(iocb, inode,
 				   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-				   iter, offset, btrfs_get_blocks_direct, NULL,
+				   iter, btrfs_get_blocks_direct, NULL,
 				   btrfs_submit_direct, flags);
 	if (iov_iter_rw(iter) == WRITE) {
 		current->journal_info = NULL;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 4801571f51cb..43098cd9602b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1292,8 +1292,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
  * intercept O_DIRECT reads and writes early, this function should
  * never get called.
  */
-static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t pos)
+static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
 {
 	WARN_ON(1);
 	return -EINVAL;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c03d0744648b..cb070aa88e57 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -3854,7 +3854,7 @@ void cifs_oplock_break(struct work_struct *work)
  * Direct IO is not yet supported in the cached mode. 
  */
 static ssize_t
-cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter)
 {
         /*
          * FIXME
diff --git a/fs/dax.c b/fs/dax.c
index 75ba46d82a76..0dbe4e0f16fe 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -244,7 +244,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  * @iocb: The control block for this I/O
  * @inode: The file which the I/O is directed at
  * @iter: The addresses to do I/O from or to
- * @pos: The file offset where the I/O starts
  * @get_block: The filesystem method used to translate file offsets to blocks
  * @end_io: A filesystem callback for I/O completion
  * @flags: See below
@@ -257,11 +256,12 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
  * is in progress.
  */
 ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-		  struct iov_iter *iter, loff_t pos, get_block_t get_block,
+		  struct iov_iter *iter, get_block_t get_block,
 		  dio_iodone_t end_io, int flags)
 {
 	struct buffer_head bh;
 	ssize_t retval = -EINVAL;
+	loff_t pos = iocb->ki_pos;
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 472037732daf..8949d3e35756 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1113,7 +1113,7 @@ static inline int drop_refcount(struct dio *dio)
 static inline ssize_t
 do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 		      struct block_device *bdev, struct iov_iter *iter,
-		      loff_t offset, get_block_t get_block, dio_iodone_t end_io,
+		      get_block_t get_block, dio_iodone_t end_io,
 		      dio_submit_t submit_io, int flags)
 {
 	unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
@@ -1121,6 +1121,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	loff_t end = offset + count;
 	struct dio *dio;
 	struct dio_submit sdio = { 0, };
@@ -1328,7 +1329,7 @@ out:
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,
-			     loff_t offset, get_block_t get_block,
+			     get_block_t get_block,
 			     dio_iodone_t end_io, dio_submit_t submit_io,
 			     int flags)
 {
@@ -1344,7 +1345,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	prefetch(bdev->bd_queue);
 	prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES);
 
-	return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block,
+	return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block,
 				     end_io, submit_io, flags);
 }
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 49e1bd00b4ec..9dc4c6dbf3c9 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -960,8 +960,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset,
 
 
  /* TODO: Should be easy enough to do proprly */
-static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
+static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	return 0;
 }
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 6bd58e6ff038..b675610391b8 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -854,20 +854,20 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block)
 }
 
 static ssize_t
-ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 	if (IS_DAX(inode))
-		ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL,
+		ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL,
 				DIO_LOCKING);
 	else
-		ret = blockdev_direct_IO(iocb, inode, iter, offset,
-					 ext2_get_block);
+		ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		ext2_write_failed(mapping, offset + count);
 	return ret;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 349afebe21ee..72f4c9e00e97 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2581,8 +2581,7 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 				struct ext4_map_blocks *map, int flags);
-extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset);
+extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
 extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
 extern void ext4_ind_truncate(handle_t *, struct inode *inode);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 3027fa681de5..627b7e8f9ef3 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -659,12 +659,12 @@ out:
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
-ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			   loff_t offset)
+ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct ext4_inode_info *ei = EXT4_I(inode);
+	loff_t offset = iocb->ki_pos;
 	handle_t *handle;
 	ssize_t ret;
 	int orphan = 0;
@@ -707,21 +707,21 @@ retry:
 			goto locked;
 		}
 		if (IS_DAX(inode))
-			ret = dax_do_io(iocb, inode, iter, offset,
+			ret = dax_do_io(iocb, inode, iter,
 					ext4_dio_get_block, NULL, 0);
 		else
 			ret = __blockdev_direct_IO(iocb, inode,
 						   inode->i_sb->s_bdev, iter,
-						   offset, ext4_dio_get_block,
+						   ext4_dio_get_block,
 						   NULL, NULL, 0);
 		inode_dio_end(inode);
 	} else {
 locked:
 		if (IS_DAX(inode))
-			ret = dax_do_io(iocb, inode, iter, offset,
+			ret = dax_do_io(iocb, inode, iter,
 					ext4_dio_get_block, NULL, DIO_LOCKING);
 		else
-			ret = blockdev_direct_IO(iocb, inode, iter, offset,
+			ret = blockdev_direct_IO(iocb, inode, iter,
 						 ext4_dio_get_block);
 
 		if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 981a1fc30eaa..79b298d397b4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3334,12 +3334,12 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
  * if the machine crashes during the write.
  *
  */
-static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset)
+static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	ssize_t ret;
+	loff_t offset = iocb->ki_pos;
 	size_t count = iov_iter_count(iter);
 	int overwrite = 0;
 	get_block_t *get_block_func = NULL;
@@ -3348,7 +3348,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	/* Use the old path for reads and writes beyond i_size. */
 	if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
-		return ext4_ind_direct_IO(iocb, iter, offset);
+		return ext4_ind_direct_IO(iocb, iter);
 
 	BUG_ON(iocb->private == NULL);
 
@@ -3400,11 +3400,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
 	if (IS_DAX(inode))
-		ret = dax_do_io(iocb, inode, iter, offset, get_block_func,
+		ret = dax_do_io(iocb, inode, iter, get_block_func,
 				ext4_end_io_dio, dio_flags);
 	else
 		ret = __blockdev_direct_IO(iocb, inode,
-					   inode->i_sb->s_bdev, iter, offset,
+					   inode->i_sb->s_bdev, iter,
 					   get_block_func,
 					   ext4_end_io_dio, NULL, dio_flags);
 
@@ -3431,12 +3431,12 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	return ret;
 }
 
-static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -3456,9 +3456,9 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ret = ext4_ext_direct_IO(iocb, iter, offset);
+		ret = ext4_ext_direct_IO(iocb, iter);
 	else
-		ret = ext4_ind_direct_IO(iocb, iter, offset);
+		ret = ext4_ind_direct_IO(iocb, iter);
 	trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
 	return ret;
 }
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 53fec0872e60..a4c5da5bfe1e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1655,12 +1655,12 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
 	return 0;
 }
 
-static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	int err;
 
 	err = check_direct_IO(inode, iter, offset);
@@ -1672,7 +1672,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
-	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+	err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
 	if (err < 0 && iov_iter_rw(iter) == WRITE)
 		f2fs_write_failed(mapping, offset + count);
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 226281068a46..3bcf57925dca 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -244,13 +244,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
 	return err;
 }
 
-static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	ssize_t ret;
 
 	if (iov_iter_rw(iter) == WRITE) {
@@ -272,7 +272,7 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 * FAT need to use the DIO_LOCKING for avoiding the race
 	 * condition of fat_get_block() and ->truncate().
 	 */
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block);
 	if (ret < 0 && iov_iter_rw(iter) == WRITE)
 		fat_write_failed(mapping, offset + count);
 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 7e8c4603d43a..02279073bf64 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2837,7 +2837,7 @@ static inline loff_t fuse_round_up(loff_t off)
 }
 
 static ssize_t
-fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	ssize_t ret = 0;
@@ -2848,6 +2848,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 	struct inode *inode;
 	loff_t i_size;
 	size_t count = iov_iter_count(iter);
+	loff_t offset = iocb->ki_pos;
 	struct fuse_io_priv *io;
 	bool is_sync = is_sync_kiocb(iocb);
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1bbbee945f46..8524c0e322fc 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -1042,13 +1042,13 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset)
 
 
-static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
+static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	struct address_space *mapping = inode->i_mapping;
 	struct gfs2_inode *ip = GFS2_I(inode);
+	loff_t offset = iocb->ki_pos;
 	struct gfs2_holder gh;
 	int rv;
 
@@ -1099,7 +1099,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	}
 
 	rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				  offset, gfs2_get_block_direct, NULL, NULL, 0);
+				  gfs2_get_block_direct, NULL, NULL, 0);
 out:
 	gfs2_glock_dq(&gh);
 	gfs2_holder_uninit(&gh);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index cb1e5faa2fb7..c82331a9cf9b 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -124,8 +124,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask)
 	return res ? try_to_free_buffers(page) : 0;
 }
 
-static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -133,7 +132,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -141,7 +140,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if (end > isize)
 			hfs_write_failed(mapping, end);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index b28f39865c3a..2ad34a5eb5ad 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -122,8 +122,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
 	return res ? try_to_free_buffers(page) : 0;
 }
 
-static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				 loff_t offset)
+static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -131,7 +130,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -139,7 +138,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if (end > isize)
 			hfsplus_write_failed(mapping, end);
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 9d9bae63ae2a..f6a2a78121b0 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -332,8 +332,7 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, jfs_get_block);
 }
 
-static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -341,7 +340,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block);
 
 	/*
 	 * In case of error extending write may have instantiated a few
@@ -349,7 +348,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if (end > isize)
 			jfs_write_failed(mapping, end);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c93826e4a8c6..346b5d85ce92 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -250,7 +250,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
  * shunt off direct read and write requests before the VFS gets them,
  * so this method is only ever called for swap.
  */
-ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
+ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
 
@@ -261,7 +261,7 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
 
 	if (iov_iter_rw(iter) == READ)
-		return nfs_file_direct_read(iocb, iter, pos);
+		return nfs_file_direct_read(iocb, iter);
 	return nfs_file_direct_write(iocb, iter);
 }
 
@@ -545,7 +545,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_read - file direct read operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers into which to read data
- * @pos: byte offset in file where reading starts
  *
  * We use this function for direct reads instead of calling
  * generic_file_aio_read() in order to avoid gfar's check to see if
@@ -561,8 +560,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
  * client must read the updated atime from the server back into its
  * cache.
  */
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
-				loff_t pos)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -574,7 +572,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
-		file, count, (long long) pos);
+		file, count, (long long) iocb->ki_pos);
 
 	result = 0;
 	if (!count)
@@ -594,7 +592,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 
 	dreq->inode = inode;
 	dreq->bytes_left = count;
-	dreq->io_start = pos;
+	dreq->io_start = iocb->ki_pos;
 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
 	l_ctx = nfs_get_lock_context(dreq->ctx);
 	if (IS_ERR(l_ctx)) {
@@ -606,14 +604,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 		dreq->iocb = iocb;
 
 	NFS_I(inode)->read_io += count;
-	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
+	result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
 
 	inode_unlock(inode);
 
 	if (!result) {
 		result = nfs_direct_wait(dreq);
 		if (result > 0)
-			iocb->ki_pos = pos + result;
+			iocb->ki_pos += result;
 	}
 
 	nfs_direct_req_release(dreq);
@@ -969,7 +967,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
  * nfs_file_direct_write - file direct write operation for NFS files
  * @iocb: target I/O control block
  * @iter: vector of user buffers from which to write data
- * @pos: byte offset in file where writing starts
  *
  * We use this function for direct writes instead of calling
  * generic_file_aio_write() in order to avoid taking the inode
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index be01095b97ae..717a8d6af52d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -164,7 +164,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
 	ssize_t result;
 
 	if (iocb->ki_flags & IOCB_DIRECT)
-		return nfs_file_direct_read(iocb, to, iocb->ki_pos);
+		return nfs_file_direct_read(iocb, to);
 
 	dprintk("NFS: read(%pD2, %zu@%lu)\n",
 		iocb->ki_filp,
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 534631358b13..cfebcd2fc7f3 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -305,7 +305,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping,
 }
 
 static ssize_t
-nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 
@@ -313,7 +313,7 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 		return 0;
 
 	/* Needs synchronization with the cleaner */
-	return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
+	return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block);
 }
 
 const struct address_space_operations nilfs_aops = {
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ad1577348a92..6c66c62d4a7e 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2423,13 +2423,11 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
 	return 0;
 }
 
-static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			       loff_t offset)
+static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file)->i_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	loff_t end = offset + iter->count;
 	get_block_t *get_block;
 
 	/*
@@ -2440,7 +2438,8 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	/* Fallback to buffered I/O if we do not support append dio. */
-	if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb))
+	if (iocb->ki_pos + iter->count > i_size_read(inode) &&
+	    !ocfs2_supports_append_dio(osb))
 		return 0;
 
 	if (iov_iter_rw(iter) == READ)
@@ -2449,7 +2448,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		get_block = ocfs2_dio_get_block;
 
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-				    iter, offset, get_block,
+				    iter, get_block,
 				    ocfs2_dio_end_io, NULL, 0);
 }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d5c2e9c865de..825455d3e4ba 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -3279,15 +3279,14 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * We thank Mingming Cao for helping us understand in great detail what
  * to do in this section of the code.
  */
-static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				  loff_t offset)
+static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset,
+	ret = blockdev_direct_IO(iocb, inode, iter,
 				 reiserfs_get_blocks_direct_io);
 
 	/*
@@ -3296,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 */
 	if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) {
 		loff_t isize = i_size_read(inode);
-		loff_t end = offset + count;
+		loff_t end = iocb->ki_pos + count;
 
 		if ((end > isize) && inode_newsize_ok(inode, isize) == 0) {
 			truncate_setsize(inode, isize);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 877ba1c9b461..7ab8d8196e90 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -99,8 +99,7 @@ static int udf_adinicb_write_begin(struct file *file,
 	return 0;
 }
 
-static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-				     loff_t offset)
+static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	/* Fallback to buffered I/O. */
 	return 0;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 2dc461eeb415..f323aff740ef 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -214,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping,
 	return ret;
 }
 
-static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			     loff_t offset)
+static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -223,9 +222,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
-	ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block);
+	ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block);
 	if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE))
-		udf_write_failed(mapping, offset + count);
+		udf_write_failed(mapping, iocb->ki_pos + count);
 	return ret;
 }
 
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e49b2406d15d..c535887c60a8 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1406,8 +1406,7 @@ xfs_end_io_direct_write(
 STATIC ssize_t
 xfs_vm_direct_IO(
 	struct kiocb		*iocb,
-	struct iov_iter		*iter,
-	loff_t			offset)
+	struct iov_iter		*iter)
 {
 	struct inode		*inode = iocb->ki_filp->f_mapping->host;
 	dio_iodone_t		*endio = NULL;
@@ -1420,12 +1419,12 @@ xfs_vm_direct_IO(
 	}
 
 	if (IS_DAX(inode)) {
-		return dax_do_io(iocb, inode, iter, offset,
+		return dax_do_io(iocb, inode, iter,
 				 xfs_get_blocks_direct, endio, 0);
 	}
 
 	bdev = xfs_find_bdev_for_inode(inode);
-	return  __blockdev_direct_IO(iocb, inode, bdev, iter, offset,
+	return  __blockdev_direct_IO(iocb, inode, bdev, iter,
 			xfs_get_blocks_direct, endio, NULL, flags);
 }
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 92f72fb05497..5de047ab2411 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -794,7 +794,7 @@ xfs_file_dio_aio_write(
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
 
 	data = *from;
-	ret = mapping->a_ops->direct_IO(iocb, &data, iocb->ki_pos);
+	ret = mapping->a_ops->direct_IO(iocb, &data);
 
 	/* see generic_file_direct_write() for why this is necessary */
 	if (mapping->nrpages) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 636dd59ab505..982a6c4a62f3 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -5,7 +5,7 @@
 #include <linux/mm.h>
 #include <asm/pgtable.h>
 
-ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
+ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *,
 		  get_block_t, dio_iodone_t, int flags);
 int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size);
 int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e9eaa2074061..e6b2de159736 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -394,7 +394,7 @@ struct address_space_operations {
 	void (*invalidatepage) (struct page *, unsigned int, unsigned int);
 	int (*releasepage) (struct page *, gfp_t);
 	void (*freepage)(struct page *);
-	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset);
+	ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter);
 	/*
 	 * migrate the contents of a page to the specified target. If
 	 * migrate_mode is MIGRATE_ASYNC, it must not block.
@@ -2766,18 +2766,17 @@ void dio_end_io(struct bio *bio, int error);
 
 ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			     struct block_device *bdev, struct iov_iter *iter,
-			     loff_t offset, get_block_t get_block,
+			     get_block_t get_block,
 			     dio_iodone_t end_io, dio_submit_t submit_io,
 			     int flags);
 
 static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
 					 struct inode *inode,
-					 struct iov_iter *iter, loff_t offset,
+					 struct iov_iter *iter,
 					 get_block_t get_block)
 {
 	return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
-				    offset, get_block, NULL, NULL,
-				    DIO_LOCKING | DIO_SKIP_HOLES);
+			get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES);
 }
 #endif
 
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 67300f8e5f2f..cede8f6a7e2d 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -445,10 +445,9 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file)
 /*
  * linux/fs/nfs/direct.c
  */
-extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *, loff_t);
+extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *);
 extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
-			struct iov_iter *iter,
-			loff_t pos);
+			struct iov_iter *iter);
 extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
 			struct iov_iter *iter);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index e7108c31346d..cb36db9f4107 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1853,8 +1853,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 					iocb->ki_pos + count - 1);
 		if (!retval) {
 			struct iov_iter data = *iter;
-			retval = mapping->a_ops->direct_IO(iocb, &data,
-					iocb->ki_pos);
+			retval = mapping->a_ops->direct_IO(iocb, &data);
 		}
 
 		if (retval > 0) {
@@ -2538,7 +2537,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	data = *from;
-	written = mapping->a_ops->direct_IO(iocb, &data, pos);
+	written = mapping->a_ops->direct_IO(iocb, &data);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
diff --git a/mm/page_io.c b/mm/page_io.c
index cd92e3d67a32..89275601d399 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -279,7 +279,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
 
 		set_page_writeback(page);
 		unlock_page(page);
-		ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos);
+		ret = mapping->a_ops->direct_IO(&kiocb, &from);
 		if (ret == PAGE_SIZE) {
 			count_vm_event(PSWPOUT);
 			ret = 0;
-- 
cgit v1.2.3


From dde0c2e79848298cc25621ad080d47f94dbd7cce Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:52:00 -0700
Subject: fs: add IOCB_SYNC and IOCB_DSYNC

This will allow us to do per-I/O sync file writes, as required by a lot
of fileservers or storage targets.

XXX: Will need a few additional audits for O_DSYNC

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     |  2 +-
 fs/btrfs/file.c    |  2 +-
 fs/cifs/file.c     |  2 +-
 fs/direct-io.c     |  2 +-
 fs/ext4/file.c     |  2 +-
 fs/f2fs/file.c     |  2 +-
 fs/gfs2/file.c     |  5 ++++-
 fs/nfs/direct.c    |  2 +-
 fs/ntfs/file.c     |  2 +-
 fs/udf/file.c      |  2 +-
 fs/xfs/xfs_file.c  |  2 +-
 include/linux/fs.h | 14 ++++++++++----
 mm/filemap.c       |  2 +-
 13 files changed, 25 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 9e1f3fe25753..d8dc3512e927 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1662,7 +1662,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = __generic_file_write_iter(iocb, from);
 	if (ret > 0) {
 		ssize_t err;
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6c376311a9d7..35ce146cceec 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1852,7 +1852,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	spin_unlock(&BTRFS_I(inode)->lock);
 	if (num_written > 0) {
-		err = generic_write_sync(file, pos, num_written);
+		err = generic_write_sync(iocb, pos, num_written);
 		if (err < 0)
 			num_written = err;
 	}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index cb070aa88e57..b22b68ccfbe5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2688,7 +2688,7 @@ out:
 	inode_unlock(inode);
 
 	if (rc > 0) {
-		ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
+		ssize_t err = generic_write_sync(iocb, iocb->ki_pos - rc, rc);
 		if (err < 0)
 			rc = err;
 	}
diff --git a/fs/direct-io.c b/fs/direct-io.c
index c61314b84b01..f7bcc0193dee 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -268,7 +268,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 		if (dio->rw & WRITE) {
 			int err;
 
-			err = generic_write_sync(dio->iocb->ki_filp, offset,
+			err = generic_write_sync(dio->iocb, offset,
 						 transferred);
 			if (err < 0 && ret > 0)
 				ret = err;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index fa2208bae2e1..1417e129be51 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -172,7 +172,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0) {
 		ssize_t err;
 
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 443e07705c2a..51ed8388e66c 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1885,7 +1885,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0) {
 		ssize_t err;
 
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 208efc70ad49..5a7d69609309 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -895,7 +895,10 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
 		mark_inode_dirty(inode);
 	}
 
-	return generic_write_sync(file, pos, count);
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
+		return vfs_fsync_range(file, pos, pos + count - 1,
+			       (file->f_flags & __O_SYNC) ? 0 : 1);
+	return 0;
 
 out_trans_fail:
 	gfs2_inplace_release(ip);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 346b5d85ce92..be86de9a77d7 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -1054,7 +1054,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 			if (i_size_read(inode) < iocb->ki_pos)
 				i_size_write(inode, iocb->ki_pos);
 			spin_unlock(&inode->i_lock);
-			generic_write_sync(file, pos, result);
+			generic_write_sync(iocb, pos, result);
 		}
 	}
 	nfs_direct_req_release(dreq);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 91117ada8528..10dc38cc02bb 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1953,7 +1953,7 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	current->backing_dev_info = NULL;
 	inode_unlock(vi);
 	if (likely(written > 0)) {
-		err = generic_write_sync(file, iocb->ki_pos, written);
+		err = generic_write_sync(iocb, iocb->ki_pos, written);
 		if (err < 0)
 			written = 0;
 	}
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7ab8d8196e90..8e3d1ae53b11 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -152,7 +152,7 @@ out:
 
 	if (retval > 0) {
 		mark_inode_dirty(inode);
-		err = generic_write_sync(file, iocb->ki_pos - retval, retval);
+		err = generic_write_sync(iocb, iocb->ki_pos - retval, retval);
 		if (err < 0)
 			retval = err;
 	}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5de047ab2411..b5d70e77195d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -908,7 +908,7 @@ xfs_file_write_iter(
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 
 		/* Handle various SYNC-type writes */
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e6b2de159736..310ca1ed9293 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -323,6 +323,8 @@ struct writeback_control;
 #define IOCB_APPEND		(1 << 1)
 #define IOCB_DIRECT		(1 << 2)
 #define IOCB_HIPRI		(1 << 3)
+#define IOCB_DSYNC		(1 << 4)
+#define IOCB_SYNC		(1 << 5)
 
 struct kiocb {
 	struct file		*ki_filp;
@@ -2485,12 +2487,12 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 			   int datasync);
 extern int vfs_fsync(struct file *file, int datasync);
-static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+static inline int generic_write_sync(struct kiocb *iocb, loff_t pos, loff_t count)
 {
-	if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host))
+	if (!(iocb->ki_flags & IOCB_DSYNC))
 		return 0;
-	return vfs_fsync_range(file, pos, pos + count - 1,
-			       (file->f_flags & __O_SYNC) ? 0 : 1);
+	return vfs_fsync_range(iocb->ki_filp, pos, pos + count - 1,
+			       (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
 }
 extern void emergency_sync(void);
 extern void emergency_remount(void);
@@ -2942,6 +2944,10 @@ static inline int iocb_flags(struct file *file)
 		res |= IOCB_APPEND;
 	if (io_is_direct(file))
 		res |= IOCB_DIRECT;
+	if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
+		res |= IOCB_DSYNC;
+	if (file->f_flags & __O_SYNC)
+		res |= IOCB_SYNC;
 	return res;
 }
 
diff --git a/mm/filemap.c b/mm/filemap.c
index cb36db9f4107..8345d6d3436a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2794,7 +2794,7 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0) {
 		ssize_t err;
 
-		err = generic_write_sync(file, iocb->ki_pos - ret, ret);
+		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
 		if (err < 0)
 			ret = err;
 	}
-- 
cgit v1.2.3


From e259221763a40403d5bb232209998e8c45804ab8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 7 Apr 2016 08:52:01 -0700
Subject: fs: simplify the generic_write_sync prototype

The kiocb already has the new position, so use that.  The only interesting
case is AIO, where we currently don't bother updating ki_pos.  We're about
to free the kiocb after we're done, so we might as well update it to make
everyone's life simpler.

While we're at it also return the bytes written argument passed in if
we were successful so that the boilerplate error switch code in the
callers can go away.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     |  8 ++------
 fs/btrfs/file.c    |  7 ++-----
 fs/cifs/file.c     |  7 ++-----
 fs/direct-io.c     | 17 +++++++++--------
 fs/ext4/file.c     |  9 ++-------
 fs/f2fs/file.c     |  9 ++-------
 fs/nfs/direct.c    |  4 +++-
 fs/ntfs/file.c     |  7 ++-----
 fs/udf/file.c      |  4 +---
 fs/xfs/xfs_file.c  |  6 +-----
 include/linux/fs.h | 24 ++++++++++++++++++------
 mm/filemap.c       |  9 ++-------
 12 files changed, 46 insertions(+), 65 deletions(-)

(limited to 'mm')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index d8dc3512e927..a063d4d8ac39 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1660,12 +1660,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	blk_start_plug(&plug);
 	ret = __generic_file_write_iter(iocb, from);
-	if (ret > 0) {
-		ssize_t err;
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	blk_finish_plug(&plug);
 	return ret;
 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 35ce146cceec..ea9f10bb089c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1851,11 +1851,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	spin_lock(&BTRFS_I(inode)->lock);
 	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	spin_unlock(&BTRFS_I(inode)->lock);
-	if (num_written > 0) {
-		err = generic_write_sync(iocb, pos, num_written);
-		if (err < 0)
-			num_written = err;
-	}
+	if (num_written > 0)
+		num_written = generic_write_sync(iocb, num_written);
 
 	if (sync)
 		atomic_dec(&BTRFS_I(inode)->sync_writers);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b22b68ccfbe5..9b51d4936a29 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2687,11 +2687,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 out:
 	inode_unlock(inode);
 
-	if (rc > 0) {
-		ssize_t err = generic_write_sync(iocb, iocb->ki_pos - rc, rc);
-		if (err < 0)
-			rc = err;
-	}
+	if (rc > 0)
+		rc = generic_write_sync(iocb, rc);
 	up_read(&cinode->lock_sem);
 	return rc;
 }
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f7bcc0193dee..3bf3f20f8ecc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -256,6 +256,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 	if (dio->end_io) {
 		int err;
 
+		// XXX: ki_pos??
 		err = dio->end_io(dio->iocb, offset, ret, dio->private);
 		if (err)
 			ret = err;
@@ -265,15 +266,15 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 		inode_dio_end(dio->inode);
 
 	if (is_async) {
-		if (dio->rw & WRITE) {
-			int err;
-
-			err = generic_write_sync(dio->iocb, offset,
-						 transferred);
-			if (err < 0 && ret > 0)
-				ret = err;
-		}
+		/*
+		 * generic_write_sync expects ki_pos to have been updated
+		 * already, but the submission path only does this for
+		 * synchronous I/O.
+		 */
+		dio->iocb->ki_pos += transferred;
 
+		if (dio->rw & WRITE)
+			ret = generic_write_sync(dio->iocb,  transferred);
 		dio->iocb->ki_complete(dio->iocb, ret, 0);
 	}
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1417e129be51..00ff6912adb3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -169,13 +169,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);
 
-	if (ret > 0) {
-		ssize_t err;
-
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	if (o_direct)
 		blk_finish_plug(&plug);
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 51ed8388e66c..28f75a1fe4a7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1882,13 +1882,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 	inode_unlock(inode);
 
-	if (ret > 0) {
-		ssize_t err;
-
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	return ret;
 }
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index be86de9a77d7..0b9fca040b0c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -1054,7 +1054,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 			if (i_size_read(inode) < iocb->ki_pos)
 				i_size_write(inode, iocb->ki_pos);
 			spin_unlock(&inode->i_lock);
-			generic_write_sync(iocb, pos, result);
+
+			/* XXX: should check the generic_write_sync retval */
+			generic_write_sync(iocb, result);
 		}
 	}
 	nfs_direct_req_release(dreq);
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 10dc38cc02bb..5622ed5a201e 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1952,12 +1952,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		written = ntfs_perform_write(file, from, iocb->ki_pos);
 	current->backing_dev_info = NULL;
 	inode_unlock(vi);
-	if (likely(written > 0)) {
-		err = generic_write_sync(iocb, iocb->ki_pos, written);
-		if (err < 0)
-			written = 0;
-	}
 	iocb->ki_pos += written;
+	if (likely(written > 0))
+		written = generic_write_sync(iocb, written);
 	return written ? written : err;
 }
 
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 8e3d1ae53b11..632570617327 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -152,9 +152,7 @@ out:
 
 	if (retval > 0) {
 		mark_inode_dirty(inode);
-		err = generic_write_sync(iocb, iocb->ki_pos - retval, retval);
-		if (err < 0)
-			retval = err;
+		retval = generic_write_sync(iocb, retval);
 	}
 
 	return retval;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index b5d70e77195d..cd3540997d65 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -903,14 +903,10 @@ xfs_file_write_iter(
 		ret = xfs_file_buffered_aio_write(iocb, from);
 
 	if (ret > 0) {
-		ssize_t err;
-
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
 
 		/* Handle various SYNC-type writes */
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
+		ret = generic_write_sync(iocb, ret);
 	}
 	return ret;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 310ca1ed9293..f6a8ed864651 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2487,13 +2487,25 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end,
 			   int datasync);
 extern int vfs_fsync(struct file *file, int datasync);
-static inline int generic_write_sync(struct kiocb *iocb, loff_t pos, loff_t count)
-{
-	if (!(iocb->ki_flags & IOCB_DSYNC))
-		return 0;
-	return vfs_fsync_range(iocb->ki_filp, pos, pos + count - 1,
-			       (iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
+
+/*
+ * Sync the bytes written if this was a synchronous write.  Expect ki_pos
+ * to already be updated for the write, and will return either the amount
+ * of bytes passed in, or an error if syncing the file failed.
+ */
+static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
+{
+	if (iocb->ki_flags & IOCB_DSYNC) {
+		int ret = vfs_fsync_range(iocb->ki_filp,
+				iocb->ki_pos - count, iocb->ki_pos - 1,
+				(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
+		if (ret)
+			return ret;
+	}
+
+	return count;
 }
+
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 #ifdef CONFIG_BLOCK
diff --git a/mm/filemap.c b/mm/filemap.c
index 8345d6d3436a..182b21825255 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2791,13 +2791,8 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);
 
-	if (ret > 0) {
-		ssize_t err;
-
-		err = generic_write_sync(iocb, iocb->ki_pos - ret, ret);
-		if (err < 0)
-			ret = err;
-	}
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_write_iter);
-- 
cgit v1.2.3


From 84e710da2a1dfacfc87f604869a4d22df91ce6cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 00:58:55 -0400
Subject: parallel lookups machinery, part 2

We'll need to verify that there's neither a hashed nor in-lookup
dentry with desired parent/name before adding to in-lookup set.

One possible solution would be to hold the parent's ->d_lock through
both checks, but while the in-lookup set is relatively small at any
time, dcache is not.  And holding the parent's ->d_lock through
something like __d_lookup_rcu() would suck too badly.

So we leave the parent's ->d_lock alone, which means that we watch
out for the following scenario:
	* we verify that there's no hashed match
	* existing in-lookup match gets hashed by another process
	* we verify that there's no in-lookup matches and decide
that everything's fine.

Solution: per-directory kinda-sorta seqlock, bumped around the times
we hash something that used to be in-lookup or move (and hash)
something in place of in-lookup.  Then the above would turn into
	* read the counter
	* do dcache lookup
	* if no matches found, check for in-lookup matches
	* if there had been none of those either, check if the
counter has changed; repeat if it has.

The "kinda-sorta" part is due to the fact that we don't have much spare
space in inode.  There is a spare word (shared with i_bdev/i_cdev/i_pipe),
so the counter part is not a problem, but spinlock is a different story.

We could use the parent's ->d_lock, and it would be less painful in
terms of contention, for __d_add() it would be rather inconvenient to
grab; we could do that (using lock_parent()), but...

Fortunately, we can get serialization on the counter itself, and it
might be a good idea in general; we can use cmpxchg() in a loop to
get from even to odd and smp_store_release() from odd to even.

This commit adds the counter and updating logics; the readers will be
added in the next commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  8 ++++++++
 fs/dcache.c                       | 34 ++++++++++++++++++++++++++++++++--
 fs/inode.c                        |  1 +
 include/linux/fs.h                |  1 +
 mm/shmem.c                        |  3 ++-
 5 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 57bb3754a027..8810e2367fe6 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -531,3 +531,11 @@ in your dentry operations instead.
 	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
 	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
 	called before we attach dentry to inode.
+--
+[mandatory]
+	symlinks are no longer the only inodes that do *not* have i_bdev/i_cdev/
+	i_pipe/i_link union zeroed out at inode eviction.  As the result, you can't
+	assume that non-NULL value in ->i_nlink at ->destroy_inode() implies that
+	it's a symlink.  Checking ->i_mode is really needed now.  In-tree we had
+	to fix shmem_destroy_callback() that used to take that kind of shortcut;
+	watch out, since that shortcut is no longer valid.
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f1d93866e69..10988f7e5a23 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2364,6 +2364,22 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
 
+static inline unsigned start_dir_add(struct inode *dir)
+{
+
+	for (;;) {
+		unsigned n = dir->i_dir_seq;
+		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+			return n;
+		cpu_relax();
+	}
+}
+
+static inline void end_dir_add(struct inode *dir, unsigned n)
+{
+	smp_store_release(&dir->i_dir_seq, n + 2);
+}
+
 void __d_lookup_done(struct dentry *dentry)
 {
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
@@ -2375,9 +2391,14 @@ EXPORT_SYMBOL(__d_lookup_done);
 
 static inline void __d_add(struct dentry *dentry, struct inode *inode)
 {
+	struct inode *dir = NULL;
+	unsigned n;
 	spin_lock(&dentry->d_lock);
-	if (unlikely(d_in_lookup(dentry)))
+	if (unlikely(d_in_lookup(dentry))) {
+		dir = dentry->d_parent->d_inode;
+		n = start_dir_add(dir);
 		__d_lookup_done(dentry);
+	}
 	if (inode) {
 		unsigned add_flags = d_flags_for_inode(inode);
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2387,6 +2408,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		__fsnotify_d_instantiate(dentry);
 	}
 	_d_rehash(dentry);
+	if (dir)
+		end_dir_add(dir, n);
 	spin_unlock(&dentry->d_lock);
 	if (inode)
 		spin_unlock(&inode->i_lock);
@@ -2616,6 +2639,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
 static void __d_move(struct dentry *dentry, struct dentry *target,
 		     bool exchange)
 {
+	struct inode *dir = NULL;
+	unsigned n;
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
@@ -2623,8 +2648,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	BUG_ON(d_ancestor(target, dentry));
 
 	dentry_lock_for_move(dentry, target);
-	if (unlikely(d_in_lookup(target)))
+	if (unlikely(d_in_lookup(target))) {
+		dir = target->d_parent->d_inode;
+		n = start_dir_add(dir);
 		__d_lookup_done(target);
+	}
 
 	write_seqcount_begin(&dentry->d_seq);
 	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
@@ -2674,6 +2702,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	write_seqcount_end(&target->d_seq);
 	write_seqcount_end(&dentry->d_seq);
 
+	if (dir)
+		end_dir_add(dir, n);
 	dentry_unlock_for_move(dentry, target);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 4202aac99464..4b884f73214e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
 	inode->i_link = NULL;
+	inode->i_dir_seq = 0;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6d0fa9174a24..00cecc5a2f75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -684,6 +684,7 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 		char			*i_link;
+		unsigned		i_dir_seq;
 	};
 
 	__u32			i_generation;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4640699b209b..e684a9140228 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3123,7 +3123,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kfree(inode->i_link);
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_link);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-- 
cgit v1.2.3


From 44f43e99fe70833058482d183e99fdfd11220996 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Mon, 9 May 2016 16:28:49 -0700
Subject: zsmalloc: fix zs_can_compact() integer overflow

zs_can_compact() has two race conditions in its core calculation:

unsigned long obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
				zs_stat_get(class, OBJ_USED);

1) classes are not locked, so the numbers of allocated and used
   objects can change by the concurrent ops happening on other CPUs
2) shrinker invokes it from preemptible context

Depending on the circumstances, thus, OBJ_ALLOCATED can become
less than OBJ_USED, which can result in either very high or
negative `total_scan' value calculated later in do_shrink_slab().

do_shrink_slab() has some logic to prevent those cases:

 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62
 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62
 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-64
 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62
 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62
 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62

However, due to the way `total_scan' is calculated, not every
shrinker->count_objects() overflow can be spotted and handled.
To demonstrate the latter, I added some debugging code to do_shrink_slab()
(x86_64) and the results were:

 vmscan: OVERFLOW: shrinker->count_objects() == -1 [18446744073709551615]
 vmscan: but total_scan > 0: 92679974445502
 vmscan: resulting total_scan: 92679974445502
[..]
 vmscan: OVERFLOW: shrinker->count_objects() == -1 [18446744073709551615]
 vmscan: but total_scan > 0: 22634041808232578
 vmscan: resulting total_scan: 22634041808232578

Even though shrinker->count_objects() has returned an overflowed value,
the resulting `total_scan' is positive, and, what is more worrisome, it
is insanely huge. This value is getting used later on in
shrinker->scan_objects() loop:

        while (total_scan >= batch_size ||
               total_scan >= freeable) {
                unsigned long ret;
                unsigned long nr_to_scan = min(batch_size, total_scan);

                shrinkctl->nr_to_scan = nr_to_scan;
                ret = shrinker->scan_objects(shrinker, shrinkctl);
                if (ret == SHRINK_STOP)
                        break;
                freed += ret;

                count_vm_events(SLABS_SCANNED, nr_to_scan);
                total_scan -= nr_to_scan;

                cond_resched();
        }

`total_scan >= batch_size' is true for a very-very long time and
'total_scan >= freeable' is also true for quite some time, because
`freeable < 0' and `total_scan' is large enough, for example,
22634041808232578. The only break condition, in the given scheme of
things, is shrinker->scan_objects() == SHRINK_STOP test, which is a
bit too weak to rely on, especially in heavy zsmalloc-usage scenarios.

To fix the issue, take a pool stat snapshot and use it instead of
racy zs_stat_get() calls.

Link: http://lkml.kernel.org/r/20160509140052.3389-1-sergey.senozhatsky@gmail.com
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: <stable@vger.kernel.org>        [4.3+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index e72efb109fde..fe47fbba995a 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1735,10 +1735,13 @@ static struct page *isolate_source_page(struct size_class *class)
 static unsigned long zs_can_compact(struct size_class *class)
 {
 	unsigned long obj_wasted;
+	unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+	unsigned long obj_used = zs_stat_get(class, OBJ_USED);
 
-	obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) -
-		zs_stat_get(class, OBJ_USED);
+	if (obj_allocated <= obj_used)
+		return 0;
 
+	obj_wasted = obj_allocated - obj_used;
 	obj_wasted /= get_maxobj_per_zspage(class->size,
 			class->pages_per_zspage);
 
-- 
cgit v1.2.3


From 7496fea9a6bf644afe360af795b121a77635b37d Mon Sep 17 00:00:00 2001
From: Zhou Chengming <zhouchengming1@huawei.com>
Date: Thu, 12 May 2016 15:42:21 -0700
Subject: ksm: fix conflict between mmput and scan_get_next_rmap_item

A concurrency issue about KSM in the function scan_get_next_rmap_item.

task A (ksmd):				|task B (the mm's task):
					|
mm = slot->mm;				|
down_read(&mm->mmap_sem);		|
					|
...					|
					|
spin_lock(&ksm_mmlist_lock);		|
					|
ksm_scan.mm_slot go to the next slot;	|
					|
spin_unlock(&ksm_mmlist_lock);		|
					|mmput() ->
					|	ksm_exit():
					|
					|spin_lock(&ksm_mmlist_lock);
					|if (mm_slot && ksm_scan.mm_slot != mm_slot) {
					|	if (!mm_slot->rmap_list) {
					|		easy_to_free = 1;
					|		...
					|
					|if (easy_to_free) {
					|	mmdrop(mm);
					|	...
					|
					|So this mm_struct may be freed in the mmput().
					|
up_read(&mm->mmap_sem);			|

As we can see above, the ksmd thread may access a mm_struct that already
been freed to the kmem_cache.  Suppose a fork will get this mm_struct from
the kmem_cache, the ksmd thread then call up_read(&mm->mmap_sem), will
cause mmap_sem.count to become -1.

As suggested by Andrea Arcangeli, unmerge_and_remove_all_rmap_items has
the same SMP race condition, so fix it too.  My prev fix in function
scan_get_next_rmap_item will introduce a different SMP race condition, so
just invert the up_read/spin_unlock order as Andrea Arcangeli said.

Link: http://lkml.kernel.org/r/1462708815-31301-1-git-send-email-zhouchengming1@huawei.com
Signed-off-by: Zhou Chengming <zhouchengming1@huawei.com>
Suggested-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Geliang Tang <geliangtang@163.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Hanjun Guo <guohanjun@huawei.com>
Cc: Ding Tianhong <dingtianhong@huawei.com>
Cc: Li Bin <huawei.libin@huawei.com>
Cc: Zhen Lei <thunder.leizhen@huawei.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/ksm.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/ksm.c b/mm/ksm.c
index b99e828172f6..4786b4150f62 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void)
 		}
 
 		remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
+		up_read(&mm->mmap_sem);
 
 		spin_lock(&ksm_mmlist_lock);
 		ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void)
 
 			free_mm_slot(mm_slot);
 			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
-			up_read(&mm->mmap_sem);
 			mmdrop(mm);
-		} else {
+		} else
 			spin_unlock(&ksm_mmlist_lock);
-			up_read(&mm->mmap_sem);
-		}
 	}
 
 	/* Clean up stable nodes, but don't worry if some are still busy */
@@ -1663,8 +1661,15 @@ next_mm:
 		up_read(&mm->mmap_sem);
 		mmdrop(mm);
 	} else {
-		spin_unlock(&ksm_mmlist_lock);
 		up_read(&mm->mmap_sem);
+		/*
+		 * up_read(&mm->mmap_sem) first because after
+		 * spin_unlock(&ksm_mmlist_lock) run, the "mm" may
+		 * already have been freed under us by __ksm_exit()
+		 * because the "mm_slot" is still hashed and
+		 * ksm_scan.mm_slot doesn't point to it anymore.
+		 */
+		spin_unlock(&ksm_mmlist_lock);
 	}
 
 	/* Repeat until we've completed scanning the whole list */
-- 
cgit v1.2.3


From 6d0a07edd17cfc12fdc1f36de8072fa17cc3666f Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Thu, 12 May 2016 15:42:25 -0700
Subject: mm: thp: calculate the mapcount correctly for THP pages during WP
 faults

This will provide fully accuracy to the mapcount calculation in the
write protect faults, so page pinning will not get broken by false
positive copy-on-writes.

total_mapcount() isn't the right calculation needed in
reuse_swap_page(), so this introduces a page_trans_huge_mapcount()
that is effectively the full accurate return value for page_mapcount()
if dealing with Transparent Hugepages, however we only use the
page_trans_huge_mapcount() during COW faults where it strictly needed,
due to its higher runtime cost.

This also provide at practical zero cost the total_mapcount
information which is needed to know if we can still relocate the page
anon_vma to the local vma. If page_trans_huge_mapcount() returns 1 we
can reuse the page no matter if it's a pte or a pmd_trans_huge
triggering the fault, but we can only relocate the page anon_vma to
the local vma->anon_vma if we're sure it's only this "vma" mapping the
whole THP physical range.

Kirill A. Shutemov discovered the problem with moving the page
anon_vma to the local vma->anon_vma in a previous version of this
patch and another problem in the way page_move_anon_rmap() was called.

Andrew Morton discovered that CONFIG_SWAP=n wouldn't build in a
previous version, because reuse_swap_page must be a macro to call
page_trans_huge_mapcount from swap.h, so this uses a macro again
instead of an inline function. With this change at least it's a less
dangerous usage than it was before, because "page" is used only once
now, while with the previous code reuse_swap_page(page++) would have
called page_mapcount on page+1 and it would have increased page twice
instead of just once.

Dean Luick noticed an uninitialized variable that could result in a
rmap inefficiency for the non-THP case in a previous version.

Mike Marciniszyn said:

: Our RDMA tests are seeing an issue with memory locking that bisects to
: commit 61f5d698cc97 ("mm: re-enable THP")
:
: The test program registers two rather large MRs (512M) and RDMA
: writes data to a passive peer using the first and RDMA reads it back
: into the second MR and compares that data.  The sizes are chosen randomly
: between 0 and 1024 bytes.
:
: The test will get through a few (<= 4 iterations) and then gets a
: compare error.
:
: Tracing indicates the kernel logical addresses associated with the individual
: pages at registration ARE correct , the data in the "RDMA read response only"
: packets ARE correct.
:
: The "corruption" occurs when the packet crosse two pages that are not physically
: contiguous.   The second page reads back as zero in the program.
:
: It looks like the user VA at the point of the compare error no longer points to
: the same physical address as was registered.
:
: This patch totally resolves the issue!

Link: http://lkml.kernel.org/r/1462547040-1737-2-git-send-email-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reviewed-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Reviewed-by: Dean Luick <dean.luick@intel.com>
Tested-by: Alex Williamson <alex.williamson@redhat.com>
Tested-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Tested-by: Josh Collier <josh.d.collier@intel.com>
Cc: Marc Haber <mh+linux-kernel@zugschlus.de>
Cc: <stable@vger.kernel.org>	[4.5]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h   |  9 +++++++
 include/linux/swap.h |  6 ++---
 mm/huge_memory.c     | 71 +++++++++++++++++++++++++++++++++++++++++++++-------
 mm/memory.c          | 22 ++++++++++------
 mm/swapfile.c        | 13 +++++-----
 5 files changed, 95 insertions(+), 26 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 864d7221de84..8f468e0d2534 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page)
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 int total_mapcount(struct page *page);
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
 #else
 static inline int total_mapcount(struct page *page)
 {
 	return page_mapcount(page);
 }
+static inline int page_trans_huge_mapcount(struct page *page,
+					   int *total_mapcount)
+{
+	int mapcount = page_mapcount(page);
+	if (total_mapcount)
+		*total_mapcount = mapcount;
+	return mapcount;
+}
 #endif
 
 static inline struct page *virt_to_head_page(const void *x)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0a4cd4703f40..ad220359f1b0 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern int page_swapcount(struct page *);
 extern int swp_swapcount(swp_entry_t entry);
 extern struct swap_info_struct *page_swap_info(struct page *);
-extern int reuse_swap_page(struct page *);
+extern bool reuse_swap_page(struct page *, int *);
 extern int try_to_free_swap(struct page *);
 struct backing_dev_info;
 
@@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry)
 	return 0;
 }
 
-#define reuse_swap_page(page) \
-	(!PageTransCompound(page) && page_mapcount(page) == 1)
+#define reuse_swap_page(page, total_mapcount) \
+	(page_trans_huge_mapcount(page, total_mapcount) == 1)
 
 static inline int try_to_free_swap(struct page *page)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f7daa7de8f48..b49ee126d4d1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
 	/*
 	 * We can only reuse the page if nobody else maps the huge page or it's
-	 * part. We can do it by checking page_mapcount() on each sub-page, but
-	 * it's expensive.
-	 * The cheaper way is to check page_count() to be equal 1: every
-	 * mapcount takes page reference reference, so this way we can
-	 * guarantee, that the PMD is the only mapping.
-	 * This can give false negative if somebody pinned the page, but that's
-	 * fine.
+	 * part.
 	 */
-	if (page_mapcount(page) == 1 && page_count(page) == 1) {
+	if (page_trans_huge_mapcount(page, NULL) == 1) {
 		pmd_t entry;
 		entry = pmd_mkyoung(orig_pmd);
 		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		if (pte_write(pteval)) {
 			writable = true;
 		} else {
-			if (PageSwapCache(page) && !reuse_swap_page(page)) {
+			if (PageSwapCache(page) &&
+			    !reuse_swap_page(page, NULL)) {
 				unlock_page(page);
 				result = SCAN_SWAP_CACHE_PAGE;
 				goto out;
@@ -3222,6 +3217,64 @@ int total_mapcount(struct page *page)
 	return ret;
 }
 
+/*
+ * This calculates accurately how many mappings a transparent hugepage
+ * has (unlike page_mapcount() which isn't fully accurate). This full
+ * accuracy is primarily needed to know if copy-on-write faults can
+ * reuse the page and change the mapping to read-write instead of
+ * copying them. At the same time this returns the total_mapcount too.
+ *
+ * The function returns the highest mapcount any one of the subpages
+ * has. If the return value is one, even if different processes are
+ * mapping different subpages of the transparent hugepage, they can
+ * all reuse it, because each process is reusing a different subpage.
+ *
+ * The total_mapcount is instead counting all virtual mappings of the
+ * subpages. If the total_mapcount is equal to "one", it tells the
+ * caller all mappings belong to the same "mm" and in turn the
+ * anon_vma of the transparent hugepage can become the vma->anon_vma
+ * local one as no other process may be mapping any of the subpages.
+ *
+ * It would be more accurate to replace page_mapcount() with
+ * page_trans_huge_mapcount(), however we only use
+ * page_trans_huge_mapcount() in the copy-on-write faults where we
+ * need full accuracy to avoid breaking page pinning, because
+ * page_trans_huge_mapcount() is slower than page_mapcount().
+ */
+int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
+{
+	int i, ret, _total_mapcount, mapcount;
+
+	/* hugetlbfs shouldn't call it */
+	VM_BUG_ON_PAGE(PageHuge(page), page);
+
+	if (likely(!PageTransCompound(page))) {
+		mapcount = atomic_read(&page->_mapcount) + 1;
+		if (total_mapcount)
+			*total_mapcount = mapcount;
+		return mapcount;
+	}
+
+	page = compound_head(page);
+
+	_total_mapcount = ret = 0;
+	for (i = 0; i < HPAGE_PMD_NR; i++) {
+		mapcount = atomic_read(&page[i]._mapcount) + 1;
+		ret = max(ret, mapcount);
+		_total_mapcount += mapcount;
+	}
+	if (PageDoubleMap(page)) {
+		ret -= 1;
+		_total_mapcount -= HPAGE_PMD_NR;
+	}
+	mapcount = compound_mapcount(page);
+	ret += mapcount;
+	_total_mapcount += mapcount;
+	if (total_mapcount)
+		*total_mapcount = _total_mapcount;
+	return ret;
+}
+
 /*
  * This function splits huge page into normal pages. @page can point to any
  * subpage of huge page to split. Split doesn't change the position of @page.
diff --git a/mm/memory.c b/mm/memory.c
index 52c218e2b724..07493e34ab7e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * not dirty accountable.
 	 */
 	if (PageAnon(old_page) && !PageKsm(old_page)) {
+		int total_mapcount;
 		if (!trylock_page(old_page)) {
 			get_page(old_page);
 			pte_unmap_unlock(page_table, ptl);
@@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			}
 			put_page(old_page);
 		}
-		if (reuse_swap_page(old_page)) {
-			/*
-			 * The page is all ours.  Move it to our anon_vma so
-			 * the rmap code will not search our parent or siblings.
-			 * Protected against the rmap code by the page lock.
-			 */
-			page_move_anon_rmap(old_page, vma, address);
+		if (reuse_swap_page(old_page, &total_mapcount)) {
+			if (total_mapcount == 1) {
+				/*
+				 * The page is all ours. Move it to
+				 * our anon_vma so the rmap code will
+				 * not search our parent or siblings.
+				 * Protected against the rmap code by
+				 * the page lock.
+				 */
+				page_move_anon_rmap(compound_head(old_page),
+						    vma, address);
+			}
 			unlock_page(old_page);
 			return wp_page_reuse(mm, vma, address, page_table, ptl,
 					     orig_pte, old_page, 0, 0);
@@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	dec_mm_counter_fast(mm, MM_SWAPENTS);
 	pte = mk_pte(page, vma->vm_page_prot);
-	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
+	if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
 		flags &= ~FAULT_FLAG_WRITE;
 		ret |= VM_FAULT_WRITE;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 83874eced5bf..031713ab40ce 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -922,18 +922,19 @@ out:
  * to it.  And as a side-effect, free up its swap: because the old content
  * on disk will never be read, and seeking back there to write new content
  * later would only waste time away from clustering.
+ *
+ * NOTE: total_mapcount should not be relied upon by the caller if
+ * reuse_swap_page() returns false, but it may be always overwritten
+ * (see the other implementation for CONFIG_SWAP=n).
  */
-int reuse_swap_page(struct page *page)
+bool reuse_swap_page(struct page *page, int *total_mapcount)
 {
 	int count;
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	if (unlikely(PageKsm(page)))
-		return 0;
-	/* The page is part of THP and cannot be reused */
-	if (PageTransCompound(page))
-		return 0;
-	count = page_mapcount(page);
+		return false;
+	count = page_trans_huge_mapcount(page, total_mapcount);
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
 		if (count == 1 && !PageWriteback(page)) {
-- 
cgit v1.2.3


From 4f622938a5e2b7f1374ffb1e5fc212744898f513 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:17 +0200
Subject: dax: Allow DAX code to replace exceptional entries

Currently we forbid page_cache_tree_insert() to replace exceptional radix
tree entries for DAX inodes. However to make DAX faults race free we will
lock radix tree entries and when hole is created, we need to replace
such locked radix tree entry with a hole page. So modify
page_cache_tree_insert() to allow that.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 include/linux/dax.h |  1 +
 mm/filemap.c        | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/dax.h b/include/linux/dax.h
index 70600b63083f..aa148937bb3f 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -3,6 +3,7 @@
 
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/radix-tree.h>
 #include <asm/pgtable.h>
 
 /* We use lowest available exceptional entry bit for locking */
diff --git a/mm/filemap.c b/mm/filemap.c
index f2479af09da9..dfe55c2cfb34 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -597,14 +597,21 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		if (!radix_tree_exceptional_entry(p))
 			return -EEXIST;
 
-		if (WARN_ON(dax_mapping(mapping)))
-			return -EINVAL;
-
-		if (shadowp)
-			*shadowp = p;
 		mapping->nrexceptional--;
-		if (node)
-			workingset_node_shadows_dec(node);
+		if (!dax_mapping(mapping)) {
+			if (shadowp)
+				*shadowp = p;
+			if (node)
+				workingset_node_shadows_dec(node);
+		} else {
+			/* DAX can replace empty locked entry with a hole */
+			WARN_ON_ONCE(p !=
+				(void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+					 RADIX_DAX_ENTRY_LOCK));
+			/* DAX accounts exceptional entries as normal pages */
+			if (node)
+				workingset_node_pages_dec(node);
+		}
 	}
 	radix_tree_replace_slot(slot, page);
 	mapping->nrpages++;
-- 
cgit v1.2.3


From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:18 +0200
Subject: dax: New fault locking

Currently DAX page fault locking is racy.

CPU0 (write fault)		CPU1 (read fault)

__dax_fault()			__dax_fault()
  get_block(inode, block, &bh, 0) -> not mapped
				  get_block(inode, block, &bh, 0)
				    -> not mapped
  if (!buffer_mapped(&bh))
    if (vmf->flags & FAULT_FLAG_WRITE)
      get_block(inode, block, &bh, 1) -> allocates blocks
  if (page) -> no
				  if (!buffer_mapped(&bh))
				    if (vmf->flags & FAULT_FLAG_WRITE) {
				    } else {
				      dax_load_hole();
				    }
  dax_insert_mapping()

And we are in a situation where we fail in dax_radix_entry() with -EIO.

Another problem with the current DAX page fault locking is that there is
no race-free way to clear dirty tag in the radix tree. We can always
end up with clean radix tree and dirty data in CPU cache.

We fix the first problem by introducing locking of exceptional radix
tree entries in DAX mappings acting very similarly to page lock and thus
synchronizing properly faults against the same mapping index. The same
lock can later be used to avoid races when clearing radix tree dirty
tag.

Reviewed-by: NeilBrown <neilb@suse.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 553 ++++++++++++++++++++++++++++++++++++++--------------
 include/linux/dax.h |   3 +
 mm/filemap.c        |   9 +-
 mm/truncate.c       |  62 +++---
 4 files changed, 447 insertions(+), 180 deletions(-)

(limited to 'mm')

diff --git a/fs/dax.c b/fs/dax.c
index 351afd3cf8be..f43c3d806fb6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -46,6 +46,30 @@
 		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
 		RADIX_TREE_EXCEPTIONAL_ENTRY))
 
+/* We choose 4096 entries - same as per-zone page wait tables */
+#define DAX_WAIT_TABLE_BITS 12
+#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
+
+wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+
+static int __init init_dax_wait_table(void)
+{
+	int i;
+
+	for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
+		init_waitqueue_head(wait_table + i);
+	return 0;
+}
+fs_initcall(init_dax_wait_table);
+
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+					      pgoff_t index)
+{
+	unsigned long hash = hash_long((unsigned long)mapping ^ index,
+				       DAX_WAIT_TABLE_BITS);
+	return wait_table + hash;
+}
+
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -267,6 +291,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 }
 EXPORT_SYMBOL_GPL(dax_do_io);
 
+/*
+ * DAX radix tree locking
+ */
+struct exceptional_entry_key {
+	struct address_space *mapping;
+	unsigned long index;
+};
+
+struct wait_exceptional_entry_queue {
+	wait_queue_t wait;
+	struct exceptional_entry_key key;
+};
+
+static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
+				       int sync, void *keyp)
+{
+	struct exceptional_entry_key *key = keyp;
+	struct wait_exceptional_entry_queue *ewait =
+		container_of(wait, struct wait_exceptional_entry_queue, wait);
+
+	if (key->mapping != ewait->key.mapping ||
+	    key->index != ewait->key.index)
+		return 0;
+	return autoremove_wake_function(wait, mode, sync, NULL);
+}
+
+/*
+ * Check whether the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline int slot_locked(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+	return entry & RADIX_DAX_ENTRY_LOCK;
+}
+
+/*
+ * Mark the given slot is locked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *lock_slot(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+	entry |= RADIX_DAX_ENTRY_LOCK;
+	radix_tree_replace_slot(slot, (void *)entry);
+	return (void *)entry;
+}
+
+/*
+ * Mark the given slot is unlocked. The function must be called with
+ * mapping->tree_lock held
+ */
+static inline void *unlock_slot(struct address_space *mapping, void **slot)
+{
+	unsigned long entry = (unsigned long)
+		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
+
+	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
+	radix_tree_replace_slot(slot, (void *)entry);
+	return (void *)entry;
+}
+
+/*
+ * Lookup entry in radix tree, wait for it to become unlocked if it is
+ * exceptional entry and return it. The caller must call
+ * put_unlocked_mapping_entry() when he decided not to lock the entry or
+ * put_locked_mapping_entry() when he locked the entry and now wants to
+ * unlock it.
+ *
+ * The function must be called with mapping->tree_lock held.
+ */
+static void *get_unlocked_mapping_entry(struct address_space *mapping,
+					pgoff_t index, void ***slotp)
+{
+	void *ret, **slot;
+	struct wait_exceptional_entry_queue ewait;
+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+	init_wait(&ewait.wait);
+	ewait.wait.func = wake_exceptional_entry_func;
+	ewait.key.mapping = mapping;
+	ewait.key.index = index;
+
+	for (;;) {
+		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+					  &slot);
+		if (!ret || !radix_tree_exceptional_entry(ret) ||
+		    !slot_locked(mapping, slot)) {
+			if (slotp)
+				*slotp = slot;
+			return ret;
+		}
+		prepare_to_wait_exclusive(wq, &ewait.wait,
+					  TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&mapping->tree_lock);
+		schedule();
+		finish_wait(wq, &ewait.wait);
+		spin_lock_irq(&mapping->tree_lock);
+	}
+}
+
+/*
+ * Find radix tree entry at given index. If it points to a page, return with
+ * the page locked. If it points to the exceptional entry, return with the
+ * radix tree entry locked. If the radix tree doesn't contain given index,
+ * create empty exceptional entry for the index and return with it locked.
+ *
+ * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
+ * persistent memory the benefit is doubtful. We can add that later if we can
+ * show it helps.
+ */
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *ret, **slot;
+
+restart:
+	spin_lock_irq(&mapping->tree_lock);
+	ret = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* No entry for given index? Make sure radix tree is big enough. */
+	if (!ret) {
+		int err;
+
+		spin_unlock_irq(&mapping->tree_lock);
+		err = radix_tree_preload(
+				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
+		if (err)
+			return ERR_PTR(err);
+		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
+			       RADIX_DAX_ENTRY_LOCK);
+		spin_lock_irq(&mapping->tree_lock);
+		err = radix_tree_insert(&mapping->page_tree, index, ret);
+		radix_tree_preload_end();
+		if (err) {
+			spin_unlock_irq(&mapping->tree_lock);
+			/* Someone already created the entry? */
+			if (err == -EEXIST)
+				goto restart;
+			return ERR_PTR(err);
+		}
+		/* Good, we have inserted empty locked entry into the tree. */
+		mapping->nrexceptional++;
+		spin_unlock_irq(&mapping->tree_lock);
+		return ret;
+	}
+	/* Normal page in radix tree? */
+	if (!radix_tree_exceptional_entry(ret)) {
+		struct page *page = ret;
+
+		get_page(page);
+		spin_unlock_irq(&mapping->tree_lock);
+		lock_page(page);
+		/* Page got truncated? Retry... */
+		if (unlikely(page->mapping != mapping)) {
+			unlock_page(page);
+			put_page(page);
+			goto restart;
+		}
+		return page;
+	}
+	ret = lock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+				   pgoff_t index, bool wake_all)
+{
+	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+
+	/*
+	 * Checking for locked entry and prepare_to_wait_exclusive() happens
+	 * under mapping->tree_lock, ditto for entry handling in our callers.
+	 * So at this point all tasks that could have seen our entry locked
+	 * must be in the waitqueue and the following check will see them.
+	 */
+	if (waitqueue_active(wq)) {
+		struct exceptional_entry_key key;
+
+		key.mapping = mapping;
+		key.index = index;
+		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
+	}
+}
+
+static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *ret, **slot;
+
+	spin_lock_irq(&mapping->tree_lock);
+	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
+			 !slot_locked(mapping, slot))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return;
+	}
+	unlock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+				     pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry)) {
+		unlock_page(entry);
+		put_page(entry);
+	} else {
+		unlock_mapping_entry(mapping, index);
+	}
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+				       pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry))
+		return;
+
+	/* We have to wake up next waiter for the radix tree entry lock */
+	dax_wake_mapping_entry_waiter(mapping, index, false);
+}
+
+/*
+ * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
+ * entry to get unlocked before deleting it.
+ */
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	void *entry;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	/*
+	 * This gets called from truncate / punch_hole path. As such, the caller
+	 * must hold locks protecting against concurrent modifications of the
+	 * radix tree (usually fs-private i_mmap_sem for writing). Since the
+	 * caller has seen exceptional entry for this index, we better find it
+	 * at that index as well...
+	 */
+	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return 0;
+	}
+	radix_tree_delete(&mapping->page_tree, index);
+	mapping->nrexceptional--;
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, true);
+
+	return 1;
+}
+
 /*
  * The user has performed a load from a hole in the file.  Allocating
  * a new page in the file would cause excessive storage usage for
@@ -275,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io);
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, struct page *page,
-							struct vm_fault *vmf)
+static int dax_load_hole(struct address_space *mapping, void *entry,
+			 struct vm_fault *vmf)
 {
-	if (!page)
-		page = find_or_create_page(mapping, vmf->pgoff,
-						GFP_KERNEL | __GFP_ZERO);
-	if (!page)
-		return VM_FAULT_OOM;
+	struct page *page;
+
+	/* Hole page already exists? Return it...  */
+	if (!radix_tree_exceptional_entry(entry)) {
+		vmf->page = entry;
+		return VM_FAULT_LOCKED;
+	}
 
+	/* This will replace locked radix tree entry with a hole page */
+	page = find_or_create_page(mapping, vmf->pgoff,
+				   vmf->gfp_mask | __GFP_ZERO);
+	if (!page) {
+		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+		return VM_FAULT_OOM;
+	}
 	vmf->page = page;
 	return VM_FAULT_LOCKED;
 }
@@ -307,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode,
 	return 0;
 }
 
-#define NO_SECTOR -1
 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
 
-static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
-		sector_t sector, bool pmd_entry, bool dirty)
+static void *dax_insert_mapping_entry(struct address_space *mapping,
+				      struct vm_fault *vmf,
+				      void *entry, sector_t sector)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	pgoff_t pmd_index = DAX_PMD_INDEX(index);
-	int type, error = 0;
-	void *entry;
+	int error = 0;
+	bool hole_fill = false;
+	void *new_entry;
+	pgoff_t index = vmf->pgoff;
 
-	WARN_ON_ONCE(pmd_entry && !dirty);
-	if (dirty)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 
-	spin_lock_irq(&mapping->tree_lock);
-
-	entry = radix_tree_lookup(page_tree, pmd_index);
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
-		index = pmd_index;
-		goto dirty;
+	/* Replacing hole page with block mapping? */
+	if (!radix_tree_exceptional_entry(entry)) {
+		hole_fill = true;
+		/*
+		 * Unmap the page now before we remove it from page cache below.
+		 * The page is locked so it cannot be faulted in again.
+		 */
+		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+				    PAGE_SIZE, 0);
+		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
+		if (error)
+			return ERR_PTR(error);
 	}
 
-	entry = radix_tree_lookup(page_tree, index);
-	if (entry) {
-		type = RADIX_DAX_TYPE(entry);
-		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
-					type != RADIX_DAX_PMD)) {
-			error = -EIO;
+	spin_lock_irq(&mapping->tree_lock);
+	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
+		       RADIX_DAX_ENTRY_LOCK);
+	if (hole_fill) {
+		__delete_from_page_cache(entry, NULL);
+		/* Drop pagecache reference */
+		put_page(entry);
+		error = radix_tree_insert(page_tree, index, new_entry);
+		if (error) {
+			new_entry = ERR_PTR(error);
 			goto unlock;
 		}
+		mapping->nrexceptional++;
+	} else {
+		void **slot;
+		void *ret;
 
-		if (!pmd_entry || type == RADIX_DAX_PMD)
-			goto dirty;
-
-		/*
-		 * We only insert dirty PMD entries into the radix tree.  This
-		 * means we don't need to worry about removing a dirty PTE
-		 * entry and inserting a clean PMD entry, thus reducing the
-		 * range we would flush with a follow-up fsync/msync call.
-		 */
-		radix_tree_delete(&mapping->page_tree, index);
-		mapping->nrexceptional--;
-	}
-
-	if (sector == NO_SECTOR) {
-		/*
-		 * This can happen during correct operation if our pfn_mkwrite
-		 * fault raced against a hole punch operation.  If this
-		 * happens the pte that was hole punched will have been
-		 * unmapped and the radix tree entry will have been removed by
-		 * the time we are called, but the call will still happen.  We
-		 * will return all the way up to wp_pfn_shared(), where the
-		 * pte_same() check will fail, eventually causing page fault
-		 * to be retried by the CPU.
-		 */
-		goto unlock;
+		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+		WARN_ON_ONCE(ret != entry);
+		radix_tree_replace_slot(slot, new_entry);
 	}
-
-	error = radix_tree_insert(page_tree, index,
-			RADIX_DAX_ENTRY(sector, pmd_entry));
-	if (error)
-		goto unlock;
-
-	mapping->nrexceptional++;
- dirty:
-	if (dirty)
+	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
  unlock:
 	spin_unlock_irq(&mapping->tree_lock);
-	return error;
+	if (hole_fill) {
+		radix_tree_preload_end();
+		/*
+		 * We don't need hole page anymore, it has been replaced with
+		 * locked radix tree entry now.
+		 */
+		if (mapping->a_ops->freepage)
+			mapping->a_ops->freepage(entry);
+		unlock_page(entry);
+		put_page(entry);
+	}
+	return new_entry;
 }
 
 static int dax_writeback_one(struct block_device *bdev,
@@ -503,17 +788,19 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
 
-static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+static int dax_insert_mapping(struct address_space *mapping,
+			struct buffer_head *bh, void **entryp,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	struct address_space *mapping = inode->i_mapping;
 	struct block_device *bdev = bh->b_bdev;
 	struct blk_dax_ctl dax = {
-		.sector = to_sector(bh, inode),
+		.sector = to_sector(bh, mapping->host),
 		.size = bh->b_size,
 	};
 	int error;
+	void *ret;
+	void *entry = *entryp;
 
 	i_mmap_lock_read(mapping);
 
@@ -523,16 +810,16 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 	}
 	dax_unmap_atomic(bdev, &dax);
 
-	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
-			vmf->flags & FAULT_FLAG_WRITE);
-	if (error)
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+	if (IS_ERR(ret)) {
+		error = PTR_ERR(ret);
 		goto out;
+	}
+	*entryp = ret;
 
 	error = vm_insert_mixed(vma, vaddr, dax.pfn);
-
  out:
 	i_mmap_unlock_read(mapping);
-
 	return error;
 }
 
@@ -552,7 +839,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *page;
+	void *entry;
 	struct buffer_head bh;
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	unsigned blkbits = inode->i_blkbits;
@@ -561,6 +848,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	int error;
 	int major = 0;
 
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is supposed
+	 * to hold locks serializing us with truncate / punch hole so this is
+	 * a reliable test.
+	 */
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		return VM_FAULT_SIGBUS;
@@ -570,40 +862,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
- repeat:
-	page = find_get_page(mapping, vmf->pgoff);
-	if (page) {
-		if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
-			put_page(page);
-			return VM_FAULT_RETRY;
-		}
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto repeat;
-		}
+	entry = grab_mapping_entry(mapping, vmf->pgoff);
+	if (IS_ERR(entry)) {
+		error = PTR_ERR(entry);
+		goto out;
 	}
 
 	error = get_block(inode, block, &bh, 0);
 	if (!error && (bh.b_size < PAGE_SIZE))
 		error = -EIO;		/* fs corruption? */
 	if (error)
-		goto unlock_page;
-
-	if (!buffer_mapped(&bh) && !vmf->cow_page) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_SIZE))
-				error = -EIO;
-			if (error)
-				goto unlock_page;
-		} else {
-			return dax_load_hole(mapping, page, vmf);
-		}
-	}
+		goto unlock_entry;
 
 	if (vmf->cow_page) {
 		struct page *new_page = vmf->cow_page;
@@ -612,30 +881,37 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
-			goto unlock_page;
-		vmf->page = page;
-		if (!page)
+			goto unlock_entry;
+		if (!radix_tree_exceptional_entry(entry)) {
+			vmf->page = entry;
+		} else {
+			unlock_mapping_entry(mapping, vmf->pgoff);
 			i_mmap_lock_read(mapping);
+			vmf->page = NULL;
+		}
 		return VM_FAULT_LOCKED;
 	}
 
-	/* Check we didn't race with a read fault installing a new page */
-	if (!page && major)
-		page = find_lock_page(mapping, vmf->pgoff);
-
-	if (page) {
-		unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
-							PAGE_SIZE, 0);
-		delete_from_page_cache(page);
-		unlock_page(page);
-		put_page(page);
-		page = NULL;
+	if (!buffer_mapped(&bh)) {
+		if (vmf->flags & FAULT_FLAG_WRITE) {
+			error = get_block(inode, block, &bh, 1);
+			count_vm_event(PGMAJFAULT);
+			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+			major = VM_FAULT_MAJOR;
+			if (!error && (bh.b_size < PAGE_SIZE))
+				error = -EIO;
+			if (error)
+				goto unlock_entry;
+		} else {
+			return dax_load_hole(mapping, entry, vmf);
+		}
 	}
 
 	/* Filesystem should not return unwritten buffers to us! */
 	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	error = dax_insert_mapping(inode, &bh, vma, vmf);
-
+	error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf);
+ unlock_entry:
+	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
  out:
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
@@ -643,13 +919,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if ((error < 0) && (error != -EBUSY))
 		return VM_FAULT_SIGBUS | major;
 	return VM_FAULT_NOPAGE | major;
-
- unlock_page:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
 
@@ -718,7 +987,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	struct block_device *bdev;
 	pgoff_t size, pgoff;
 	sector_t block;
-	int error, result = 0;
+	int result = 0;
 	bool alloc = false;
 
 	/* dax pmd mappings require pfn_t_devmap() */
@@ -865,13 +1134,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		 * the write to insert a dirty entry.
 		 */
 		if (write) {
-			error = dax_radix_entry(mapping, pgoff, dax.sector,
-					true, true);
-			if (error) {
-				dax_pmd_dbg(&bh, address,
-						"PMD radix insertion failed");
-				goto fallback;
-			}
+			/*
+			 * We should insert radix-tree entry and dirty it here.
+			 * For now this is broken...
+			 */
 		}
 
 		dev_dbg(part_to_dev(bdev->bd_part),
@@ -931,23 +1197,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
-	int error;
-
-	/*
-	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
-	 * RADIX_DAX_PTE entry already exists in the radix tree from a
-	 * previous call to __dax_fault().  We just want to look up that PTE
-	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
-	 * saves us from having to make a call to get_block() here to look
-	 * up the sector.
-	 */
-	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
-			true);
+	struct address_space *mapping = file->f_mapping;
+	void *entry;
+	pgoff_t index = vmf->pgoff;
 
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM;
-	if (error)
-		return VM_FAULT_SIGBUS;
+	spin_lock_irq(&mapping->tree_lock);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (!entry || !radix_tree_exceptional_entry(entry))
+		goto out;
+	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
+	put_unlocked_mapping_entry(mapping, index, entry);
+out:
+	spin_unlock_irq(&mapping->tree_lock);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
diff --git a/include/linux/dax.h b/include/linux/dax.h
index aa148937bb3f..756625c6d0dd 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
 int dax_truncate_page(struct inode *, loff_t from, get_block_t);
 int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
 int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
+int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
+void dax_wake_mapping_entry_waiter(struct address_space *mapping,
+				   pgoff_t index, bool wake_all);
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
diff --git a/mm/filemap.c b/mm/filemap.c
index dfe55c2cfb34..7b9a4b180cae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping,
 			return;
 
 	/*
-	 * Track node that only contains shadow entries.
+	 * Track node that only contains shadow entries. DAX mappings contain
+	 * no shadow entries and may contain other exceptional entries so skip
+	 * those.
 	 *
 	 * Avoid acquiring the list_lru lock if already tracked.  The
 	 * list_empty() test is safe as node->private_list is
 	 * protected by mapping->tree_lock.
 	 */
-	if (!workingset_node_pages(node) &&
+	if (!dax_mapping(mapping) && !workingset_node_pages(node) &&
 	    list_empty(&node->private_list)) {
 		node->private_data = mapping;
 		list_lru_add(&workingset_shadow_nodes, &node->private_list);
@@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping,
 			/* DAX accounts exceptional entries as normal pages */
 			if (node)
 				workingset_node_pages_dec(node);
+			/* Wakeup waiters for exceptional entry lock */
+			dax_wake_mapping_entry_waiter(mapping, page->index,
+						      false);
 		}
 	}
 	radix_tree_replace_slot(slot, page);
diff --git a/mm/truncate.c b/mm/truncate.c
index b00272810871..4064f8f53daa 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping,
 	if (shmem_mapping(mapping))
 		return;
 
-	spin_lock_irq(&mapping->tree_lock);
-
 	if (dax_mapping(mapping)) {
-		if (radix_tree_delete_item(&mapping->page_tree, index, entry))
-			mapping->nrexceptional--;
-	} else {
-		/*
-		 * Regular page slots are stabilized by the page lock even
-		 * without the tree itself locked.  These unlocked entries
-		 * need verification under the tree lock.
-		 */
-		if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
-					&slot))
-			goto unlock;
-		if (*slot != entry)
-			goto unlock;
-		radix_tree_replace_slot(slot, NULL);
-		mapping->nrexceptional--;
-		if (!node)
-			goto unlock;
-		workingset_node_shadows_dec(node);
-		/*
-		 * Don't track node without shadow entries.
-		 *
-		 * Avoid acquiring the list_lru lock if already untracked.
-		 * The list_empty() test is safe as node->private_list is
-		 * protected by mapping->tree_lock.
-		 */
-		if (!workingset_node_shadows(node) &&
-		    !list_empty(&node->private_list))
-			list_lru_del(&workingset_shadow_nodes,
-					&node->private_list);
-		__radix_tree_delete_node(&mapping->page_tree, node);
+		dax_delete_mapping_entry(mapping, index);
+		return;
 	}
+	spin_lock_irq(&mapping->tree_lock);
+	/*
+	 * Regular page slots are stabilized by the page lock even
+	 * without the tree itself locked.  These unlocked entries
+	 * need verification under the tree lock.
+	 */
+	if (!__radix_tree_lookup(&mapping->page_tree, index, &node,
+				&slot))
+		goto unlock;
+	if (*slot != entry)
+		goto unlock;
+	radix_tree_replace_slot(slot, NULL);
+	mapping->nrexceptional--;
+	if (!node)
+		goto unlock;
+	workingset_node_shadows_dec(node);
+	/*
+	 * Don't track node without shadow entries.
+	 *
+	 * Avoid acquiring the list_lru lock if already untracked.
+	 * The list_empty() test is safe as node->private_list is
+	 * protected by mapping->tree_lock.
+	 */
+	if (!workingset_node_shadows(node) &&
+	    !list_empty(&node->private_list))
+		list_lru_del(&workingset_shadow_nodes,
+				&node->private_list);
+	__radix_tree_delete_node(&mapping->page_tree, node);
 unlock:
 	spin_unlock_irq(&mapping->tree_lock);
 }
-- 
cgit v1.2.3


From bc2466e4257369d0ebee2b6265070d323343fa72 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:19 +0200
Subject: dax: Use radix tree entry lock to protect cow faults

When doing cow faults, we cannot directly fill in PTE as we do for other
faults as we rely on generic code to do proper accounting of the cowed page.
We also have no page to lock to protect against races with truncate as
other faults have and we need the protection to extend until the moment
generic code inserts cowed page into PTE thus at that point we have no
protection of fs-specific i_mmap_sem. So far we relied on using
i_mmap_lock for the protection however that is completely special to cow
faults. To make fault locking more uniform use DAX entry lock instead.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c            | 12 +++++-------
 include/linux/dax.h |  7 +++++++
 include/linux/mm.h  |  7 +++++++
 mm/memory.c         | 38 ++++++++++++++++++--------------------
 4 files changed, 37 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/fs/dax.c b/fs/dax.c
index f43c3d806fb6..be74635e05a6 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -478,7 +478,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 	}
 }
 
-static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
 	void *ret, **slot;
 
@@ -501,7 +501,7 @@ static void put_locked_mapping_entry(struct address_space *mapping,
 		unlock_page(entry);
 		put_page(entry);
 	} else {
-		unlock_mapping_entry(mapping, index);
+		dax_unlock_mapping_entry(mapping, index);
 	}
 }
 
@@ -884,12 +884,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			goto unlock_entry;
 		if (!radix_tree_exceptional_entry(entry)) {
 			vmf->page = entry;
-		} else {
-			unlock_mapping_entry(mapping, vmf->pgoff);
-			i_mmap_lock_read(mapping);
-			vmf->page = NULL;
+			return VM_FAULT_LOCKED;
 		}
-		return VM_FAULT_LOCKED;
+		vmf->entry = entry;
+		return VM_FAULT_DAX_LOCKED;
 	}
 
 	if (!buffer_mapped(&bh)) {
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 756625c6d0dd..7bf12277c006 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -21,6 +21,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 
 #ifdef CONFIG_FS_DAX
 struct page *read_dax_sector(struct block_device *bdev, sector_t n);
+void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index);
 int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 		unsigned int offset, unsigned int length);
 #else
@@ -29,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev,
 {
 	return ERR_PTR(-ENXIO);
 }
+/* Shouldn't ever be called when dax is disabled. */
+static inline void dax_unlock_mapping_entry(struct address_space *mapping,
+					    pgoff_t index)
+{
+	BUG();
+}
 static inline int __dax_zero_page_range(struct block_device *bdev,
 		sector_t sector, unsigned int offset, unsigned int length)
 {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a55e5be0894f..0ef9dc720ec3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -299,6 +299,12 @@ struct vm_fault {
 					 * is set (which is also implied by
 					 * VM_FAULT_ERROR).
 					 */
+	void *entry;			/* ->fault handler can alternatively
+					 * return locked DAX entry. In that
+					 * case handler should return
+					 * VM_FAULT_DAX_LOCKED and fill in
+					 * entry here.
+					 */
 	/* for ->map_pages() only */
 	pgoff_t max_pgoff;		/* map pages for offset from pgoff till
 					 * max_pgoff inclusive */
@@ -1084,6 +1090,7 @@ static inline void clear_page_pfmemalloc(struct page *page)
 #define VM_FAULT_LOCKED	0x0200	/* ->fault locked the returned page */
 #define VM_FAULT_RETRY	0x0400	/* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800	/* huge page fault failed, fall back to small */
+#define VM_FAULT_DAX_LOCKED 0x1000	/* ->fault has locked DAX entry */
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
 
diff --git a/mm/memory.c b/mm/memory.c
index 93897f23cc11..f09cdb8d48fa 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -63,6 +63,7 @@
 #include <linux/dma-debug.h>
 #include <linux/debugfs.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/dax.h>
 
 #include <asm/io.h>
 #include <asm/mmu_context.h>
@@ -2785,7 +2786,8 @@ oom:
  */
 static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 			pgoff_t pgoff, unsigned int flags,
-			struct page *cow_page, struct page **page)
+			struct page *cow_page, struct page **page,
+			void **entry)
 {
 	struct vm_fault vmf;
 	int ret;
@@ -2800,8 +2802,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
-	if (!vmf.page)
-		goto out;
+	if (ret & VM_FAULT_DAX_LOCKED) {
+		*entry = vmf.entry;
+		return ret;
+	}
 
 	if (unlikely(PageHWPoison(vmf.page))) {
 		if (ret & VM_FAULT_LOCKED)
@@ -2815,7 +2819,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
- out:
 	*page = vmf.page;
 	return ret;
 }
@@ -2987,7 +2990,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
@@ -3010,6 +3013,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	struct page *fault_page, *new_page;
+	void *fault_entry;
 	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pte_t *pte;
@@ -3027,26 +3031,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		return VM_FAULT_OOM;
 	}
 
-	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page,
+			 &fault_entry);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		goto uncharge_out;
 
-	if (fault_page)
+	if (!(ret & VM_FAULT_DAX_LOCKED))
 		copy_user_highpage(new_page, fault_page, address, vma);
 	__SetPageUptodate(new_page);
 
 	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (unlikely(!pte_same(*pte, orig_pte))) {
 		pte_unmap_unlock(pte, ptl);
-		if (fault_page) {
+		if (!(ret & VM_FAULT_DAX_LOCKED)) {
 			unlock_page(fault_page);
 			put_page(fault_page);
 		} else {
-			/*
-			 * The fault handler has no page to lock, so it holds
-			 * i_mmap_lock for read to protect against truncate.
-			 */
-			i_mmap_unlock_read(vma->vm_file->f_mapping);
+			dax_unlock_mapping_entry(vma->vm_file->f_mapping,
+						 pgoff);
 		}
 		goto uncharge_out;
 	}
@@ -3054,15 +3056,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
-	if (fault_page) {
+	if (!(ret & VM_FAULT_DAX_LOCKED)) {
 		unlock_page(fault_page);
 		put_page(fault_page);
 	} else {
-		/*
-		 * The fault handler has no page to lock, so it holds
-		 * i_mmap_lock for read to protect against truncate.
-		 */
-		i_mmap_unlock_read(vma->vm_file->f_mapping);
+		dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff);
 	}
 	return ret;
 uncharge_out:
@@ -3082,7 +3080,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	int dirtied = 0;
 	int ret, tmp;
 
-	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
+	ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
 
-- 
cgit v1.2.3


From 4d9a2c8746671efbb0c27d3ae28c7474597a7aad Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 12 May 2016 18:29:20 +0200
Subject: dax: Remove i_mmap_lock protection

Currently faults are protected against truncate by filesystem specific
i_mmap_sem and page lock in case of hole page. Cow faults are protected
DAX radix tree entry locking. So there's no need for i_mmap_lock in DAX
code. Remove it.

Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
---
 fs/dax.c    | 24 +++++-------------------
 mm/memory.c |  2 --
 2 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/fs/dax.c b/fs/dax.c
index be74635e05a6..6dbe6021cab7 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -798,29 +798,19 @@ static int dax_insert_mapping(struct address_space *mapping,
 		.sector = to_sector(bh, mapping->host),
 		.size = bh->b_size,
 	};
-	int error;
 	void *ret;
 	void *entry = *entryp;
 
-	i_mmap_lock_read(mapping);
-
-	if (dax_map_atomic(bdev, &dax) < 0) {
-		error = PTR_ERR(dax.addr);
-		goto out;
-	}
+	if (dax_map_atomic(bdev, &dax) < 0)
+		return PTR_ERR(dax.addr);
 	dax_unmap_atomic(bdev, &dax);
 
 	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
-	if (IS_ERR(ret)) {
-		error = PTR_ERR(ret);
-		goto out;
-	}
+	if (IS_ERR(ret))
+		return PTR_ERR(ret);
 	*entryp = ret;
 
-	error = vm_insert_mixed(vma, vaddr, dax.pfn);
- out:
-	i_mmap_unlock_read(mapping);
-	return error;
+	return vm_insert_mixed(vma, vaddr, dax.pfn);
 }
 
 /**
@@ -1058,8 +1048,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		truncate_pagecache_range(inode, lstart, lend);
 	}
 
-	i_mmap_lock_read(mapping);
-
 	if (!write && !buffer_mapped(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
@@ -1148,8 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	}
 
  out:
-	i_mmap_unlock_read(mapping);
-
 	return result;
 
  fallback:
diff --git a/mm/memory.c b/mm/memory.c
index f09cdb8d48fa..06f552504e79 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2453,8 +2453,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	if (details.last_index < details.first_index)
 		details.last_index = ULONG_MAX;
 
-
-	/* DAX uses i_mmap_lock to serialise file truncate vs page fault */
 	i_mmap_lock_write(mapping);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
-- 
cgit v1.2.3


From 18726ca8b34bbfb3ab5a1c0a52a5d8dd392466ed Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:02 -0700
Subject: mm/slab: fix the theoretical race by holding proper lock

While processing concurrent allocation, SLAB could be contended a lot
because it did a lots of work with holding a lock.  This patchset try to
reduce the number of critical section to reduce lock contention.  Major
changes are lockless decision to allocate more slab and lockless cpu
cache refill from the newly allocated slab.

Below is the result of concurrent allocation/free in slab allocation
benchmark made by Christoph a long time ago.  I make the output simpler.
The number shows cycle count during alloc/free respectively so less is
better.

  * Before
  Kmalloc N*alloc N*free(32): Average=365/806
  Kmalloc N*alloc N*free(64): Average=452/690
  Kmalloc N*alloc N*free(128): Average=736/886
  Kmalloc N*alloc N*free(256): Average=1167/985
  Kmalloc N*alloc N*free(512): Average=2088/1125
  Kmalloc N*alloc N*free(1024): Average=4115/1184
  Kmalloc N*alloc N*free(2048): Average=8451/1748
  Kmalloc N*alloc N*free(4096): Average=16024/2048

  * After
  Kmalloc N*alloc N*free(32): Average=344/792
  Kmalloc N*alloc N*free(64): Average=347/882
  Kmalloc N*alloc N*free(128): Average=390/959
  Kmalloc N*alloc N*free(256): Average=393/1067
  Kmalloc N*alloc N*free(512): Average=683/1229
  Kmalloc N*alloc N*free(1024): Average=1295/1325
  Kmalloc N*alloc N*free(2048): Average=2513/1664
  Kmalloc N*alloc N*free(4096): Average=4742/2172

It shows that performance improves greatly (roughly more than 50%) for
the object class whose size is more than 128 bytes.

This patch (of 11):

If we don't hold neither the slab_mutex nor the node lock, node's shared
array cache could be freed and re-populated.  If __kmem_cache_shrink()
is called at the same time, it will call drain_array() with n->shared
without holding node lock so problem can happen.  This patch fix the
situation by holding the node lock before trying to drain the shared
array.

In addition, add a debug check to confirm that n->shared access race
doesn't exist.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 68 ++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 45 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 17e2848979c5..3f1cc1ca4d88 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2180,6 +2180,11 @@ static void check_irq_on(void)
 	BUG_ON(irqs_disabled());
 }
 
+static void check_mutex_acquired(void)
+{
+	BUG_ON(!mutex_is_locked(&slab_mutex));
+}
+
 static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
@@ -2199,13 +2204,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
+#define check_mutex_acquired()	do { } while(0)
 #define check_spinlock_acquired(x) do { } while(0)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
-			struct array_cache *ac,
-			int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
+				int node, bool free_all, struct list_head *list)
+{
+	int tofree;
+
+	if (!ac || !ac->avail)
+		return;
+
+	tofree = free_all ? ac->avail : (ac->limit + 4) / 5;
+	if (tofree > ac->avail)
+		tofree = (ac->avail + 1) / 2;
+
+	free_block(cachep, ac->entry, tofree, node, list);
+	ac->avail -= tofree;
+	memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail);
+}
 
 static void do_drain(void *arg)
 {
@@ -2229,6 +2248,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 {
 	struct kmem_cache_node *n;
 	int node;
+	LIST_HEAD(list);
 
 	on_each_cpu(do_drain, cachep, 1);
 	check_irq_on();
@@ -2236,8 +2256,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 		if (n->alien)
 			drain_alien_cache(cachep, n->alien);
 
-	for_each_kmem_cache_node(cachep, node, n)
-		drain_array(cachep, n, n->shared, 1, node);
+	for_each_kmem_cache_node(cachep, node, n) {
+		spin_lock_irq(&n->list_lock);
+		drain_array_locked(cachep, n->shared, node, true, &list);
+		spin_unlock_irq(&n->list_lock);
+
+		slabs_destroy(cachep, &list);
+	}
 }
 
 /*
@@ -3869,29 +3894,26 @@ skip_setup:
  * if drain_array() is used on the shared array.
  */
 static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n,
-			 struct array_cache *ac, int force, int node)
+			 struct array_cache *ac, int node)
 {
 	LIST_HEAD(list);
-	int tofree;
+
+	/* ac from n->shared can be freed if we don't hold the slab_mutex. */
+	check_mutex_acquired();
 
 	if (!ac || !ac->avail)
 		return;
-	if (ac->touched && !force) {
+
+	if (ac->touched) {
 		ac->touched = 0;
-	} else {
-		spin_lock_irq(&n->list_lock);
-		if (ac->avail) {
-			tofree = force ? ac->avail : (ac->limit + 4) / 5;
-			if (tofree > ac->avail)
-				tofree = (ac->avail + 1) / 2;
-			free_block(cachep, ac->entry, tofree, node, &list);
-			ac->avail -= tofree;
-			memmove(ac->entry, &(ac->entry[tofree]),
-				sizeof(void *) * ac->avail);
-		}
-		spin_unlock_irq(&n->list_lock);
-		slabs_destroy(cachep, &list);
+		return;
 	}
+
+	spin_lock_irq(&n->list_lock);
+	drain_array_locked(cachep, ac, node, false, &list);
+	spin_unlock_irq(&n->list_lock);
+
+	slabs_destroy(cachep, &list);
 }
 
 /**
@@ -3929,7 +3951,7 @@ static void cache_reap(struct work_struct *w)
 
 		reap_alien(searchp, n);
 
-		drain_array(searchp, n, cpu_cache_get(searchp), 0, node);
+		drain_array(searchp, n, cpu_cache_get(searchp), node);
 
 		/*
 		 * These are racy checks but it does not matter
@@ -3940,7 +3962,7 @@ static void cache_reap(struct work_struct *w)
 
 		n->next_reap = jiffies + REAPTIMEOUT_NODE;
 
-		drain_array(searchp, n, n->shared, 0, node);
+		drain_array(searchp, n, n->shared, node);
 
 		if (n->free_touched)
 			n->free_touched = 0;
-- 
cgit v1.2.3


From 8888177ea116d4d14ca0a2ba054d02f35b0dae29 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:05 -0700
Subject: mm/slab: remove BAD_ALIEN_MAGIC again

Initial attemp to remove BAD_ALIEN_MAGIC is once reverted by 'commit
edcad2509550 ("Revert "slab: remove BAD_ALIEN_MAGIC"")' because it
causes a problem on m68k which has many node but !CONFIG_NUMA.  In this
case, although alien cache isn't used at all but to cope with some
initialization path, garbage value is used and that is BAD_ALIEN_MAGIC.
Now, this patch set use_alien_caches to 0 when !CONFIG_NUMA, there is no
initialization path problem so we don't need BAD_ALIEN_MAGIC at all.  So
remove it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Tested-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3f1cc1ca4d88..f36d3493f49f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -421,8 +421,6 @@ static struct kmem_cache kmem_cache_boot = {
 	.name = "kmem_cache",
 };
 
-#define BAD_ALIEN_MAGIC 0x01020304ul
-
 static DEFINE_PER_CPU(struct delayed_work, slab_reap_work);
 
 static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
@@ -644,7 +642,7 @@ static int transfer_objects(struct array_cache *to,
 static inline struct alien_cache **alloc_alien_cache(int node,
 						int limit, gfp_t gfp)
 {
-	return (struct alien_cache **)BAD_ALIEN_MAGIC;
+	return NULL;
 }
 
 static inline void free_alien_cache(struct alien_cache **ac_ptr)
@@ -1212,7 +1210,7 @@ void __init kmem_cache_init(void)
 					sizeof(struct rcu_head));
 	kmem_cache = &kmem_cache_boot;
 
-	if (num_possible_nodes() == 1)
+	if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1)
 		use_alien_caches = 0;
 
 	for (i = 0; i < NUM_INIT_LISTS; i++)
-- 
cgit v1.2.3


From a5aa63a5f7352aa8991f64d46854dcb8d3788d55 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:08 -0700
Subject: mm/slab: drain the free slab as much as possible

slabs_tofree() implies freeing all free slab.  We can do it with just
providing INT_MAX.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f36d3493f49f..a998d35599a3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -895,12 +895,6 @@ static int init_cache_node_node(int node)
 	return 0;
 }
 
-static inline int slabs_tofree(struct kmem_cache *cachep,
-						struct kmem_cache_node *n)
-{
-	return (n->free_objects + cachep->num - 1) / cachep->num;
-}
-
 static void cpuup_canceled(long cpu)
 {
 	struct kmem_cache *cachep;
@@ -965,7 +959,7 @@ free_slab:
 		n = get_node(cachep, node);
 		if (!n)
 			continue;
-		drain_freelist(cachep, n, slabs_tofree(cachep, n));
+		drain_freelist(cachep, n, INT_MAX);
 	}
 }
 
@@ -1117,7 +1111,7 @@ static int __meminit drain_cache_node_node(int node)
 		if (!n)
 			continue;
 
-		drain_freelist(cachep, n, slabs_tofree(cachep, n));
+		drain_freelist(cachep, n, INT_MAX);
 
 		if (!list_empty(&n->slabs_full) ||
 		    !list_empty(&n->slabs_partial)) {
@@ -2311,7 +2305,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 
 	check_irq_on();
 	for_each_kmem_cache_node(cachep, node, n) {
-		drain_freelist(cachep, n, slabs_tofree(cachep, n));
+		drain_freelist(cachep, n, INT_MAX);
 
 		ret += !list_empty(&n->slabs_full) ||
 			!list_empty(&n->slabs_partial);
-- 
cgit v1.2.3


From ded0ecf61118930988f0943e741056c8fd5d439c Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:11 -0700
Subject: mm/slab: factor out kmem_cache_node initialization code

It can be reused on other place, so factor out it.  Following patch will
use it.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 74 ++++++++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 45 insertions(+), 29 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index a998d35599a3..9bef33bc4daa 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -848,6 +848,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
 }
 #endif
 
+static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
+{
+	struct kmem_cache_node *n;
+
+	/*
+	 * Set up the kmem_cache_node for cpu before we can
+	 * begin anything. Make sure some other cpu on this
+	 * node has not already allocated this
+	 */
+	n = get_node(cachep, node);
+	if (n) {
+		spin_lock_irq(&n->list_lock);
+		n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount +
+				cachep->num;
+		spin_unlock_irq(&n->list_lock);
+
+		return 0;
+	}
+
+	n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
+	if (!n)
+		return -ENOMEM;
+
+	kmem_cache_node_init(n);
+	n->next_reap = jiffies + REAPTIMEOUT_NODE +
+		    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
+
+	n->free_limit =
+		(1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num;
+
+	/*
+	 * The kmem_cache_nodes don't come and go as CPUs
+	 * come and go.  slab_mutex is sufficient
+	 * protection here.
+	 */
+	cachep->node[node] = n;
+
+	return 0;
+}
+
 /*
  * Allocates and initializes node for a node on each slab cache, used for
  * either memory or cpu hotplug.  If memory is being hot-added, the kmem_cache_node
@@ -859,39 +899,15 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
  */
 static int init_cache_node_node(int node)
 {
+	int ret;
 	struct kmem_cache *cachep;
-	struct kmem_cache_node *n;
-	const size_t memsize = sizeof(struct kmem_cache_node);
 
 	list_for_each_entry(cachep, &slab_caches, list) {
-		/*
-		 * Set up the kmem_cache_node for cpu before we can
-		 * begin anything. Make sure some other cpu on this
-		 * node has not already allocated this
-		 */
-		n = get_node(cachep, node);
-		if (!n) {
-			n = kmalloc_node(memsize, GFP_KERNEL, node);
-			if (!n)
-				return -ENOMEM;
-			kmem_cache_node_init(n);
-			n->next_reap = jiffies + REAPTIMEOUT_NODE +
-			    ((unsigned long)cachep) % REAPTIMEOUT_NODE;
-
-			/*
-			 * The kmem_cache_nodes don't come and go as CPUs
-			 * come and go.  slab_mutex is sufficient
-			 * protection here.
-			 */
-			cachep->node[node] = n;
-		}
-
-		spin_lock_irq(&n->list_lock);
-		n->free_limit =
-			(1 + nr_cpus_node(node)) *
-			cachep->batchcount + cachep->num;
-		spin_unlock_irq(&n->list_lock);
+		ret = init_cache_node(cachep, node, GFP_KERNEL);
+		if (ret)
+			return ret;
 	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c3d332b6b2c11ddda9cce3e2f3135b68929d4b82 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:14 -0700
Subject: mm/slab: clean-up kmem_cache_node setup

There are mostly same code for setting up kmem_cache_node either in
cpuup_prepare() or alloc_kmem_cache_node().  Factor out and clean-up
them.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Tested-by: Nishanth Menon <nm@ti.com>
Tested-by: Jon Hunter <jonathanh@nvidia.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 168 +++++++++++++++++++++++++-------------------------------------
 1 file changed, 68 insertions(+), 100 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 9bef33bc4daa..f1db679c2b5d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -911,6 +911,63 @@ static int init_cache_node_node(int node)
 	return 0;
 }
 
+static int setup_kmem_cache_node(struct kmem_cache *cachep,
+				int node, gfp_t gfp, bool force_change)
+{
+	int ret = -ENOMEM;
+	struct kmem_cache_node *n;
+	struct array_cache *old_shared = NULL;
+	struct array_cache *new_shared = NULL;
+	struct alien_cache **new_alien = NULL;
+	LIST_HEAD(list);
+
+	if (use_alien_caches) {
+		new_alien = alloc_alien_cache(node, cachep->limit, gfp);
+		if (!new_alien)
+			goto fail;
+	}
+
+	if (cachep->shared) {
+		new_shared = alloc_arraycache(node,
+			cachep->shared * cachep->batchcount, 0xbaadf00d, gfp);
+		if (!new_shared)
+			goto fail;
+	}
+
+	ret = init_cache_node(cachep, node, gfp);
+	if (ret)
+		goto fail;
+
+	n = get_node(cachep, node);
+	spin_lock_irq(&n->list_lock);
+	if (n->shared && force_change) {
+		free_block(cachep, n->shared->entry,
+				n->shared->avail, node, &list);
+		n->shared->avail = 0;
+	}
+
+	if (!n->shared || force_change) {
+		old_shared = n->shared;
+		n->shared = new_shared;
+		new_shared = NULL;
+	}
+
+	if (!n->alien) {
+		n->alien = new_alien;
+		new_alien = NULL;
+	}
+
+	spin_unlock_irq(&n->list_lock);
+	slabs_destroy(cachep, &list);
+
+fail:
+	kfree(old_shared);
+	kfree(new_shared);
+	free_alien_cache(new_alien);
+
+	return ret;
+}
+
 static void cpuup_canceled(long cpu)
 {
 	struct kmem_cache *cachep;
@@ -982,7 +1039,6 @@ free_slab:
 static int cpuup_prepare(long cpu)
 {
 	struct kmem_cache *cachep;
-	struct kmem_cache_node *n = NULL;
 	int node = cpu_to_mem(cpu);
 	int err;
 
@@ -1001,44 +1057,9 @@ static int cpuup_prepare(long cpu)
 	 * array caches
 	 */
 	list_for_each_entry(cachep, &slab_caches, list) {
-		struct array_cache *shared = NULL;
-		struct alien_cache **alien = NULL;
-
-		if (cachep->shared) {
-			shared = alloc_arraycache(node,
-				cachep->shared * cachep->batchcount,
-				0xbaadf00d, GFP_KERNEL);
-			if (!shared)
-				goto bad;
-		}
-		if (use_alien_caches) {
-			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
-			if (!alien) {
-				kfree(shared);
-				goto bad;
-			}
-		}
-		n = get_node(cachep, node);
-		BUG_ON(!n);
-
-		spin_lock_irq(&n->list_lock);
-		if (!n->shared) {
-			/*
-			 * We are serialised from CPU_DEAD or
-			 * CPU_UP_CANCELLED by the cpucontrol lock
-			 */
-			n->shared = shared;
-			shared = NULL;
-		}
-#ifdef CONFIG_NUMA
-		if (!n->alien) {
-			n->alien = alien;
-			alien = NULL;
-		}
-#endif
-		spin_unlock_irq(&n->list_lock);
-		kfree(shared);
-		free_alien_cache(alien);
+		err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false);
+		if (err)
+			goto bad;
 	}
 
 	return 0;
@@ -3678,72 +3699,19 @@ EXPORT_SYMBOL(kfree);
 /*
  * This initializes kmem_cache_node or resizes various caches for all nodes.
  */
-static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp)
+static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp)
 {
+	int ret;
 	int node;
 	struct kmem_cache_node *n;
-	struct array_cache *new_shared;
-	struct alien_cache **new_alien = NULL;
 
 	for_each_online_node(node) {
-
-		if (use_alien_caches) {
-			new_alien = alloc_alien_cache(node, cachep->limit, gfp);
-			if (!new_alien)
-				goto fail;
-		}
-
-		new_shared = NULL;
-		if (cachep->shared) {
-			new_shared = alloc_arraycache(node,
-				cachep->shared*cachep->batchcount,
-					0xbaadf00d, gfp);
-			if (!new_shared) {
-				free_alien_cache(new_alien);
-				goto fail;
-			}
-		}
-
-		n = get_node(cachep, node);
-		if (n) {
-			struct array_cache *shared = n->shared;
-			LIST_HEAD(list);
-
-			spin_lock_irq(&n->list_lock);
-
-			if (shared)
-				free_block(cachep, shared->entry,
-						shared->avail, node, &list);
-
-			n->shared = new_shared;
-			if (!n->alien) {
-				n->alien = new_alien;
-				new_alien = NULL;
-			}
-			n->free_limit = (1 + nr_cpus_node(node)) *
-					cachep->batchcount + cachep->num;
-			spin_unlock_irq(&n->list_lock);
-			slabs_destroy(cachep, &list);
-			kfree(shared);
-			free_alien_cache(new_alien);
-			continue;
-		}
-		n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
-		if (!n) {
-			free_alien_cache(new_alien);
-			kfree(new_shared);
+		ret = setup_kmem_cache_node(cachep, node, gfp, true);
+		if (ret)
 			goto fail;
-		}
 
-		kmem_cache_node_init(n);
-		n->next_reap = jiffies + REAPTIMEOUT_NODE +
-				((unsigned long)cachep) % REAPTIMEOUT_NODE;
-		n->shared = new_shared;
-		n->alien = new_alien;
-		n->free_limit = (1 + nr_cpus_node(node)) *
-					cachep->batchcount + cachep->num;
-		cachep->node[node] = n;
 	}
+
 	return 0;
 
 fail:
@@ -3785,7 +3753,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	cachep->shared = shared;
 
 	if (!prev)
-		goto alloc_node;
+		goto setup_node;
 
 	for_each_online_cpu(cpu) {
 		LIST_HEAD(list);
@@ -3802,8 +3770,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	free_percpu(prev);
 
-alloc_node:
-	return alloc_kmem_cache_node(cachep, gfp);
+setup_node:
+	return setup_kmem_cache_nodes(cachep, gfp);
 }
 
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-- 
cgit v1.2.3


From 6052b7880a95554993898f7cac075c2669f1dd7c Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:17 -0700
Subject: mm/slab: don't keep free slabs if free_objects exceeds free_limit

Currently, determination to free a slab is done whenever each freed
object is put into the slab.  This has a following problem.

Assume free_limit = 10 and nr_free = 9.

Free happens as following sequence and nr_free changes as following.

free(become a free slab) free(not become a free slab) nr_free: 9 -> 10
(at first free) -> 11 (at second free)

If we try to check if we can free current slab or not on each object
free, we can't free any slab in this situation because current slab
isn't a free slab when nr_free exceed free_limit (at second free) even
if there is a free slab.

However, if we check it lastly, we can free 1 free slab.

This problem would cause to keep too much memory in the slab subsystem.
This patch try to fix it by checking number of free object after all
free work is done.  If there is free slab at that time, we can free slab
as much as possible so we keep free slab as minimal.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f1db679c2b5d..3f16475b7189 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3296,6 +3296,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 {
 	int i;
 	struct kmem_cache_node *n = get_node(cachep, node);
+	struct page *page;
+
+	n->free_objects += nr_objects;
 
 	for (i = 0; i < nr_objects; i++) {
 		void *objp;
@@ -3308,17 +3311,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 		check_spinlock_acquired_node(cachep, node);
 		slab_put_obj(cachep, page, objp);
 		STATS_DEC_ACTIVE(cachep);
-		n->free_objects++;
 
 		/* fixup slab chains */
-		if (page->active == 0) {
-			if (n->free_objects > n->free_limit) {
-				n->free_objects -= cachep->num;
-				list_add_tail(&page->lru, list);
-			} else {
-				list_add(&page->lru, &n->slabs_free);
-			}
-		} else {
+		if (page->active == 0)
+			list_add(&page->lru, &n->slabs_free);
+		else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
@@ -3326,6 +3323,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp,
 			list_add_tail(&page->lru, &n->slabs_partial);
 		}
 	}
+
+	while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
+		n->free_objects -= cachep->num;
+
+		page = list_last_entry(&n->slabs_free, struct page, lru);
+		list_del(&page->lru);
+		list_add(&page->lru, list);
+	}
 }
 
 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
-- 
cgit v1.2.3


From 03d1d43a1262b347a9aa814980438fff8eb32edc Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:20 -0700
Subject: mm/slab: racy access/modify the slab color

Slab color isn't needed to be changed strictly.  Because locking for
changing slab color could cause more lock contention so this patch
implements racy access/modify the slab color.  This is a preparation
step to implement lockless allocation path when there is no free objects
in the kmem_cache.

Below is the result of concurrent allocation/free in slab allocation
benchmark made by Christoph a long time ago.  I make the output simpler.
The number shows cycle count during alloc/free respectively so less is
better.

  * Before
  Kmalloc N*alloc N*free(32): Average=365/806
  Kmalloc N*alloc N*free(64): Average=452/690
  Kmalloc N*alloc N*free(128): Average=736/886
  Kmalloc N*alloc N*free(256): Average=1167/985
  Kmalloc N*alloc N*free(512): Average=2088/1125
  Kmalloc N*alloc N*free(1024): Average=4115/1184
  Kmalloc N*alloc N*free(2048): Average=8451/1748
  Kmalloc N*alloc N*free(4096): Average=16024/2048

  * After
  Kmalloc N*alloc N*free(32): Average=355/750
  Kmalloc N*alloc N*free(64): Average=452/812
  Kmalloc N*alloc N*free(128): Average=559/1070
  Kmalloc N*alloc N*free(256): Average=1176/980
  Kmalloc N*alloc N*free(512): Average=1939/1189
  Kmalloc N*alloc N*free(1024): Average=3521/1278
  Kmalloc N*alloc N*free(2048): Average=7152/1838
  Kmalloc N*alloc N*free(4096): Average=13438/2013

It shows that contention is reduced for object size >= 1024 and
performance increases by roughly 15%.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3f16475b7189..e181cfbf026b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2574,20 +2574,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	}
 	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
 
-	/* Take the node list lock to change the colour_next on this node */
 	check_irq_off();
-	n = get_node(cachep, nodeid);
-	spin_lock(&n->list_lock);
-
-	/* Get colour for the slab, and cal the next value. */
-	offset = n->colour_next;
-	n->colour_next++;
-	if (n->colour_next >= cachep->colour)
-		n->colour_next = 0;
-	spin_unlock(&n->list_lock);
-
-	offset *= cachep->colour_off;
-
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_enable();
 
@@ -2608,6 +2595,19 @@ static int cache_grow(struct kmem_cache *cachep,
 	if (!page)
 		goto failed;
 
+	n = get_node(cachep, nodeid);
+
+	/* Get colour for the slab, and cal the next value. */
+	n->colour_next++;
+	if (n->colour_next >= cachep->colour)
+		n->colour_next = 0;
+
+	offset = n->colour_next;
+	if (offset >= cachep->colour)
+		offset = 0;
+
+	offset *= cachep->colour_off;
+
 	/* Get slab management. */
 	freelist = alloc_slabmgmt(cachep, page, offset,
 			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
-- 
cgit v1.2.3


From 511e3a05881221a7fc63e36f3d604887040fc845 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:23 -0700
Subject: mm/slab: make cache_grow() handle the page allocated on arbitrary
 node

Currently, cache_grow() assumes that allocated page's nodeid would be
same with parameter nodeid which is used for allocation request.  If we
discard this assumption, we can handle fallback_alloc() case gracefully.
So, this patch makes cache_grow() handle the page allocated on arbitrary
node and clean-up relevant code.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 60 +++++++++++++++++++++---------------------------------------
 1 file changed, 21 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index e181cfbf026b..b303c04c1565 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2556,13 +2556,14 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep,
-		gfp_t flags, int nodeid, struct page *page)
+static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 {
 	void *freelist;
 	size_t offset;
 	gfp_t local_flags;
+	int page_node;
 	struct kmem_cache_node *n;
+	struct page *page;
 
 	/*
 	 * Be lazy and only check for valid flags here,  keeping it out of the
@@ -2590,12 +2591,12 @@ static int cache_grow(struct kmem_cache *cachep,
 	 * Get mem for the objs.  Attempt to allocate a physical page from
 	 * 'nodeid'.
 	 */
-	if (!page)
-		page = kmem_getpages(cachep, local_flags, nodeid);
+	page = kmem_getpages(cachep, local_flags, nodeid);
 	if (!page)
 		goto failed;
 
-	n = get_node(cachep, nodeid);
+	page_node = page_to_nid(page);
+	n = get_node(cachep, page_node);
 
 	/* Get colour for the slab, and cal the next value. */
 	n->colour_next++;
@@ -2610,7 +2611,7 @@ static int cache_grow(struct kmem_cache *cachep,
 
 	/* Get slab management. */
 	freelist = alloc_slabmgmt(cachep, page, offset,
-			local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
+			local_flags & ~GFP_CONSTRAINT_MASK, page_node);
 	if (OFF_SLAB(cachep) && !freelist)
 		goto opps1;
 
@@ -2629,13 +2630,13 @@ static int cache_grow(struct kmem_cache *cachep,
 	STATS_INC_GROWN(cachep);
 	n->free_objects += cachep->num;
 	spin_unlock(&n->list_lock);
-	return 1;
+	return page_node;
 opps1:
 	kmem_freepages(cachep, page);
 failed:
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_disable();
-	return 0;
+	return -1;
 }
 
 #if DEBUG
@@ -2916,14 +2917,14 @@ alloc_done:
 				return obj;
 		}
 
-		x = cache_grow(cachep, gfp_exact_node(flags), node, NULL);
+		x = cache_grow(cachep, gfp_exact_node(flags), node);
 
 		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
 		node = numa_mem_id();
 
 		/* no objects in sight? abort */
-		if (!x && ac->avail == 0)
+		if (x < 0 && ac->avail == 0)
 			return NULL;
 
 		if (!ac->avail)		/* objects refilled by interrupt? */
@@ -3052,7 +3053,6 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 {
 	struct zonelist *zonelist;
-	gfp_t local_flags;
 	struct zoneref *z;
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
@@ -3063,8 +3063,6 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	if (flags & __GFP_THISNODE)
 		return NULL;
 
-	local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
-
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 	zonelist = node_zonelist(mempolicy_slab_node(), flags);
@@ -3094,33 +3092,17 @@ retry:
 		 * We may trigger various forms of reclaim on the allowed
 		 * set and go into memory reserves if necessary.
 		 */
-		struct page *page;
+		nid = cache_grow(cache, flags, numa_mem_id());
+		if (nid >= 0) {
+			obj = ____cache_alloc_node(cache,
+				gfp_exact_node(flags), nid);
 
-		if (gfpflags_allow_blocking(local_flags))
-			local_irq_enable();
-		kmem_flagcheck(cache, flags);
-		page = kmem_getpages(cache, local_flags, numa_mem_id());
-		if (gfpflags_allow_blocking(local_flags))
-			local_irq_disable();
-		if (page) {
 			/*
-			 * Insert into the appropriate per node queues
+			 * Another processor may allocate the objects in
+			 * the slab since we are not holding any locks.
 			 */
-			nid = page_to_nid(page);
-			if (cache_grow(cache, flags, nid, page)) {
-				obj = ____cache_alloc_node(cache,
-					gfp_exact_node(flags), nid);
-				if (!obj)
-					/*
-					 * Another processor may allocate the
-					 * objects in the slab since we are
-					 * not holding any locks.
-					 */
-					goto retry;
-			} else {
-				/* cache_grow already freed obj */
-				obj = NULL;
-			}
+			if (!obj)
+				goto retry;
 		}
 	}
 
@@ -3171,8 +3153,8 @@ retry:
 
 must_grow:
 	spin_unlock(&n->list_lock);
-	x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL);
-	if (x)
+	x = cache_grow(cachep, gfp_exact_node(flags), nodeid);
+	if (x >= 0)
 		goto retry;
 
 	return fallback_alloc(cachep, flags);
-- 
cgit v1.2.3


From 76b342bdc71badea2cbac7bf6590aa86e895c507 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:26 -0700
Subject: mm/slab: separate cache_grow() to two parts

This is a preparation step to implement lockless allocation path when
there is no free objects in kmem_cache.

What we'd like to do here is to refill cpu cache without holding a node
lock.  To accomplish this purpose, refill should be done after new slab
allocation but before attaching the slab to the management list.  So,
this patch separates cache_grow() to two parts, allocation and attaching
to the list in order to add some code inbetween them in the following
patch.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 74 ++++++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 52 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index b303c04c1565..8c4db214b05b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list);
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
+static inline void fixup_objfreelist_debug(struct kmem_cache *cachep,
+						void **list);
+static inline void fixup_slab_list(struct kmem_cache *cachep,
+				struct kmem_cache_node *n, struct page *page,
+				void **list);
 static int slab_early_init = 1;
 
 #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node))
@@ -1810,7 +1815,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 
 			/*
 			 * Needed to avoid possible looping condition
-			 * in cache_grow()
+			 * in cache_grow_begin()
 			 */
 			if (OFF_SLAB(freelist_cache))
 				continue;
@@ -2556,7 +2561,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static struct page *cache_grow_begin(struct kmem_cache *cachep,
+				gfp_t flags, int nodeid)
 {
 	void *freelist;
 	size_t offset;
@@ -2622,21 +2628,40 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_disable();
-	check_irq_off();
-	spin_lock(&n->list_lock);
 
-	/* Make slab active. */
-	list_add_tail(&page->lru, &(n->slabs_free));
-	STATS_INC_GROWN(cachep);
-	n->free_objects += cachep->num;
-	spin_unlock(&n->list_lock);
-	return page_node;
+	return page;
+
 opps1:
 	kmem_freepages(cachep, page);
 failed:
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_disable();
-	return -1;
+	return NULL;
+}
+
+static void cache_grow_end(struct kmem_cache *cachep, struct page *page)
+{
+	struct kmem_cache_node *n;
+	void *list = NULL;
+
+	check_irq_off();
+
+	if (!page)
+		return;
+
+	INIT_LIST_HEAD(&page->lru);
+	n = get_node(cachep, page_to_nid(page));
+
+	spin_lock(&n->list_lock);
+	if (!page->active)
+		list_add_tail(&page->lru, &(n->slabs_free));
+	else
+		fixup_slab_list(cachep, n, page, &list);
+	STATS_INC_GROWN(cachep);
+	n->free_objects += cachep->num - page->active;
+	spin_unlock(&n->list_lock);
+
+	fixup_objfreelist_debug(cachep, &list);
 }
 
 #if DEBUG
@@ -2847,6 +2872,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 	int node;
 	void *list = NULL;
+	struct page *page;
 
 	check_irq_off();
 	node = numa_mem_id();
@@ -2874,7 +2900,6 @@ retry:
 	}
 
 	while (batchcount > 0) {
-		struct page *page;
 		/* Get slab alloc is to come from. */
 		page = get_first_slab(n, false);
 		if (!page)
@@ -2907,8 +2932,6 @@ alloc_done:
 	fixup_objfreelist_debug(cachep, &list);
 
 	if (unlikely(!ac->avail)) {
-		int x;
-
 		/* Check if we can use obj in pfmemalloc slab */
 		if (sk_memalloc_socks()) {
 			void *obj = cache_alloc_pfmemalloc(cachep, n, flags);
@@ -2917,14 +2940,18 @@ alloc_done:
 				return obj;
 		}
 
-		x = cache_grow(cachep, gfp_exact_node(flags), node);
+		page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
+		cache_grow_end(cachep, page);
 
-		/* cache_grow can reenable interrupts, then ac could change. */
+		/*
+		 * cache_grow_begin() can reenable interrupts,
+		 * then ac could change.
+		 */
 		ac = cpu_cache_get(cachep);
 		node = numa_mem_id();
 
 		/* no objects in sight? abort */
-		if (x < 0 && ac->avail == 0)
+		if (!page && ac->avail == 0)
 			return NULL;
 
 		if (!ac->avail)		/* objects refilled by interrupt? */
@@ -3057,6 +3084,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
 	struct zone *zone;
 	enum zone_type high_zoneidx = gfp_zone(flags);
 	void *obj = NULL;
+	struct page *page;
 	int nid;
 	unsigned int cpuset_mems_cookie;
 
@@ -3092,8 +3120,10 @@ retry:
 		 * We may trigger various forms of reclaim on the allowed
 		 * set and go into memory reserves if necessary.
 		 */
-		nid = cache_grow(cache, flags, numa_mem_id());
-		if (nid >= 0) {
+		page = cache_grow_begin(cache, flags, numa_mem_id());
+		cache_grow_end(cache, page);
+		if (page) {
+			nid = page_to_nid(page);
 			obj = ____cache_alloc_node(cache,
 				gfp_exact_node(flags), nid);
 
@@ -3121,7 +3151,6 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 	struct kmem_cache_node *n;
 	void *obj;
 	void *list = NULL;
-	int x;
 
 	VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
 	n = get_node(cachep, nodeid);
@@ -3153,8 +3182,9 @@ retry:
 
 must_grow:
 	spin_unlock(&n->list_lock);
-	x = cache_grow(cachep, gfp_exact_node(flags), nodeid);
-	if (x >= 0)
+	page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+	cache_grow_end(cachep, page);
+	if (page)
 		goto retry;
 
 	return fallback_alloc(cachep, flags);
-- 
cgit v1.2.3


From 213b46958c65c7adaaf3201102da16ce0264e9cf Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:29 -0700
Subject: mm/slab: refill cpu cache through a new slab without holding a node
 lock

Until now, cache growing makes a free slab on node's slab list and then
we can allocate free objects from it.  This necessarily requires to hold
a node lock which is very contended.  If we refill cpu cache before
attaching it to node's slab list, we can avoid holding a node lock as
much as possible because this newly allocated slab is only visible to
the current task.  This will reduce lock contention.

Below is the result of concurrent allocation/free in slab allocation
benchmark made by Christoph a long time ago.  I make the output simpler.
The number shows cycle count during alloc/free respectively so less is
better.

  * Before
  Kmalloc N*alloc N*free(32): Average=355/750
  Kmalloc N*alloc N*free(64): Average=452/812
  Kmalloc N*alloc N*free(128): Average=559/1070
  Kmalloc N*alloc N*free(256): Average=1176/980
  Kmalloc N*alloc N*free(512): Average=1939/1189
  Kmalloc N*alloc N*free(1024): Average=3521/1278
  Kmalloc N*alloc N*free(2048): Average=7152/1838
  Kmalloc N*alloc N*free(4096): Average=13438/2013

  * After
  Kmalloc N*alloc N*free(32): Average=248/966
  Kmalloc N*alloc N*free(64): Average=261/949
  Kmalloc N*alloc N*free(128): Average=314/1016
  Kmalloc N*alloc N*free(256): Average=741/1061
  Kmalloc N*alloc N*free(512): Average=1246/1152
  Kmalloc N*alloc N*free(1024): Average=2437/1259
  Kmalloc N*alloc N*free(2048): Average=4980/1800
  Kmalloc N*alloc N*free(4096): Average=9000/2078

It shows that contention is reduced for all the object sizes and
performance increases by 30 ~ 40%.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 68 +++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 36 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 8c4db214b05b..37600e91742f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2865,6 +2865,30 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep,
 	return obj;
 }
 
+/*
+ * Slab list should be fixed up by fixup_slab_list() for existing slab
+ * or cache_grow_end() for new slab
+ */
+static __always_inline int alloc_block(struct kmem_cache *cachep,
+		struct array_cache *ac, struct page *page, int batchcount)
+{
+	/*
+	 * There must be at least one object available for
+	 * allocation.
+	 */
+	BUG_ON(page->active >= cachep->num);
+
+	while (page->active < cachep->num && batchcount--) {
+		STATS_INC_ALLOCED(cachep);
+		STATS_INC_ACTIVE(cachep);
+		STATS_SET_HIGH(cachep);
+
+		ac->entry[ac->avail++] = slab_get_obj(cachep, page);
+	}
+
+	return batchcount;
+}
+
 static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 {
 	int batchcount;
@@ -2877,7 +2901,6 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 	check_irq_off();
 	node = numa_mem_id();
 
-retry:
 	ac = cpu_cache_get(cachep);
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2907,21 +2930,7 @@ retry:
 
 		check_spinlock_acquired(cachep);
 
-		/*
-		 * The slab was either on partial or free list so
-		 * there must be at least one object available for
-		 * allocation.
-		 */
-		BUG_ON(page->active >= cachep->num);
-
-		while (page->active < cachep->num && batchcount--) {
-			STATS_INC_ALLOCED(cachep);
-			STATS_INC_ACTIVE(cachep);
-			STATS_SET_HIGH(cachep);
-
-			ac->entry[ac->avail++] = slab_get_obj(cachep, page);
-		}
-
+		batchcount = alloc_block(cachep, ac, page, batchcount);
 		fixup_slab_list(cachep, n, page, &list);
 	}
 
@@ -2941,21 +2950,18 @@ alloc_done:
 		}
 
 		page = cache_grow_begin(cachep, gfp_exact_node(flags), node);
-		cache_grow_end(cachep, page);
 
 		/*
 		 * cache_grow_begin() can reenable interrupts,
 		 * then ac could change.
 		 */
 		ac = cpu_cache_get(cachep);
-		node = numa_mem_id();
+		if (!ac->avail && page)
+			alloc_block(cachep, ac, page, batchcount);
+		cache_grow_end(cachep, page);
 
-		/* no objects in sight? abort */
-		if (!page && ac->avail == 0)
+		if (!ac->avail)
 			return NULL;
-
-		if (!ac->avail)		/* objects refilled by interrupt? */
-			goto retry;
 	}
 	ac->touched = 1;
 
@@ -3149,14 +3155,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 {
 	struct page *page;
 	struct kmem_cache_node *n;
-	void *obj;
+	void *obj = NULL;
 	void *list = NULL;
 
 	VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES);
 	n = get_node(cachep, nodeid);
 	BUG_ON(!n);
 
-retry:
 	check_irq_off();
 	spin_lock(&n->list_lock);
 	page = get_first_slab(n, false);
@@ -3178,19 +3183,18 @@ retry:
 
 	spin_unlock(&n->list_lock);
 	fixup_objfreelist_debug(cachep, &list);
-	goto done;
+	return obj;
 
 must_grow:
 	spin_unlock(&n->list_lock);
 	page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid);
+	if (page) {
+		/* This slab isn't counted yet so don't update free_objects */
+		obj = slab_get_obj(cachep, page);
+	}
 	cache_grow_end(cachep, page);
-	if (page)
-		goto retry;
 
-	return fallback_alloc(cachep, flags);
-
-done:
-	return obj;
+	return obj ? obj : fallback_alloc(cachep, flags);
 }
 
 static __always_inline void *
-- 
cgit v1.2.3


From 801faf0db8947e01877920e848a4d338dd7a99e7 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:31 -0700
Subject: mm/slab: lockless decision to grow cache

To check whether free objects exist or not precisely, we need to grab a
lock.  But, accuracy isn't that important because race window would be
even small and if there is too much free object, cache reaper would reap
it.  So, this patch makes the check for free object exisistence not to
hold a lock.  This will reduce lock contention in heavily allocation
case.

Note that until now, n->shared can be freed during the processing by
writing slabinfo, but, with some trick in this patch, we can access it
freely within interrupt disabled period.

Below is the result of concurrent allocation/free in slab allocation
benchmark made by Christoph a long time ago.  I make the output simpler.
The number shows cycle count during alloc/free respectively so less is
better.

  * Before
  Kmalloc N*alloc N*free(32): Average=248/966
  Kmalloc N*alloc N*free(64): Average=261/949
  Kmalloc N*alloc N*free(128): Average=314/1016
  Kmalloc N*alloc N*free(256): Average=741/1061
  Kmalloc N*alloc N*free(512): Average=1246/1152
  Kmalloc N*alloc N*free(1024): Average=2437/1259
  Kmalloc N*alloc N*free(2048): Average=4980/1800
  Kmalloc N*alloc N*free(4096): Average=9000/2078

  * After
  Kmalloc N*alloc N*free(32): Average=344/792
  Kmalloc N*alloc N*free(64): Average=347/882
  Kmalloc N*alloc N*free(128): Average=390/959
  Kmalloc N*alloc N*free(256): Average=393/1067
  Kmalloc N*alloc N*free(512): Average=683/1229
  Kmalloc N*alloc N*free(1024): Average=1295/1325
  Kmalloc N*alloc N*free(2048): Average=2513/1664
  Kmalloc N*alloc N*free(4096): Average=4742/2172

It shows that allocation performance decreases for the object size up to
128 and it may be due to extra checks in cache_alloc_refill().  But,
with considering improvement of free performance, net result looks the
same.  Result for other size class looks very promising, roughly, 50%
performance improvement.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 37600e91742f..8133ebea77a4 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -965,6 +965,15 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep,
 	spin_unlock_irq(&n->list_lock);
 	slabs_destroy(cachep, &list);
 
+	/*
+	 * To protect lockless access to n->shared during irq disabled context.
+	 * If n->shared isn't NULL in irq disabled context, accessing to it is
+	 * guaranteed to be valid until irq is re-enabled, because it will be
+	 * freed after synchronize_sched().
+	 */
+	if (force_change)
+		synchronize_sched();
+
 fail:
 	kfree(old_shared);
 	kfree(new_shared);
@@ -2893,7 +2902,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 {
 	int batchcount;
 	struct kmem_cache_node *n;
-	struct array_cache *ac;
+	struct array_cache *ac, *shared;
 	int node;
 	void *list = NULL;
 	struct page *page;
@@ -2914,11 +2923,16 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 	n = get_node(cachep, node);
 
 	BUG_ON(ac->avail > 0 || !n);
+	shared = READ_ONCE(n->shared);
+	if (!n->free_objects && (!shared || !shared->avail))
+		goto direct_grow;
+
 	spin_lock(&n->list_lock);
+	shared = READ_ONCE(n->shared);
 
 	/* See if we can refill from the shared array */
-	if (n->shared && transfer_objects(ac, n->shared, batchcount)) {
-		n->shared->touched = 1;
+	if (shared && transfer_objects(ac, shared, batchcount)) {
+		shared->touched = 1;
 		goto alloc_done;
 	}
 
@@ -2940,6 +2954,7 @@ alloc_done:
 	spin_unlock(&n->list_lock);
 	fixup_objfreelist_debug(cachep, &list);
 
+direct_grow:
 	if (unlikely(!ac->avail)) {
 		/* Check if we can use obj in pfmemalloc slab */
 		if (sk_memalloc_socks()) {
-- 
cgit v1.2.3


From 81ae6d03952c1bfb96e1a716809bd65e7cd14360 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Thu, 19 May 2016 17:10:34 -0700
Subject: mm/slub.c: replace kick_all_cpus_sync() with synchronize_sched() in
 kmem_cache_shrink()

When we call __kmem_cache_shrink on memory cgroup removal, we need to
synchronize kmem_cache->cpu_partial update with put_cpu_partial that
might be running on other cpus.  Currently, we achieve that by using
kick_all_cpus_sync, which works as a system wide memory barrier.  Though
fast it is, this method has a flaw - it issues a lot of IPIs, which
might hurt high performance or real-time workloads.

To fix this, let's replace kick_all_cpus_sync with synchronize_sched.
Although the latter one may take much longer to finish, it shouldn't be
a problem in this particular case, because memory cgroups are destroyed
asynchronously from a workqueue so that no user visible effects should
be introduced.  OTOH, it will save us from excessive IPIs when someone
removes a cgroup.

Anyway, even if using synchronize_sched turns out to take too long, we
can always introduce a kind of __kmem_cache_shrink batching so that this
method would only be called once per one cgroup destruction (not per
each per memcg kmem cache as it is now).

Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Reported-by: Peter Zijlstra <peterz@infradead.org>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index 4dbb109eb8cd..ba81cf672f08 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
 		 * so we have to make sure the change is visible.
 		 */
-		kick_all_cpus_sync();
+		synchronize_sched();
 	}
 
 	flush_all(s);
-- 
cgit v1.2.3


From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 19 May 2016 17:10:37 -0700
Subject: mm: SLAB freelist randomization

Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize
the SLAB freelist.  The list is randomized during initialization of a
new set of pages.  The order on different freelist sizes is pre-computed
at boot for performance.  Each kmem_cache has its own randomized
freelist.  Before pre-computed lists are available freelists are
generated dynamically.  This security feature reduces the predictability
of the kernel SLAB allocator against heap overflows rendering attacks
much less stable.

For example this attack against SLUB (also applicable against SLAB)
would be affected:

  https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/

Also, since v4.6 the freelist was moved at the end of the SLAB.  It
means a controllable heap is opened to new attacks not yet publicly
discussed.  A kernel heap overflow can be transformed to multiple
use-after-free.  This feature makes this type of attack harder too.

To generate entropy, we use get_random_bytes_arch because 0 bits of
entropy is available in the boot stage.  In the worse case this function
will fallback to the get_random_bytes sub API.  We also generate a shift
random number to shift pre-computed freelist for each new set of pages.

The config option name is not specific to the SLAB as this approach will
be extended to other allocators like SLUB.

Performance results highlighted no major changes:

Hackbench (running 90 10 times):

  Before average: 0.0698
  After average: 0.0663 (-5.01%)

slab_test 1 run on boot.  Difference only seen on the 2048 size test
being the worse case scenario covered by freelist randomization.  New
slab pages are constantly being created on the 10000 allocations.
Variance should be mainly due to getting new pages every few
allocations.

Before:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles
  10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles
  10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles
  10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles
  10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles
  10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles
  10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles
  10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles
  10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles
  10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles
  10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles
  10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 121 cycles
  10000 times kmalloc(64)/kfree -> 121 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 121 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

After:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles
  10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles
  10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles
  10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles
  10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles
  10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles
  10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles
  10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles
  10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles
  10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles
  10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles
  10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 123 cycles
  10000 times kmalloc(64)/kfree -> 142 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 119 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

[akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()]
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h |   4 ++
 init/Kconfig             |   9 +++
 mm/slab.c                | 167 ++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 178 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9edbbf352340..8694f7a5d92b 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
 	struct kasan_cache kasan_info;
 #endif
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+	void *random_seq;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index 0dfd09d54c65..79a91a2c0444 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1742,6 +1742,15 @@ config SLOB
 
 endchoice
 
+config SLAB_FREELIST_RANDOM
+	default n
+	depends on SLAB
+	bool "SLAB freelist randomization"
+	help
+	  Randomizes the freelist order used on creating new SLABs. This
+	  security feature reduces the predictability of the kernel slab
+	  allocator against heap overflows.
+
 config SLUB_CPU_PARTIAL
 	default y
 	depends on SLUB && SMP
diff --git a/mm/slab.c b/mm/slab.c
index 8133ebea77a4..1ee26a0d358f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1243,6 +1243,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index)
 	}
 }
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list,
+			size_t count)
+{
+	size_t i;
+	unsigned int rand;
+
+	for (i = 0; i < count; i++)
+		list[i] = i;
+
+	/* Fisher-Yates shuffle */
+	for (i = count - 1; i > 0; i--) {
+		rand = prandom_u32_state(state);
+		rand %= (i + 1);
+		swap(list[i], list[rand]);
+	}
+}
+
+/* Create a random sequence per cache */
+static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+	unsigned int seed, count = cachep->num;
+	struct rnd_state state;
+
+	if (count < 2)
+		return 0;
+
+	/* If it fails, we will just use the global lists */
+	cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp);
+	if (!cachep->random_seq)
+		return -ENOMEM;
+
+	/* Get best entropy at this stage */
+	get_random_bytes_arch(&seed, sizeof(seed));
+	prandom_seed_state(&state, seed);
+
+	freelist_randomize(&state, cachep->random_seq, count);
+	return 0;
+}
+
+/* Destroy the per-cache random freelist sequence */
+static void cache_random_seq_destroy(struct kmem_cache *cachep)
+{
+	kfree(cachep->random_seq);
+	cachep->random_seq = NULL;
+}
+#else
+static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp)
+{
+	return 0;
+}
+static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
+
 /*
  * Initialisation.  Called after the page allocator have been initialised and
  * before smp_init().
@@ -2374,6 +2429,8 @@ void __kmem_cache_release(struct kmem_cache *cachep)
 	int i;
 	struct kmem_cache_node *n;
 
+	cache_random_seq_destroy(cachep);
+
 	free_percpu(cachep->cpu_cache);
 
 	/* NUMA: free the node structures */
@@ -2480,15 +2537,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page)
 #endif
 }
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+/* Hold information during a freelist initialization */
+union freelist_init_state {
+	struct {
+		unsigned int pos;
+		freelist_idx_t *list;
+		unsigned int count;
+		unsigned int rand;
+	};
+	struct rnd_state rnd_state;
+};
+
+/*
+ * Initialize the state based on the randomization methode available.
+ * return true if the pre-computed list is available, false otherwize.
+ */
+static bool freelist_state_initialize(union freelist_init_state *state,
+				struct kmem_cache *cachep,
+				unsigned int count)
+{
+	bool ret;
+	unsigned int rand;
+
+	/* Use best entropy available to define a random shift */
+	get_random_bytes_arch(&rand, sizeof(rand));
+
+	/* Use a random state if the pre-computed list is not available */
+	if (!cachep->random_seq) {
+		prandom_seed_state(&state->rnd_state, rand);
+		ret = false;
+	} else {
+		state->list = cachep->random_seq;
+		state->count = count;
+		state->pos = 0;
+		state->rand = rand;
+		ret = true;
+	}
+	return ret;
+}
+
+/* Get the next entry on the list and randomize it using a random shift */
+static freelist_idx_t next_random_slot(union freelist_init_state *state)
+{
+	return (state->list[state->pos++] + state->rand) % state->count;
+}
+
+/*
+ * Shuffle the freelist initialization state based on pre-computed lists.
+ * return true if the list was successfully shuffled, false otherwise.
+ */
+static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page)
+{
+	unsigned int objfreelist = 0, i, count = cachep->num;
+	union freelist_init_state state;
+	bool precomputed;
+
+	if (count < 2)
+		return false;
+
+	precomputed = freelist_state_initialize(&state, cachep, count);
+
+	/* Take a random entry as the objfreelist */
+	if (OBJFREELIST_SLAB(cachep)) {
+		if (!precomputed)
+			objfreelist = count - 1;
+		else
+			objfreelist = next_random_slot(&state);
+		page->freelist = index_to_obj(cachep, page, objfreelist) +
+						obj_offset(cachep);
+		count--;
+	}
+
+	/*
+	 * On early boot, generate the list dynamically.
+	 * Later use a pre-computed list for speed.
+	 */
+	if (!precomputed) {
+		freelist_randomize(&state.rnd_state, page->freelist, count);
+	} else {
+		for (i = 0; i < count; i++)
+			set_free_obj(page, i, next_random_slot(&state));
+	}
+
+	if (OBJFREELIST_SLAB(cachep))
+		set_free_obj(page, cachep->num - 1, objfreelist);
+
+	return true;
+}
+#else
+static inline bool shuffle_freelist(struct kmem_cache *cachep,
+				struct page *page)
+{
+	return false;
+}
+#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+
 static void cache_init_objs(struct kmem_cache *cachep,
 			    struct page *page)
 {
 	int i;
 	void *objp;
+	bool shuffled;
 
 	cache_init_objs_debug(cachep, page);
 
-	if (OBJFREELIST_SLAB(cachep)) {
+	/* Try to randomize the freelist if enabled */
+	shuffled = shuffle_freelist(cachep, page);
+
+	if (!shuffled && OBJFREELIST_SLAB(cachep)) {
 		page->freelist = index_to_obj(cachep, page, cachep->num - 1) +
 						obj_offset(cachep);
 	}
@@ -2502,7 +2659,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			kasan_poison_object_data(cachep, objp);
 		}
 
-		set_free_obj(page, i, i);
+		if (!shuffled)
+			set_free_obj(page, i, i);
 	}
 }
 
@@ -3841,6 +3999,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	int shared = 0;
 	int batchcount = 0;
 
+	err = cache_random_seq_create(cachep, gfp);
+	if (err)
+		goto end;
+
 	if (!is_root_cache(cachep)) {
 		struct kmem_cache *root = memcg_root_cache(cachep);
 		limit = root->limit;
@@ -3894,6 +4056,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 	batchcount = (limit + 1) / 2;
 skip_setup:
 	err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+end:
 	if (err)
 		pr_err("enable_cpucache failed for %s, error %d\n",
 		       cachep->name, -err);
-- 
cgit v1.2.3


From a3187e438bc6565d6e54a550a19073d1b453f041 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 19 May 2016 17:10:41 -0700
Subject: mm: slab: remove ZONE_DMA_FLAG

Now we have IS_ENABLED helper to check if a Kconfig option is enabled or
not, so ZONE_DMA_FLAG sounds no longer useful.

And, the use of ZONE_DMA_FLAG in slab looks pointless according to the
comment [1] from Johannes Weiner, so remove them and ORing passed in
flags with the cache gfp flags has been done in kmem_getpages().

[1] https://lkml.org/lkml/2014/9/25/553

Link: http://lkml.kernel.org/r/1462381297-11009-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig |  5 -----
 mm/slab.c  | 23 +----------------------
 2 files changed, 1 insertion(+), 27 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 989f8f3d77e0..d6e9042b99e0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -268,11 +268,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION
 config PHYS_ADDR_T_64BIT
 	def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
 
-config ZONE_DMA_FLAG
-	int
-	default "0" if !ZONE_DMA
-	default "1"
-
 config BOUNCE
 	bool "Enable bounce buffers"
 	default y
diff --git a/mm/slab.c b/mm/slab.c
index 1ee26a0d358f..d81565a92864 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2236,7 +2236,7 @@ done:
 	cachep->freelist_size = cachep->num * sizeof(freelist_idx_t);
 	cachep->flags = flags;
 	cachep->allocflags = __GFP_COMP;
-	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+	if (flags & SLAB_CACHE_DMA)
 		cachep->allocflags |= GFP_DMA;
 	cachep->size = size;
 	cachep->reciprocal_buffer_size = reciprocal_value(size);
@@ -2664,16 +2664,6 @@ static void cache_init_objs(struct kmem_cache *cachep,
 	}
 }
 
-static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
-{
-	if (CONFIG_ZONE_DMA_FLAG) {
-		if (flags & GFP_DMA)
-			BUG_ON(!(cachep->allocflags & GFP_DMA));
-		else
-			BUG_ON(cachep->allocflags & GFP_DMA);
-	}
-}
-
 static void *slab_get_obj(struct kmem_cache *cachep, struct page *page)
 {
 	void *objp;
@@ -2752,14 +2742,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
 	if (gfpflags_allow_blocking(local_flags))
 		local_irq_enable();
 
-	/*
-	 * The test for missing atomic flag is performed here, rather than
-	 * the more obvious place, simply to reduce the critical path length
-	 * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
-	 * will eventually be caught here (where it matters).
-	 */
-	kmem_flagcheck(cachep, flags);
-
 	/*
 	 * Get mem for the objs.  Attempt to allocate a physical page from
 	 * 'nodeid'.
@@ -3145,9 +3127,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
 						gfp_t flags)
 {
 	might_sleep_if(gfpflags_allow_blocking(flags));
-#if DEBUG
-	kmem_flagcheck(cachep, flags);
-#endif
 }
 
 #if DEBUG
-- 
cgit v1.2.3


From 43efd3ea64f3cf8920e8793e6953321a466023e3 Mon Sep 17 00:00:00 2001
From: Li Peng <lip@dtdream.com>
Date: Thu, 19 May 2016 17:10:43 -0700
Subject: mm/slub.c: fix sysfs filename in comment

/sys/kernel/slab/xx/defrag_ratio should be remote_node_defrag_ratio.

Link: http://lkml.kernel.org/r/1463449242-5366-1-git-send-email-lip@dtdream.com
Signed-off-by: Li Peng <lip@dtdream.com>
Acked-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slub.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slub.c b/mm/slub.c
index ba81cf672f08..8671de2e5b12 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
 	 * may return off node objects because partial slabs are obtained
 	 * from other nodes and filled up.
 	 *
-	 * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes
-	 * defrag_ratio = 1000) then every (well almost) allocation will
-	 * first attempt to defrag slab caches on other nodes. This means
-	 * scanning over all nodes to look for partial slabs which may be
-	 * expensive if we do it every time we are trying to find a slab
+	 * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100
+	 * (which makes defrag_ratio = 1000) then every (well almost)
+	 * allocation will first attempt to defrag slab caches on other nodes.
+	 * This means scanning over all nodes to look for partial slabs which
+	 * may be expensive if we do it every time we are trying to find a slab
 	 * with available objects.
 	 */
 	if (!s->remote_node_defrag_ratio ||
-- 
cgit v1.2.3


From 6d061f9f6136d477932088c24ce155d7dc785746 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:46 -0700
Subject: mm/page_ref: use page_ref helper instead of direct modification of
 _count

page_reference manipulation functions are introduced to track down
reference count change of the page.  Use it instead of direct
modification of _count.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 2 +-
 drivers/net/ethernet/qlogic/qede/qede_main.c       | 2 +-
 mm/filemap.c                                       | 2 +-
 net/wireless/util.c                                | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 06b819db51b1..0ff8e60deccb 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -23,7 +23,7 @@ static void nicvf_get_page(struct nicvf *nic)
 	if (!nic->rb_pageref || !nic->rb_page)
 		return;
 
-	atomic_add(nic->rb_pageref, &nic->rb_page->_count);
+	page_ref_add(nic->rb_page, nic->rb_pageref);
 	nic->rb_pageref = 0;
 }
 
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 8114541f327c..3aabfc0adefe 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -920,7 +920,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev,
 		 * network stack to take the ownership of the page
 		 * which can be recycled multiple times by the driver.
 		 */
-		atomic_inc(&curr_cons->data->_count);
+		page_ref_inc(curr_cons->data);
 		qede_reuse_page(edev, rxq, curr_cons);
 	}
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 182b21825255..01690338e3d2 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 			 * some other bad page check should catch it later.
 			 */
 			page_mapcount_reset(page);
-			atomic_sub(mapcount, &page->_count);
+			page_ref_sub(page, mapcount);
 		}
 	}
 
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 219bd197039e..4e809e978b7d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page,
 	struct skb_shared_info *sh = skb_shinfo(skb);
 	int page_offset;
 
-	atomic_inc(&page->_count);
+	page_ref_inc(page);
 	page_offset = ptr - page_address(page);
 	skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
 }
-- 
cgit v1.2.3


From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:49 -0700
Subject: mm: rename _count, field of the struct page, to _refcount

Many developers already know that field for reference count of the
struct page is _count and atomic type.  They would try to handle it
directly and this could break the purpose of page reference count
tracepoint.  To prevent direct _count modification, this patch rename it
to _refcount and add warning message on the code.  After that, developer
who need to handle reference count will find that field should not be
accessed directly.

[akpm@linux-foundation.org: fix comments, per Vlastimil]
[akpm@linux-foundation.org: Documentation/vm/transhuge.txt too]
[sfr@canb.auug.org.au: sync ethernet driver changes]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Manish Chopra <manish.chopra@qlogic.com>
Cc: Yuval Mintz <yuval.mintz@qlogic.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/transhuge.txt                  | 10 +++++-----
 arch/tile/mm/init.c                             |  2 +-
 drivers/block/aoe/aoecmd.c                      |  2 +-
 drivers/hwtracing/intel_th/msu.c                |  2 +-
 drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++----------
 drivers/net/ethernet/qlogic/qede/qede_main.c    |  4 ++--
 fs/proc/page.c                                  |  2 +-
 include/linux/mm.h                              |  2 +-
 include/linux/mm_types.h                        | 14 ++++++++-----
 include/linux/page_ref.h                        | 26 ++++++++++++-------------
 include/linux/pagemap.h                         |  8 ++++----
 kernel/kexec_core.c                             |  2 +-
 mm/huge_memory.c                                |  4 ++--
 mm/internal.h                                   |  2 +-
 mm/page_alloc.c                                 |  4 ++--
 mm/slub.c                                       |  4 ++--
 mm/vmscan.c                                     |  4 ++--
 17 files changed, 58 insertions(+), 54 deletions(-)

(limited to 'mm')

diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt
index d9cb65cf5cfd..fb0e1f2a19cc 100644
--- a/Documentation/vm/transhuge.txt
+++ b/Documentation/vm/transhuge.txt
@@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock.
 Refcounting on THP is mostly consistent with refcounting on other compound
 pages:
 
-  - get_page()/put_page() and GUP operate in head page's ->_count.
+  - get_page()/put_page() and GUP operate in head page's ->_refcount.
 
-  - ->_count in tail pages is always zero: get_page_unless_zero() never
+  - ->_refcount in tail pages is always zero: get_page_unless_zero() never
     succeed on tail pages.
 
   - map/unmap of the pages with PTE entry increment/decrement ->_mapcount
@@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to
 sum of mapcount of all sub-pages plus one (split_huge_page caller must
 have reference for head page).
 
-split_huge_page uses migration entries to stabilize page->_count and
+split_huge_page uses migration entries to stabilize page->_refcount and
 page->_mapcount.
 
 We safe against physical memory scanners too: the only legitimate way
 scanner can get reference to a page is get_page_unless_zero().
 
-All tail pages has zero ->_count until atomic_add(). It prevent scanner
+All tail pages has zero ->_refcount until atomic_add(). It prevent scanner
 from geting reference to tail page up to the point. After the atomic_add()
-we don't care about ->_count value.  We already known how many references
+we don't care about ->_refcount value.  We already known how many references
 with should uncharge from head page.
 
 For head page get_page_unless_zero() will succeed and we don't mind. It's
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index a0582b7f41d3..adce25462b0d 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end)
 			 * Hacky direct set to avoid unnecessary
 			 * lock take/release for EVERY page here.
 			 */
-			p->_count.counter = 0;
+			p->_refcount.counter = 0;
 			p->_mapcount.counter = -1;
 		}
 		init_page_count(page);
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 437b3a822f44..d597e432e195 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -861,7 +861,7 @@ rqbiocnt(struct request *r)
  * discussion.
  *
  * We cannot use get_page in the workaround, because it insists on a
- * positive page count as a precondition.  So we use _count directly.
+ * positive page count as a precondition.  So we use _refcount directly.
  */
 static void
 bio_pageinc(struct bio *bio)
diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c
index d9d6022c5aca..d2209147dc89 100644
--- a/drivers/hwtracing/intel_th/msu.c
+++ b/drivers/hwtracing/intel_th/msu.c
@@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma)
 	if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex))
 		return;
 
-	/* drop page _counts */
+	/* drop page _refcounts */
 	for (pg = 0; pg < msc->nr_pages; pg++) {
 		struct page *page = msc_buffer_get_page(msc, pg);
 
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
index f3456798c596..bd947704b59c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i)))
 			goto err_unmap;
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_add(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -452,8 +452,8 @@ err_unmap:
 	while (--i >= 0) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			     mlx5e_mpwqe_strides_per_page(rq));
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq,
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
 		dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE,
 			       PCI_DMA_FROMDEVICE);
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->umr.dma_info[i].page->_count);
+		page_ref_sub(wi->umr.dma_info[i].page,
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(wi->umr.dma_info[i].page);
 	}
 	dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE);
@@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	 */
 	split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_add(mlx5e_mpwqe_strides_per_page(rq),
-			   &wi->dma_info.page[i]._count);
+		page_ref_add(&wi->dma_info.page[i],
+			     mlx5e_mpwqe_strides_per_page(rq));
 		wi->skbs_frags[i] = 0;
 	}
 
@@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq,
 	dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz,
 		       PCI_DMA_FROMDEVICE);
 	for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) {
-		atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i],
-			   &wi->dma_info.page[i]._count);
+		page_ref_sub(&wi->dma_info.page[i],
+			mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]);
 		put_page(&wi->dma_info.page[i]);
 	}
 }
diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c
index 3aabfc0adefe..73dd525fbf08 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_main.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_main.c
@@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev,
 		/* Incr page ref count to reuse on allocation failure
 		 * so that it doesn't get freed while freeing SKB.
 		 */
-		atomic_inc(&current_bd->data->_count);
+		page_ref_inc(current_bd->data);
 		goto out;
 	}
 
@@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget)
 				 * freeing SKB.
 				 */
 
-				atomic_inc(&sw_rx_data->data->_count);
+				page_ref_inc(sw_rx_data->data);
 				rxq->rx_alloc_errors++;
 				qede_recycle_rx_bd_ring(rxq, edev,
 							fp_cqe->bd_num);
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 712f1b9992cc..3ecd445e830d 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page)
 
 
 	/*
-	 * Caveats on high order pages: page->_count will only be set
+	 * Caveats on high order pages: page->_refcount will only be set
 	 * -1 on the head page; SLUB/SLQB do the same for PG_slab;
 	 * SLOB won't set PG_slab at all on compound pages.
 	 */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f799757ab..1193a54ea2b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
 	page = compound_head(page);
 	/*
 	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_count.
+	 * requires to already have an elevated page->_refcount.
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4fa86c..1fda9c99ef95 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
 			unsigned long counters;
 #else
 			/*
-			 * Keep _count separate from slub cmpxchg_double data.
-			 * As the rest of the double word is protected by
-			 * slab_lock but _count is not.
+			 * Keep _refcount separate from slub cmpxchg_double
+			 * data.  As the rest of the double word is protected by
+			 * slab_lock but _refcount is not.
 			 */
 			unsigned counters;
 #endif
@@ -97,7 +97,11 @@ struct page {
 					};
 					int units;	/* SLOB */
 				};
-				atomic_t _count;		/* Usage count, see below. */
+				/*
+				 * Usage count, *USE WRAPPER FUNCTION*
+				 * when manual accounting. See page_ref.h
+				 */
+				atomic_t _refcount;
 			};
 			unsigned int active;	/* SLAB */
 		};
@@ -248,7 +252,7 @@ struct page_frag_cache {
 	__u32 offset;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
+	 * containing page->_refcount every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
 	bool pfmemalloc;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9540e..8b5e0a9f2431 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
 
 static inline int page_ref_count(struct page *page)
 {
-	return atomic_read(&page->_count);
+	return atomic_read(&page->_refcount);
 }
 
 static inline int page_count(struct page *page)
 {
-	return atomic_read(&compound_head(page)->_count);
+	return atomic_read(&compound_head(page)->_refcount);
 }
 
 static inline void set_page_count(struct page *page, int v)
 {
-	atomic_set(&page->_count, v);
+	atomic_set(&page->_refcount, v);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
 		__page_ref_set(page, v);
 }
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
 
 static inline void page_ref_add(struct page *page, int nr)
 {
-	atomic_add(nr, &page->_count);
+	atomic_add(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, nr);
 }
 
 static inline void page_ref_sub(struct page *page, int nr)
 {
-	atomic_sub(nr, &page->_count);
+	atomic_sub(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -nr);
 }
 
 static inline void page_ref_inc(struct page *page)
 {
-	atomic_inc(&page->_count);
+	atomic_inc(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, 1);
 }
 
 static inline void page_ref_dec(struct page *page)
 {
-	atomic_dec(&page->_count);
+	atomic_dec(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -1);
 }
 
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
-	int ret = atomic_sub_and_test(nr, &page->_count);
+	int ret = atomic_sub_and_test(nr, &page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 
 static inline int page_ref_dec_and_test(struct page *page)
 {
-	int ret = atomic_dec_and_test(&page->_count);
+	int ret = atomic_dec_and_test(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
 
 static inline int page_ref_dec_return(struct page *page)
 {
-	int ret = atomic_dec_return(&page->_count);
+	int ret = atomic_dec_return(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
 		__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
 
 static inline int page_ref_add_unless(struct page *page, int nr, int u)
 {
-	int ret = atomic_add_unless(&page->_count, nr, u);
+	int ret = atomic_add_unless(&page->_refcount, nr, u);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
 		__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
 
 static inline int page_ref_freeze(struct page *page, int count)
 {
-	int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
 		__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
 	VM_BUG_ON_PAGE(page_count(page) != 0, page);
 	VM_BUG_ON(count == 0);
 
-	atomic_set(&page->_count, count);
+	atomic_set(&page->_refcount, count);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
 		__page_ref_unfreeze(page, count);
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab155c67c..fe1513ffb7bf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
 
 /*
  * speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
  *
  * This function must be called inside the same rcu_read_lock() section as has
  * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
  *
  * Unless an RCU grace period has passed, the count of all pages coming out
  * of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
  * 2. conditionally increment refcount
  * 3. check the page is still in pagecache (if no, goto 1)
  *
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
  * following (with tree_lock held for write):
  * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
  * B. remove page from pagecache
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 1391d3ee3b86..1c03dfb4abfd 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_STRUCT_SIZE(list_head);
 	VMCOREINFO_SIZE(nodemask_t);
 	VMCOREINFO_OFFSET(page, flags);
-	VMCOREINFO_OFFSET(page, _count);
+	VMCOREINFO_OFFSET(page, _refcount);
 	VMCOREINFO_OFFSET(page, mapping);
 	VMCOREINFO_OFFSET(page, lru);
 	VMCOREINFO_OFFSET(page, _mapcount);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b49ee126d4d1..f8ac8f582fd8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
 	VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail);
 
 	/*
-	 * tail_page->_count is zero and not changing from under us. But
+	 * tail_page->_refcount is zero and not changing from under us. But
 	 * get_page_unless_zero() may be running from under us on the
 	 * tail_page. If we used atomic_set() below instead of atomic_inc(), we
 	 * would then run atomic_set() concurrently with
@@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 	if (mlocked)
 		lru_add_drain();
 
-	/* Prevent deferred_split_scan() touching ->_count */
+	/* Prevent deferred_split_scan() touching ->_refcount */
 	spin_lock_irqsave(&pgdata->split_queue_lock, flags);
 	count = page_count(head);
 	mapcount = total_mapcount(head);
diff --git a/mm/internal.h b/mm/internal.h
index b79abb6721cf..098a89e3b97c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra,
 }
 
 /*
- * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * Turn a non-refcounted page (->_refcount == 0) into refcounted with
  * a count of one.
  */
 static inline void set_page_refcounted(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c1069efcc4d7..4ce57f938b7f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page->mapping != NULL))
 		bad_reason = "non-NULL mapping";
 	if (unlikely(page_ref_count(page) != 0))
-		bad_reason = "nonzero _count";
+		bad_reason = "nonzero _refcount";
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 		bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
@@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		 * We can't use page_count without pin a page
 		 * because another CPU can free compound page.
 		 * This check already skips compound tails of THP
-		 * because their page->_count is zero at all time.
+		 * because their page->_refcount is zero at all time.
 		 */
 		if (!page_ref_count(page)) {
 			if (PageBuddy(page))
diff --git a/mm/slub.c b/mm/slub.c
index 8671de2e5b12..cf1faa4d3992 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count
 	tmp.counters = counters_new;
 	/*
 	 * page->counters can cover frozen/inuse/objects as well
-	 * as page->_count.  If we assign to ->counters directly
-	 * we run the risk of losing updates to page->_count, so
+	 * as page->_refcount.  If we assign to ->counters directly
+	 * we run the risk of losing updates to page->_refcount, so
 	 * be careful and only assign to the fields we need.
 	 */
 	page->frozen  = tmp.frozen;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 142cb61f4822..d3a02ac3eed7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 	 *
 	 * Reversing the order of the tests ensures such a situation cannot
 	 * escape unnoticed. The smp_rmb is needed to ensure the page->flags
-	 * load is not satisfied before that of page->_count.
+	 * load is not satisfied before that of page->_refcount.
 	 *
 	 * Note that if SetPageDirty is always performed via set_page_dirty,
 	 * and thus under tree_lock, then this ordering is not required.
@@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
  * It is safe to rely on PG_active against the non-LRU pages in here because
  * nobody will play with that bit on a non-LRU page.
  *
- * The downside is that we have to touch page->_count against each page.
+ * The downside is that we have to touch page->_refcount against each page.
  * But we had to alter page->flags anyway.
  */
 
-- 
cgit v1.2.3


From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:10:58 -0700
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

	node = next_node(node, XXX);
	if (node == MAX_NUMNODES)
		node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

[mhocko@suse.com: use next_node_in() helper]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Hui Zhu <zhuhui@xiaomi.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/tile/kernel/setup.c |  4 +---
 arch/x86/mm/numa.c       |  4 +---
 include/linux/nodemask.h | 11 ++++++++++-
 kernel/cpuset.c          |  8 +-------
 lib/Makefile             |  2 +-
 lib/nodemask.c           | 30 ++++++++++++++++++++++++++++++
 mm/hugetlb.c             |  4 +---
 mm/memcontrol.c          |  4 +---
 mm/mempolicy.c           | 24 ++----------------------
 mm/page_isolation.c      |  9 +++------
 mm/slab.c                | 13 +++----------
 11 files changed, 54 insertions(+), 59 deletions(-)
 create mode 100644 lib/nodemask.c

(limited to 'mm')

diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index a992238e9b58..153020abd2f5 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void)
 		cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]);
 		cpu_2_node[best_cpu] = node;
 		cpumask_clear_cpu(best_cpu, &unbound_cpus);
-		node = next_node(node, default_nodes);
-		if (node == MAX_NUMNODES)
-			node = first_node(default_nodes);
+		node = next_node_in(node, default_nodes);
 	}
 
 	/* Print out node assignments and set defaults for disabled cpus */
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index f70c1ff46125..9c086c57105c 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -617,9 +617,7 @@ static void __init numa_init_array(void)
 		if (early_cpu_to_node(i) != NUMA_NO_NODE)
 			continue;
 		numa_set_node(i, rr);
-		rr = next_node(rr, node_online_map);
-		if (rr == MAX_NUMNODES)
-			rr = first_node(node_online_map);
+		rr = next_node_in(rr, node_online_map);
 	}
 }
 
diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889cf9ab..f746e44d4046 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
  *
  * int first_node(mask)			Number lowest set bit, or MAX_NUMNODES
  * int next_node(node, mask)		Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask)		Next node past 'node', or wrap to first,
+ *					or MAX_NUMNODES
  * int first_unset_node(mask)		First node not set in mask, or 
- *					MAX_NUMNODES.
+ *					MAX_NUMNODES
  *
  * nodemask_t nodemask_of_node(node)	Return nodemask with bit 'node' set
  * NODE_MASK_ALL			Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
 	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
 
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
 	nodes_clear(*mask);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1902956baba1..611cc69af8f0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask)
 
 static int cpuset_spread_node(int *rotor)
 {
-	int node;
-
-	node = next_node(*rotor, current->mems_allowed);
-	if (node == MAX_NUMNODES)
-		node = first_node(current->mems_allowed);
-	*rotor = node;
-	return node;
+	return *rotor = next_node_in(*rotor, current->mems_allowed);
 }
 
 int cpuset_mem_spread_node(void)
diff --git a/lib/Makefile b/lib/Makefile
index 931396ada5eb..42b69185f963 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o seq_buf.o nmi_backtrace.o
+	 earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
diff --git a/lib/nodemask.c b/lib/nodemask.c
new file mode 100644
index 000000000000..e42a5bf44d33
--- /dev/null
+++ b/lib/nodemask.c
@@ -0,0 +1,30 @@
+#include <linux/nodemask.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+int __next_node_in(int node, const nodemask_t *srcp)
+{
+	int ret = __next_node(node, srcp);
+
+	if (ret == MAX_NUMNODES)
+		ret = __first_node(srcp);
+	return ret;
+}
+EXPORT_SYMBOL(__next_node_in);
+
+#ifdef CONFIG_NUMA
+/*
+ * Return the bit number of a random bit set in the nodemask.
+ * (returns NUMA_NO_NODE if nodemask is empty)
+ */
+int node_random(const nodemask_t *maskp)
+{
+	int w, bit = NUMA_NO_NODE;
+
+	w = nodes_weight(*maskp);
+	if (w)
+		bit = bitmap_ord_to_pos(maskp->bits,
+			get_random_int() % w, MAX_NUMNODES);
+	return bit;
+}
+#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..5856093f9062 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -937,9 +937,7 @@ err:
  */
 static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
 {
-	nid = next_node(nid, *nodes_allowed);
-	if (nid == MAX_NUMNODES)
-		nid = first_node(*nodes_allowed);
+	nid = next_node_in(nid, *nodes_allowed);
 	VM_BUG_ON(nid >= MAX_NUMNODES);
 
 	return nid;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fe787f5c41bd..6740c4c2b550 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 	mem_cgroup_may_update_nodemask(memcg);
 	node = memcg->last_scanned_node;
 
-	node = next_node(node, memcg->scan_nodes);
-	if (node == MAX_NUMNODES)
-		node = first_node(memcg->scan_nodes);
+	node = next_node_in(node, memcg->scan_nodes);
 	/*
 	 * We call this when we hit limit, not when pages are added to LRU.
 	 * No LRU may hold pages because all pages are UNEVICTABLE or
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 36cc01bc950a..8d369cee0cd6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -97,7 +97,6 @@
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
-#include <linux/random.h>
 
 #include "internal.h"
 
@@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
 		BUG();
 
 	if (!node_isset(current->il_next, tmp)) {
-		current->il_next = next_node(current->il_next, tmp);
-		if (current->il_next >= MAX_NUMNODES)
-			current->il_next = first_node(tmp);
+		current->il_next = next_node_in(current->il_next, tmp);
 		if (current->il_next >= MAX_NUMNODES)
 			current->il_next = numa_node_id();
 	}
@@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy)
 	struct task_struct *me = current;
 
 	nid = me->il_next;
-	next = next_node(nid, policy->v.nodes);
-	if (next >= MAX_NUMNODES)
-		next = first_node(policy->v.nodes);
+	next = next_node_in(nid, policy->v.nodes);
 	if (next < MAX_NUMNODES)
 		me->il_next = next;
 	return nid;
@@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 		return interleave_nodes(pol);
 }
 
-/*
- * Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
- */
-int node_random(const nodemask_t *maskp)
-{
-	int w, bit = NUMA_NO_NODE;
-
-	w = nodes_weight(*maskp);
-	if (w)
-		bit = bitmap_ord_to_pos(maskp->bits,
-			get_random_int() % w, MAX_NUMNODES);
-	return bit;
-}
-
 #ifdef CONFIG_HUGETLBFS
 /*
  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c4f568206544..67bedd18429c 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private,
 	 * accordance with memory policy of the user process if possible. For
 	 * now as a simple work-around, we use the next node for destination.
 	 */
-	if (PageHuge(page)) {
-		int node = next_online_node(page_to_nid(page));
-		if (node == MAX_NUMNODES)
-			node = first_online_node;
+	if (PageHuge(page))
 		return alloc_huge_page_node(page_hstate(compound_head(page)),
-					    node);
-	}
+					    next_node_in(page_to_nid(page),
+							 node_online_map));
 
 	if (PageHighMem(page))
 		gfp_mask |= __GFP_HIGHMEM;
diff --git a/mm/slab.c b/mm/slab.c
index d81565a92864..c11bf5007952 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node);
 
 static void init_reap_node(int cpu)
 {
-	int node;
-
-	node = next_node(cpu_to_mem(cpu), node_online_map);
-	if (node == MAX_NUMNODES)
-		node = first_node(node_online_map);
-
-	per_cpu(slab_reap_node, cpu) = node;
+	per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu),
+						    node_online_map);
 }
 
 static void next_reap_node(void)
 {
 	int node = __this_cpu_read(slab_reap_node);
 
-	node = next_node(node, node_online_map);
-	if (unlikely(node >= MAX_NUMNODES))
-		node = first_node(node_online_map);
+	node = next_node_in(node, node_online_map);
 	__this_cpu_write(slab_reap_node, node);
 }
 
-- 
cgit v1.2.3


From 09a95e29cb30a3930db22d340ddd072a82b6b0db Mon Sep 17 00:00:00 2001
From: Mike Kravetz <mike.kravetz@oracle.com>
Date: Thu, 19 May 2016 17:11:01 -0700
Subject: mm/hugetlb: optimize minimum size (min_size) accounting

It was observed that minimum size accounting associated with the
hugetlbfs min_size mount option may not perform optimally and as
expected.  As huge pages/reservations are released from the filesystem
and given back to the global pools, they are reserved for subsequent
filesystem use as long as the subpool reserved count is less than
subpool minimum size.  It does not take into account used pages within
the filesystem.  The filesystem size limits are not exceeded and this is
technically not a bug.  However, better behavior would be to wait for
the number of used pages/reservations associated with the filesystem to
drop below the minimum size before taking reservations to satisfy
minimum size.

An optimization is also made to the hugepage_subpool_get_pages() routine
which is called when pages/reservations are allocated.  This does not
change behavior, but simply avoids the accounting if all reservations
have already been taken (subpool reserved count == 0).

Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5856093f9062..fb37ef810655 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -144,7 +144,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
 		}
 	}
 
-	if (spool->min_hpages != -1) {		/* minimum size accounting */
+	/* minimum size accounting */
+	if (spool->min_hpages != -1 && spool->rsv_hpages) {
 		if (delta > spool->rsv_hpages) {
 			/*
 			 * Asking for more reserves than those already taken on
@@ -182,7 +183,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
 	if (spool->max_hpages != -1)		/* maximum size accounting */
 		spool->used_hpages -= delta;
 
-	if (spool->min_hpages != -1) {		/* minimum size accounting */
+	 /* minimum size accounting */
+	if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) {
 		if (spool->rsv_hpages + delta <= spool->min_hpages)
 			ret = 0;
 		else
-- 
cgit v1.2.3


From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001
From: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Date: Thu, 19 May 2016 17:11:04 -0700
Subject: mm/hugetlb: introduce hugetlb_bad_size()

When any unsupported hugepage size is specified, 'hugepagesz=' and
'hugepages=' should be ignored during command line parsing until any
supported hugepage size is found.  But currently incorrect number of
hugepages are allocated when unsupported size is specified as it fails
to ignore the 'hugepages=' command.

Test case:

Note that this is specific to x86 architecture.

Boot the kernel with command line option 'hugepagesz=256M hugepages=X'.
After boot, dmesg output shows that X number of hugepages of the size 2M
is pre-allocated instead of 0.

So, to handle such command line options, introduce new routine
hugetlb_bad_size.  The routine hugetlb_bad_size sets the global variable
parsed_valid_hugepagesz.  We are using parsed_valid_hugepagesz to save
the state when unsupported hugepagesize is found so that we can ignore
the 'hugepages=' parameters after that and then reset the variable when
supported hugepage size is found.

The routine hugetlb_bad_size can be called while setting 'hugepagesz='
parameter in an architecture specific code.

Signed-off-by: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  1 +
 mm/hugetlb.c            | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d953c2542a8..e44c57876e89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
+void __init hugetlb_bad_size(void);
 void __init hugetlb_add_hstate(unsigned order);
 struct hstate *size_to_hstate(unsigned long size);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fb37ef810655..0adb74d0a4e1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages);
 static struct hstate * __initdata parsed_hstate;
 static unsigned long __initdata default_hstate_max_huge_pages;
 static unsigned long __initdata default_hstate_size;
+static bool __initdata parsed_valid_hugepagesz = true;
 
 /*
  * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -2659,6 +2660,11 @@ static int __init hugetlb_init(void)
 subsys_initcall(hugetlb_init);
 
 /* Should be called on processing a hugepagesz=... option */
+void __init hugetlb_bad_size(void)
+{
+	parsed_valid_hugepagesz = false;
+}
+
 void __init hugetlb_add_hstate(unsigned int order)
 {
 	struct hstate *h;
@@ -2691,11 +2697,17 @@ static int __init hugetlb_nrpages_setup(char *s)
 	unsigned long *mhp;
 	static unsigned long *last_mhp;
 
+	if (!parsed_valid_hugepagesz) {
+		pr_warn("hugepages = %s preceded by "
+			"an unsupported hugepagesz, ignoring\n", s);
+		parsed_valid_hugepagesz = true;
+		return 1;
+	}
 	/*
 	 * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
 	 * so this hugepages= parameter goes to the "default hstate".
 	 */
-	if (!hugetlb_max_hstate)
+	else if (!hugetlb_max_hstate)
 		mhp = &default_hstate_max_huge_pages;
 	else
 		mhp = &parsed_hstate->max_huge_pages;
-- 
cgit v1.2.3


From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:26 -0700
Subject: mm/memory_hotplug: is_mem_section_removable() can return bool

Make is_mem_section_removable() return bool to improve readability due
to this particular function only using either one or zero as its return
value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 6 +++---
 mm/memory_hotplug.c            | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index adbef586e696..20d8a5d4d133 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern void remove_memory(int nid, u64 start, u64 size);
 
 #else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
 					unsigned long nr_pages)
 {
-	return 0;
+	return false;
 }
 
 static inline void try_offline_node(int nid) {}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index aa34431c3f31..b21d8895ea41 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1410,7 +1410,7 @@ static struct page *next_active_pageblock(struct page *page)
 }
 
 /* Checks if this range of memory is likely to be hot-removable. */
-int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
+bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 {
 	struct page *page = pfn_to_page(start_pfn);
 	struct page *end_page = page + nr_pages;
@@ -1418,12 +1418,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 	/* Check the starting page of each pageblock within the range */
 	for (; page < end_page; page = next_active_pageblock(page)) {
 		if (!is_pageblock_removable_nolock(page))
-			return 0;
+			return false;
 		cond_resched();
 	}
 
 	/* All pageblocks in the memory block are likely to be hot-removable */
-	return 1;
+	return true;
 }
 
 /*
-- 
cgit v1.2.3


From fda3d69be9fe7a24ad32b840cb2ed7c30b6ba1c9 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@kernel.org>
Date: Thu, 19 May 2016 17:11:34 -0700
Subject: mm/memcontrol.c:mem_cgroup_select_victim_node(): clarify comment

> The comment seems to have not much to do with the code?

I guess the comment tries to say that the code path is triggered when we
charge the page which happens _before_ it is added to the LRU list and
so last_scanned_node might contain the stale data.

Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6740c4c2b550..011dac8ab5d7 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1391,10 +1391,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
 
 	node = next_node_in(node, memcg->scan_nodes);
 	/*
-	 * We call this when we hit limit, not when pages are added to LRU.
-	 * No LRU may hold pages because all pages are UNEVICTABLE or
-	 * memcg is too small and all pages are not on LRU. In that case,
-	 * we use curret node.
+	 * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
+	 * last time it really checked all the LRUs due to rate limiting.
+	 * Fallback to the current node in that case for simplicity.
 	 */
 	if (unlikely(node == MAX_NUMNODES))
 		node = numa_node_id();
-- 
cgit v1.2.3


From 949698a31af3b3808b0ff0cca26f36e68953bd1f Mon Sep 17 00:00:00 2001
From: Li Zhang <zhlcindy@linux.vnet.ibm.com>
Date: Thu, 19 May 2016 17:11:37 -0700
Subject: mm/page_alloc: Remove useless parameter of __free_pages_boot_core

__free_pages_boot_core has parameter pfn which is not used at all.
Remove it.

Signed-off-by: Li Zhang <zhlcindy@linux.vnet.ibm.com>
Reviewed-by: Pan Xinhui <xinhui.pan@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4ce57f938b7f..34f688bc5c7a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1076,8 +1076,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 	local_irq_restore(flags);
 }
 
-static void __init __free_pages_boot_core(struct page *page,
-					unsigned long pfn, unsigned int order)
+static void __init __free_pages_boot_core(struct page *page, unsigned int order)
 {
 	unsigned int nr_pages = 1 << order;
 	struct page *p = page;
@@ -1154,7 +1153,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
 {
 	if (early_page_uninitialised(pfn))
 		return;
-	return __free_pages_boot_core(page, pfn, order);
+	return __free_pages_boot_core(page, order);
 }
 
 /*
@@ -1239,12 +1238,12 @@ static void __init deferred_free_range(struct page *page,
 	if (nr_pages == MAX_ORDER_NR_PAGES &&
 	    (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) {
 		set_pageblock_migratetype(page, MIGRATE_MOVABLE);
-		__free_pages_boot_core(page, pfn, MAX_ORDER-1);
+		__free_pages_boot_core(page, MAX_ORDER-1);
 		return;
 	}
 
-	for (i = 0; i < nr_pages; i++, page++, pfn++)
-		__free_pages_boot_core(page, pfn, 0);
+	for (i = 0; i < nr_pages; i++, page++)
+		__free_pages_boot_core(page, 0);
 }
 
 /* Completion tracking for deferred_init_memmap() threads */
-- 
cgit v1.2.3


From 54f18d35263334ebcc6bf409fee3c0c8c22e5588 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:11:40 -0700
Subject: mm/hugetlb.c: use first_memory_node

Instead of open-coding it.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0adb74d0a4e1..0f580ea7f41d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2684,8 +2684,8 @@ void __init hugetlb_add_hstate(unsigned int order)
 	for (i = 0; i < MAX_NUMNODES; ++i)
 		INIT_LIST_HEAD(&h->hugepage_freelists[i]);
 	INIT_LIST_HEAD(&h->hugepage_activelist);
-	h->next_nid_to_alloc = first_node(node_states[N_MEMORY]);
-	h->next_nid_to_free = first_node(node_states[N_MEMORY]);
+	h->next_nid_to_alloc = first_memory_node;
+	h->next_nid_to_free = first_memory_node;
 	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
 					huge_page_size(h)/1024);
 
-- 
cgit v1.2.3


From fee83b3aba4b7ddb0cb1497a04ddebcaa43f236e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:11:43 -0700
Subject: mm/mempolicy.c:offset_il_node() document and clarify

This code was pretty obscure and was relying upon obscure side-effects
of next_node(-1, ...) and was relying upon NUMA_NO_NODE being equal to
-1.

Clean that all up and document the function's intent.

Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mempolicy.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 8d369cee0cd6..7f80ebcd6552 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1758,23 +1758,25 @@ unsigned int mempolicy_slab_node(void)
 	}
 }
 
-/* Do static interleaving for a VMA with known offset. */
+/*
+ * Do static interleaving for a VMA with known offset @n.  Returns the n'th
+ * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the
+ * number of present nodes.
+ */
 static unsigned offset_il_node(struct mempolicy *pol,
-		struct vm_area_struct *vma, unsigned long off)
+			       struct vm_area_struct *vma, unsigned long n)
 {
 	unsigned nnodes = nodes_weight(pol->v.nodes);
 	unsigned target;
-	int c;
-	int nid = NUMA_NO_NODE;
+	int i;
+	int nid;
 
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)off % nnodes;
-	c = 0;
-	do {
+	target = (unsigned int)n % nnodes;
+	nid = first_node(pol->v.nodes);
+	for (i = 0; i < target; i++)
 		nid = next_node(nid, pol->v.nodes);
-		c++;
-	} while (c <= target);
 	return nid;
 }
 
-- 
cgit v1.2.3


From e4c5800a3991f0c6a766983535dfc10d51802cf6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Thu, 19 May 2016 17:11:46 -0700
Subject: mm/rmap: replace BUG_ON(anon_vma->degree) with VM_WARN_ON

This check effectively catches anon vma hierarchy inconsistence and some
vma corruptions.  It was effective for catching corner cases in anon vma
reusing logic.  For now this code seems stable so check could be hidden
under CONFIG_DEBUG_VM and replaced with WARN because it's not so fatal.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Suggested-by: Vasily Averin <vvs@virtuozzo.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 307b555024ef..4cebe8a7c2cb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
 	list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
 		struct anon_vma *anon_vma = avc->anon_vma;
 
-		BUG_ON(anon_vma->degree);
+		VM_WARN_ON(anon_vma->degree);
 		put_anon_vma(anon_vma);
 
 		list_del(&avc->same_vma);
-- 
cgit v1.2.3


From 06b6640a3902d6d50c1bb4fb1f29a46b207dbf08 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:11:48 -0700
Subject: mm, compaction: wrap calculating first and last pfn of pageblock

Compaction code has accumulated numerous instances of manual
calculations of the first (inclusive) and last (exclusive) pfn of a
pageblock (or a smaller block of given order), given a pfn within the
pageblock.

Wrap these calculations by introducing pageblock_start_pfn(pfn) and
pageblock_end_pfn(pfn) macros.

[vbabka@suse.cz: fix crash in get_pfnblock_flags_mask() from isolate_freepages():]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 8fa254043801..017a1a1963cb 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
 #define CREATE_TRACE_POINTS
 #include <trace/events/compaction.h>
 
+#define block_start_pfn(pfn, order)	round_down(pfn, 1UL << (order))
+#define block_end_pfn(pfn, order)	ALIGN((pfn) + 1, 1UL << (order))
+#define pageblock_start_pfn(pfn)	block_start_pfn(pfn, pageblock_order)
+#define pageblock_end_pfn(pfn)		block_end_pfn(pfn, pageblock_order)
+
 static unsigned long release_freepages(struct list_head *freelist)
 {
 	struct page *page, *next;
@@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone)
 	zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
 	zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 	zone->compact_cached_free_pfn =
-			round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages);
+				pageblock_start_pfn(zone_end_pfn(zone) - 1);
 }
 
 /*
@@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc,
 	LIST_HEAD(freelist);
 
 	pfn = start_pfn;
-	block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+	block_start_pfn = pageblock_start_pfn(pfn);
 	if (block_start_pfn < cc->zone->zone_start_pfn)
 		block_start_pfn = cc->zone->zone_start_pfn;
-	block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+	block_end_pfn = pageblock_end_pfn(pfn);
 
 	for (; pfn < end_pfn; pfn += isolated,
 				block_start_pfn = block_end_pfn,
@@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc,
 		 * scanning range to right one.
 		 */
 		if (pfn >= block_end_pfn) {
-			block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
-			block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+			block_start_pfn = pageblock_start_pfn(pfn);
+			block_end_pfn = pageblock_end_pfn(pfn);
 			block_end_pfn = min(block_end_pfn, end_pfn);
 		}
 
@@ -834,10 +839,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 
 	/* Scan block by block. First and last block may be incomplete */
 	pfn = start_pfn;
-	block_start_pfn = pfn & ~(pageblock_nr_pages - 1);
+	block_start_pfn = pageblock_start_pfn(pfn);
 	if (block_start_pfn < cc->zone->zone_start_pfn)
 		block_start_pfn = cc->zone->zone_start_pfn;
-	block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
+	block_end_pfn = pageblock_end_pfn(pfn);
 
 	for (; pfn < end_pfn; pfn = block_end_pfn,
 				block_start_pfn = block_end_pfn,
@@ -924,10 +929,10 @@ static void isolate_freepages(struct compact_control *cc)
 	 * is using.
 	 */
 	isolate_start_pfn = cc->free_pfn;
-	block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1);
+	block_start_pfn = pageblock_start_pfn(cc->free_pfn);
 	block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
 						zone_end_pfn(zone));
-	low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages);
+	low_pfn = pageblock_end_pfn(cc->migrate_pfn);
 
 	/*
 	 * Isolate free pages until enough are available to migrate the
@@ -1081,12 +1086,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	 * initialized by compact_zone()
 	 */
 	low_pfn = cc->migrate_pfn;
-	block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1);
+	block_start_pfn = pageblock_start_pfn(low_pfn);
 	if (block_start_pfn < zone->zone_start_pfn)
 		block_start_pfn = zone->zone_start_pfn;
 
 	/* Only scan within a pageblock boundary */
-	block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages);
+	block_end_pfn = pageblock_end_pfn(low_pfn);
 
 	/*
 	 * Iterate over whole pageblocks until we find the first suitable.
@@ -1343,7 +1348,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 	cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
 	cc->free_pfn = zone->compact_cached_free_pfn;
 	if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
-		cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages);
+		cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
 		zone->compact_cached_free_pfn = cc->free_pfn;
 	}
 	if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
@@ -1411,7 +1416,7 @@ check_drain:
 		if (cc->order > 0 && cc->last_migrated_pfn) {
 			int cpu;
 			unsigned long current_block_start =
-				cc->migrate_pfn & ~((1UL << cc->order) - 1);
+				block_start_pfn(cc->migrate_pfn, cc->order);
 
 			if (cc->last_migrated_pfn < current_block_start) {
 				cpu = get_cpu();
@@ -1436,7 +1441,7 @@ out:
 		cc->nr_freepages = 0;
 		VM_BUG_ON(free_pfn == 0);
 		/* The cached pfn is always the first in a pageblock */
-		free_pfn &= ~(pageblock_nr_pages-1);
+		free_pfn = pageblock_start_pfn(free_pfn);
 		/*
 		 * Only go back, not forward. The cached pfn might have been
 		 * already reset to zone end in compact_finished()
-- 
cgit v1.2.3


From a34753d275576896b06af9baa6f54bee258368c2 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:11:51 -0700
Subject: mm, compaction: reduce spurious pcplist drains

Compaction drains the local pcplists each time migration scanner moves
away from a cc->order aligned block where it isolated pages for
migration, so that the pages freed by migrations can merge into higher
orders.

The detection is currently coarser than it could be.  The
cc->last_migrated_pfn variable should track the lowest pfn that was
isolated for migration.  But it is set to the pfn where
isolate_migratepages_block() starts scanning, which is typically the
first pfn of the pageblock.  There, the scanner might fail to isolate
several order-aligned blocks, and then isolate COMPACT_CLUSTER_MAX in
another block.  This would cause the pcplists drain to be performed,
although the scanner didn't yet finish the block where it isolated from.

This patch thus makes cc->last_migrated_pfn handling more accurate by
setting it to the pfn of an actually isolated page in
isolate_migratepages_block().  Although practical effects of this patch
are likely low, it arguably makes the intent of the code more obvious.
Also the next patch will make async direct compaction skip blocks more
aggressively, and draining pcplists due to skipped blocks is wasteful.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 017a1a1963cb..329973a1ae45 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -787,6 +787,15 @@ isolate_success:
 		cc->nr_migratepages++;
 		nr_isolated++;
 
+		/*
+		 * Record where we could have freed pages by migration and not
+		 * yet flushed them to buddy allocator.
+		 * - this is the lowest page that was isolated and likely be
+		 * then freed by migration.
+		 */
+		if (!cc->last_migrated_pfn)
+			cc->last_migrated_pfn = low_pfn;
+
 		/* Avoid isolating too much */
 		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
 			++low_pfn;
@@ -1075,7 +1084,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 	unsigned long block_start_pfn;
 	unsigned long block_end_pfn;
 	unsigned long low_pfn;
-	unsigned long isolate_start_pfn;
 	struct page *page;
 	const isolate_mode_t isolate_mode =
 		(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
@@ -1130,7 +1138,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 			continue;
 
 		/* Perform the isolation */
-		isolate_start_pfn = low_pfn;
 		low_pfn = isolate_migratepages_block(cc, low_pfn,
 						block_end_pfn, isolate_mode);
 
@@ -1139,15 +1146,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
 			return ISOLATE_ABORT;
 		}
 
-		/*
-		 * Record where we could have freed pages by migration and not
-		 * yet flushed them to buddy allocator.
-		 * - this is the lowest page that could have been isolated and
-		 * then freed by migration.
-		 */
-		if (cc->nr_migratepages && !cc->last_migrated_pfn)
-			cc->last_migrated_pfn = isolate_start_pfn;
-
 		/*
 		 * Either we isolated something and proceed with migration. Or
 		 * we failed and compact_zone should decide if we should
-- 
cgit v1.2.3


From fdd048e12c9a46d058f69822cb15641adae181e1 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:11:55 -0700
Subject: mm, compaction: skip blocks where isolation fails in async direct
 compaction

The goal of direct compaction is to quickly make a high-order page
available for the pending allocation.  Within an aligned block of pages
of desired order, a single allocated page that cannot be isolated for
migration means that the block cannot fully merge to a buddy page that
would satisfy the allocation request.  Therefore we can reduce the
allocation stall by skipping the rest of the block immediately on
isolation failure.  For async compaction, this also means a higher
chance of succeeding until it detects contention.

We however shouldn't completely sacrifice the second objective of
compaction, which is to reduce overal long-term memory fragmentation.
As a compromise, perform the eager skipping only in direct async
compaction, while sync compaction (including kcompactd) remains
thorough.

Testing was done using stress-highalloc from mmtests, configured for
order-4 GFP_KERNEL allocations:

                                 4.6-rc1               4.6-rc1
                                  before                 after
  Success 1 Min         24.00 (  0.00%)       27.00 (-12.50%)
  Success 1 Mean        30.20 (  0.00%)       31.60 ( -4.64%)
  Success 1 Max         37.00 (  0.00%)       35.00 (  5.41%)
  Success 2 Min         42.00 (  0.00%)       32.00 ( 23.81%)
  Success 2 Mean        44.00 (  0.00%)       44.80 ( -1.82%)
  Success 2 Max         48.00 (  0.00%)       52.00 ( -8.33%)
  Success 3 Min         91.00 (  0.00%)       92.00 ( -1.10%)
  Success 3 Mean        92.20 (  0.00%)       92.80 ( -0.65%)
  Success 3 Max         94.00 (  0.00%)       93.00 (  1.06%)

We can see that success rates are unaffected by the skipping.

                4.6-rc1     4.6-rc1
                 before       after
  User         2587.42     2566.53
  System        482.89      471.20
  Elapsed      1395.68     1382.00

Times are not so useful metric for this benchmark as main portion is the
interfering kernel builds, but results do hint at reduced system times.

                                      4.6-rc1     4.6-rc1
                                       before       after
  Direct pages scanned                163614      159608
  Kswapd pages scanned               2070139     2078790
  Kswapd pages reclaimed             2061707     2069757
  Direct pages reclaimed              163354      159505

Reduced direct reclaim was unintended, but could be explained by more
successful first attempt at (async) direct compaction, which is
attempted before the first reclaim attempt in __alloc_pages_slowpath().

  Compaction stalls                    33052       39853
  Compaction success                   12121       19773
  Compaction failures                  20931       20079

Compaction is indeed more successful, and thus less likely to get
deferred, so there are also more direct compaction stalls.

  Page migrate success               3781876     3326819
  Page migrate failure                 45817       41774
  Compaction pages isolated          7868232     6941457
  Compaction migrate scanned       168160492   127269354
  Compaction migrate prescanned            0           0
  Compaction free scanned         2522142582  2326342620
  Compaction free direct alloc             0           0
  Compaction free dir. all. miss           0           0
  Compaction cost                       5252        4476

The patch reduces migration scanned pages by 25% thanks to the eager
skipping.

[hughd@google.com: prevent nr_isolated_* from going negative]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Rik van Riel <riel@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 80 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 329973a1ae45..7487067b4613 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -638,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 {
 	struct zone *zone = cc->zone;
 	unsigned long nr_scanned = 0, nr_isolated = 0;
-	struct list_head *migratelist = &cc->migratepages;
 	struct lruvec *lruvec;
 	unsigned long flags = 0;
 	bool locked = false;
 	struct page *page = NULL, *valid_page = NULL;
 	unsigned long start_pfn = low_pfn;
+	bool skip_on_failure = false;
+	unsigned long next_skip_pfn = 0;
 
 	/*
 	 * Ensure that there are not too many pages isolated from the LRU
@@ -664,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 	if (compact_should_abort(cc))
 		return 0;
 
+	if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
+		skip_on_failure = true;
+		next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+	}
+
 	/* Time to isolate some pages for migration */
 	for (; low_pfn < end_pfn; low_pfn++) {
 		bool is_lru;
 
+		if (skip_on_failure && low_pfn >= next_skip_pfn) {
+			/*
+			 * We have isolated all migration candidates in the
+			 * previous order-aligned block, and did not skip it due
+			 * to failure. We should migrate the pages now and
+			 * hopefully succeed compaction.
+			 */
+			if (nr_isolated)
+				break;
+
+			/*
+			 * We failed to isolate in the previous order-aligned
+			 * block. Set the new boundary to the end of the
+			 * current block. Note we can't simply increase
+			 * next_skip_pfn by 1 << order, as low_pfn might have
+			 * been incremented by a higher number due to skipping
+			 * a compound or a high-order buddy page in the
+			 * previous loop iteration.
+			 */
+			next_skip_pfn = block_end_pfn(low_pfn, cc->order);
+		}
+
 		/*
 		 * Periodically drop the lock (if held) regardless of its
 		 * contention, to give chance to IRQs. Abort async compaction
@@ -679,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			break;
 
 		if (!pfn_valid_within(low_pfn))
-			continue;
+			goto isolate_fail;
 		nr_scanned++;
 
 		page = pfn_to_page(low_pfn);
@@ -734,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			if (likely(comp_order < MAX_ORDER))
 				low_pfn += (1UL << comp_order) - 1;
 
-			continue;
+			goto isolate_fail;
 		}
 
 		if (!is_lru)
-			continue;
+			goto isolate_fail;
 
 		/*
 		 * Migration will fail if an anonymous page is pinned in memory,
@@ -747,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		 */
 		if (!page_mapping(page) &&
 		    page_count(page) > page_mapcount(page))
-			continue;
+			goto isolate_fail;
 
 		/* If we already hold the lock, we can skip some rechecking */
 		if (!locked) {
@@ -758,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 			/* Recheck PageLRU and PageCompound under lock */
 			if (!PageLRU(page))
-				continue;
+				goto isolate_fail;
 
 			/*
 			 * Page become compound since the non-locked check,
@@ -767,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 			 */
 			if (unlikely(PageCompound(page))) {
 				low_pfn += (1UL << compound_order(page)) - 1;
-				continue;
+				goto isolate_fail;
 			}
 		}
 
@@ -775,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 
 		/* Try isolate the page */
 		if (__isolate_lru_page(page, isolate_mode) != 0)
-			continue;
+			goto isolate_fail;
 
 		VM_BUG_ON_PAGE(PageCompound(page), page);
 
@@ -783,7 +811,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 		del_page_from_lru_list(page, lruvec, page_lru(page));
 
 isolate_success:
-		list_add(&page->lru, migratelist);
+		list_add(&page->lru, &cc->migratepages);
 		cc->nr_migratepages++;
 		nr_isolated++;
 
@@ -801,6 +829,37 @@ isolate_success:
 			++low_pfn;
 			break;
 		}
+
+		continue;
+isolate_fail:
+		if (!skip_on_failure)
+			continue;
+
+		/*
+		 * We have isolated some pages, but then failed. Release them
+		 * instead of migrating, as we cannot form the cc->order buddy
+		 * page anyway.
+		 */
+		if (nr_isolated) {
+			if (locked) {
+				spin_unlock_irqrestore(&zone->lru_lock,	flags);
+				locked = false;
+			}
+			acct_isolated(zone, cc);
+			putback_movable_pages(&cc->migratepages);
+			cc->nr_migratepages = 0;
+			cc->last_migrated_pfn = 0;
+			nr_isolated = 0;
+		}
+
+		if (low_pfn < next_skip_pfn) {
+			low_pfn = next_skip_pfn - 1;
+			/*
+			 * The check near the loop beginning would have updated
+			 * next_skip_pfn too, but this is a bit simpler.
+			 */
+			next_skip_pfn += 1UL << cc->order;
+		}
 	}
 
 	/*
@@ -1401,6 +1460,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
 				ret = COMPACT_CONTENDED;
 				goto out;
 			}
+			/*
+			 * We failed to migrate at least one page in the current
+			 * order-aligned block, so skip the rest of it.
+			 */
+			if (cc->direct_compaction &&
+						(cc->mode == MIGRATE_ASYNC)) {
+				cc->migrate_pfn = block_end_pfn(
+						cc->migrate_pfn - 1, cc->order);
+				/* Draining pcplists is useless in this case */
+				cc->last_migrated_pfn = 0;
+
+			}
 		}
 
 check_drain:
-- 
cgit v1.2.3


From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:12:00 -0700
Subject: mm: uninline page_mapped()

It's huge.  Uninlining it saves 206 bytes per callsite.  Shaves 4924
bytes from the x86_64 allmodconfig vmlinux.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Steve Capper <steve.capper@arm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 21 +--------------------
 mm/util.c          | 23 +++++++++++++++++++++++
 2 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5b375133c695..9c2852cabf01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page)
 	return page->index;
 }
 
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
-	int i;
-	if (likely(!PageCompound(page)))
-		return atomic_read(&page->_mapcount) >= 0;
-	page = compound_head(page);
-	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
-		return true;
-	if (PageHuge(page))
-		return false;
-	for (i = 0; i < hpage_nr_pages(page); i++) {
-		if (atomic_read(&page[i]._mapcount) >= 0)
-			return true;
-	}
-	return false;
-}
+bool page_mapped(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
diff --git a/mm/util.c b/mm/util.c
index 6cc81e7b8705..8a1b3a1fb595 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -346,6 +346,29 @@ void *page_rmapping(struct page *page)
 	return __page_rmapping(page);
 }
 
+/*
+ * Return true if this page is mapped into pagetables.
+ * For compound page it returns true if any subpage of compound page is mapped.
+ */
+bool page_mapped(struct page *page)
+{
+	int i;
+
+	if (likely(!PageCompound(page)))
+		return atomic_read(&page->_mapcount) >= 0;
+	page = compound_head(page);
+	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
+		return true;
+	if (PageHuge(page))
+		return false;
+	for (i = 0; i < hpage_nr_pages(page); i++) {
+		if (atomic_read(&page[i]._mapcount) >= 0)
+			return true;
+	}
+	return false;
+}
+EXPORT_SYMBOL(page_mapped);
+
 struct anon_vma *page_anon_vma(struct page *page)
 {
 	unsigned long mapping;
-- 
cgit v1.2.3


From f44b2dda8bc29de36ccdc1e04092de7d0b2d5868 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:03 -0700
Subject: mm/hugetlb: add same zone check in pfn_range_valid_gigantic()

This patchset deals with some problematic sites that iterate pfn ranges.

There is a system thats node's pfns are overlapped as follows:

  -----pfn-------->
  N0 N1 N2 N0 N1 N2

Therefore, we need to take care of this overlapping when iterating pfn
range.

I audit many iterating sites that uses pfn_valid(), pfn_valid_within(),
zone_start_pfn and etc.  and others looks safe to me.  This is a
preparation step for a new CMA implementation, ZONE_CMA
(https://lkml.org/lkml/2015/2/12/95), because it would be easily
overlapped with other zones.  But, zone overlap check is also needed for
the general case so I send it separately.

This patch (of 5):

alloc_gigantic_page() uses alloc_contig_range() and this requires that
the requested range is in a single zone.  To satisfy this requirement,
add this check to pfn_range_valid_gigantic().

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0f580ea7f41d..949d80609a32 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1031,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn,
 	return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 }
 
-static bool pfn_range_valid_gigantic(unsigned long start_pfn,
-				unsigned long nr_pages)
+static bool pfn_range_valid_gigantic(struct zone *z,
+			unsigned long start_pfn, unsigned long nr_pages)
 {
 	unsigned long i, end_pfn = start_pfn + nr_pages;
 	struct page *page;
@@ -1043,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn,
 
 		page = pfn_to_page(i);
 
+		if (page_zone(page) != z)
+			return false;
+
 		if (PageReserved(page))
 			return false;
 
@@ -1075,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order)
 
 		pfn = ALIGN(z->zone_start_pfn, nr_pages);
 		while (zone_spans_last_pfn(z, pfn, nr_pages)) {
-			if (pfn_range_valid_gigantic(pfn, nr_pages)) {
+			if (pfn_range_valid_gigantic(z, pfn, nr_pages)) {
 				/*
 				 * We release the zone lock here because
 				 * alloc_contig_range() will also lock the zone
-- 
cgit v1.2.3


From b9eb63191a6ad70127ebdd83650b810cdc1d1117 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:06 -0700
Subject: mm/memory_hotplug: add comment to some functions related to memory
 hotplug

__offline_isolated_pages() and test_pages_isolated() are used by memory
hotplug.  These functions require that range is in a single zone but
there is no code to do this because memory hotplug checks it before
calling these functions.  To avoid confusing future user of these
functions, this patch adds comments to them.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c     | 3 ++-
 mm/page_isolation.c | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34f688bc5c7a..19fe7e9c39a6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -7176,7 +7176,8 @@ void zone_pcp_reset(struct zone *zone)
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
- * All pages in the range must be isolated before calling this.
+ * All pages in the range must be in a single zone and isolated
+ * before calling this.
  */
 void
 __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 67bedd18429c..612122bf6a42 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
 	return pfn;
 }
 
+/* Caller should ensure that requested range is in a single zone */
 int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
 			bool skip_hwpoisoned_pages)
 {
-- 
cgit v1.2.3


From a91c43c7313a995a8908f8f6b911a85d00fdbffd Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:10 -0700
Subject: mm/vmstat: add zone range overlapping check

There is a system thats node's pfns are overlapped as follows:

  -----pfn-------->
  N0 N1 N2 N0 N1 N2

Therefore, we need to care this overlapping when iterating pfn range.

There are two places in vmstat.c that iterates pfn range and they don't
consider this overlapping.  Add it.

Without this patch, above system could over count pageblock number on a
zone.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5e4300482897..0a726e398cfd 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1010,6 +1010,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
 		if (!memmap_valid_within(pfn, page, zone))
 			continue;
 
+		if (page_zone(page) != zone)
+			continue;
+
 		mtype = get_pageblock_migratetype(page);
 
 		if (mtype < MIGRATE_TYPES)
@@ -1076,6 +1079,10 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 				continue;
 
 			page = pfn_to_page(pfn);
+
+			if (page_zone(page) != zone)
+				continue;
+
 			if (PageBuddy(page)) {
 				pfn += (1UL << page_order(page)) - 1;
 				continue;
-- 
cgit v1.2.3


From 9d43f5aec9506d98ad492a783aa8a18226c5d474 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:13 -0700
Subject: mm/page_owner: add zone range overlapping check

There is a system thats node's pfns are overlapped as follows:

  -----pfn-------->
  N0 N1 N2 N0 N1 N2

Therefore, we need to care this overlapping when iterating pfn range.

There are one place in page_owner.c that iterates pfn range and it
doesn't consider this overlapping.  Add it.

Without this patch, above system could over count early allocated page
number before page_owner is activated.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_owner.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'mm')

diff --git a/mm/page_owner.c b/mm/page_owner.c
index ac3d8d129974..438768c092ac 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone)
 
 			page = pfn_to_page(pfn);
 
+			if (page_zone(page) != zone)
+				continue;
+
 			/*
 			 * We are safe to check buddy flag and order, because
 			 * this is init stage and only single thread runs.
-- 
cgit v1.2.3


From ba6b0979e346fd91d3b7ef6956d7155877308f0f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:16 -0700
Subject: power: add zone range overlapping check

There is a system thats node's pfns are overlapped as follows:

  -----pfn-------->
  N0 N1 N2 N0 N1 N2

Therefore, we need to care this overlapping when iterating pfn range.

mark_free_pages() iterates requested zone's pfn range and unset all
range's bitmap first.  And then it marks freepages in a zone to the
bitmap.  If there is an overlapping zone, above unset could clear
previous marked bit and reference to this bitmap in the future will
cause the problem.  To prevent it, this patch adds a zone check in
mark_free_pages().

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 19fe7e9c39a6..e132705d19fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2156,6 +2156,10 @@ void mark_free_pages(struct zone *zone)
 	for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
+
+			if (page_zone(page) != zone)
+				continue;
+
 			if (!swsusp_page_is_forbidden(page))
 				swsusp_unset_page_free(page);
 		}
-- 
cgit v1.2.3


From 09b4ab3c433733127f8644290e94548a4cb8122f Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:20 -0700
Subject: mm/writeback: correct dirty page calculation for highmem

ZONE_MOVABLE could be treated as highmem so we need to consider it for
accurate calculation of dirty pages.  And, in following patches,
ZONE_CMA will be introduced and it can be treated as highmem, too.  So,
instead of manually adding stat of ZONE_MOVABLE, looping all zones and
check whether the zone is highmem or not and add stat of the zone which
can be treated as highmem.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page-writeback.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bc5149d5ec38..3b88795ab46e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 #ifdef CONFIG_HIGHMEM
 	int node;
 	unsigned long x = 0;
+	int i;
 
 	for_each_node_state(node, N_HIGH_MEMORY) {
-		struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			struct zone *z = &NODE_DATA(node)->node_zones[i];
 
-		x += zone_dirtyable_memory(z);
+			if (is_highmem(z))
+				x += zone_dirtyable_memory(z);
+		}
 	}
 	/*
 	 * Unreclaimable memory (kernel memory or anonymous memory
-- 
cgit v1.2.3


From fc2bd799c7c79c84a59da6f9221370bc6f38c503 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:23 -0700
Subject: mm/page_alloc: correct highmem memory statistics

ZONE_MOVABLE could be treated as highmem so we need to consider it for
accurate statistics.  And, in following patches, ZONE_CMA will be
introduced and it can be treated as highmem, too.  So, instead of
manually adding stat of ZONE_MOVABLE, looping all zones and check
whether the zone is highmem or not and add stat of the zone which can be
treated as highmem.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e132705d19fd..da6d339f1936 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3793,6 +3793,8 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 {
 	int zone_type;		/* needs to be signed */
 	unsigned long managed_pages = 0;
+	unsigned long managed_highpages = 0;
+	unsigned long free_highpages = 0;
 	pg_data_t *pgdat = NODE_DATA(nid);
 
 	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
@@ -3801,12 +3803,19 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 	val->sharedram = node_page_state(nid, NR_SHMEM);
 	val->freeram = node_page_state(nid, NR_FREE_PAGES);
 #ifdef CONFIG_HIGHMEM
-	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages;
-	val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
-			NR_FREE_PAGES);
+	for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
+		struct zone *zone = &pgdat->node_zones[zone_type];
+
+		if (is_highmem(zone)) {
+			managed_highpages += zone->managed_pages;
+			free_highpages += zone_page_state(zone, NR_FREE_PAGES);
+		}
+	}
+	val->totalhigh = managed_highpages;
+	val->freehigh = free_highpages;
 #else
-	val->totalhigh = 0;
-	val->freehigh = 0;
+	val->totalhigh = managed_highpages;
+	val->freehigh = free_highpages;
 #endif
 	val->mem_unit = PAGE_SIZE;
 }
-- 
cgit v1.2.3


From 33499bfe507c844f2c6f55ae8cec17705d0eda95 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:26 -0700
Subject: mm/highmem: make nr_free_highpages() handles all highmem zones by
 itself

nr_free_highpages() manually adds statistics per each highmem zone and
returns a total value for them.  Whenever we add a new highmem zone, we
need to consider this function and it's really troublesome.  Make it
handle all highmem zones by itself.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/highmem.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/highmem.c b/mm/highmem.c
index 123bcd3ed4f2..50b4ca6787f0 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
 
 unsigned int nr_free_highpages (void)
 {
-	pg_data_t *pgdat;
+	struct zone *zone;
 	unsigned int pages = 0;
 
-	for_each_online_pgdat(pgdat) {
-		pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM],
-			NR_FREE_PAGES);
-		if (zone_movable_is_highmem())
-			pages += zone_page_state(
-					&pgdat->node_zones[ZONE_MOVABLE],
-					NR_FREE_PAGES);
+	for_each_populated_zone(zone) {
+		if (is_highmem(zone))
+			pages += zone_page_state(zone, NR_FREE_PAGES);
 	}
 
 	return pages;
-- 
cgit v1.2.3


From e87d59f7a2002aa2d4582d4ea16da91dd3c72752 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:12:29 -0700
Subject: mm/vmstat: make node_page_state() handles all zones by itself

node_page_state() manually adds statistics per each zone and returns
total value for all zones.  Whenever we add a new zone, we need to
consider this function and it's really troublesome.  Make it handle all
zones by itself.

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 0a726e398cfd..a7de9adacbd9 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -600,19 +600,13 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 unsigned long node_page_state(int node, enum zone_stat_item item)
 {
 	struct zone *zones = NODE_DATA(node)->node_zones;
+	int i;
+	unsigned long count = 0;
 
-	return
-#ifdef CONFIG_ZONE_DMA
-		zone_page_state(&zones[ZONE_DMA], item) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
-		zone_page_state(&zones[ZONE_DMA32], item) +
-#endif
-#ifdef CONFIG_HIGHMEM
-		zone_page_state(&zones[ZONE_HIGHMEM], item) +
-#endif
-		zone_page_state(&zones[ZONE_NORMAL], item) +
-		zone_page_state(&zones[ZONE_MOVABLE], item);
+	for (i = 0; i < MAX_NR_ZONES; i++)
+		count += zone_page_state(zones + i, item);
+
+	return count;
 }
 
 #endif
-- 
cgit v1.2.3


From 1269019e69a6798db15edea8921f83215ef954d6 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Thu, 19 May 2016 17:12:32 -0700
Subject: mm/mmap: kill hook arch_rebalance_pgtables()

Nobody uses it.

Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index bd2e1a533bc1..fba246b8f1a5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -55,10 +55,6 @@
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
 
-#ifndef arch_rebalance_pgtables
-#define arch_rebalance_pgtables(addr, len)		(addr)
-#endif
-
 #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
 const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN;
 const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX;
@@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
 	if (offset_in_page(addr))
 		return -EINVAL;
 
-	addr = arch_rebalance_pgtables(addr, len);
 	error = security_mmap_addr(addr);
 	return error ? error : addr;
 }
-- 
cgit v1.2.3


From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:35 -0700
Subject: mm: update_lru_size warn and reset bad lru_size

Though debug kernels have a VM_BUG_ON to help protect from misaccounting
lru_size, non-debug kernels are liable to wrap it around: and then the
vast unsigned long size draws page reclaim into a loop of repeatedly
doing nothing on an empty list, without even a cond_resched().

That soft lockup looks confusingly like an over-busy reclaim scenario,
with lots of contention on the lru_lock in shrink_inactive_list(): yet
has a totally different origin.

Help differentiate with a custom warning in
mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the
size to avoid the lockup.  But the particular bug which suggested this
change was mine alone, and since fixed.

Make it a WARN_ONCE: the first occurrence is the most informative, a
flurry may follow, yet even when rate-limited little more is learnt.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h |  2 +-
 mm/memcontrol.c           | 24 ++++++++++++++++++++----
 2 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 712e8c37a200..d8cea81ab1ac 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
 	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	list_del(&page->lru);
+	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
 }
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 011dac8ab5d7..6a0199706f00 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1023,22 +1023,38 @@ out:
  * @lru: index of lru list the page is sitting on
  * @nr_pages: positive when adding or negative when removing
  *
- * This function must be called when a page is added to or removed from an
- * lru list.
+ * This function must be called under lru_lock, just before a page is added
+ * to or just after a page is removed from an lru list (that ordering being
+ * so as to allow it to check that lru_size 0 is consistent with list_empty).
  */
 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 				int nr_pages)
 {
 	struct mem_cgroup_per_zone *mz;
 	unsigned long *lru_size;
+	long size;
+	bool empty;
 
 	if (mem_cgroup_disabled())
 		return;
 
 	mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
 	lru_size = mz->lru_size + lru;
-	*lru_size += nr_pages;
-	VM_BUG_ON((long)(*lru_size) < 0);
+	empty = list_empty(lruvec->lists + lru);
+
+	if (nr_pages < 0)
+		*lru_size += nr_pages;
+
+	size = *lru_size;
+	if (WARN_ONCE(size < 0 || empty != !size,
+		"%s(%p, %d, %d): lru_size %ld but %sempty\n",
+		__func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
+		VM_BUG_ON(1);
+		*lru_size = 0;
+	}
+
+	if (nr_pages > 0)
+		*lru_size += nr_pages;
 }
 
 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-- 
cgit v1.2.3


From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:38 -0700
Subject: mm: update_lru_size do the __mod_zone_page_state

Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy
reclaim was removed) that lru_size can be updated by -nr_taken once per
call to isolate_lru_pages(), instead of page by page.

Update it inside isolate_lru_pages(), or at its two callsites? I chose
to update it at the callsites, rearranging and grouping the updates by
nr_taken and nr_scanned together in both.

With one exception, mem_cgroup_update_lru_size(,lru,) is then used where
__mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding
some more calls in a future commit.  Make the code a little smaller and
simpler by incorporating stat update in lru_size update.

The exception was move_active_pages_to_lru(), which aggregated the
pgmoved stat update separately from the individual lru_size updates; but
I still think this a simplification worth making.

However, the __mod_zone_page_state is not peculiar to mem_cgroups: so
better use the name update_lru_size, calls mem_cgroup_update_lru_size
when CONFIG_MEMCG.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 include/linux/mm_inline.h  | 24 ++++++++++++++++++------
 mm/memcontrol.c            |  2 ++
 mm/vmscan.c                | 23 ++++++++++-------------
 4 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'mm')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1191d79aa495..94da96738df3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return 0;
 }
 
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-			      int increment)
-{
-}
-
 static inline unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			     int nid, unsigned int lru_mask)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index d8cea81ab1ac..5bd29ba4f174 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
 	return !PageSwapBacked(page);
 }
 
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+	__update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	update_lru_size(lruvec, lru, hpage_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
 	list_del(&page->lru);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+	update_lru_size(lruvec, lru, -hpage_nr_pages(page));
 }
 
 /**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6a0199706f00..1b40dcad2b90 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1035,6 +1035,8 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 	long size;
 	bool empty;
 
+	__update_lru_size(lruvec, lru, nr_pages);
+
 	if (mem_cgroup_disabled())
 		return;
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d3a02ac3eed7..dcfdfc1a0942 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
 					!list_empty(src); scan++) {
 		struct page *page;
-		int nr_pages;
 
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
@@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 
 		switch (__isolate_lru_page(page, mode)) {
 		case 0:
-			nr_pages = hpage_nr_pages(page);
-			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+			nr_taken += hpage_nr_pages(page);
 			list_move(&page->lru, dst);
-			nr_taken += nr_pages;
 			break;
 
 		case -EBUSY:
@@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
 				     &nr_scanned, sc, isolate_mode, lru);
 
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
+	update_lru_size(lruvec, lru, -nr_taken);
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+	reclaim_stat->recent_scanned[file] += nr_taken;
 
 	if (global_reclaim(sc)) {
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
@@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_lock_irq(&zone->lru_lock);
 
-	reclaim_stat->recent_scanned[file] += nr_taken;
-
 	if (global_reclaim(sc)) {
 		if (current_is_kswapd())
 			__count_zone_vm_events(PGSTEAL_KSWAPD, zone,
@@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 		SetPageLRU(page);
 
 		nr_pages = hpage_nr_pages(page);
-		mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+		update_lru_size(lruvec, lru, nr_pages);
 		list_move(&page->lru, &lruvec->lists[lru]);
 		pgmoved += nr_pages;
 
@@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 				list_add(&page->lru, pages_to_free);
 		}
 	}
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
+
 	if (!is_active_lru(lru))
 		__count_vm_events(PGDEACTIVATE, pgmoved);
 }
@@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
 
 	nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
 				     &nr_scanned, sc, isolate_mode, lru);
-	if (global_reclaim(sc))
-		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 
+	update_lru_size(lruvec, lru, -nr_taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
 	reclaim_stat->recent_scanned[file] += nr_taken;
 
+	if (global_reclaim(sc))
+		__mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned);
 	__count_zone_vm_events(PGREFILL, zone, nr_scanned);
-	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken);
-	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken);
+
 	spin_unlock_irq(&zone->lru_lock);
 
 	while (!list_empty(&l_hold)) {
-- 
cgit v1.2.3


From fa9949da59a15017a02c86b087c7499d7b5702be Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:41 -0700
Subject: mm: use __SetPageSwapBacked and dont ClearPageSwapBacked

v3.16 commit 07a427884348 ("mm: shmem: avoid atomic operation during
shmem_getpage_gfp") rightly replaced one instance of SetPageSwapBacked
by __SetPageSwapBacked, pointing out that the newly allocated page is
not yet visible to other users (except speculative get_page_unless_zero-
ers, who may not update page flags before their further checks).

That was part of a series in which Mel was focused on tmpfs profiles:
but almost all SetPageSwapBacked uses can be so optimized, with the same
justification.

Remove ClearPageSwapBacked from __read_swap_cache_async() error path:
it's not an error to free a page with PG_swapbacked set.

Follow a convention of __SetPageLocked, __SetPageSwapBacked instead of
doing it differently in different places; but that's for tidiness - if
the ordering actually mattered, we should not be using the __variants.

There's probably scope for further __SetPageFlags in other places, but
SwapBacked is the one I'm interested in at the moment.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Reviewed-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c    | 6 +++---
 mm/rmap.c       | 2 +-
 mm/shmem.c      | 4 ++--
 mm/swap_state.c | 3 +--
 4 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index f9dfb18a4eba..53ab6398e7a2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 		newpage->index = page->index;
 		newpage->mapping = page->mapping;
 		if (PageSwapBacked(page))
-			SetPageSwapBacked(newpage);
+			__SetPageSwapBacked(newpage);
 
 		return MIGRATEPAGE_SUCCESS;
 	}
@@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
 	newpage->index = page->index;
 	newpage->mapping = page->mapping;
 	if (PageSwapBacked(page))
-		SetPageSwapBacked(newpage);
+		__SetPageSwapBacked(newpage);
 
 	get_page(newpage);	/* add cache reference */
 	if (PageSwapCache(page)) {
@@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 	/* Prepare a page as a migration target */
 	__SetPageLocked(new_page);
-	SetPageSwapBacked(new_page);
+	__SetPageSwapBacked(new_page);
 
 	/* anon mapping, we can simply copy page->mapping to the new page: */
 	new_page->mapping = page->mapping;
diff --git a/mm/rmap.c b/mm/rmap.c
index 4cebe8a7c2cb..8a839935b18c 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page,
 	int nr = compound ? hpage_nr_pages(page) : 1;
 
 	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-	SetPageSwapBacked(page);
+	__SetPageSwapBacked(page);
 	if (compound) {
 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 		/* increment count (starts at -1) */
diff --git a/mm/shmem.c b/mm/shmem.c
index e684a9140228..9e609d58df73 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1085,8 +1085,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	flush_dcache_page(newpage);
 
 	__SetPageLocked(newpage);
+	__SetPageSwapBacked(newpage);
 	SetPageUptodate(newpage);
-	SetPageSwapBacked(newpage);
 	set_page_private(newpage, swap_index);
 	SetPageSwapCache(newpage);
 
@@ -1276,8 +1276,8 @@ repeat:
 			goto decused;
 		}
 
-		__SetPageSwapBacked(page);
 		__SetPageLocked(page);
+		__SetPageSwapBacked(page);
 		if (sgp == SGP_WRITE)
 			__SetPageReferenced(page);
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 366ce3518703..0d457e7db8d6 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
 		/* May fail (-ENOMEM) if radix-tree node allocation failed. */
 		__SetPageLocked(new_page);
-		SetPageSwapBacked(new_page);
+		__SetPageSwapBacked(new_page);
 		err = __add_to_swap_cache(new_page, entry);
 		if (likely(!err)) {
 			radix_tree_preload_end();
@@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 			return new_page;
 		}
 		radix_tree_preload_end();
-		ClearPageSwapBacked(new_page);
 		__ClearPageLocked(new_page);
 		/*
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
-- 
cgit v1.2.3


From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:44 -0700
Subject: tmpfs: preliminary minor tidyups

Make a few cleanups in mm/shmem.c, before going on to complicate it.

shmem_alloc_page() will become more complicated: we can't afford to to
have that complication duplicated between a CONFIG_NUMA version and a
!CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a
single shmem_swapin() and a single shmem_alloc_page().

Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA
configurations, but eliminating it is a larger cleanup: I have an
alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and
bug-prone, and changed yet again since my last version.

Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to
shmem_alloc_page(): that SwapBacked flag will be useful in future, to
help to distinguish different cases appropriately.

And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of
little use (IIRC it dates back to when shmem_getpage() returned the page
unlocked): kill it and do the necessary in shmem_file_read_iter().

But an arm64 build then complained that info may be uninitialized (where
shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and
advancing to an "sgp <= SGP_CACHE" test jogged it back to reality.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  6 +++++
 mm/shmem.c                | 69 ++++++++++++++++++-----------------------------
 2 files changed, 32 insertions(+), 43 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6978a99e571f..4429d255c8ab 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
 {
 }
 
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	return NULL;
+}
+
 #define vma_policy(vma) NULL
 
 static inline int
diff --git a/mm/shmem.c b/mm/shmem.c
index 9e609d58df73..6d2de2c1bf11 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -101,7 +101,6 @@ struct shmem_falloc {
 enum sgp_type {
 	SGP_READ,	/* don't exceed i_size, don't allocate page */
 	SGP_CACHE,	/* don't exceed i_size, may allocate page */
-	SGP_DIRTY,	/* like SGP_CACHE, but set new page dirty */
 	SGP_WRITE,	/* may exceed i_size, may allocate !Uptodate page */
 	SGP_FALLOC,	/* like SGP_WRITE, but make existing page Uptodate */
 };
@@ -169,7 +168,7 @@ static inline int shmem_reacct_size(unsigned long flags,
 
 /*
  * ... whereas tmpfs objects are accounted incrementally as
- * pages are allocated, in order to allow huge sparse files.
+ * pages are allocated, in order to allow large sparse files.
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
@@ -947,8 +946,7 @@ redirty:
 	return 0;
 }
 
-#ifdef CONFIG_NUMA
-#ifdef CONFIG_TMPFS
+#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS)
 static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
 {
 	char buffer[64];
@@ -972,7 +970,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
 	}
 	return mpol;
 }
-#endif /* CONFIG_TMPFS */
+#else /* !CONFIG_NUMA || !CONFIG_TMPFS */
+static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
+{
+}
+static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
+{
+	return NULL;
+}
+#endif /* CONFIG_NUMA && CONFIG_TMPFS */
+#ifndef CONFIG_NUMA
+#define vm_policy vm_private_data
+#endif
 
 static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
 			struct shmem_inode_info *info, pgoff_t index)
@@ -1008,39 +1017,17 @@ static struct page *shmem_alloc_page(gfp_t gfp,
 	pvma.vm_ops = NULL;
 	pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
 
-	page = alloc_page_vma(gfp, &pvma, 0);
+	page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false);
+	if (page) {
+		__SetPageLocked(page);
+		__SetPageSwapBacked(page);
+	}
 
 	/* Drop reference taken by mpol_shared_policy_lookup() */
 	mpol_cond_put(pvma.vm_policy);
 
 	return page;
 }
-#else /* !CONFIG_NUMA */
-#ifdef CONFIG_TMPFS
-static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
-{
-}
-#endif /* CONFIG_TMPFS */
-
-static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	return swapin_readahead(swap, gfp, NULL, 0);
-}
-
-static inline struct page *shmem_alloc_page(gfp_t gfp,
-			struct shmem_inode_info *info, pgoff_t index)
-{
-	return alloc_page(gfp);
-}
-#endif /* CONFIG_NUMA */
-
-#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
-static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
-{
-	return NULL;
-}
-#endif
 
 /*
  * When a page is moved from swapcache to shmem filecache (either by the
@@ -1084,8 +1071,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 	copy_highpage(newpage, oldpage);
 	flush_dcache_page(newpage);
 
-	__SetPageLocked(newpage);
-	__SetPageSwapBacked(newpage);
 	SetPageUptodate(newpage);
 	set_page_private(newpage, swap_index);
 	SetPageSwapCache(newpage);
@@ -1155,7 +1140,7 @@ repeat:
 		page = NULL;
 	}
 
-	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		error = -EINVAL;
 		goto unlock;
@@ -1275,9 +1260,6 @@ repeat:
 			error = -ENOMEM;
 			goto decused;
 		}
-
-		__SetPageLocked(page);
-		__SetPageSwapBacked(page);
 		if (sgp == SGP_WRITE)
 			__SetPageReferenced(page);
 
@@ -1321,12 +1303,10 @@ clear:
 			flush_dcache_page(page);
 			SetPageUptodate(page);
 		}
-		if (sgp == SGP_DIRTY)
-			set_page_dirty(page);
 	}
 
 	/* Perhaps the file has been truncated since we checked */
-	if (sgp != SGP_WRITE && sgp != SGP_FALLOC &&
+	if (sgp <= SGP_CACHE &&
 	    ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
 		if (alloced) {
 			ClearPageDirty(page);
@@ -1633,7 +1613,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	 * and even mark them dirty, so it cannot exceed the max_blocks limit.
 	 */
 	if (!iter_is_iovec(to))
-		sgp = SGP_DIRTY;
+		sgp = SGP_CACHE;
 
 	index = *ppos >> PAGE_SHIFT;
 	offset = *ppos & ~PAGE_MASK;
@@ -1659,8 +1639,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				error = 0;
 			break;
 		}
-		if (page)
+		if (page) {
+			if (sgp == SGP_CACHE)
+				set_page_dirty(page);
 			unlock_page(page);
+		}
 
 		/*
 		 * We must evaluate after, since reads (unlike writes)
-- 
cgit v1.2.3


From 9e18eb29356b7dfd55183bd42cf73919d1590835 Mon Sep 17 00:00:00 2001
From: Andres Lagar-Cavilla <andreslc@google.com>
Date: Thu, 19 May 2016 17:12:47 -0700
Subject: tmpfs: mem_cgroup charge fault to vm_mm not current mm

Although shmem_fault() has been careful to count a major fault to vm_mm,
shmem_getpage_gfp() has been careless in charging a remote access fault
to current->mm owner's memcg instead of to vma->vm_mm owner's memcg:
that is inconsistent with all the mem_cgroup charging on remote access
faults in mm/memory.c.

Fix it by passing fault_mm along with fault_type to
shmem_get_page_gfp(); but in that case, now knowing the right mm, it's
better for it to handle the PGMAJFAULT updates itself.

And let's keep this clutter out of most callers' way: change the common
shmem_getpage() wrapper to hide fault_mm and fault_type as well as gfp.

Signed-off-by: Andres Lagar-Cavilla <andreslc@google.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/shmem.c | 61 ++++++++++++++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index 6d2de2c1bf11..e418a995427d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -121,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
 static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 				struct shmem_inode_info *info, pgoff_t index);
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
+		struct page **pagep, enum sgp_type sgp,
+		gfp_t gfp, struct mm_struct *fault_mm, int *fault_type);
 
 static inline int shmem_getpage(struct inode *inode, pgoff_t index,
-	struct page **pagep, enum sgp_type sgp, int *fault_type)
+		struct page **pagep, enum sgp_type sgp)
 {
 	return shmem_getpage_gfp(inode, index, pagep, sgp,
-			mapping_gfp_mask(inode->i_mapping), fault_type);
+		mapping_gfp_mask(inode->i_mapping), NULL, NULL);
 }
 
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@ -527,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 
 	if (partial_start) {
 		struct page *page = NULL;
-		shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+		shmem_getpage(inode, start - 1, &page, SGP_READ);
 		if (page) {
 			unsigned int top = PAGE_SIZE;
 			if (start > end) {
@@ -542,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 	}
 	if (partial_end) {
 		struct page *page = NULL;
-		shmem_getpage(inode, end, &page, SGP_READ, NULL);
+		shmem_getpage(inode, end, &page, SGP_READ);
 		if (page) {
 			zero_user_segment(page, 0, partial_end);
 			set_page_dirty(page);
@@ -1115,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
  *
  * If we allocate a new one we do not mark it dirty. That's up to the
  * vm. If we swap it in we mark it dirty since we also free the swap
- * entry since a page cannot live in both the swap and page cache
+ * entry since a page cannot live in both the swap and page cache.
+ *
+ * fault_mm and fault_type are only supplied by shmem_fault:
+ * otherwise they are NULL.
  */
 static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
-	struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
+	struct page **pagep, enum sgp_type sgp, gfp_t gfp,
+	struct mm_struct *fault_mm, int *fault_type)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo;
+	struct mm_struct *charge_mm;
 	struct mem_cgroup *memcg;
 	struct page *page;
 	swp_entry_t swap;
@@ -1168,14 +1174,19 @@ repeat:
 	 */
 	info = SHMEM_I(inode);
 	sbinfo = SHMEM_SB(inode->i_sb);
+	charge_mm = fault_mm ? : current->mm;
 
 	if (swap.val) {
 		/* Look it up and read it in.. */
 		page = lookup_swap_cache(swap);
 		if (!page) {
-			/* here we actually do the io */
-			if (fault_type)
+			/* Or update major stats only when swapin succeeds?? */
+			if (fault_type) {
 				*fault_type |= VM_FAULT_MAJOR;
+				count_vm_event(PGMAJFAULT);
+				mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT);
+			}
+			/* Here we actually start the io */
 			page = shmem_swapin(swap, gfp, info, index);
 			if (!page) {
 				error = -ENOMEM;
@@ -1202,7 +1213,7 @@ repeat:
 				goto failed;
 		}
 
-		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+		error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
 				false);
 		if (!error) {
 			error = shmem_add_to_page_cache(page, mapping, index,
@@ -1263,7 +1274,7 @@ repeat:
 		if (sgp == SGP_WRITE)
 			__SetPageReferenced(page);
 
-		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg,
+		error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg,
 				false);
 		if (error)
 			goto decused;
@@ -1352,6 +1363,7 @@ unlock:
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vma->vm_file);
+	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
 	int error;
 	int ret = VM_FAULT_LOCKED;
 
@@ -1413,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		spin_unlock(&inode->i_lock);
 	}
 
-	error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
+	error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE,
+				  gfp, vma->vm_mm, &ret);
 	if (error)
 		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
-
-	if (ret & VM_FAULT_MAJOR) {
-		count_vm_event(PGMAJFAULT);
-		mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-	}
 	return ret;
 }
 
@@ -1567,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
 			return -EPERM;
 	}
 
-	return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+	return shmem_getpage(inode, index, pagep, SGP_WRITE);
 }
 
 static int
@@ -1633,7 +1641,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 				break;
 		}
 
-		error = shmem_getpage(inode, index, &page, sgp, NULL);
+		error = shmem_getpage(inode, index, &page, sgp);
 		if (error) {
 			if (error == -EINVAL)
 				error = 0;
@@ -1749,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 	error = 0;
 
 	while (spd.nr_pages < nr_pages) {
-		error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+		error = shmem_getpage(inode, index, &page, SGP_CACHE);
 		if (error)
 			break;
 		unlock_page(page);
@@ -1771,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
 		page = spd.pages[page_nr];
 
 		if (!PageUptodate(page) || page->mapping != mapping) {
-			error = shmem_getpage(inode, index, &page,
-							SGP_CACHE, NULL);
+			error = shmem_getpage(inode, index, &page, SGP_CACHE);
 			if (error)
 				break;
 			unlock_page(page);
@@ -2215,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
 			error = -ENOMEM;
 		else
-			error = shmem_getpage(inode, index, &page, SGP_FALLOC,
-									NULL);
+			error = shmem_getpage(inode, index, &page, SGP_FALLOC);
 		if (error) {
 			/* Remove the !PageUptodate pages we added */
 			shmem_undo_range(inode,
@@ -2534,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
 		inode->i_op = &shmem_short_symlink_operations;
 	} else {
 		inode_nohighmem(inode);
-		error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
+		error = shmem_getpage(inode, 0, &page, SGP_WRITE);
 		if (error) {
 			iput(inode);
 			return error;
@@ -2575,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry,
 			return ERR_PTR(-ECHILD);
 		}
 	} else {
-		error = shmem_getpage(inode, 0, &page, SGP_READ, NULL);
+		error = shmem_getpage(inode, 0, &page, SGP_READ);
 		if (error)
 			return ERR_PTR(error);
 		unlock_page(page);
@@ -3479,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
 	int error;
 
 	BUG_ON(mapping->a_ops != &shmem_aops);
-	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+	error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+				  gfp, NULL, NULL);
 	if (error)
 		page = ERR_PTR(error);
 	else
-- 
cgit v1.2.3


From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:50 -0700
Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing.  Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.

Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient.  Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.

But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time.  I have not yet worked out why, but
have no evidence that it's actually harmful.  Punt for the moment by
just ignoring the anomaly on those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 14 +++++++++++
 include/linux/vmstat.h      |  4 +++
 kernel/sysctl.c             |  7 ++++++
 mm/vmstat.c                 | 60 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+)

(limited to 'mm')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
 - panic_on_oom
 - percpu_pagelist_fraction
 - stat_interval
+- stat_refresh
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
 
 ==============================================================
 
+stat_refresh
+
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+
+==============================================================
+
 swappiness
 
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos);
+
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 
 int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_jiffies,
 	},
+	{
+		.procname	= "stat_refresh",
+		.data		= NULL,
+		.maxlen		= 0,
+		.mode		= 0600,
+		.proc_handler	= vmstat_refresh,
+	},
 #endif
 #ifdef CONFIG_MMU
 	{
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7de9adacbd9..c831be32a1a3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 static cpumask_var_t cpu_stat_off;
 
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+	refresh_cpu_vm_stats(true);
+}
+
+int vmstat_refresh(struct ctl_table *table, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	long val;
+	int err;
+	int i;
+
+	/*
+	 * The regular update, every sysctl_stat_interval, may come later
+	 * than expected: leaving a significant amount in per_cpu buckets.
+	 * This is particularly misleading when checking a quantity of HUGE
+	 * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+	 * which can equally be echo'ed to or cat'ted from (by root),
+	 * can be used to update the stats just before reading them.
+	 *
+	 * Oh, and since global_page_state() etc. are so careful to hide
+	 * transiently negative values, report an error here if any of
+	 * the stats is negative, so we know to go looking for imbalance.
+	 */
+	err = schedule_on_each_cpu(refresh_vm_stats);
+	if (err)
+		return err;
+	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+		val = atomic_long_read(&vm_stat[i]);
+		if (val < 0) {
+			switch (i) {
+			case NR_ALLOC_BATCH:
+			case NR_PAGES_SCANNED:
+				/*
+				 * These are often seen to go negative in
+				 * recent kernels, but not to go permanently
+				 * negative.  Whilst it would be nicer not to
+				 * have exceptions, rooting them out would be
+				 * another task, of rather low priority.
+				 */
+				break;
+			default:
+				pr_warn("%s: %s %ld\n",
+					__func__, vmstat_text[i], val);
+				err = -EINVAL;
+				break;
+			}
+		}
+	}
+	if (err)
+		return err;
+	if (write)
+		*ppos += *lenp;
+	else
+		*lenp = 0;
+	return 0;
+}
+#endif /* CONFIG_PROC_FS */
+
 static void vmstat_update(struct work_struct *w)
 {
 	if (refresh_cpu_vm_stats(true)) {
-- 
cgit v1.2.3


From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:54 -0700
Subject: huge mm: move_huge_pmd does not need new_vma

Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was
a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from
the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as
it was in the old vma, alignment and size permitting.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 +---
 mm/huge_memory.c        | 7 ++-----
 mm/mremap.c             | 5 ++---
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d7b9e5346fba..419fb9e03447 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
-			 struct vm_area_struct *new_vma,
-			 unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f8ac8f582fd8..66675eed67be 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 	return 1;
 }
 
-bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
-		  unsigned long old_addr,
+bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 		  unsigned long new_addr, unsigned long old_end,
 		  pmd_t *old_pmd, pmd_t *new_pmd)
 {
 	spinlock_t *old_ptl, *new_ptl;
 	pmd_t pmd;
-
 	struct mm_struct *mm = vma->vm_mm;
 
 	if ((old_addr & ~HPAGE_PMD_MASK) ||
 	    (new_addr & ~HPAGE_PMD_MASK) ||
-	    old_end - old_addr < HPAGE_PMD_SIZE ||
-	    (new_vma->vm_flags & VM_NOHUGEPAGE))
+	    old_end - old_addr < HPAGE_PMD_SIZE)
 		return false;
 
 	/*
diff --git a/mm/mremap.c b/mm/mremap.c
index 3fa0a467df66..7d98fe1adc12 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -198,9 +198,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 				/* See comment in move_ptes() */
 				if (need_rmap_locks)
 					anon_vma_lock_write(vma->anon_vma);
-				moved = move_huge_pmd(vma, new_vma, old_addr,
-						    new_addr, old_end,
-						    old_pmd, new_pmd);
+				moved = move_huge_pmd(vma, old_addr, new_addr,
+						    old_end, old_pmd, new_pmd);
 				if (need_rmap_locks)
 					anon_vma_unlock_write(vma->anon_vma);
 				if (moved) {
-- 
cgit v1.2.3


From 1d069b7dd56728a0eb6acb138dce0d37600dee00 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:57 -0700
Subject: huge pagecache: extend mremap pmd rmap lockout to files

Whatever huge pagecache implementation we go with, file rmap locking
must be added to anon rmap locking, when mremap's move_page_tables()
finds a pmd_trans_huge pmd entry: a simple change, let's do it now.

Factor out take_rmap_locks() and drop_rmap_locks() to handle the locking
for make move_ptes() and move_page_tables(), and delete the
VM_BUG_ON_VMA which rejected vm_file and required anon_vma.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mremap.c | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/mm/mremap.c b/mm/mremap.c
index 7d98fe1adc12..9dc499977924 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
 	return pmd;
 }
 
+static void take_rmap_locks(struct vm_area_struct *vma)
+{
+	if (vma->vm_file)
+		i_mmap_lock_write(vma->vm_file->f_mapping);
+	if (vma->anon_vma)
+		anon_vma_lock_write(vma->anon_vma);
+}
+
+static void drop_rmap_locks(struct vm_area_struct *vma)
+{
+	if (vma->anon_vma)
+		anon_vma_unlock_write(vma->anon_vma);
+	if (vma->vm_file)
+		i_mmap_unlock_write(vma->vm_file->f_mapping);
+}
+
 static pte_t move_soft_dirty_pte(pte_t pte)
 {
 	/*
@@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		struct vm_area_struct *new_vma, pmd_t *new_pmd,
 		unsigned long new_addr, bool need_rmap_locks)
 {
-	struct address_space *mapping = NULL;
-	struct anon_vma *anon_vma = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
 	spinlock_t *old_ptl, *new_ptl;
@@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	 *   serialize access to individual ptes, but only rmap traversal
 	 *   order guarantees that we won't miss both the old and new ptes).
 	 */
-	if (need_rmap_locks) {
-		if (vma->vm_file) {
-			mapping = vma->vm_file->f_mapping;
-			i_mmap_lock_write(mapping);
-		}
-		if (vma->anon_vma) {
-			anon_vma = vma->anon_vma;
-			anon_vma_lock_write(anon_vma);
-		}
-	}
+	if (need_rmap_locks)
+		take_rmap_locks(vma);
 
 	/*
 	 * We don't have to worry about the ordering of src and dst
@@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		spin_unlock(new_ptl);
 	pte_unmap(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
-	if (anon_vma)
-		anon_vma_unlock_write(anon_vma);
-	if (mapping)
-		i_mmap_unlock_write(mapping);
+	if (need_rmap_locks)
+		drop_rmap_locks(vma);
 }
 
 #define LATENCY_LIMIT	(64 * PAGE_SIZE)
@@ -193,15 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
 		if (pmd_trans_huge(*old_pmd)) {
 			if (extent == HPAGE_PMD_SIZE) {
 				bool moved;
-				VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma,
-					      vma);
 				/* See comment in move_ptes() */
 				if (need_rmap_locks)
-					anon_vma_lock_write(vma->anon_vma);
+					take_rmap_locks(vma);
 				moved = move_huge_pmd(vma, old_addr, new_addr,
 						    old_end, old_pmd, new_pmd);
 				if (need_rmap_locks)
-					anon_vma_unlock_write(vma->anon_vma);
+					drop_rmap_locks(vma);
 				if (moved) {
 					need_flush = true;
 					continue;
-- 
cgit v1.2.3


From 8604d9e534a3e662600e288bcfd1a5acd2763d28 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 19 May 2016 17:13:03 -0700
Subject: memory_hotplug: introduce CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE

This patchset continues the work I started with commit 31bc3858ea3e
("memory-hotplug: add automatic onlining policy for the newly added
memory").

Initially I was going to stop there and bring the policy setting logic
to userspace.  I met two issues on this way:

 1) It is possible to have memory hotplugged at boot (e.g.  with QEMU).
    These blocks stay offlined if we turn the onlining policy on by
    userspace.

 2) My attempt to bring this policy setting to systemd failed, systemd
    maintainers suggest to change the default in kernel or ...  to use
    tmpfiles.d to alter the policy (which looks like a hack to me):
        https://github.com/systemd/systemd/pull/2938

Here I suggest to add a config option to set the default value for the
policy and a kernel command line parameter to make the override.

This patch (of 2):

Introduce config option to set the default value for memory hotplug
onlining policy (/sys/devices/system/memory/auto_online_blocks).  The
reason one would want to turn this option on are to have early onlining
for hotpluggable memory available at boot and to not require any
userspace actions to make memory hotplug work.

[akpm@linux-foundation.org: tweak Kconfig text]
Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/memory-hotplug.txt |  9 +++++----
 mm/Kconfig                       | 16 ++++++++++++++++
 mm/memory_hotplug.c              |  4 ++++
 3 files changed, 25 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
index 443f4b44ad97..0d7cb955aa01 100644
--- a/Documentation/memory-hotplug.txt
+++ b/Documentation/memory-hotplug.txt
@@ -261,10 +261,11 @@ it according to the policy which can be read from "auto_online_blocks" file:
 
 % cat /sys/devices/system/memory/auto_online_blocks
 
-The default is "offline" which means the newly added memory is not in a
-ready-to-use state and you have to "online" the newly added memory blocks
-manually. Automatic onlining can be requested by writing "online" to
-"auto_online_blocks" file:
+The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+option. If it is disabled the default is "offline" which means the newly added
+memory is not in a ready-to-use state and you have to "online" the newly added
+memory blocks manually. Automatic onlining can be requested by writing "online"
+to "auto_online_blocks" file:
 
 % echo online > /sys/devices/system/memory/auto_online_blocks
 
diff --git a/mm/Kconfig b/mm/Kconfig
index d6e9042b99e0..b0432b71137d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE
 	def_bool y
 	depends on SPARSEMEM && MEMORY_HOTPLUG
 
+config MEMORY_HOTPLUG_DEFAULT_ONLINE
+        bool "Online the newly added memory blocks by default"
+        default n
+        depends on MEMORY_HOTPLUG
+        help
+	  This option sets the default policy setting for memory hotplug
+	  onlining policy (/sys/devices/system/memory/auto_online_blocks) which
+	  determines what happens to newly added memory regions. Policy setting
+	  can always be changed at runtime.
+	  See Documentation/memory-hotplug.txt for more information.
+
+	  Say Y here if you want all hot-plugged memory blocks to appear in
+	  'online' state by default.
+	  Say N here if you want the default policy to keep all hot-plugged
+	  memory blocks in 'offline' state.
+
 config MEMORY_HOTREMOVE
 	bool "Allow for memory hot remove"
 	select MEMORY_ISOLATION
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b21d8895ea41..fcafbfcff044 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -78,7 +78,11 @@ static struct {
 #define memhp_lock_acquire()      lock_map_acquire(&mem_hotplug.dep_map)
 #define memhp_lock_release()      lock_map_release(&mem_hotplug.dep_map)
 
+#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
 bool memhp_auto_online;
+#else
+bool memhp_auto_online = true;
+#endif
 EXPORT_SYMBOL_GPL(memhp_auto_online);
 
 void get_online_mems(void)
-- 
cgit v1.2.3


From 86dd995d63241039e0ad9123f9b424013c611510 Mon Sep 17 00:00:00 2001
From: Vitaly Kuznetsov <vkuznets@redhat.com>
Date: Thu, 19 May 2016 17:13:06 -0700
Subject: memory_hotplug: introduce memhp_default_state= command line parameter

CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE specifies the default value for the
memory hotplug onlining policy.  Add a command line parameter to make it
possible to override the default.  It may come handy for debug and
testing purposes.

Signed-off-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt |  8 ++++++++
 mm/memory_hotplug.c                 | 11 +++++++++++
 2 files changed, 19 insertions(+)

(limited to 'mm')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 794403166a8f..2edb27bdc680 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2161,6 +2161,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			[KNL,SH] Allow user to override the default size for
 			per-device physically contiguous DMA buffers.
 
+        memhp_default_state=online/offline
+			[KNL] Set the initial state for the memory hotplug
+			onlining policy. If not specified, the default value is
+			set according to the
+			CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config
+			option.
+			See Documentation/memory-hotplug.txt.
+
 	memmap=exactmap	[KNL,X86] Enable setting of an exact
 			E820 memory map, as specified by the user.
 			Such memmap=exactmap lines can be constructed based on
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fcafbfcff044..caf2a14c37ad 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -85,6 +85,17 @@ bool memhp_auto_online = true;
 #endif
 EXPORT_SYMBOL_GPL(memhp_auto_online);
 
+static int __init setup_memhp_default_state(char *str)
+{
+	if (!strcmp(str, "online"))
+		memhp_auto_online = true;
+	else if (!strcmp(str, "offline"))
+		memhp_auto_online = false;
+
+	return 1;
+}
+__setup("memhp_default_state=", setup_memhp_default_state);
+
 void get_online_mems(void)
 {
 	might_sleep();
-- 
cgit v1.2.3


From 3da88fb3bacfaa33ff9d13730d17110bb2d9604d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 19 May 2016 17:13:09 -0700
Subject: mm, oom: move GFP_NOFS check to out_of_memory

__alloc_pages_may_oom is the central place to decide when the
out_of_memory should be invoked.  This is a good approach for most
checks there because they are page allocator specific and the allocation
fails right after for all of them.

The notable exception is GFP_NOFS context which is faking
did_some_progress and keep the page allocator looping even though there
couldn't have been any progress from the OOM killer.  This patch doesn't
change this behavior because we are not ready to allow those allocation
requests to fail yet (and maybe we will face the reality that we will
never manage to safely fail these request).  Instead __GFP_FS check is
moved down to out_of_memory and prevent from OOM victim selection there.
There are two reasons for that

	- OOM notifiers might release some memory even from this context
	  as none of the registered notifier seems to be FS related
	- this might help a dying thread to get an access to memory
          reserves and move on which will make the behavior more
          consistent with the case when the task gets killed from a
          different context.

Keep a comment in __alloc_pages_may_oom to make sure we do not forget
how GFP_NOFS is special and that we really want to do something about
it.

Note to the current oom_notifier users:

The observable difference for you is that oom notifiers cannot depend on
any fs locks because we could deadlock.  Not that this would be allowed
today because that would just lockup machine in most of the cases and
ruling out the OOM killer along the way.  Another difference is that
callbacks might be invoked sooner now because GFP_NOFS is a weaker
reclaim context and so there could be reclaimable memory which is just
not reachable now.  That would require GFP_NOFS only loads which are
really rare and more importantly the observable result would be dropping
of reconstructible object and potential performance drop which is not
such a big deal when we are struggling to fulfill other important
allocation requests.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Raushaniya Maksudova <rmaksudova@parallels.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c   |  9 +++++++++
 mm/page_alloc.c | 24 ++++++++++--------------
 2 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 86349586eacb..32d8210b8773 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -876,6 +876,15 @@ bool out_of_memory(struct oom_control *oc)
 		return true;
 	}
 
+	/*
+	 * The OOM killer does not compensate for IO-less reclaim.
+	 * pagefault_out_of_memory lost its gfp context so we have to
+	 * make sure exclude 0 mask - all other users should have at least
+	 * ___GFP_DIRECT_RECLAIM to get here.
+	 */
+	if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL)))
+		return true;
+
 	/*
 	 * Check if there were limitations on the allocation (only relevant for
 	 * NUMA) that may require different handling.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index da6d339f1936..6d1c8b06b458 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2875,22 +2875,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 		/* The OOM killer does not needlessly kill tasks for lowmem */
 		if (ac->high_zoneidx < ZONE_NORMAL)
 			goto out;
-		/* The OOM killer does not compensate for IO-less reclaim */
-		if (!(gfp_mask & __GFP_FS)) {
-			/*
-			 * XXX: Page reclaim didn't yield anything,
-			 * and the OOM killer can't be invoked, but
-			 * keep looping as per tradition.
-			 *
-			 * But do not keep looping if oom_killer_disable()
-			 * was already called, for the system is trying to
-			 * enter a quiescent state during suspend.
-			 */
-			*did_some_progress = !oom_killer_disabled;
-			goto out;
-		}
 		if (pm_suspended_storage())
 			goto out;
+		/*
+		 * XXX: GFP_NOFS allocations should rather fail than rely on
+		 * other request to make a forward progress.
+		 * We are in an unfortunate situation where out_of_memory cannot
+		 * do much for this context but let's try it to at least get
+		 * access to memory reserved if the current task is killed (see
+		 * out_of_memory). Once filesystems are ready to handle allocation
+		 * failures more gracefully we should just bail out here.
+		 */
+
 		/* The OOM killer may not free memory on a specific node */
 		if (gfp_mask & __GFP_THISNODE)
 			goto out;
-- 
cgit v1.2.3


From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 19 May 2016 17:13:12 -0700
Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path

If either the current task is already killed or PF_EXITING or a selected
task is PF_EXITING then the oom killer is suppressed and so is the oom
reaper.  This patch adds try_oom_reaper which checks the given task and
queues it for the oom reaper if that is safe to be done meaning that the
task doesn't share the mm with an alive process.

This might help to release the memory pressure while the task tries to
exit.

[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Raushaniya Maksudova <rmaksudova@parallels.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h |  8 +++++
 mm/memcontrol.c     |  1 +
 mm/oom_kill.c       | 86 ++++++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 77 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 628a43242a34..83b9c39bd8b7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p)
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b40dcad2b90..d71d387868e6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1275,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 */
 	if (fatal_signal_pending(current) || task_will_free_mem(current)) {
 		mark_oom_victim(current);
+		try_oom_reaper(current);
 		goto unlock;
 	}
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 32d8210b8773..850b6ff66bdf 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly;
 
 #define K(x) ((x) << (PAGE_SHIFT-10))
 
+/*
+ * task->mm can be NULL if the task is the exited group leader.  So to
+ * determine whether the task is using a particular mm, we examine all the
+ * task's threads: if one of those is using this mm then this task was also
+ * using it.
+ */
+static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
+{
+	struct task_struct *t;
+
+	for_each_thread(p, t) {
+		struct mm_struct *t_mm = READ_ONCE(t->mm);
+		if (t_mm)
+			return t_mm == mm;
+	}
+	return false;
+}
+
+
 #ifdef CONFIG_MMU
 /*
  * OOM Reaper kernel thread which tries to reap the memory used by the OOM
@@ -563,6 +582,53 @@ static void wake_oom_reaper(struct task_struct *tsk)
 	wake_up(&oom_reaper_wait);
 }
 
+/* Check if we can reap the given task. This has to be called with stable
+ * tsk->mm
+ */
+void try_oom_reaper(struct task_struct *tsk)
+{
+	struct mm_struct *mm = tsk->mm;
+	struct task_struct *p;
+
+	if (!mm)
+		return;
+
+	/*
+	 * There might be other threads/processes which are either not
+	 * dying or even not killable.
+	 */
+	if (atomic_read(&mm->mm_users) > 1) {
+		rcu_read_lock();
+		for_each_process(p) {
+			bool exiting;
+
+			if (!process_shares_mm(p, mm))
+				continue;
+			if (same_thread_group(p, tsk))
+				continue;
+			if (fatal_signal_pending(p))
+				continue;
+
+			/*
+			 * If the task is exiting make sure the whole thread group
+			 * is exiting and cannot acces mm anymore.
+			 */
+			spin_lock_irq(&p->sighand->siglock);
+			exiting = signal_group_exit(p->signal);
+			spin_unlock_irq(&p->sighand->siglock);
+			if (exiting)
+				continue;
+
+			/* Give up */
+			rcu_read_unlock();
+			return;
+		}
+		rcu_read_unlock();
+	}
+
+	wake_oom_reaper(tsk);
+}
+
 static int __init oom_init(void)
 {
 	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
@@ -652,24 +718,6 @@ void oom_killer_enable(void)
 	oom_killer_disabled = false;
 }
 
-/*
- * task->mm can be NULL if the task is the exited group leader.  So to
- * determine whether the task is using a particular mm, we examine all the
- * task's threads: if one of those is using this mm then this task was also
- * using it.
- */
-static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm)
-{
-	struct task_struct *t;
-
-	for_each_thread(p, t) {
-		struct mm_struct *t_mm = READ_ONCE(t->mm);
-		if (t_mm)
-			return t_mm == mm;
-	}
-	return false;
-}
-
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
@@ -694,6 +742,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p,
 	task_lock(p);
 	if (p->mm && task_will_free_mem(p)) {
 		mark_oom_victim(p);
+		try_oom_reaper(p);
 		task_unlock(p);
 		put_task_struct(p);
 		return;
@@ -873,6 +922,7 @@ bool out_of_memory(struct oom_control *oc)
 	if (current->mm &&
 	    (fatal_signal_pending(current) || task_will_free_mem(current))) {
 		mark_oom_victim(current);
+		try_oom_reaper(current);
 		return true;
 	}
 
-- 
cgit v1.2.3


From 449d777d7ad6d7f9ac5ed8f618fa13e6ff36c32f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 19 May 2016 17:13:15 -0700
Subject: mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper

Right now the oom reaper will clear TIF_MEMDIE only for tasks which were
successfully reaped.  This is the safest option because we know that
such an oom victim would only block forward progress of the oom killer
without a good reason because it is highly unlikely it would release
much more memory.  Basically most of its memory has been already torn
down.

We can relax this assumption to catch more corner cases though.

The first obvious one is when the oom victim clears its mm and gets
stuck later on.  oom_reaper would back of on find_lock_task_mm returning
NULL.  We can safely try to clear TIF_MEMDIE in this case because such a
task would be ignored by the oom killer anyway.  The flag would be
cleared by that time already most of the time anyway.

The less obvious one is when the oom reaper fails due to mmap_sem
contention.  Even if we clear TIF_MEMDIE for this task then it is not
very likely that we would select another task too easily because we
haven't reaped the last victim and so it would be still the #1
candidate.  There is a rare race condition possible when the current
victim terminates before the next select_bad_process but considering
that oom_reap_task had retried several times before giving up then this
sounds like a borderline thing.

After this patch we should have a guarantee that the OOM killer will not
be block for unbounded amount of time for most cases.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Raushaniya Maksudova <rmaksudova@parallels.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 850b6ff66bdf..415f7eb913fa 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -510,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	up_read(&mm->mmap_sem);
 
 	/*
-	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
-	 * reasonably reclaimable memory anymore. OOM killer can continue
-	 * by selecting other victim if unmapping hasn't led to any
-	 * improvements. This also means that selecting this task doesn't
-	 * make any sense.
+	 * This task can be safely ignored because we cannot do much more
+	 * to release its memory.
 	 */
 	tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
-	exit_oom_victim(tsk);
 out:
 	mmput(mm);
 	return ret;
@@ -538,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk)
 		debug_show_all_locks();
 	}
 
+	/*
+	 * Clear TIF_MEMDIE because the task shouldn't be sitting on a
+	 * reasonably reclaimable memory anymore or it is not a good candidate
+	 * for the oom victim right now because it cannot release its memory
+	 * itself nor by the oom reaper.
+	 */
+	tsk->oom_reaper_list = NULL;
+	exit_oom_victim(tsk);
+
 	/* Drop a reference taken by wake_oom_reaper */
 	put_task_struct(tsk);
 }
-- 
cgit v1.2.3


From d61f8590397480981f0d3ee7a2b38b5ea990db52 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:18 -0700
Subject: mm, page_alloc: only check PageCompound for high-order pages

Another year, another round of page allocator optimisations focusing
this time on the alloc and free fast paths.  This should be of help to
workloads that are allocator-intensive from kernel space where the cost
of zeroing is not nceessraily incurred.

The series is motivated by the observation that page alloc
microbenchmarks on multiple machines regressed between 3.12.44 and 4.4.
Second, there is discussions before LSF/MM considering the possibility
of adding another page allocator which is potentially hazardous but a
patch series improving performance is better than whining.

After the series is applied, there are still hazards.  In the free
paths, the debugging checking and page zone/pageblock lookups dominate
but there was not an obvious solution to that.  In the alloc path, the
major contributers are dealing with zonelists, new page preperation, the
fair zone allocation and numerous statistic updates.  The fair zone
allocator is removed by the per-node LRU series if that gets merged so
it's nor a major concern at the moment.

On normal userspace benchmarks, there is little impact as the zeroing
cost is significant but it's visible

  aim9
                                 4.6.0-rc3             4.6.0-rc3
                                   vanilla         deferalloc-v3
  Min      page_test   828693.33 (  0.00%)   887060.00 (  7.04%)
  Min      brk_test   4847266.67 (  0.00%)  4966266.67 (  2.45%)
  Min      exec_test     1271.00 (  0.00%)     1275.67 (  0.37%)
  Min      fork_test    12371.75 (  0.00%)    12380.00 (  0.07%)

The overall impact on a page allocator microbenchmark for a range of orders
and number of pages allocated in a batch is

                                            4.6.0-rc3                  4.6.0-rc3
                                               vanilla            deferalloc-v3r7
  Min      alloc-odr0-1               428.00 (  0.00%)           316.00 ( 26.17%)
  Min      alloc-odr0-2               314.00 (  0.00%)           231.00 ( 26.43%)
  Min      alloc-odr0-4               256.00 (  0.00%)           192.00 ( 25.00%)
  Min      alloc-odr0-8               222.00 (  0.00%)           166.00 ( 25.23%)
  Min      alloc-odr0-16              207.00 (  0.00%)           154.00 ( 25.60%)
  Min      alloc-odr0-32              197.00 (  0.00%)           148.00 ( 24.87%)
  Min      alloc-odr0-64              193.00 (  0.00%)           144.00 ( 25.39%)
  Min      alloc-odr0-128             191.00 (  0.00%)           143.00 ( 25.13%)
  Min      alloc-odr0-256             203.00 (  0.00%)           153.00 ( 24.63%)
  Min      alloc-odr0-512             212.00 (  0.00%)           165.00 ( 22.17%)
  Min      alloc-odr0-1024            221.00 (  0.00%)           172.00 ( 22.17%)
  Min      alloc-odr0-2048            225.00 (  0.00%)           179.00 ( 20.44%)
  Min      alloc-odr0-4096            232.00 (  0.00%)           185.00 ( 20.26%)
  Min      alloc-odr0-8192            235.00 (  0.00%)           187.00 ( 20.43%)
  Min      alloc-odr0-16384           236.00 (  0.00%)           188.00 ( 20.34%)
  Min      alloc-odr1-1               519.00 (  0.00%)           450.00 ( 13.29%)
  Min      alloc-odr1-2               391.00 (  0.00%)           336.00 ( 14.07%)
  Min      alloc-odr1-4               313.00 (  0.00%)           268.00 ( 14.38%)
  Min      alloc-odr1-8               277.00 (  0.00%)           235.00 ( 15.16%)
  Min      alloc-odr1-16              256.00 (  0.00%)           218.00 ( 14.84%)
  Min      alloc-odr1-32              252.00 (  0.00%)           212.00 ( 15.87%)
  Min      alloc-odr1-64              244.00 (  0.00%)           206.00 ( 15.57%)
  Min      alloc-odr1-128             244.00 (  0.00%)           207.00 ( 15.16%)
  Min      alloc-odr1-256             243.00 (  0.00%)           207.00 ( 14.81%)
  Min      alloc-odr1-512             245.00 (  0.00%)           209.00 ( 14.69%)
  Min      alloc-odr1-1024            248.00 (  0.00%)           214.00 ( 13.71%)
  Min      alloc-odr1-2048            253.00 (  0.00%)           220.00 ( 13.04%)
  Min      alloc-odr1-4096            258.00 (  0.00%)           224.00 ( 13.18%)
  Min      alloc-odr1-8192            261.00 (  0.00%)           229.00 ( 12.26%)
  Min      alloc-odr2-1               560.00 (  0.00%)           753.00 (-34.46%)
  Min      alloc-odr2-2               424.00 (  0.00%)           351.00 ( 17.22%)
  Min      alloc-odr2-4               339.00 (  0.00%)           393.00 (-15.93%)
  Min      alloc-odr2-8               298.00 (  0.00%)           246.00 ( 17.45%)
  Min      alloc-odr2-16              276.00 (  0.00%)           227.00 ( 17.75%)
  Min      alloc-odr2-32              271.00 (  0.00%)           221.00 ( 18.45%)
  Min      alloc-odr2-64              264.00 (  0.00%)           217.00 ( 17.80%)
  Min      alloc-odr2-128             264.00 (  0.00%)           217.00 ( 17.80%)
  Min      alloc-odr2-256             264.00 (  0.00%)           218.00 ( 17.42%)
  Min      alloc-odr2-512             269.00 (  0.00%)           223.00 ( 17.10%)
  Min      alloc-odr2-1024            279.00 (  0.00%)           230.00 ( 17.56%)
  Min      alloc-odr2-2048            283.00 (  0.00%)           235.00 ( 16.96%)
  Min      alloc-odr2-4096            285.00 (  0.00%)           239.00 ( 16.14%)
  Min      alloc-odr3-1               629.00 (  0.00%)           505.00 ( 19.71%)
  Min      alloc-odr3-2               472.00 (  0.00%)           374.00 ( 20.76%)
  Min      alloc-odr3-4               383.00 (  0.00%)           301.00 ( 21.41%)
  Min      alloc-odr3-8               341.00 (  0.00%)           266.00 ( 21.99%)
  Min      alloc-odr3-16              316.00 (  0.00%)           248.00 ( 21.52%)
  Min      alloc-odr3-32              308.00 (  0.00%)           241.00 ( 21.75%)
  Min      alloc-odr3-64              305.00 (  0.00%)           241.00 ( 20.98%)
  Min      alloc-odr3-128             308.00 (  0.00%)           244.00 ( 20.78%)
  Min      alloc-odr3-256             317.00 (  0.00%)           249.00 ( 21.45%)
  Min      alloc-odr3-512             327.00 (  0.00%)           256.00 ( 21.71%)
  Min      alloc-odr3-1024            331.00 (  0.00%)           261.00 ( 21.15%)
  Min      alloc-odr3-2048            333.00 (  0.00%)           266.00 ( 20.12%)
  Min      alloc-odr4-1               767.00 (  0.00%)           572.00 ( 25.42%)
  Min      alloc-odr4-2               578.00 (  0.00%)           429.00 ( 25.78%)
  Min      alloc-odr4-4               474.00 (  0.00%)           346.00 ( 27.00%)
  Min      alloc-odr4-8               422.00 (  0.00%)           310.00 ( 26.54%)
  Min      alloc-odr4-16              399.00 (  0.00%)           295.00 ( 26.07%)
  Min      alloc-odr4-32              392.00 (  0.00%)           293.00 ( 25.26%)
  Min      alloc-odr4-64              394.00 (  0.00%)           293.00 ( 25.63%)
  Min      alloc-odr4-128             405.00 (  0.00%)           305.00 ( 24.69%)
  Min      alloc-odr4-256             417.00 (  0.00%)           319.00 ( 23.50%)
  Min      alloc-odr4-512             425.00 (  0.00%)           326.00 ( 23.29%)
  Min      alloc-odr4-1024            426.00 (  0.00%)           329.00 ( 22.77%)
  Min      free-odr0-1                216.00 (  0.00%)           178.00 ( 17.59%)
  Min      free-odr0-2                152.00 (  0.00%)           125.00 ( 17.76%)
  Min      free-odr0-4                120.00 (  0.00%)            99.00 ( 17.50%)
  Min      free-odr0-8                106.00 (  0.00%)            85.00 ( 19.81%)
  Min      free-odr0-16                97.00 (  0.00%)            80.00 ( 17.53%)
  Min      free-odr0-32                92.00 (  0.00%)            76.00 ( 17.39%)
  Min      free-odr0-64                89.00 (  0.00%)            74.00 ( 16.85%)
  Min      free-odr0-128               89.00 (  0.00%)            73.00 ( 17.98%)
  Min      free-odr0-256              107.00 (  0.00%)            90.00 ( 15.89%)
  Min      free-odr0-512              117.00 (  0.00%)           108.00 (  7.69%)
  Min      free-odr0-1024             125.00 (  0.00%)           118.00 (  5.60%)
  Min      free-odr0-2048             132.00 (  0.00%)           125.00 (  5.30%)
  Min      free-odr0-4096             135.00 (  0.00%)           130.00 (  3.70%)
  Min      free-odr0-8192             137.00 (  0.00%)           130.00 (  5.11%)
  Min      free-odr0-16384            137.00 (  0.00%)           131.00 (  4.38%)
  Min      free-odr1-1                318.00 (  0.00%)           289.00 (  9.12%)
  Min      free-odr1-2                228.00 (  0.00%)           207.00 (  9.21%)
  Min      free-odr1-4                182.00 (  0.00%)           165.00 (  9.34%)
  Min      free-odr1-8                163.00 (  0.00%)           146.00 ( 10.43%)
  Min      free-odr1-16               151.00 (  0.00%)           135.00 ( 10.60%)
  Min      free-odr1-32               146.00 (  0.00%)           129.00 ( 11.64%)
  Min      free-odr1-64               145.00 (  0.00%)           130.00 ( 10.34%)
  Min      free-odr1-128              148.00 (  0.00%)           134.00 (  9.46%)
  Min      free-odr1-256              148.00 (  0.00%)           137.00 (  7.43%)
  Min      free-odr1-512              151.00 (  0.00%)           140.00 (  7.28%)
  Min      free-odr1-1024             154.00 (  0.00%)           143.00 (  7.14%)
  Min      free-odr1-2048             156.00 (  0.00%)           144.00 (  7.69%)
  Min      free-odr1-4096             156.00 (  0.00%)           142.00 (  8.97%)
  Min      free-odr1-8192             156.00 (  0.00%)           140.00 ( 10.26%)
  Min      free-odr2-1                361.00 (  0.00%)           457.00 (-26.59%)
  Min      free-odr2-2                258.00 (  0.00%)           224.00 ( 13.18%)
  Min      free-odr2-4                208.00 (  0.00%)           223.00 ( -7.21%)
  Min      free-odr2-8                185.00 (  0.00%)           160.00 ( 13.51%)
  Min      free-odr2-16               173.00 (  0.00%)           149.00 ( 13.87%)
  Min      free-odr2-32               166.00 (  0.00%)           145.00 ( 12.65%)
  Min      free-odr2-64               166.00 (  0.00%)           146.00 ( 12.05%)
  Min      free-odr2-128              169.00 (  0.00%)           148.00 ( 12.43%)
  Min      free-odr2-256              170.00 (  0.00%)           152.00 ( 10.59%)
  Min      free-odr2-512              177.00 (  0.00%)           156.00 ( 11.86%)
  Min      free-odr2-1024             182.00 (  0.00%)           162.00 ( 10.99%)
  Min      free-odr2-2048             181.00 (  0.00%)           160.00 ( 11.60%)
  Min      free-odr2-4096             180.00 (  0.00%)           159.00 ( 11.67%)
  Min      free-odr3-1                431.00 (  0.00%)           367.00 ( 14.85%)
  Min      free-odr3-2                306.00 (  0.00%)           259.00 ( 15.36%)
  Min      free-odr3-4                249.00 (  0.00%)           208.00 ( 16.47%)
  Min      free-odr3-8                224.00 (  0.00%)           186.00 ( 16.96%)
  Min      free-odr3-16               208.00 (  0.00%)           176.00 ( 15.38%)
  Min      free-odr3-32               206.00 (  0.00%)           174.00 ( 15.53%)
  Min      free-odr3-64               210.00 (  0.00%)           178.00 ( 15.24%)
  Min      free-odr3-128              215.00 (  0.00%)           182.00 ( 15.35%)
  Min      free-odr3-256              224.00 (  0.00%)           189.00 ( 15.62%)
  Min      free-odr3-512              232.00 (  0.00%)           195.00 ( 15.95%)
  Min      free-odr3-1024             230.00 (  0.00%)           195.00 ( 15.22%)
  Min      free-odr3-2048             229.00 (  0.00%)           193.00 ( 15.72%)
  Min      free-odr4-1                561.00 (  0.00%)           439.00 ( 21.75%)
  Min      free-odr4-2                418.00 (  0.00%)           318.00 ( 23.92%)
  Min      free-odr4-4                339.00 (  0.00%)           269.00 ( 20.65%)
  Min      free-odr4-8                299.00 (  0.00%)           239.00 ( 20.07%)
  Min      free-odr4-16               289.00 (  0.00%)           234.00 ( 19.03%)
  Min      free-odr4-32               291.00 (  0.00%)           235.00 ( 19.24%)
  Min      free-odr4-64               298.00 (  0.00%)           238.00 ( 20.13%)
  Min      free-odr4-128              308.00 (  0.00%)           251.00 ( 18.51%)
  Min      free-odr4-256              321.00 (  0.00%)           267.00 ( 16.82%)
  Min      free-odr4-512              327.00 (  0.00%)           269.00 ( 17.74%)
  Min      free-odr4-1024             326.00 (  0.00%)           271.00 ( 16.87%)
  Min      total-odr0-1               644.00 (  0.00%)           494.00 ( 23.29%)
  Min      total-odr0-2               466.00 (  0.00%)           356.00 ( 23.61%)
  Min      total-odr0-4               376.00 (  0.00%)           291.00 ( 22.61%)
  Min      total-odr0-8               328.00 (  0.00%)           251.00 ( 23.48%)
  Min      total-odr0-16              304.00 (  0.00%)           234.00 ( 23.03%)
  Min      total-odr0-32              289.00 (  0.00%)           224.00 ( 22.49%)
  Min      total-odr0-64              282.00 (  0.00%)           218.00 ( 22.70%)
  Min      total-odr0-128             280.00 (  0.00%)           216.00 ( 22.86%)
  Min      total-odr0-256             310.00 (  0.00%)           243.00 ( 21.61%)
  Min      total-odr0-512             329.00 (  0.00%)           273.00 ( 17.02%)
  Min      total-odr0-1024            346.00 (  0.00%)           290.00 ( 16.18%)
  Min      total-odr0-2048            357.00 (  0.00%)           304.00 ( 14.85%)
  Min      total-odr0-4096            367.00 (  0.00%)           315.00 ( 14.17%)
  Min      total-odr0-8192            372.00 (  0.00%)           317.00 ( 14.78%)
  Min      total-odr0-16384           373.00 (  0.00%)           319.00 ( 14.48%)
  Min      total-odr1-1               838.00 (  0.00%)           739.00 ( 11.81%)
  Min      total-odr1-2               619.00 (  0.00%)           543.00 ( 12.28%)
  Min      total-odr1-4               495.00 (  0.00%)           433.00 ( 12.53%)
  Min      total-odr1-8               440.00 (  0.00%)           382.00 ( 13.18%)
  Min      total-odr1-16              407.00 (  0.00%)           353.00 ( 13.27%)
  Min      total-odr1-32              398.00 (  0.00%)           341.00 ( 14.32%)
  Min      total-odr1-64              389.00 (  0.00%)           336.00 ( 13.62%)
  Min      total-odr1-128             392.00 (  0.00%)           341.00 ( 13.01%)
  Min      total-odr1-256             391.00 (  0.00%)           344.00 ( 12.02%)
  Min      total-odr1-512             396.00 (  0.00%)           349.00 ( 11.87%)
  Min      total-odr1-1024            402.00 (  0.00%)           357.00 ( 11.19%)
  Min      total-odr1-2048            409.00 (  0.00%)           364.00 ( 11.00%)
  Min      total-odr1-4096            414.00 (  0.00%)           366.00 ( 11.59%)
  Min      total-odr1-8192            417.00 (  0.00%)           369.00 ( 11.51%)
  Min      total-odr2-1               921.00 (  0.00%)          1210.00 (-31.38%)
  Min      total-odr2-2               682.00 (  0.00%)           576.00 ( 15.54%)
  Min      total-odr2-4               547.00 (  0.00%)           616.00 (-12.61%)
  Min      total-odr2-8               483.00 (  0.00%)           406.00 ( 15.94%)
  Min      total-odr2-16              449.00 (  0.00%)           376.00 ( 16.26%)
  Min      total-odr2-32              437.00 (  0.00%)           366.00 ( 16.25%)
  Min      total-odr2-64              431.00 (  0.00%)           363.00 ( 15.78%)
  Min      total-odr2-128             433.00 (  0.00%)           365.00 ( 15.70%)
  Min      total-odr2-256             434.00 (  0.00%)           371.00 ( 14.52%)
  Min      total-odr2-512             446.00 (  0.00%)           379.00 ( 15.02%)
  Min      total-odr2-1024            461.00 (  0.00%)           392.00 ( 14.97%)
  Min      total-odr2-2048            464.00 (  0.00%)           395.00 ( 14.87%)
  Min      total-odr2-4096            465.00 (  0.00%)           398.00 ( 14.41%)
  Min      total-odr3-1              1060.00 (  0.00%)           872.00 ( 17.74%)
  Min      total-odr3-2               778.00 (  0.00%)           633.00 ( 18.64%)
  Min      total-odr3-4               632.00 (  0.00%)           510.00 ( 19.30%)
  Min      total-odr3-8               565.00 (  0.00%)           452.00 ( 20.00%)
  Min      total-odr3-16              524.00 (  0.00%)           424.00 ( 19.08%)
  Min      total-odr3-32              514.00 (  0.00%)           415.00 ( 19.26%)
  Min      total-odr3-64              515.00 (  0.00%)           419.00 ( 18.64%)
  Min      total-odr3-128             523.00 (  0.00%)           426.00 ( 18.55%)
  Min      total-odr3-256             541.00 (  0.00%)           438.00 ( 19.04%)
  Min      total-odr3-512             559.00 (  0.00%)           451.00 ( 19.32%)
  Min      total-odr3-1024            561.00 (  0.00%)           456.00 ( 18.72%)
  Min      total-odr3-2048            562.00 (  0.00%)           459.00 ( 18.33%)
  Min      total-odr4-1              1328.00 (  0.00%)          1011.00 ( 23.87%)
  Min      total-odr4-2               997.00 (  0.00%)           747.00 ( 25.08%)
  Min      total-odr4-4               813.00 (  0.00%)           615.00 ( 24.35%)
  Min      total-odr4-8               721.00 (  0.00%)           550.00 ( 23.72%)
  Min      total-odr4-16              689.00 (  0.00%)           529.00 ( 23.22%)
  Min      total-odr4-32              683.00 (  0.00%)           528.00 ( 22.69%)
  Min      total-odr4-64              692.00 (  0.00%)           531.00 ( 23.27%)
  Min      total-odr4-128             713.00 (  0.00%)           556.00 ( 22.02%)
  Min      total-odr4-256             738.00 (  0.00%)           586.00 ( 20.60%)
  Min      total-odr4-512             753.00 (  0.00%)           595.00 ( 20.98%)
  Min      total-odr4-1024            752.00 (  0.00%)           600.00 ( 20.21%)

This patch (of 27):

order-0 pages by definition cannot be compound so avoid the check in the
fast path for those pages.

[akpm@linux-foundation.org: use unlikely(order) in free_pages_prepare(), per Vlastimil]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d1c8b06b458..087ba3e417ec 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1024,24 +1024,33 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
 
 static bool free_pages_prepare(struct page *page, unsigned int order)
 {
-	bool compound = PageCompound(page);
-	int i, bad = 0;
+	int bad = 0;
 
 	VM_BUG_ON_PAGE(PageTail(page), page);
-	VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
 
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
 	kasan_free_pages(page, order);
 
+	/*
+	 * Check tail pages before head page information is cleared to
+	 * avoid checking PageCompound for order-0 pages.
+	 */
+	if (unlikely(order)) {
+		bool compound = PageCompound(page);
+		int i;
+
+		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
+
+		for (i = 1; i < (1 << order); i++) {
+			if (compound)
+				bad += free_tail_pages_check(page, page + i);
+			bad += free_pages_check(page + i);
+		}
+	}
 	if (PageAnon(page))
 		page->mapping = NULL;
 	bad += free_pages_check(page);
-	for (i = 1; i < (1 << order); i++) {
-		if (compound)
-			bad += free_tail_pages_check(page, page + i);
-		bad += free_pages_check(page + i);
-	}
 	if (bad)
 		return false;
 
-- 
cgit v1.2.3


From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:21 -0700
Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast
 path

The PageAnon check always checks for compound_head but this is a
relatively expensive check if the caller already knows the page is a
head page.  This patch creates a helper and uses it in the page free
path which only operates on head pages.

With this patch and "Only check PageCompound for high-order pages", the
performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                               vanilla           nocompound-v1r20
  Min      alloc-odr0-1               425.00 (  0.00%)           417.00 (  1.88%)
  Min      alloc-odr0-2               313.00 (  0.00%)           308.00 (  1.60%)
  Min      alloc-odr0-4               257.00 (  0.00%)           253.00 (  1.56%)
  Min      alloc-odr0-8               224.00 (  0.00%)           221.00 (  1.34%)
  Min      alloc-odr0-16              208.00 (  0.00%)           205.00 (  1.44%)
  Min      alloc-odr0-32              199.00 (  0.00%)           199.00 (  0.00%)
  Min      alloc-odr0-64              195.00 (  0.00%)           193.00 (  1.03%)
  Min      alloc-odr0-128             192.00 (  0.00%)           191.00 (  0.52%)
  Min      alloc-odr0-256             204.00 (  0.00%)           200.00 (  1.96%)
  Min      alloc-odr0-512             213.00 (  0.00%)           212.00 (  0.47%)
  Min      alloc-odr0-1024            219.00 (  0.00%)           219.00 (  0.00%)
  Min      alloc-odr0-2048            225.00 (  0.00%)           225.00 (  0.00%)
  Min      alloc-odr0-4096            230.00 (  0.00%)           231.00 ( -0.43%)
  Min      alloc-odr0-8192            235.00 (  0.00%)           234.00 (  0.43%)
  Min      alloc-odr0-16384           235.00 (  0.00%)           234.00 (  0.43%)
  Min      free-odr0-1                215.00 (  0.00%)           191.00 ( 11.16%)
  Min      free-odr0-2                152.00 (  0.00%)           136.00 ( 10.53%)
  Min      free-odr0-4                119.00 (  0.00%)           107.00 ( 10.08%)
  Min      free-odr0-8                106.00 (  0.00%)            96.00 (  9.43%)
  Min      free-odr0-16                97.00 (  0.00%)            87.00 ( 10.31%)
  Min      free-odr0-32                91.00 (  0.00%)            83.00 (  8.79%)
  Min      free-odr0-64                89.00 (  0.00%)            81.00 (  8.99%)
  Min      free-odr0-128               88.00 (  0.00%)            80.00 (  9.09%)
  Min      free-odr0-256              106.00 (  0.00%)            95.00 ( 10.38%)
  Min      free-odr0-512              116.00 (  0.00%)           111.00 (  4.31%)
  Min      free-odr0-1024             125.00 (  0.00%)           118.00 (  5.60%)
  Min      free-odr0-2048             133.00 (  0.00%)           126.00 (  5.26%)
  Min      free-odr0-4096             136.00 (  0.00%)           130.00 (  4.41%)
  Min      free-odr0-8192             138.00 (  0.00%)           130.00 (  5.80%)
  Min      free-odr0-16384            137.00 (  0.00%)           130.00 (  5.11%)

There is a sizable boost to the free allocator performance.  While there
is an apparent boost on the allocation side, it's likely a co-incidence
or due to the patches slightly reducing cache footprint.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 7 ++++++-
 mm/page_alloc.c            | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b052aa7b5b7..a61e06e5fbce 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #define PAGE_MAPPING_KSM	2
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
 
+static __always_inline int PageAnonHead(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
 static __always_inline int PageAnon(struct page *page)
 {
 	page = compound_head(page);
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return PageAnonHead(page);
 }
 
 #ifdef CONFIG_KSM
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 087ba3e417ec..7be1ce8b6be0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1048,7 +1048,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 			bad += free_pages_check(page + i);
 		}
 	}
-	if (PageAnon(page))
+	if (PageAnonHead(page))
 		page->mapping = NULL;
 	bad += free_pages_check(page);
 	if (bad)
-- 
cgit v1.2.3


From b9f00e147f27d86691f7f52a3c8126d25432477c Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:24 -0700
Subject: mm, page_alloc: reduce branches in zone_statistics

zone_statistics has more branches than it really needs to take an
unlikely GFP flag into account.  Reduce the number and annotate the
unlikely flag.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      nocompound-v1r10           statbranch-v1r10
  Min      alloc-odr0-1               417.00 (  0.00%)           419.00 ( -0.48%)
  Min      alloc-odr0-2               308.00 (  0.00%)           305.00 (  0.97%)
  Min      alloc-odr0-4               253.00 (  0.00%)           250.00 (  1.19%)
  Min      alloc-odr0-8               221.00 (  0.00%)           219.00 (  0.90%)
  Min      alloc-odr0-16              205.00 (  0.00%)           203.00 (  0.98%)
  Min      alloc-odr0-32              199.00 (  0.00%)           195.00 (  2.01%)
  Min      alloc-odr0-64              193.00 (  0.00%)           191.00 (  1.04%)
  Min      alloc-odr0-128             191.00 (  0.00%)           189.00 (  1.05%)
  Min      alloc-odr0-256             200.00 (  0.00%)           198.00 (  1.00%)
  Min      alloc-odr0-512             212.00 (  0.00%)           210.00 (  0.94%)
  Min      alloc-odr0-1024            219.00 (  0.00%)           216.00 (  1.37%)
  Min      alloc-odr0-2048            225.00 (  0.00%)           221.00 (  1.78%)
  Min      alloc-odr0-4096            231.00 (  0.00%)           227.00 (  1.73%)
  Min      alloc-odr0-8192            234.00 (  0.00%)           232.00 (  0.85%)
  Min      alloc-odr0-16384           234.00 (  0.00%)           232.00 (  0.85%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index c831be32a1a3..d585de27e960 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -581,17 +581,21 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
  */
 void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
 {
-	if (z->zone_pgdat == preferred_zone->zone_pgdat) {
+	int local_nid = numa_node_id();
+	enum zone_stat_item local_stat = NUMA_LOCAL;
+
+	if (unlikely(flags & __GFP_OTHER_NODE)) {
+		local_stat = NUMA_OTHER;
+		local_nid = preferred_zone->node;
+	}
+
+	if (z->node == local_nid) {
 		__inc_zone_state(z, NUMA_HIT);
+		__inc_zone_state(z, local_stat);
 	} else {
 		__inc_zone_state(z, NUMA_MISS);
 		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
 	}
-	if (z->node == ((flags & __GFP_OTHER_NODE) ?
-			preferred_zone->node : numa_node_id()))
-		__inc_zone_state(z, NUMA_LOCAL);
-	else
-		__inc_zone_state(z, NUMA_OTHER);
 }
 
 /*
-- 
cgit v1.2.3


From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:27 -0700
Subject: mm, page_alloc: inline zone_statistics

zone_statistics has one call-site but it's a public function.  Make it
static and inline.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statbranch-v1r20           statinline-v1r20
  Min      alloc-odr0-1               419.00 (  0.00%)           412.00 (  1.67%)
  Min      alloc-odr0-2               305.00 (  0.00%)           301.00 (  1.31%)
  Min      alloc-odr0-4               250.00 (  0.00%)           247.00 (  1.20%)
  Min      alloc-odr0-8               219.00 (  0.00%)           215.00 (  1.83%)
  Min      alloc-odr0-16              203.00 (  0.00%)           199.00 (  1.97%)
  Min      alloc-odr0-32              195.00 (  0.00%)           191.00 (  2.05%)
  Min      alloc-odr0-64              191.00 (  0.00%)           187.00 (  2.09%)
  Min      alloc-odr0-128             189.00 (  0.00%)           185.00 (  2.12%)
  Min      alloc-odr0-256             198.00 (  0.00%)           193.00 (  2.53%)
  Min      alloc-odr0-512             210.00 (  0.00%)           207.00 (  1.43%)
  Min      alloc-odr0-1024            216.00 (  0.00%)           213.00 (  1.39%)
  Min      alloc-odr0-2048            221.00 (  0.00%)           220.00 (  0.45%)
  Min      alloc-odr0-4096            227.00 (  0.00%)           226.00 (  0.44%)
  Min      alloc-odr0-8192            232.00 (  0.00%)           229.00 (  1.29%)
  Min      alloc-odr0-16384           232.00 (  0.00%)           229.00 (  1.29%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h |  2 --
 mm/page_alloc.c        | 31 +++++++++++++++++++++++++++++++
 mm/vmstat.c            | 29 -----------------------------
 3 files changed, 31 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 02fce415b3d9..d2da8e053210 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 #ifdef CONFIG_NUMA
 
 extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
 
 #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
 
 #endif /* CONFIG_NUMA */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7be1ce8b6be0..36384baa74e1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2354,6 +2354,37 @@ int split_free_page(struct page *page)
 	return nr_pages;
 }
 
+/*
+ * Update NUMA hit/miss statistics
+ *
+ * Must be called with interrupts disabled.
+ *
+ * When __GFP_OTHER_NODE is set assume the node of the preferred
+ * zone is the local node. This is useful for daemons who allocate
+ * memory on behalf of other processes.
+ */
+static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
+								gfp_t flags)
+{
+#ifdef CONFIG_NUMA
+	int local_nid = numa_node_id();
+	enum zone_stat_item local_stat = NUMA_LOCAL;
+
+	if (unlikely(flags & __GFP_OTHER_NODE)) {
+		local_stat = NUMA_OTHER;
+		local_nid = preferred_zone->node;
+	}
+
+	if (z->node == local_nid) {
+		__inc_zone_state(z, NUMA_HIT);
+		__inc_zone_state(z, local_stat);
+	} else {
+		__inc_zone_state(z, NUMA_MISS);
+		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
+	}
+#endif
+}
+
 /*
  * Allocate a page from the given zone. Use pcplists for order-0 allocations.
  */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d585de27e960..f1a73bfb77b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -569,35 +569,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
 #endif
 
 #ifdef CONFIG_NUMA
-/*
- * zonelist = the list of zones passed to the allocator
- * z 	    = the zone from which the allocation occurred.
- *
- * Must be called with interrupts disabled.
- *
- * When __GFP_OTHER_NODE is set assume the node of the preferred
- * zone is the local node. This is useful for daemons who allocate
- * memory on behalf of other processes.
- */
-void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags)
-{
-	int local_nid = numa_node_id();
-	enum zone_stat_item local_stat = NUMA_LOCAL;
-
-	if (unlikely(flags & __GFP_OTHER_NODE)) {
-		local_stat = NUMA_OTHER;
-		local_nid = preferred_zone->node;
-	}
-
-	if (z->node == local_nid) {
-		__inc_zone_state(z, NUMA_HIT);
-		__inc_zone_state(z, local_stat);
-	} else {
-		__inc_zone_state(z, NUMA_MISS);
-		__inc_zone_state(preferred_zone, NUMA_FOREIGN);
-	}
-}
-
 /*
  * Determine the per node value of a stat item.
  */
-- 
cgit v1.2.3


From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:30 -0700
Subject: mm, page_alloc: inline the fast path of the zonelist iterator

The page allocator iterates through a zonelist for zones that match the
addressing limitations and nodemask of the caller but many allocations
will not be restricted.  Despite this, there is always functional call
overhead which builds up.

This patch inlines the optimistic basic case and only calls the iterator
function for the complex case.  A hindrance was the fact that
cpuset_current_mems_allowed is used in the fastpath as the allowed
nodemask even though all nodes are allowed on most systems.  The patch
handles this by only considering cpuset_current_mems_allowed if a cpuset
exists.  As well as being faster in the fast-path, this removes some
junk in the slowpath.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statinline-v1r20              optiter-v1r20
  Min      alloc-odr0-1               412.00 (  0.00%)           382.00 (  7.28%)
  Min      alloc-odr0-2               301.00 (  0.00%)           282.00 (  6.31%)
  Min      alloc-odr0-4               247.00 (  0.00%)           233.00 (  5.67%)
  Min      alloc-odr0-8               215.00 (  0.00%)           203.00 (  5.58%)
  Min      alloc-odr0-16              199.00 (  0.00%)           188.00 (  5.53%)
  Min      alloc-odr0-32              191.00 (  0.00%)           182.00 (  4.71%)
  Min      alloc-odr0-64              187.00 (  0.00%)           177.00 (  5.35%)
  Min      alloc-odr0-128             185.00 (  0.00%)           175.00 (  5.41%)
  Min      alloc-odr0-256             193.00 (  0.00%)           184.00 (  4.66%)
  Min      alloc-odr0-512             207.00 (  0.00%)           197.00 (  4.83%)
  Min      alloc-odr0-1024            213.00 (  0.00%)           203.00 (  4.69%)
  Min      alloc-odr0-2048            220.00 (  0.00%)           209.00 (  5.00%)
  Min      alloc-odr0-4096            226.00 (  0.00%)           214.00 (  5.31%)
  Min      alloc-odr0-8192            229.00 (  0.00%)           218.00 (  4.80%)
  Min      alloc-odr0-16384           229.00 (  0.00%)           219.00 (  4.37%)

perf indicated that next_zones_zonelist disappeared in the profile and
__next_zones_zonelist did not appear.  This is expected as the
micro-benchmark would hit the inlined fast-path every time.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +++++++++++--
 mm/mmzone.c            |  2 +-
 mm/page_alloc.c        | 26 +++++++++-----------------
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 150c6049f961..cfcd7723edb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
 #endif /* CONFIG_NUMA */
 }
 
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes);
+
 /**
  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
  * @z - The cursor used as a starting point for the search
@@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  * being examined. It should be advanced by one before calling
  * next_zones_zonelist again.
  */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes);
+					nodemask_t *nodes)
+{
+	if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+		return z;
+	return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 52687fb4de6f..5652be858e5e 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
 }
 
 /* Returns the next zone at or below highest_zoneidx in a zonelist */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
 					nodemask_t *nodes)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 36384baa74e1..789e5f065e8d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3192,17 +3192,6 @@ retry:
 	 */
 	alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
-	/*
-	 * Find the true preferred zone if the allocation is unconstrained by
-	 * cpusets.
-	 */
-	if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
-		struct zoneref *preferred_zoneref;
-		preferred_zoneref = first_zones_zonelist(ac->zonelist,
-				ac->high_zoneidx, NULL, &ac->preferred_zone);
-		ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
-	}
-
 	/* This is the last chance, in general, before the goto nopage. */
 	page = get_page_from_freelist(gfp_mask, order,
 				alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
@@ -3358,14 +3347,21 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
-	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
+	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
+		.zonelist = zonelist,
 		.nodemask = nodemask,
 		.migratetype = gfpflags_to_migratetype(gfp_mask),
 	};
 
+	if (cpusets_enabled()) {
+		alloc_flags |= ALLOC_CPUSET;
+		if (!ac.nodemask)
+			ac.nodemask = &cpuset_current_mems_allowed;
+	}
+
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
@@ -3389,16 +3385,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
-	/* We set it here, as __alloc_pages_slowpath might have changed it */
-	ac.zonelist = zonelist;
-
 	/* Dirty zone balancing only done in the fast path */
 	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/* The preferred zone is used for statistics later */
 	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-				ac.nodemask ? : &cpuset_current_mems_allowed,
-				&ac.preferred_zone);
+				ac.nodemask, &ac.preferred_zone);
 	if (!ac.preferred_zone)
 		goto out;
 	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
-- 
cgit v1.2.3


From 754078eb45df8069f389f3371002e7e24962e1a2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:33 -0700
Subject: mm, page_alloc: use __dec_zone_state for order-0 page allocation

__dec_zone_state is cheaper to use for removing an order-0 page as it
has fewer conditions to check.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                         optiter-v1r20              decstat-v1r20
  Min      alloc-odr0-1               382.00 (  0.00%)           381.00 (  0.26%)
  Min      alloc-odr0-2               282.00 (  0.00%)           275.00 (  2.48%)
  Min      alloc-odr0-4               233.00 (  0.00%)           229.00 (  1.72%)
  Min      alloc-odr0-8               203.00 (  0.00%)           199.00 (  1.97%)
  Min      alloc-odr0-16              188.00 (  0.00%)           186.00 (  1.06%)
  Min      alloc-odr0-32              182.00 (  0.00%)           179.00 (  1.65%)
  Min      alloc-odr0-64              177.00 (  0.00%)           174.00 (  1.69%)
  Min      alloc-odr0-128             175.00 (  0.00%)           172.00 (  1.71%)
  Min      alloc-odr0-256             184.00 (  0.00%)           181.00 (  1.63%)
  Min      alloc-odr0-512             197.00 (  0.00%)           193.00 (  2.03%)
  Min      alloc-odr0-1024            203.00 (  0.00%)           201.00 (  0.99%)
  Min      alloc-odr0-2048            209.00 (  0.00%)           206.00 (  1.44%)
  Min      alloc-odr0-4096            214.00 (  0.00%)           212.00 (  0.93%)
  Min      alloc-odr0-8192            218.00 (  0.00%)           215.00 (  1.38%)
  Min      alloc-odr0-16384           219.00 (  0.00%)           216.00 (  1.37%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 789e5f065e8d..8f6b6eab074f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2417,6 +2417,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 		else
 			page = list_first_entry(list, struct page, lru);
 
+		__dec_zone_state(zone, NR_ALLOC_BATCH);
 		list_del(&page->lru);
 		pcp->count--;
 	} else {
@@ -2438,11 +2439,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
+		__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 		__mod_zone_freepage_state(zone, -(1 << order),
 					  get_pcppage_migratetype(page));
 	}
 
-	__mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order));
 	if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 &&
 	    !test_bit(ZONE_FAIR_DEPLETED, &zone->flags))
 		set_bit(ZONE_FAIR_DEPLETED, &zone->flags);
-- 
cgit v1.2.3


From f75fb889d18d362e336f8d3fba158a8636d0a063 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:36 -0700
Subject: mm, page_alloc: avoid unnecessary zone lookups during pageblock
 operations

Pageblocks have an associated bitmap to store migrate types and whether
the pageblock should be skipped during compaction.  The bitmap may be
associated with a memory section or a zone but the zone is looked up
unconditionally.  The compiler should optimise this away automatically
so this is a cosmetic patch only in many cases.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8f6b6eab074f..7f328cfb137d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6767,23 +6767,23 @@ void *__init alloc_large_system_hash(const char *tablename,
 }
 
 /* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
 							unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	return __pfn_to_section(pfn)->pageblock_flags;
 #else
-	return zone->pageblock_flags;
+	return page_zone(page)->pageblock_flags;
 #endif /* CONFIG_SPARSEMEM */
 }
 
-static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn)
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
 {
 #ifdef CONFIG_SPARSEMEM
 	pfn &= (PAGES_PER_SECTION-1);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #else
-	pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages);
+	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
 	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 #endif /* CONFIG_SPARSEMEM */
 }
@@ -6801,14 +6801,12 @@ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
-	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long bitidx, word_bitidx;
 	unsigned long word;
 
-	zone = page_zone(page);
-	bitmap = get_pageblock_bitmap(zone, pfn);
-	bitidx = pfn_to_bitidx(zone, pfn);
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 
@@ -6830,20 +6828,18 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
 					unsigned long end_bitidx,
 					unsigned long mask)
 {
-	struct zone *zone;
 	unsigned long *bitmap;
 	unsigned long bitidx, word_bitidx;
 	unsigned long old_word, word;
 
 	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 
-	zone = page_zone(page);
-	bitmap = get_pageblock_bitmap(zone, pfn);
-	bitidx = pfn_to_bitidx(zone, pfn);
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
 	word_bitidx = bitidx / BITS_PER_LONG;
 	bitidx &= (BITS_PER_LONG-1);
 
-	VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page);
+	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
 
 	bitidx += end_bitidx;
 	mask <<= (BITS_PER_LONG - bitidx - 1);
-- 
cgit v1.2.3


From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:38 -0700
Subject: mm, page_alloc: convert alloc_flags to unsigned

alloc_flags is a bitmask of flags but it is signed which does not
necessarily generate the best code depending on the compiler.  Even
without an impact, it makes more sense that this be unsigned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  6 +++---
 include/linux/mmzone.h     |  3 ++-
 mm/compaction.c            | 12 +++++++-----
 mm/internal.h              |  2 +-
 mm/page_alloc.c            | 26 ++++++++++++++------------
 5 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index d7c8de583a23..242b660f64e6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended);
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx);
+		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
 extern bool compaction_deferred(struct zone *zone, int order);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cfcd7723edb6..327f0fa1e1ce 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
-		unsigned long mark, int classzone_idx, int alloc_flags);
+		unsigned long mark, int classzone_idx,
+		unsigned int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx);
 enum memmap_context {
diff --git a/mm/compaction.c b/mm/compaction.c
index 7487067b4613..8f339ca25621 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1313,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_CONTINUE - If compaction should run now
  */
 static unsigned long __compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+					unsigned int alloc_flags,
+					int classzone_idx)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1358,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 }
 
 unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx)
+					unsigned int alloc_flags,
+					int classzone_idx)
 {
 	unsigned long ret;
 
@@ -1530,7 +1532,7 @@ out:
 
 static unsigned long compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
-		int alloc_flags, int classzone_idx)
+		unsigned int alloc_flags, int classzone_idx)
 {
 	unsigned long ret;
 	struct compact_control cc = {
@@ -1571,8 +1573,8 @@ int sysctl_extfrag_threshold = 500;
  * This is the main entry point for direct page compaction.
  */
 unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended)
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended)
 {
 	int may_enter_fs = gfp_mask & __GFP_FS;
 	int may_perform_io = gfp_mask & __GFP_IO;
diff --git a/mm/internal.h b/mm/internal.h
index 098a89e3b97c..114593aab55c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -175,7 +175,7 @@ struct compact_control {
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
-	const int alloc_flags;		/* alloc flags of a direct compactor */
+	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
 	const int classzone_idx;	/* zone index of a direct compactor */
 	struct zone *zone;
 	int contended;			/* Signal need_sched() or lock
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f328cfb137d..094587a4ed81 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1525,7 +1525,7 @@ static inline bool free_pages_prezeroed(bool poisoned)
 }
 
 static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
-								int alloc_flags)
+							unsigned int alloc_flags)
 {
 	int i;
 	bool poisoned = true;
@@ -2391,7 +2391,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
 static inline
 struct page *buffered_rmqueue(struct zone *preferred_zone,
 			struct zone *zone, unsigned int order,
-			gfp_t gfp_flags, int alloc_flags, int migratetype)
+			gfp_t gfp_flags, unsigned int alloc_flags,
+			int migratetype)
 {
 	unsigned long flags;
 	struct page *page;
@@ -2545,12 +2546,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * to check in the allocation paths if no pages are free.
  */
 static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx, int alloc_flags,
+			unsigned long mark, int classzone_idx,
+			unsigned int alloc_flags,
 			long free_pages)
 {
 	long min = mark;
 	int o;
-	const int alloc_harder = (alloc_flags & ALLOC_HARDER);
+	const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
 
 	/* free_pages may go negative - that's OK */
 	free_pages -= (1 << order) - 1;
@@ -2613,7 +2615,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order,
 }
 
 bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
-		      int classzone_idx, int alloc_flags)
+		      int classzone_idx, unsigned int alloc_flags)
 {
 	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
 					zone_page_state(z, NR_FREE_PAGES));
@@ -2957,7 +2959,7 @@ out:
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
@@ -3013,7 +3015,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
@@ -3053,7 +3055,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
-		int alloc_flags, const struct alloc_context *ac,
+		unsigned int alloc_flags, const struct alloc_context *ac,
 		unsigned long *did_some_progress)
 {
 	struct page *page = NULL;
@@ -3092,10 +3094,10 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 		wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
 }
 
-static inline int
+static inline unsigned int
 gfp_to_alloc_flags(gfp_t gfp_mask)
 {
-	int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+	unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
 
 	/* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
 	BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
@@ -3156,7 +3158,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 {
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
 	struct page *page = NULL;
-	int alloc_flags;
+	unsigned int alloc_flags;
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
@@ -3348,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct zoneref *preferred_zoneref;
 	struct page *page = NULL;
 	unsigned int cpuset_mems_cookie;
-	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
+	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
-- 
cgit v1.2.3


From fa379b9586c7507c607d031dadf3681ed29614a9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:41 -0700
Subject: mm, page_alloc: convert nr_fair_skipped to bool

The number of zones skipped to a zone expiring its fair zone allocation
quota is irrelevant.  Convert to bool.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 094587a4ed81..41d20bf66ac8 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2680,7 +2680,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 	struct zoneref *z;
 	struct page *page = NULL;
 	struct zone *zone;
-	int nr_fair_skipped = 0;
+	bool fair_skipped;
 	bool zonelist_rescan;
 
 zonelist_scan:
@@ -2708,7 +2708,7 @@ zonelist_scan:
 			if (!zone_local(ac->preferred_zone, zone))
 				break;
 			if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
-				nr_fair_skipped++;
+				fair_skipped = true;
 				continue;
 			}
 		}
@@ -2801,7 +2801,7 @@ try_this_zone:
 	 */
 	if (alloc_flags & ALLOC_FAIR) {
 		alloc_flags &= ~ALLOC_FAIR;
-		if (nr_fair_skipped) {
+		if (fair_skipped) {
 			zonelist_rescan = true;
 			reset_alloc_batches(ac->preferred_zone);
 		}
-- 
cgit v1.2.3


From 4dfa6cd8fdb1682586d463bc34888980fe98eb46 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:44 -0700
Subject: mm, page_alloc: remove unnecessary local variable in
 get_page_from_freelist

zonelist here is a copy of a struct field that is used once.  Ditch it.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 41d20bf66ac8..1d705803fc66 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2676,7 +2676,6 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
-	struct zonelist *zonelist = ac->zonelist;
 	struct zoneref *z;
 	struct page *page = NULL;
 	struct zone *zone;
@@ -2690,7 +2689,7 @@ zonelist_scan:
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 		unsigned long mark;
 
-- 
cgit v1.2.3


From be06af002f6d50de10fd602ce3a6aa5d28e88d38 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:47 -0700
Subject: mm, page_alloc: remove unnecessary initialisation in
 get_page_from_freelist

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1d705803fc66..1096ac8f5ed1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2677,7 +2677,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
 	struct zoneref *z;
-	struct page *page = NULL;
 	struct zone *zone;
 	bool fair_skipped;
 	bool zonelist_rescan;
@@ -2691,6 +2690,7 @@ zonelist_scan:
 	 */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
+		struct page *page;
 		unsigned long mark;
 
 		if (cpusets_enabled() &&
-- 
cgit v1.2.3


From 5bb1b169757875a72e05bfcbb76e22602cb1a760 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:50 -0700
Subject: mm, page_alloc: remove unnecessary initialisation from
 __alloc_pages_nodemask()

page is guaranteed to be set before it is read with or without the
initialisation.

[akpm@linux-foundation.org: fix warning]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1096ac8f5ed1..f9ca6cc553c7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3347,7 +3347,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
 	struct zoneref *preferred_zoneref;
-	struct page *page = NULL;
+	struct page *page;
 	unsigned int cpuset_mems_cookie;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
 	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
@@ -3393,8 +3393,11 @@ retry_cpuset:
 	/* The preferred zone is used for statistics later */
 	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
 				ac.nodemask, &ac.preferred_zone);
-	if (!ac.preferred_zone)
+	if (!ac.preferred_zone) {
+		page = NULL;
 		goto out;
+	}
+
 	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
 
 	/* First allocation attempt */
-- 
cgit v1.2.3


From 83d4ca8148fd9092715fd8ef75b30bbfd67fd2a9 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:56 -0700
Subject: mm, page_alloc: move __GFP_HARDWALL modifications out of the fastpath

__GFP_HARDWALL only has meaning in the context of cpusets but the fast
path always applies the flag on the first attempt.  Move the
manipulations into the cpuset paths where they will be masked by a
static branch in the common case.

With the other micro-optimisations in this series combined, the impact
on a page allocator microbenchmark is

                                             4.6.0-rc2                  4.6.0-rc2
                                         decstat-v1r20                micro-v1r20
  Min      alloc-odr0-1               381.00 (  0.00%)           377.00 (  1.05%)
  Min      alloc-odr0-2               275.00 (  0.00%)           273.00 (  0.73%)
  Min      alloc-odr0-4               229.00 (  0.00%)           226.00 (  1.31%)
  Min      alloc-odr0-8               199.00 (  0.00%)           196.00 (  1.51%)
  Min      alloc-odr0-16              186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-32              179.00 (  0.00%)           175.00 (  2.23%)
  Min      alloc-odr0-64              174.00 (  0.00%)           172.00 (  1.15%)
  Min      alloc-odr0-128             172.00 (  0.00%)           170.00 (  1.16%)
  Min      alloc-odr0-256             181.00 (  0.00%)           183.00 ( -1.10%)
  Min      alloc-odr0-512             193.00 (  0.00%)           191.00 (  1.04%)
  Min      alloc-odr0-1024            201.00 (  0.00%)           199.00 (  1.00%)
  Min      alloc-odr0-2048            206.00 (  0.00%)           204.00 (  0.97%)
  Min      alloc-odr0-4096            212.00 (  0.00%)           210.00 (  0.94%)
  Min      alloc-odr0-8192            215.00 (  0.00%)           213.00 (  0.93%)
  Min      alloc-odr0-16384           216.00 (  0.00%)           214.00 (  0.93%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f9ca6cc553c7..48afc1a42bbd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3350,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
-	gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+	gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = {
 		.high_zoneidx = gfp_zone(gfp_mask),
 		.zonelist = zonelist,
@@ -3359,6 +3359,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	};
 
 	if (cpusets_enabled()) {
+		alloc_mask |= __GFP_HARDWALL;
 		alloc_flags |= ALLOC_CPUSET;
 		if (!ac.nodemask)
 			ac.nodemask = &cpuset_current_mems_allowed;
@@ -3401,7 +3402,6 @@ retry_cpuset:
 	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
 
 	/* First allocation attempt */
-	alloc_mask = gfp_mask|__GFP_HARDWALL;
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
 	if (unlikely(!page)) {
 		/*
@@ -3427,8 +3427,10 @@ out:
 	 * the mask is being updated. If a page allocation is about to fail,
 	 * check if the cpuset changed during allocation and if so, retry.
 	 */
-	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
+	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) {
+		alloc_mask = gfp_mask;
 		goto retry_cpuset;
+	}
 
 	return page;
 }
-- 
cgit v1.2.3


From 3777999dd47ec00ec34a151b1d93c0a2b721e822 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:58 -0700
Subject: mm, page_alloc: check once if a zone has isolated pageblocks

When bulk freeing pages from the per-cpu lists the zone is checked for
isolated pageblocks on every release.  This patch checks it once per
drain.

[mgorman@techsingularity.net: fix locking radce, per Vlastimil]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 48afc1a42bbd..a3b7eb86f912 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -831,8 +831,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	int batch_free = 0;
 	int to_free = count;
 	unsigned long nr_scanned;
+	bool isolated_pageblocks;
 
 	spin_lock(&zone->lock);
+	isolated_pageblocks = has_isolate_pageblock(zone);
 	nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED);
 	if (nr_scanned)
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
@@ -870,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			/* MIGRATE_ISOLATE page should not go to pcplists */
 			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
 			/* Pageblock could have been isolated meanwhile */
-			if (unlikely(has_isolate_pageblock(zone)))
+			if (unlikely(isolated_pageblocks))
 				mt = get_pageblock_migratetype(page);
 
 			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
-- 
cgit v1.2.3


From 4fcb0971175f6037590d7b7772fe6619261d2165 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:01 -0700
Subject: mm, page_alloc: shorten the page allocator fast path

The page allocator fast path checks page multiple times unnecessarily.
This patch avoids all the slowpath checks if the first allocation
attempt succeeds.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3b7eb86f912..8380011d77db 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3398,31 +3398,26 @@ retry_cpuset:
 				ac.nodemask, &ac.preferred_zone);
 	if (!ac.preferred_zone) {
 		page = NULL;
-		goto out;
+		goto no_zone;
 	}
 
 	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
-	if (unlikely(!page)) {
-		/*
-		 * Runtime PM, block IO and its error handling path
-		 * can deadlock because I/O on the device might not
-		 * complete.
-		 */
-		alloc_mask = memalloc_noio_flags(gfp_mask);
-		ac.spread_dirty_pages = false;
-
-		page = __alloc_pages_slowpath(alloc_mask, order, &ac);
-	}
+	if (likely(page))
+		goto out;
 
-	if (kmemcheck_enabled && page)
-		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+	/*
+	 * Runtime PM, block IO and its error handling path can deadlock
+	 * because I/O on the device might not complete.
+	 */
+	alloc_mask = memalloc_noio_flags(gfp_mask);
+	ac.spread_dirty_pages = false;
 
-	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
 
-out:
+no_zone:
 	/*
 	 * When updating a task's mems_allowed, it is possible to race with
 	 * parallel threads in such a way that an allocation can fail while
@@ -3434,6 +3429,12 @@ out:
 		goto retry_cpuset;
 	}
 
+out:
+	if (kmemcheck_enabled && page)
+		kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+	trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
+
 	return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
-- 
cgit v1.2.3


From 305347550becd08fdb576df32fc0767842ed71a6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:04 -0700
Subject: mm, page_alloc: reduce cost of fair zone allocation policy retry

The fair zone allocation policy is not without cost but it can be
reduced slightly.  This patch removes an unnecessary local variable,
checks the likely conditions of the fair zone policy first, uses a bool
instead of a flags check and falls through when a remote node is
encountered instead of doing a full restart.  The benefit is marginal
but it's there

                                             4.6.0-rc2                  4.6.0-rc2
                                         decstat-v1r20              optfair-v1r20
  Min      alloc-odr0-1               377.00 (  0.00%)           380.00 ( -0.80%)
  Min      alloc-odr0-2               273.00 (  0.00%)           273.00 (  0.00%)
  Min      alloc-odr0-4               226.00 (  0.00%)           227.00 ( -0.44%)
  Min      alloc-odr0-8               196.00 (  0.00%)           196.00 (  0.00%)
  Min      alloc-odr0-16              183.00 (  0.00%)           183.00 (  0.00%)
  Min      alloc-odr0-32              175.00 (  0.00%)           173.00 (  1.14%)
  Min      alloc-odr0-64              172.00 (  0.00%)           169.00 (  1.74%)
  Min      alloc-odr0-128             170.00 (  0.00%)           169.00 (  0.59%)
  Min      alloc-odr0-256             183.00 (  0.00%)           180.00 (  1.64%)
  Min      alloc-odr0-512             191.00 (  0.00%)           190.00 (  0.52%)
  Min      alloc-odr0-1024            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-2048            204.00 (  0.00%)           204.00 (  0.00%)
  Min      alloc-odr0-4096            210.00 (  0.00%)           209.00 (  0.48%)
  Min      alloc-odr0-8192            213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-16384           214.00 (  0.00%)           214.00 (  0.00%)

The benefit is marginal at best but one of the most important benefits,
avoiding a second search when falling back to another node is not
triggered by this particular test so the benefit for some corner cases
is understated.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 32 ++++++++++++++------------------
 1 file changed, 14 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8380011d77db..a4124936303b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2680,12 +2680,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 {
 	struct zoneref *z;
 	struct zone *zone;
-	bool fair_skipped;
-	bool zonelist_rescan;
+	bool fair_skipped = false;
+	bool apply_fair = (alloc_flags & ALLOC_FAIR);
 
 zonelist_scan:
-	zonelist_rescan = false;
-
 	/*
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
@@ -2705,13 +2703,16 @@ zonelist_scan:
 		 * page was allocated in should have no effect on the
 		 * time the page has in memory before being reclaimed.
 		 */
-		if (alloc_flags & ALLOC_FAIR) {
-			if (!zone_local(ac->preferred_zone, zone))
-				break;
+		if (apply_fair) {
 			if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
 				fair_skipped = true;
 				continue;
 			}
+			if (!zone_local(ac->preferred_zone, zone)) {
+				if (fair_skipped)
+					goto reset_fair;
+				apply_fair = false;
+			}
 		}
 		/*
 		 * When allocating a page cache page for writing, we
@@ -2800,18 +2801,13 @@ try_this_zone:
 	 * include remote zones now, before entering the slowpath and waking
 	 * kswapd: prefer spilling to a remote zone over swapping locally.
 	 */
-	if (alloc_flags & ALLOC_FAIR) {
-		alloc_flags &= ~ALLOC_FAIR;
-		if (fair_skipped) {
-			zonelist_rescan = true;
-			reset_alloc_batches(ac->preferred_zone);
-		}
-		if (nr_online_nodes > 1)
-			zonelist_rescan = true;
-	}
-
-	if (zonelist_rescan)
+	if (fair_skipped) {
+reset_fair:
+		apply_fair = false;
+		fair_skipped = false;
+		reset_alloc_batches(ac->preferred_zone);
 		goto zonelist_scan;
+	}
 
 	return NULL;
 }
-- 
cgit v1.2.3


From 48ee5f3696f62496481a8b6d852bcad9b3ebbe37 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:07 -0700
Subject: mm, page_alloc: shortcut watermark checks for order-0 pages

Watermarks have to be checked on every allocation including the number
of pages being allocated and whether reserves can be accessed.  The
reserves only matter if memory is limited and the free_pages adjustment
only applies to high-order pages.  This patch adds a shortcut for
order-0 pages that avoids numerous calculations if there is plenty of
free memory yielding the following performance difference in a page
allocator microbenchmark;

                                             4.6.0-rc2                  4.6.0-rc2
                                         optfair-v1r20             fastmark-v1r20
  Min      alloc-odr0-1               380.00 (  0.00%)           364.00 (  4.21%)
  Min      alloc-odr0-2               273.00 (  0.00%)           262.00 (  4.03%)
  Min      alloc-odr0-4               227.00 (  0.00%)           214.00 (  5.73%)
  Min      alloc-odr0-8               196.00 (  0.00%)           186.00 (  5.10%)
  Min      alloc-odr0-16              183.00 (  0.00%)           173.00 (  5.46%)
  Min      alloc-odr0-32              173.00 (  0.00%)           165.00 (  4.62%)
  Min      alloc-odr0-64              169.00 (  0.00%)           161.00 (  4.73%)
  Min      alloc-odr0-128             169.00 (  0.00%)           159.00 (  5.92%)
  Min      alloc-odr0-256             180.00 (  0.00%)           168.00 (  6.67%)
  Min      alloc-odr0-512             190.00 (  0.00%)           180.00 (  5.26%)
  Min      alloc-odr0-1024            198.00 (  0.00%)           190.00 (  4.04%)
  Min      alloc-odr0-2048            204.00 (  0.00%)           196.00 (  3.92%)
  Min      alloc-odr0-4096            209.00 (  0.00%)           202.00 (  3.35%)
  Min      alloc-odr0-8192            213.00 (  0.00%)           206.00 (  3.29%)
  Min      alloc-odr0-16384           214.00 (  0.00%)           206.00 (  3.74%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a4124936303b..732875b1bdfb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2623,6 +2623,32 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
 					zone_page_state(z, NR_FREE_PAGES));
 }
 
+static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
+		unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+{
+	long free_pages = zone_page_state(z, NR_FREE_PAGES);
+	long cma_pages = 0;
+
+#ifdef CONFIG_CMA
+	/* If allocation can't use CMA areas don't use free CMA pages */
+	if (!(alloc_flags & ALLOC_CMA))
+		cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+	/*
+	 * Fast check for order-0 only. If this fails then the reserves
+	 * need to be calculated. There is a corner case where the check
+	 * passes but only the high-order atomic reserve are free. If
+	 * the caller is !atomic then it'll uselessly search the free
+	 * list. That corner case is then slower but it is harmless.
+	 */
+	if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+		return true;
+
+	return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+					free_pages);
+}
+
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 			unsigned long mark, int classzone_idx)
 {
@@ -2744,7 +2770,7 @@ zonelist_scan:
 			continue;
 
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
-		if (!zone_watermark_ok(zone, order, mark,
+		if (!zone_watermark_fast(zone, order, mark,
 				       ac->classzone_idx, alloc_flags)) {
 			int ret;
 
-- 
cgit v1.2.3


From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:10 -0700
Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice

The allocator fast path looks up the first usable zone in a zonelist and
then get_page_from_freelist does the same job in the zonelist iterator.
This patch preserves the necessary information.

                                             4.6.0-rc2                  4.6.0-rc2
                                        fastmark-v1r20             initonce-v1r20
  Min      alloc-odr0-1               364.00 (  0.00%)           359.00 (  1.37%)
  Min      alloc-odr0-2               262.00 (  0.00%)           260.00 (  0.76%)
  Min      alloc-odr0-4               214.00 (  0.00%)           214.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           186.00 (  0.00%)
  Min      alloc-odr0-16              173.00 (  0.00%)           173.00 (  0.00%)
  Min      alloc-odr0-32              165.00 (  0.00%)           165.00 (  0.00%)
  Min      alloc-odr0-64              161.00 (  0.00%)           162.00 ( -0.62%)
  Min      alloc-odr0-128             159.00 (  0.00%)           161.00 ( -1.26%)
  Min      alloc-odr0-256             168.00 (  0.00%)           170.00 ( -1.19%)
  Min      alloc-odr0-512             180.00 (  0.00%)           181.00 ( -0.56%)
  Min      alloc-odr0-1024            190.00 (  0.00%)           190.00 (  0.00%)
  Min      alloc-odr0-2048            196.00 (  0.00%)           196.00 (  0.00%)
  Min      alloc-odr0-4096            202.00 (  0.00%)           202.00 (  0.00%)
  Min      alloc-odr0-8192            206.00 (  0.00%)           205.00 (  0.49%)
  Min      alloc-odr0-16384           206.00 (  0.00%)           205.00 (  0.49%)

The benefit is negligible and the results are within the noise but each
cycle counts.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c            | 10 +++++-----
 include/linux/mmzone.h | 18 +++++++++++-------
 mm/internal.h          |  2 +-
 mm/mempolicy.c         | 19 ++++++++++---------
 mm/page_alloc.c        | 34 ++++++++++++++++------------------
 5 files changed, 43 insertions(+), 40 deletions(-)

(limited to 'mm')

diff --git a/fs/buffer.c b/fs/buffer.c
index af0d9a82a8ed..754813a6962b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -255,17 +255,17 @@ out:
  */
 static void free_more_memory(void)
 {
-	struct zone *zone;
+	struct zoneref *z;
 	int nid;
 
 	wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM);
 	yield();
 
 	for_each_online_node(nid) {
-		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
-						gfp_zone(GFP_NOFS), NULL,
-						&zone);
-		if (zone)
+
+		z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
+						gfp_zone(GFP_NOFS), NULL);
+		if (z->zone)
 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
 						GFP_NOFS, NULL);
 	}
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 327f0fa1e1ce..4b28d2f8125e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone)
+					nodemask_t *nodes)
 {
-	struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+	return next_zones_zonelist(zonelist->_zonerefs,
 							highest_zoneidx, nodes);
-	*zone = zonelist_zone(z);
-	return z;
 }
 
 /**
@@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
  * within a given nodemask
  */
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
-	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
+	for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);	\
 		zone;							\
 		z = next_zones_zonelist(++z, highidx, nodemask),	\
-			zone = zonelist_zone(z))			\
+			zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+	for (zone = z->zone;	\
+		zone;							\
+		z = next_zones_zonelist(++z, highidx, nodemask),	\
+			zone = zonelist_zone(z))
+
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
diff --git a/mm/internal.h b/mm/internal.h
index 114593aab55c..d1ddd71c1bbf 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -102,7 +102,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
 struct alloc_context {
 	struct zonelist *zonelist;
 	nodemask_t *nodemask;
-	struct zone *preferred_zone;
+	struct zoneref *preferred_zoneref;
 	int classzone_idx;
 	int migratetype;
 	enum zone_type high_zoneidx;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 7f80ebcd6552..297d6854f849 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1739,18 +1739,18 @@ unsigned int mempolicy_slab_node(void)
 		return interleave_nodes(policy);
 
 	case MPOL_BIND: {
+		struct zoneref *z;
+
 		/*
 		 * Follow bind policy behavior and start allocation at the
 		 * first node.
 		 */
 		struct zonelist *zonelist;
-		struct zone *zone;
 		enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
 		zonelist = &NODE_DATA(node)->node_zonelists[0];
-		(void)first_zones_zonelist(zonelist, highest_zoneidx,
-							&policy->v.nodes,
-							&zone);
-		return zone ? zone->node : node;
+		z = first_zones_zonelist(zonelist, highest_zoneidx,
+							&policy->v.nodes);
+		return z->zone ? z->zone->node : node;
 	}
 
 	default:
@@ -2266,7 +2266,7 @@ static void sp_free(struct sp_node *n)
 int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol;
-	struct zone *zone;
+	struct zoneref *z;
 	int curnid = page_to_nid(page);
 	unsigned long pgoff;
 	int thiscpu = raw_smp_processor_id();
@@ -2298,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		break;
 
 	case MPOL_BIND:
+
 		/*
 		 * allows binding to multiple nodes.
 		 * use current page if in policy nodemask,
@@ -2306,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
 		 */
 		if (node_isset(curnid, pol->v.nodes))
 			goto out;
-		(void)first_zones_zonelist(
+		z = first_zones_zonelist(
 				node_zonelist(numa_node_id(), GFP_HIGHUSER),
 				gfp_zone(GFP_HIGHUSER),
-				&pol->v.nodes, &zone);
-		polnid = zone->node;
+				&pol->v.nodes);
+		polnid = z->zone->node;
 		break;
 
 	default:
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 732875b1bdfb..dba8cfd0b2d6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2704,7 +2704,7 @@ static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
 						const struct alloc_context *ac)
 {
-	struct zoneref *z;
+	struct zoneref *z = ac->preferred_zoneref;
 	struct zone *zone;
 	bool fair_skipped = false;
 	bool apply_fair = (alloc_flags & ALLOC_FAIR);
@@ -2714,7 +2714,7 @@ zonelist_scan:
 	 * Scan zonelist, looking for a zone with enough free.
 	 * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
 	 */
-	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+	for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
 		struct page *page;
 		unsigned long mark;
@@ -2734,7 +2734,7 @@ zonelist_scan:
 				fair_skipped = true;
 				continue;
 			}
-			if (!zone_local(ac->preferred_zone, zone)) {
+			if (!zone_local(ac->preferred_zoneref->zone, zone)) {
 				if (fair_skipped)
 					goto reset_fair;
 				apply_fair = false;
@@ -2780,7 +2780,7 @@ zonelist_scan:
 				goto try_this_zone;
 
 			if (zone_reclaim_mode == 0 ||
-			    !zone_allows_reclaim(ac->preferred_zone, zone))
+			    !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
 				continue;
 
 			ret = zone_reclaim(zone, gfp_mask, order);
@@ -2802,7 +2802,7 @@ zonelist_scan:
 		}
 
 try_this_zone:
-		page = buffered_rmqueue(ac->preferred_zone, zone, order,
+		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
 			if (prep_new_page(page, order, gfp_mask, alloc_flags))
@@ -2831,7 +2831,7 @@ try_this_zone:
 reset_fair:
 		apply_fair = false;
 		fair_skipped = false;
-		reset_alloc_batches(ac->preferred_zone);
+		reset_alloc_batches(ac->preferred_zoneref->zone);
 		goto zonelist_scan;
 	}
 
@@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
 						ac->high_zoneidx, ac->nodemask)
-		wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
+		wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref));
 }
 
 static inline unsigned int
@@ -3332,7 +3332,7 @@ retry:
 	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
 	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
 		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
+		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
 		goto retry;
 	}
 
@@ -3370,7 +3370,6 @@ struct page *
 __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 			struct zonelist *zonelist, nodemask_t *nodemask)
 {
-	struct zoneref *preferred_zoneref;
 	struct page *page;
 	unsigned int cpuset_mems_cookie;
 	unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR;
@@ -3416,14 +3415,14 @@ retry_cpuset:
 	ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE);
 
 	/* The preferred zone is used for statistics later */
-	preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
-				ac.nodemask, &ac.preferred_zone);
-	if (!ac.preferred_zone) {
+	ac.preferred_zoneref = first_zones_zonelist(ac.zonelist,
+					ac.high_zoneidx, ac.nodemask);
+	if (!ac.preferred_zoneref) {
 		page = NULL;
 		goto no_zone;
 	}
 
-	ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
+	ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref);
 
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
@@ -4462,13 +4461,12 @@ static void build_zonelists(pg_data_t *pgdat)
  */
 int local_memory_node(int node)
 {
-	struct zone *zone;
+	struct zoneref *z;
 
-	(void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
+	z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
 				   gfp_zone(GFP_KERNEL),
-				   NULL,
-				   &zone);
-	return zone->node;
+				   NULL);
+	return z->zone->node;
 }
 #endif
 
-- 
cgit v1.2.3


From 93ea9964d14ad583492ffb9ab7543f015876aaf2 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:13 -0700
Subject: mm, page_alloc: remove field from alloc_context

The classzone_idx can be inferred from preferred_zoneref so remove the
unnecessary field and save stack space.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 4 ++--
 mm/internal.h   | 3 ++-
 mm/page_alloc.c | 8 +++-----
 3 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index 8f339ca25621..eda3c2244f30 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1602,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
 				&zone_contended, alloc_flags,
-				ac->classzone_idx);
+				ac_classzone_idx(ac));
 		rc = max(status, rc);
 		/*
 		 * It takes at least one zone that wasn't lock contended
@@ -1612,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 
 		/* If a normal allocation would succeed, stop compacting */
 		if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
-					ac->classzone_idx, alloc_flags)) {
+					ac_classzone_idx(ac), alloc_flags)) {
 			/*
 			 * We think the allocation will succeed in this zone,
 			 * but it is not certain, hence the false. The caller
diff --git a/mm/internal.h b/mm/internal.h
index d1ddd71c1bbf..3ac544f1963f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -103,12 +103,13 @@ struct alloc_context {
 	struct zonelist *zonelist;
 	nodemask_t *nodemask;
 	struct zoneref *preferred_zoneref;
-	int classzone_idx;
 	int migratetype;
 	enum zone_type high_zoneidx;
 	bool spread_dirty_pages;
 };
 
+#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
+
 /*
  * Locate the struct page for both the matching buddy in our
  * pair (buddy1) and the combined O(n+1) page they form (page).
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dba8cfd0b2d6..9da66e792e17 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2771,7 +2771,7 @@ zonelist_scan:
 
 		mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
 		if (!zone_watermark_fast(zone, order, mark,
-				       ac->classzone_idx, alloc_flags)) {
+				       ac_classzone_idx(ac), alloc_flags)) {
 			int ret;
 
 			/* Checked here to keep the fast path fast */
@@ -2794,7 +2794,7 @@ zonelist_scan:
 			default:
 				/* did we reclaim enough */
 				if (zone_watermark_ok(zone, order, mark,
-						ac->classzone_idx, alloc_flags))
+						ac_classzone_idx(ac), alloc_flags))
 					goto try_this_zone;
 
 				continue;
@@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
 
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
 						ac->high_zoneidx, ac->nodemask)
-		wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref));
+		wakeup_kswapd(zone, order, ac_classzone_idx(ac));
 }
 
 static inline unsigned int
@@ -3422,8 +3422,6 @@ retry_cpuset:
 		goto no_zone;
 	}
 
-	ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref);
-
 	/* First allocation attempt */
 	page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
 	if (likely(page))
-- 
cgit v1.2.3


From 7bfec6f47bb0ffd207c7e813e819235e6c1c0f34 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:15 -0700
Subject: mm, page_alloc: check multiple page fields with a single branch

Every page allocated or freed is checked for sanity to avoid corruptions
that are difficult to detect later.  A bad page could be due to a number
of fields.  Instead of using multiple branches, this patch combines
multiple fields into a single branch.  A detailed check is only
necessary if that check fails.

                                             4.6.0-rc2                  4.6.0-rc2
                                        initonce-v1r20            multcheck-v1r20
  Min      alloc-odr0-1               359.00 (  0.00%)           348.00 (  3.06%)
  Min      alloc-odr0-2               260.00 (  0.00%)           254.00 (  2.31%)
  Min      alloc-odr0-4               214.00 (  0.00%)           213.00 (  0.47%)
  Min      alloc-odr0-8               186.00 (  0.00%)           186.00 (  0.00%)
  Min      alloc-odr0-16              173.00 (  0.00%)           173.00 (  0.00%)
  Min      alloc-odr0-32              165.00 (  0.00%)           166.00 ( -0.61%)
  Min      alloc-odr0-64              162.00 (  0.00%)           162.00 (  0.00%)
  Min      alloc-odr0-128             161.00 (  0.00%)           160.00 (  0.62%)
  Min      alloc-odr0-256             170.00 (  0.00%)           169.00 (  0.59%)
  Min      alloc-odr0-512             181.00 (  0.00%)           180.00 (  0.55%)
  Min      alloc-odr0-1024            190.00 (  0.00%)           188.00 (  1.05%)
  Min      alloc-odr0-2048            196.00 (  0.00%)           194.00 (  1.02%)
  Min      alloc-odr0-4096            202.00 (  0.00%)           199.00 (  1.49%)
  Min      alloc-odr0-8192            205.00 (  0.00%)           202.00 (  1.46%)
  Min      alloc-odr0-16384           205.00 (  0.00%)           203.00 (  0.98%)

Again, the benefit is marginal but avoiding excessive branches is
important.  Ideally the paths would not have to check these conditions
at all but regrettably abandoning the tests would make use-after-free
bugs much harder to detect.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9da66e792e17..76a394812776 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -784,10 +784,42 @@ out:
 	zone->free_area[order].nr_free++;
 }
 
+/*
+ * A bad page could be due to a number of fields. Instead of multiple branches,
+ * try and check multiple fields with one check. The caller must do a detailed
+ * check if necessary.
+ */
+static inline bool page_expected_state(struct page *page,
+					unsigned long check_flags)
+{
+	if (unlikely(atomic_read(&page->_mapcount) != -1))
+		return false;
+
+	if (unlikely((unsigned long)page->mapping |
+			page_ref_count(page) |
+#ifdef CONFIG_MEMCG
+			(unsigned long)page->mem_cgroup |
+#endif
+			(page->flags & check_flags)))
+		return false;
+
+	return true;
+}
+
 static inline int free_pages_check(struct page *page)
 {
-	const char *bad_reason = NULL;
-	unsigned long bad_flags = 0;
+	const char *bad_reason;
+	unsigned long bad_flags;
+
+	if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)) {
+		page_cpupid_reset_last(page);
+		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+		return 0;
+	}
+
+	/* Something has gone sideways, find it */
+	bad_reason = NULL;
+	bad_flags = 0;
 
 	if (unlikely(atomic_read(&page->_mapcount) != -1))
 		bad_reason = "nonzero mapcount";
@@ -803,14 +835,8 @@ static inline int free_pages_check(struct page *page)
 	if (unlikely(page->mem_cgroup))
 		bad_reason = "page still charged to cgroup";
 #endif
-	if (unlikely(bad_reason)) {
-		bad_page(page, bad_reason, bad_flags);
-		return 1;
-	}
-	page_cpupid_reset_last(page);
-	if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
-		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-	return 0;
+	bad_page(page, bad_reason, bad_flags);
+	return 1;
 }
 
 /*
@@ -1492,9 +1518,14 @@ static inline void expand(struct zone *zone, struct page *page,
  */
 static inline int check_new_page(struct page *page)
 {
-	const char *bad_reason = NULL;
-	unsigned long bad_flags = 0;
+	const char *bad_reason;
+	unsigned long bad_flags;
+
+	if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))
+		return 0;
 
+	bad_reason = NULL;
+	bad_flags = 0;
 	if (unlikely(atomic_read(&page->_mapcount) != -1))
 		bad_reason = "nonzero mapcount";
 	if (unlikely(page->mapping != NULL))
-- 
cgit v1.2.3


From bb552ac6c6b4f24e7a7b491286f87b63f9478d42 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:18 -0700
Subject: mm, page_alloc: un-inline the bad part of free_pages_check

From: Vlastimil Babka <vbabka@suse.cz>

!DEBUG_VM size and bloat-o-meter:

  add/remove: 1/0 grow/shrink: 0/2 up/down: 124/-370 (-246)
  function                                     old     new   delta
  free_pages_check_bad                           -     124    +124
  free_pcppages_bulk                          1288    1171    -117
  __free_pages_ok                              948     695    -253

DEBUG_VM:

  add/remove: 1/0 grow/shrink: 0/1 up/down: 124/-214 (-90)
  function                                     old     new   delta
  free_pages_check_bad                           -     124    +124
  free_pages_prepare                          1112     898    -214

[akpm@linux-foundation.org: fix whitespace]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 76a394812776..d51543de1813 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,18 +806,11 @@ static inline bool page_expected_state(struct page *page,
 	return true;
 }
 
-static inline int free_pages_check(struct page *page)
+static void free_pages_check_bad(struct page *page)
 {
 	const char *bad_reason;
 	unsigned long bad_flags;
 
-	if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)) {
-		page_cpupid_reset_last(page);
-		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-		return 0;
-	}
-
-	/* Something has gone sideways, find it */
 	bad_reason = NULL;
 	bad_flags = 0;
 
@@ -836,6 +829,18 @@ static inline int free_pages_check(struct page *page)
 		bad_reason = "page still charged to cgroup";
 #endif
 	bad_page(page, bad_reason, bad_flags);
+}
+
+static inline int free_pages_check(struct page *page)
+{
+	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) {
+		page_cpupid_reset_last(page);
+		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+		return 0;
+	}
+
+	/* Something has gone sideways, find it */
+	free_pages_check_bad(page);
 	return 1;
 }
 
-- 
cgit v1.2.3


From da838d4fcba675cbf864f225d76f970e91220ee6 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:21 -0700
Subject: mm, page_alloc: pull out side effects from free_pages_check

Check without side-effects should be easier to maintain.  It also
removes the duplicated cpupid and flags reset done in !DEBUG_VM variant
of both free_pcp_prepare() and then bulkfree_pcp_prepare().  Finally, it
enables the next patch.

It shouldn't result in new branches, thanks to inlining of the check.

!DEBUG_VM bloat-o-meter:

  add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-27 (-27)
  function                                     old     new   delta
  __free_pages_ok                              748     739      -9
  free_pcppages_bulk                          1403    1385     -18

DEBUG_VM:

  add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-28 (-28)
  function                                     old     new   delta
  free_pages_prepare                           806     778     -28

This is also slightly faster because cpupid information is not set on
tail pages so we can avoid resets there.

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d51543de1813..fea50b0cb405 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -833,11 +833,8 @@ static void free_pages_check_bad(struct page *page)
 
 static inline int free_pages_check(struct page *page)
 {
-	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) {
-		page_cpupid_reset_last(page);
-		page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+	if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
 		return 0;
-	}
 
 	/* Something has gone sideways, find it */
 	free_pages_check_bad(page);
@@ -1078,7 +1075,11 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 		for (i = 1; i < (1 << order); i++) {
 			if (compound)
 				bad += free_tail_pages_check(page, page + i);
-			bad += free_pages_check(page + i);
+			if (unlikely(free_pages_check(page + i))) {
+				bad++;
+				continue;
+			}
+			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 		}
 	}
 	if (PageAnonHead(page))
@@ -1087,6 +1088,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 	if (bad)
 		return false;
 
+	page_cpupid_reset_last(page);
+	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	reset_page_owner(page, order);
 
 	if (!PageHighMem(page)) {
-- 
cgit v1.2.3


From e5b31ac2ca2cd0cf6bf2fcbb708ed01466c89aaa Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:24 -0700
Subject: mm, page_alloc: remove unnecessary variable from free_pcppages_bulk

The original count is never reused so it can be removed.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fea50b0cb405..822ce86fc883 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -857,7 +857,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 {
 	int migratetype = 0;
 	int batch_free = 0;
-	int to_free = count;
 	unsigned long nr_scanned;
 	bool isolated_pageblocks;
 
@@ -867,7 +866,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	if (nr_scanned)
 		__mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned);
 
-	while (to_free) {
+	while (count) {
 		struct page *page;
 		struct list_head *list;
 
@@ -887,7 +886,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 
 		/* This is the only non-empty list. Free them all. */
 		if (batch_free == MIGRATE_PCPTYPES)
-			batch_free = to_free;
+			batch_free = count;
 
 		do {
 			int mt;	/* migratetype of the to-be-freed page */
@@ -905,7 +904,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 
 			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
-		} while (--to_free && --batch_free && !list_empty(list));
+		} while (--count && --batch_free && !list_empty(list));
 	}
 	spin_unlock(&zone->lock);
 }
-- 
cgit v1.2.3


From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:27 -0700
Subject: mm, page_alloc: inline pageblock lookup in page free fast paths

The function call overhead of get_pfnblock_flags_mask() is measurable in
the page free paths.  This patch uses an inlined version that is faster.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |   7 --
 mm/page_alloc.c        | 188 ++++++++++++++++++++++++++-----------------------
 mm/page_owner.c        |   2 +-
 mm/vmstat.c            |   2 +-
 4 files changed, 102 insertions(+), 97 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4b28d2f8125e..c60db2096fd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
 	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
 			PB_migrate_end, MIGRATETYPE_MASK)
 
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
-	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
-	return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
-					MIGRATETYPE_MASK);
-}
-
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 822ce86fc883..bdf7a13311b5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat,
 }
 #endif
 
+/* Return a pointer to the bitmap storing bits affecting a block of pages */
+static inline unsigned long *get_pageblock_bitmap(struct page *page,
+							unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	return __pfn_to_section(pfn)->pageblock_flags;
+#else
+	return page_zone(page)->pageblock_flags;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
+{
+#ifdef CONFIG_SPARSEMEM
+	pfn &= (PAGES_PER_SECTION-1);
+	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#else
+	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
+	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
+#endif /* CONFIG_SPARSEMEM */
+}
+
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest to retrieve
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
+static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+					unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	unsigned long *bitmap;
+	unsigned long bitidx, word_bitidx;
+	unsigned long word;
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = bitidx / BITS_PER_LONG;
+	bitidx &= (BITS_PER_LONG-1);
+
+	word = bitmap[word_bitidx];
+	bitidx += end_bitidx;
+	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+}
+
+unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+}
+
+static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
+{
+	return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+}
+
+/**
+ * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @flags: The flags to set
+ * @pfn: The target page frame number
+ * @end_bitidx: The last bit of interest
+ * @mask: mask of bits that the caller is interested in
+ */
+void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
+					unsigned long pfn,
+					unsigned long end_bitidx,
+					unsigned long mask)
+{
+	unsigned long *bitmap;
+	unsigned long bitidx, word_bitidx;
+	unsigned long old_word, word;
+
+	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+
+	bitmap = get_pageblock_bitmap(page, pfn);
+	bitidx = pfn_to_bitidx(page, pfn);
+	word_bitidx = bitidx / BITS_PER_LONG;
+	bitidx &= (BITS_PER_LONG-1);
+
+	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
+
+	bitidx += end_bitidx;
+	mask <<= (BITS_PER_LONG - bitidx - 1);
+	flags <<= (BITS_PER_LONG - bitidx - 1);
+
+	word = READ_ONCE(bitmap[word_bitidx]);
+	for (;;) {
+		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
+		if (word == old_word)
+			break;
+		word = old_word;
+	}
+}
 
 void set_pageblock_migratetype(struct page *page, int migratetype)
 {
@@ -6831,94 +6931,6 @@ void *__init alloc_large_system_hash(const char *tablename,
 	return table;
 }
 
-/* Return a pointer to the bitmap storing bits affecting a block of pages */
-static inline unsigned long *get_pageblock_bitmap(struct page *page,
-							unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	return __pfn_to_section(pfn)->pageblock_flags;
-#else
-	return page_zone(page)->pageblock_flags;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
-{
-#ifdef CONFIG_SPARSEMEM
-	pfn &= (PAGES_PER_SECTION-1);
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#else
-	pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
-	return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
-#endif /* CONFIG_SPARSEMEM */
-}
-
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
-unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
-					unsigned long end_bitidx,
-					unsigned long mask)
-{
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
-	unsigned long word;
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	word = bitmap[word_bitidx];
-	bitidx += end_bitidx;
-	return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
-}
-
-/**
- * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @flags: The flags to set
- * @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
- * @mask: mask of bits that the caller is interested in
- */
-void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
-					unsigned long pfn,
-					unsigned long end_bitidx,
-					unsigned long mask)
-{
-	unsigned long *bitmap;
-	unsigned long bitidx, word_bitidx;
-	unsigned long old_word, word;
-
-	BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
-
-	bitmap = get_pageblock_bitmap(page, pfn);
-	bitidx = pfn_to_bitidx(page, pfn);
-	word_bitidx = bitidx / BITS_PER_LONG;
-	bitidx &= (BITS_PER_LONG-1);
-
-	VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
-
-	bitidx += end_bitidx;
-	mask <<= (BITS_PER_LONG - bitidx - 1);
-	flags <<= (BITS_PER_LONG - bitidx - 1);
-
-	word = READ_ONCE(bitmap[word_bitidx]);
-	for (;;) {
-		old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
-		if (word == old_word)
-			break;
-		word = old_word;
-	}
-}
-
 /*
  * This function checks whether pageblock includes unmovable pages or not.
  * If @count is not zero, it is okay to include less @count unmovable pages
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 438768c092ac..792b56da13d8 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
 		goto err;
 
 	/* Print information relevant to grouping pages by mobility */
-	pageblock_mt = get_pfnblock_migratetype(page, pfn);
+	pageblock_mt = get_pageblock_migratetype(page);
 	page_mt  = gfpflags_to_migratetype(page_ext->gfp_mask);
 	ret += snprintf(kbuf + ret, count - ret,
 			"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f1a73bfb77b5..5b72a8ad2813 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1041,7 +1041,7 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m,
 		block_end_pfn = min(block_end_pfn, end_pfn);
 
 		page = pfn_to_page(pfn);
-		pageblock_mt = get_pfnblock_migratetype(page, pfn);
+		pageblock_mt = get_pageblock_migratetype(page);
 
 		for (; pfn < block_end_pfn; pfn++) {
 			if (!pfn_valid_within(pfn))
-- 
cgit v1.2.3


From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:30 -0700
Subject: cpuset: use static key better and convert to new API

An important function for cpusets is cpuset_node_allowed(), which
optimizes on the fact if there's a single root CPU set, it must be
trivially allowed.  But the check "nr_cpusets() <= 1" doesn't use the
cpusets_enabled_key static key the right way where static keys eliminate
branching overhead with jump labels.

This patch converts it so that static key is used properly.  It's also
switched to the new static key API and the checking functions are
converted to return bool instead of int.  We also provide a new variant
__cpuset_zone_allowed() which expects that the static key check was
already done and they key was enabled.  This is needed for
get_page_from_freelist() where we want to also avoid the relatively
slower check when ALLOC_CPUSET is not set in alloc_flags.

The impact on the page allocator microbenchmark is less than expected
but the cleanup in itself is worthwhile.

                                             4.6.0-rc2                  4.6.0-rc2
                                       multcheck-v1r20               cpuset-v1r20
  Min      alloc-odr0-1               348.00 (  0.00%)           348.00 (  0.00%)
  Min      alloc-odr0-2               254.00 (  0.00%)           254.00 (  0.00%)
  Min      alloc-odr0-4               213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-16              173.00 (  0.00%)           171.00 (  1.16%)
  Min      alloc-odr0-32              166.00 (  0.00%)           163.00 (  1.81%)
  Min      alloc-odr0-64              162.00 (  0.00%)           159.00 (  1.85%)
  Min      alloc-odr0-128             160.00 (  0.00%)           157.00 (  1.88%)
  Min      alloc-odr0-256             169.00 (  0.00%)           166.00 (  1.78%)
  Min      alloc-odr0-512             180.00 (  0.00%)           180.00 (  0.00%)
  Min      alloc-odr0-1024            188.00 (  0.00%)           187.00 (  0.53%)
  Min      alloc-odr0-2048            194.00 (  0.00%)           193.00 (  0.52%)
  Min      alloc-odr0-4096            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-8192            202.00 (  0.00%)           201.00 (  0.50%)
  Min      alloc-odr0-16384           203.00 (  0.00%)           202.00 (  0.49%)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++--------------
 kernel/cpuset.c        | 14 +++++++-------
 mm/page_alloc.c        |  2 +-
 3 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..bfc204e70338 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
 
 #ifdef CONFIG_CPUSETS
 
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
 static inline bool cpusets_enabled(void)
 {
-	return static_key_false(&cpusets_enabled_key);
+	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
 static inline int nr_cpusets(void)
 {
 	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key) + 1;
+	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
 static inline void cpuset_inc(void)
 {
-	static_key_slow_inc(&cpusets_enabled_key);
+	static_branch_inc(&cpusets_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-	static_key_slow_dec(&cpusets_enabled_key);
+	static_branch_dec(&cpusets_enabled_key);
 }
 
 extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+	if (cpusets_enabled())
+		return __cpuset_node_allowed(node, gfp_mask);
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	if (cpusets_enabled())
+		return __cpuset_zone_allowed(z, gfp_mask);
+	return true;
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return true;
 }
 
 static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 611cc69af8f0..73e93e53884d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,7 +61,7 @@
 #include <linux/cgroup.h>
 #include <linux/wait.h>
 
-struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE;
+DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
 
 /* See "Frequency meter" comments, below. */
 
@@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
  *	GFP_KERNEL   - any node in enclosing hardwalled cpuset ok
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  */
-int __cpuset_node_allowed(int node, gfp_t gfp_mask)
+bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
 	struct cpuset *cs;		/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 	unsigned long flags;
 
 	if (in_interrupt())
-		return 1;
+		return true;
 	if (node_isset(node, current->mems_allowed))
-		return 1;
+		return true;
 	/*
 	 * Allow tasks that have access to memory reserves because they have
 	 * been OOM killed to get memory anywhere.
 	 */
 	if (unlikely(test_thread_flag(TIF_MEMDIE)))
-		return 1;
+		return true;
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
-		return 0;
+		return false;
 
 	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return 1;
+		return true;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	spin_lock_irqsave(&callback_lock, flags);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdf7a13311b5..39c441bb8d61 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2859,7 +2859,7 @@ zonelist_scan:
 
 		if (cpusets_enabled() &&
 			(alloc_flags & ALLOC_CPUSET) &&
-			!cpuset_zone_allowed(zone, gfp_mask))
+			!__cpuset_zone_allowed(zone, gfp_mask))
 				continue;
 		/*
 		 * Distribute pages in proportion to the individual
-- 
cgit v1.2.3


From 4db7548ccbd9ec8e666f35df4a530f55904dec39 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:32 -0700
Subject: mm, page_alloc: defer debugging checks of freed pages until a PCP
 drain

Every page free checks a number of page fields for validity.  This
catches premature frees and corruptions but it is also expensive.  This
patch weakens the debugging check by checking PCP pages at the time they
are drained from the PCP list.  This will trigger the bug but the site
that freed the corrupt page will be lost.  To get the full context, a
kernel rebuild with DEBUG_VM is necessary.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 152 +++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 101 insertions(+), 51 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 39c441bb8d61..759d3f60bea0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -941,6 +941,103 @@ static inline int free_pages_check(struct page *page)
 	return 1;
 }
 
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+	int ret = 1;
+
+	/*
+	 * We rely page->lru.next never has bit 0 set, unless the page
+	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
+	 */
+	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
+
+	if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
+		ret = 0;
+		goto out;
+	}
+	switch (page - head_page) {
+	case 1:
+		/* the first tail page: ->mapping is compound_mapcount() */
+		if (unlikely(compound_mapcount(page))) {
+			bad_page(page, "nonzero compound_mapcount", 0);
+			goto out;
+		}
+		break;
+	case 2:
+		/*
+		 * the second tail page: ->mapping is
+		 * page_deferred_list().next -- ignore value.
+		 */
+		break;
+	default:
+		if (page->mapping != TAIL_MAPPING) {
+			bad_page(page, "corrupted mapping in tail page", 0);
+			goto out;
+		}
+		break;
+	}
+	if (unlikely(!PageTail(page))) {
+		bad_page(page, "PageTail not set", 0);
+		goto out;
+	}
+	if (unlikely(compound_head(page) != head_page)) {
+		bad_page(page, "compound_head not consistent", 0);
+		goto out;
+	}
+	ret = 0;
+out:
+	page->mapping = NULL;
+	clear_compound_head(page);
+	return ret;
+}
+
+static bool free_pages_prepare(struct page *page, unsigned int order);
+
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+	return free_pages_prepare(page, 0);
+}
+
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+	return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageTail(page), page);
+
+	trace_mm_page_free(page, 0);
+	kmemcheck_free_shadow(page, 0);
+	kasan_free_pages(page, 0);
+
+	if (PageAnonHead(page))
+		page->mapping = NULL;
+
+	reset_page_owner(page, 0);
+
+	if (!PageHighMem(page)) {
+		debug_check_no_locks_freed(page_address(page),
+					   PAGE_SIZE);
+		debug_check_no_obj_freed(page_address(page),
+					   PAGE_SIZE);
+	}
+	arch_free_page(page, 0);
+	kernel_poison_pages(page, 0, 0);
+	kernel_map_pages(page, 0, 0);
+
+	page_cpupid_reset_last(page);
+	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+	return true;
+}
+
+static bool bulkfree_pcp_prepare(struct page *page)
+{
+	return free_pages_check(page);
+}
+#endif /* CONFIG_DEBUG_VM */
+
 /*
  * Frees a number of pages from the PCP lists
  * Assumes all pages on list are in same zone, and of same order.
@@ -1002,6 +1099,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			if (unlikely(isolated_pageblocks))
 				mt = get_pageblock_migratetype(page);
 
+			if (bulkfree_pcp_prepare(page))
+				continue;
+
 			__free_one_page(page, page_to_pfn(page), zone, 0, mt);
 			trace_mm_page_pcpu_drain(page, 0, mt);
 		} while (--count && --batch_free && !list_empty(list));
@@ -1028,56 +1128,6 @@ static void free_one_page(struct zone *zone,
 	spin_unlock(&zone->lock);
 }
 
-static int free_tail_pages_check(struct page *head_page, struct page *page)
-{
-	int ret = 1;
-
-	/*
-	 * We rely page->lru.next never has bit 0 set, unless the page
-	 * is PageTail(). Let's make sure that's true even for poisoned ->lru.
-	 */
-	BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
-
-	if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
-		ret = 0;
-		goto out;
-	}
-	switch (page - head_page) {
-	case 1:
-		/* the first tail page: ->mapping is compound_mapcount() */
-		if (unlikely(compound_mapcount(page))) {
-			bad_page(page, "nonzero compound_mapcount", 0);
-			goto out;
-		}
-		break;
-	case 2:
-		/*
-		 * the second tail page: ->mapping is
-		 * page_deferred_list().next -- ignore value.
-		 */
-		break;
-	default:
-		if (page->mapping != TAIL_MAPPING) {
-			bad_page(page, "corrupted mapping in tail page", 0);
-			goto out;
-		}
-		break;
-	}
-	if (unlikely(!PageTail(page))) {
-		bad_page(page, "PageTail not set", 0);
-		goto out;
-	}
-	if (unlikely(compound_head(page) != head_page)) {
-		bad_page(page, "compound_head not consistent", 0);
-		goto out;
-	}
-	ret = 0;
-out:
-	page->mapping = NULL;
-	clear_compound_head(page);
-	return ret;
-}
-
 static void __meminit __init_single_page(struct page *page, unsigned long pfn,
 				unsigned long zone, int nid)
 {
@@ -2339,7 +2389,7 @@ void free_hot_cold_page(struct page *page, bool cold)
 	unsigned long pfn = page_to_pfn(page);
 	int migratetype;
 
-	if (!free_pages_prepare(page, 0))
+	if (!free_pcp_prepare(page))
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-- 
cgit v1.2.3


From 479f854a207ce2b97545a0a83856778b541063d0 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:35 -0700
Subject: mm, page_alloc: defer debugging checks of pages allocated from the
 PCP

Every page allocated checks a number of page fields for validity.  This
catches corruption bugs of pages that are already freed but it is
expensive.  This patch weakens the debugging check by checking PCP pages
only when the PCP lists are being refilled.  All compound pages are
checked.  This potentially avoids debugging checks entirely if the PCP
lists are never emptied and refilled so some corruption issues may be
missed.  Full checking requires DEBUG_VM.

With the two deferred debugging patches applied, the impact to a page
allocator microbenchmark is

                                             4.6.0-rc3                  4.6.0-rc3
                                           inline-v3r6            deferalloc-v3r7
  Min      alloc-odr0-1               344.00 (  0.00%)           317.00 (  7.85%)
  Min      alloc-odr0-2               248.00 (  0.00%)           231.00 (  6.85%)
  Min      alloc-odr0-4               209.00 (  0.00%)           192.00 (  8.13%)
  Min      alloc-odr0-8               181.00 (  0.00%)           166.00 (  8.29%)
  Min      alloc-odr0-16              168.00 (  0.00%)           154.00 (  8.33%)
  Min      alloc-odr0-32              161.00 (  0.00%)           148.00 (  8.07%)
  Min      alloc-odr0-64              158.00 (  0.00%)           145.00 (  8.23%)
  Min      alloc-odr0-128             156.00 (  0.00%)           143.00 (  8.33%)
  Min      alloc-odr0-256             168.00 (  0.00%)           154.00 (  8.33%)
  Min      alloc-odr0-512             178.00 (  0.00%)           167.00 (  6.18%)
  Min      alloc-odr0-1024            186.00 (  0.00%)           174.00 (  6.45%)
  Min      alloc-odr0-2048            192.00 (  0.00%)           180.00 (  6.25%)
  Min      alloc-odr0-4096            198.00 (  0.00%)           184.00 (  7.07%)
  Min      alloc-odr0-8192            200.00 (  0.00%)           188.00 (  6.00%)
  Min      alloc-odr0-16384           201.00 (  0.00%)           188.00 (  6.47%)
  Min      free-odr0-1                189.00 (  0.00%)           180.00 (  4.76%)
  Min      free-odr0-2                132.00 (  0.00%)           126.00 (  4.55%)
  Min      free-odr0-4                104.00 (  0.00%)            99.00 (  4.81%)
  Min      free-odr0-8                 90.00 (  0.00%)            85.00 (  5.56%)
  Min      free-odr0-16                84.00 (  0.00%)            80.00 (  4.76%)
  Min      free-odr0-32                80.00 (  0.00%)            76.00 (  5.00%)
  Min      free-odr0-64                78.00 (  0.00%)            74.00 (  5.13%)
  Min      free-odr0-128               77.00 (  0.00%)            73.00 (  5.19%)
  Min      free-odr0-256               94.00 (  0.00%)            91.00 (  3.19%)
  Min      free-odr0-512              108.00 (  0.00%)           112.00 ( -3.70%)
  Min      free-odr0-1024             115.00 (  0.00%)           118.00 ( -2.61%)
  Min      free-odr0-2048             120.00 (  0.00%)           125.00 ( -4.17%)
  Min      free-odr0-4096             123.00 (  0.00%)           129.00 ( -4.88%)
  Min      free-odr0-8192             126.00 (  0.00%)           130.00 ( -3.17%)
  Min      free-odr0-16384            126.00 (  0.00%)           131.00 ( -3.97%)

Note that the free paths for large numbers of pages is impacted as the
debugging cost gets shifted into that path when the page data is no
longer necessarily cache-hot.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 92 +++++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 64 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 759d3f60bea0..193ed34a2780 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1714,7 +1714,41 @@ static inline bool free_pages_prezeroed(bool poisoned)
 		page_poisoning_enabled() && poisoned;
 }
 
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+#ifdef CONFIG_DEBUG_VM
+static bool check_pcp_refill(struct page *page)
+{
+	return false;
+}
+
+static bool check_new_pcp(struct page *page)
+{
+	return check_new_page(page);
+}
+#else
+static bool check_pcp_refill(struct page *page)
+{
+	return check_new_page(page);
+}
+static bool check_new_pcp(struct page *page)
+{
+	return false;
+}
+#endif /* CONFIG_DEBUG_VM */
+
+static bool check_new_pages(struct page *page, unsigned int order)
+{
+	int i;
+	for (i = 0; i < (1 << order); i++) {
+		struct page *p = page + i;
+
+		if (unlikely(check_new_page(p)))
+			return true;
+	}
+
+	return false;
+}
+
+static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 							unsigned int alloc_flags)
 {
 	int i;
@@ -1722,8 +1756,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 
 	for (i = 0; i < (1 << order); i++) {
 		struct page *p = page + i;
-		if (unlikely(check_new_page(p)))
-			return 1;
 		if (poisoned)
 			poisoned &= page_is_poisoned(p);
 	}
@@ -1755,8 +1787,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
 		set_page_pfmemalloc(page);
 	else
 		clear_page_pfmemalloc(page);
-
-	return 0;
 }
 
 /*
@@ -2178,6 +2208,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		if (unlikely(page == NULL))
 			break;
 
+		if (unlikely(check_pcp_refill(page)))
+			continue;
+
 		/*
 		 * Split buddy pages returned by expand() are received here
 		 * in physical page order. The page is added to the callers and
@@ -2593,20 +2626,22 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 		struct list_head *list;
 
 		local_irq_save(flags);
-		pcp = &this_cpu_ptr(zone->pageset)->pcp;
-		list = &pcp->lists[migratetype];
-		if (list_empty(list)) {
-			pcp->count += rmqueue_bulk(zone, 0,
-					pcp->batch, list,
-					migratetype, cold);
-			if (unlikely(list_empty(list)))
-				goto failed;
-		}
+		do {
+			pcp = &this_cpu_ptr(zone->pageset)->pcp;
+			list = &pcp->lists[migratetype];
+			if (list_empty(list)) {
+				pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, list,
+						migratetype, cold);
+				if (unlikely(list_empty(list)))
+					goto failed;
+			}
 
-		if (cold)
-			page = list_last_entry(list, struct page, lru);
-		else
-			page = list_first_entry(list, struct page, lru);
+			if (cold)
+				page = list_last_entry(list, struct page, lru);
+			else
+				page = list_first_entry(list, struct page, lru);
+		} while (page && check_new_pcp(page));
 
 		__dec_zone_state(zone, NR_ALLOC_BATCH);
 		list_del(&page->lru);
@@ -2619,14 +2654,16 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
 		WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
 		spin_lock_irqsave(&zone->lock, flags);
 
-		page = NULL;
-		if (alloc_flags & ALLOC_HARDER) {
-			page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-			if (page)
-				trace_mm_page_alloc_zone_locked(page, order, migratetype);
-		}
-		if (!page)
-			page = __rmqueue(zone, order, migratetype);
+		do {
+			page = NULL;
+			if (alloc_flags & ALLOC_HARDER) {
+				page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+				if (page)
+					trace_mm_page_alloc_zone_locked(page, order, migratetype);
+			}
+			if (!page)
+				page = __rmqueue(zone, order, migratetype);
+		} while (page && check_new_pages(page, order));
 		spin_unlock(&zone->lock);
 		if (!page)
 			goto failed;
@@ -2993,8 +3030,7 @@ try_this_zone:
 		page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order,
 				gfp_mask, alloc_flags, ac->migratetype);
 		if (page) {
-			if (prep_new_page(page, order, gfp_mask, alloc_flags))
-				goto try_this_zone;
+			prep_new_page(page, order, gfp_mask, alloc_flags);
 
 			/*
 			 * If this is a high-order atomic allocation then check
-- 
cgit v1.2.3


From e2769dbdc51f1baa1908ecf6c84d50f19577e1db Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:38 -0700
Subject: mm, page_alloc: don't duplicate code in free_pcp_prepare

The new free_pcp_prepare() function shares a lot of code with
free_pages_prepare(), which makes this a maintenance risk when some
future patch modifies only one of them.  We should be able to achieve
the same effect (skipping free_pages_check() from !DEBUG_VM configs) by
adding a parameter to free_pages_prepare() and making it inline, so the
checks (and the order != 0 parts) are eliminated from the call from
free_pcp_prepare().

!DEBUG_VM: bloat-o-meter reports no difference, as my gcc was already
inlining free_pages_prepare() and the elimination seems to work as
expected

DEBUG_VM bloat-o-meter:

  add/remove: 0/1 grow/shrink: 2/0 up/down: 1035/-778 (257)
  function                                     old     new   delta
  __free_pages_ok                              297    1060    +763
  free_hot_cold_page                           480     752    +272
  free_pages_prepare                           778       -    -778

Here inlining didn't occur before, and added some code, but it's ok for
a debug option.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 133 +++++++++++++++++++++++---------------------------------
 1 file changed, 55 insertions(+), 78 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 193ed34a2780..7d8f642c498d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -991,47 +991,77 @@ out:
 	return ret;
 }
 
-static bool free_pages_prepare(struct page *page, unsigned int order);
-
-#ifdef CONFIG_DEBUG_VM
-static inline bool free_pcp_prepare(struct page *page)
+static __always_inline bool free_pages_prepare(struct page *page,
+					unsigned int order, bool check_free)
 {
-	return free_pages_prepare(page, 0);
-}
+	int bad = 0;
 
-static inline bool bulkfree_pcp_prepare(struct page *page)
-{
-	return false;
-}
-#else
-static bool free_pcp_prepare(struct page *page)
-{
 	VM_BUG_ON_PAGE(PageTail(page), page);
 
-	trace_mm_page_free(page, 0);
-	kmemcheck_free_shadow(page, 0);
-	kasan_free_pages(page, 0);
+	trace_mm_page_free(page, order);
+	kmemcheck_free_shadow(page, order);
+	kasan_free_pages(page, order);
+
+	/*
+	 * Check tail pages before head page information is cleared to
+	 * avoid checking PageCompound for order-0 pages.
+	 */
+	if (unlikely(order)) {
+		bool compound = PageCompound(page);
+		int i;
+
+		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
 
+		for (i = 1; i < (1 << order); i++) {
+			if (compound)
+				bad += free_tail_pages_check(page, page + i);
+			if (unlikely(free_pages_check(page + i))) {
+				bad++;
+				continue;
+			}
+			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+		}
+	}
 	if (PageAnonHead(page))
 		page->mapping = NULL;
+	if (check_free)
+		bad += free_pages_check(page);
+	if (bad)
+		return false;
 
-	reset_page_owner(page, 0);
+	page_cpupid_reset_last(page);
+	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
+	reset_page_owner(page, order);
 
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page),
-					   PAGE_SIZE);
+					   PAGE_SIZE << order);
 		debug_check_no_obj_freed(page_address(page),
-					   PAGE_SIZE);
+					   PAGE_SIZE << order);
 	}
-	arch_free_page(page, 0);
-	kernel_poison_pages(page, 0, 0);
-	kernel_map_pages(page, 0, 0);
+	arch_free_page(page, order);
+	kernel_poison_pages(page, 1 << order, 0);
+	kernel_map_pages(page, 1 << order, 0);
 
-	page_cpupid_reset_last(page);
-	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
 	return true;
 }
 
+#ifdef CONFIG_DEBUG_VM
+static inline bool free_pcp_prepare(struct page *page)
+{
+	return free_pages_prepare(page, 0, true);
+}
+
+static inline bool bulkfree_pcp_prepare(struct page *page)
+{
+	return false;
+}
+#else
+static bool free_pcp_prepare(struct page *page)
+{
+	return free_pages_prepare(page, 0, false);
+}
+
 static bool bulkfree_pcp_prepare(struct page *page)
 {
 	return free_pages_check(page);
@@ -1201,66 +1231,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
 	}
 }
 
-static bool free_pages_prepare(struct page *page, unsigned int order)
-{
-	int bad = 0;
-
-	VM_BUG_ON_PAGE(PageTail(page), page);
-
-	trace_mm_page_free(page, order);
-	kmemcheck_free_shadow(page, order);
-	kasan_free_pages(page, order);
-
-	/*
-	 * Check tail pages before head page information is cleared to
-	 * avoid checking PageCompound for order-0 pages.
-	 */
-	if (unlikely(order)) {
-		bool compound = PageCompound(page);
-		int i;
-
-		VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
-
-		for (i = 1; i < (1 << order); i++) {
-			if (compound)
-				bad += free_tail_pages_check(page, page + i);
-			if (unlikely(free_pages_check(page + i))) {
-				bad++;
-				continue;
-			}
-			(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-		}
-	}
-	if (PageAnonHead(page))
-		page->mapping = NULL;
-	bad += free_pages_check(page);
-	if (bad)
-		return false;
-
-	page_cpupid_reset_last(page);
-	page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
-	reset_page_owner(page, order);
-
-	if (!PageHighMem(page)) {
-		debug_check_no_locks_freed(page_address(page),
-					   PAGE_SIZE << order);
-		debug_check_no_obj_freed(page_address(page),
-					   PAGE_SIZE << order);
-	}
-	arch_free_page(page, order);
-	kernel_poison_pages(page, 1 << order, 0);
-	kernel_map_pages(page, 1 << order, 0);
-
-	return true;
-}
-
 static void __free_pages_ok(struct page *page, unsigned int order)
 {
 	unsigned long flags;
 	int migratetype;
 	unsigned long pfn = page_to_pfn(page);
 
-	if (!free_pages_prepare(page, order))
+	if (!free_pages_prepare(page, order, true))
 		return;
 
 	migratetype = get_pfnblock_migratetype(page, pfn);
-- 
cgit v1.2.3


From 4e6118016eb7986109ad61b00186579f384f956a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:41 -0700
Subject: mm, page_alloc: uninline the bad page part of check_new_page()

Bad pages should be rare so the code handling them doesn't need to be
inline for performance reasons.  Put it to separate function which
returns void.  This also assumes that the initial page_expected_state()
result will match the result of the thorough check, i.e.  the page
doesn't become "good" in the meanwhile.  This matches the same
expectations already in place in free_pages_check().

!DEBUG_VM bloat-o-meter:

  add/remove: 1/0 grow/shrink: 0/1 up/down: 134/-274 (-140)
  function                                     old     new   delta
  check_new_page_bad                             -     134    +134
  get_page_from_freelist                      3468    3194    -274

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7d8f642c498d..ecf663358b0d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1647,19 +1647,11 @@ static inline void expand(struct zone *zone, struct page *page,
 	}
 }
 
-/*
- * This page is about to be returned from the page allocator
- */
-static inline int check_new_page(struct page *page)
+static void check_new_page_bad(struct page *page)
 {
-	const char *bad_reason;
-	unsigned long bad_flags;
+	const char *bad_reason = NULL;
+	unsigned long bad_flags = 0;
 
-	if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))
-		return 0;
-
-	bad_reason = NULL;
-	bad_flags = 0;
 	if (unlikely(atomic_read(&page->_mapcount) != -1))
 		bad_reason = "nonzero mapcount";
 	if (unlikely(page->mapping != NULL))
@@ -1678,11 +1670,20 @@ static inline int check_new_page(struct page *page)
 	if (unlikely(page->mem_cgroup))
 		bad_reason = "page still charged to cgroup";
 #endif
-	if (unlikely(bad_reason)) {
-		bad_page(page, bad_reason, bad_flags);
-		return 1;
-	}
-	return 0;
+	bad_page(page, bad_reason, bad_flags);
+}
+
+/*
+ * This page is about to be returned from the page allocator
+ */
+static inline int check_new_page(struct page *page)
+{
+	if (likely(page_expected_state(page,
+				PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
+		return 0;
+
+	check_new_page_bad(page);
+	return 1;
 }
 
 static inline bool free_pages_prezeroed(bool poisoned)
-- 
cgit v1.2.3


From 4741526b83c5d3a3d661d1896f9e7414c5730bcb Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:44 -0700
Subject: mm, page_alloc: restore the original nodemask if the fast path
 allocation failed

The page allocator fast path uses either the requested nodemask or
cpuset_current_mems_allowed if cpusets are enabled.  If the allocation
context allows watermarks to be ignored then it can also ignore memory
policies.  However, on entering the allocator slowpath the nodemask may
still be cpuset_current_mems_allowed and the policies are enforced.
This patch resets the nodemask appropriately before entering the
slowpath.

Link: http://lkml.kernel.org/r/20160504143628.GU2858@techsingularity.net
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ecf663358b0d..5c469c1dfb8b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3636,6 +3636,12 @@ retry_cpuset:
 	alloc_mask = memalloc_noio_flags(gfp_mask);
 	ac.spread_dirty_pages = false;
 
+	/*
+	 * Restore the original nodemask if it was potentially replaced with
+	 * &cpuset_current_mems_allowed to optimize the fast-path attempt.
+	 */
+	if (cpusets_enabled())
+		ac.nodemask = nodemask;
 	page = __alloc_pages_slowpath(alloc_mask, order, &ac);
 
 no_zone:
-- 
cgit v1.2.3


From f0281a00fe80f0e689dd51e68c3aed5f6ef1bf58 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 20 May 2016 16:56:25 -0700
Subject: mm: workingset: only do workingset activations on reads

This is a follow-up to

  http://www.spinics.net/lists/linux-mm/msg101739.html

where Andres reported his database workingset being pushed out by the
minimum size enforcement of the inactive file list - currently 50% of
cache - as well as repeatedly written file pages that are never actually
read.

Two changes fell out of the discussions.  The first change observes that
pages that are only ever written don't benefit from caching beyond what
the writeback cache does for partial page writes, and so we shouldn't
promote them to the active file list where they compete with pages whose
cached data is actually accessed repeatedly.  This change comes in two
patches - one for in-cache write accesses and one for refaults triggered
by writes, neither of which should promote a cache page.

Second, with the refault detection we don't need to set 50% of the cache
aside for used-once cache anymore since we can detect frequently used
pages even when they are evicted between accesses.  We can allow the
active list to be bigger and thus protect a bigger workingset that isn't
challenged by streamers.  Depending on the access patterns, this can
increase major faults during workingset transitions for better
performance during stable phases.

This patch (of 3):

When rewriting a page, the data in that page is replaced with new data.
This means that evicting something else from the active file list, in
order to cache data that will be replaced by something else, is likely
to be a waste of memory.

It is better to save the active list for frequently read pages, because
reads actually use the data that is in the page.

This patch ignores partial writes, because it is unclear whether the
complexity of identifying those is worth any potential performance gain
obtained from better caching pages that see repeated partial writes at
large enough intervals to not get caught by the use-twice promotion code
used for the inactive file list.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index 01690338e3d2..beba6bd6b511 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -713,8 +713,12 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 		 * The page might have been evicted from cache only
 		 * recently, in which case it should be activated like
 		 * any other repeatedly accessed page.
+		 * The exception is pages getting rewritten; evicting other
+		 * data from the working set, only to cache data that will
+		 * get overwritten with something else, is a waste of memory.
 		 */
-		if (shadow && workingset_refault(shadow)) {
+		if (!(gfp_mask & __GFP_WRITE) &&
+		    shadow && workingset_refault(shadow)) {
 			SetPageActive(page);
 			workingset_activation(page);
 		} else
-- 
cgit v1.2.3


From bbddabe2e436aa7869b3ac5248df5c14ddde0cbf Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 20 May 2016 16:56:28 -0700
Subject: mm: filemap: only do access activations on reads

Andres observed that his database workload is struggling with the
transaction journal creating pressure on frequently read pages.

Access patterns like transaction journals frequently write the same
pages over and over, but in the majority of cases those pages are never
read back.  There are no caching benefits to be had for those pages, so
activating them and having them put pressure on pages that do benefit
from caching is a bad choice.

Leave page activations to read accesses and don't promote pages based on
writes alone.

It could be said that partially written pages do contain cache-worthy
data, because even if *userspace* does not access the unwritten part,
the kernel still has to read it from the filesystem for correctness.
However, a counter argument is that these pages enjoy at least *some*
protection over other inactive file pages through the writeback cache,
in the sense that dirty pages are written back with a delay and cache
reclaim leaves them alone until they have been written back to disk.
Should that turn out to be insufficient and we see increased read IO
from partial writes under memory pressure, we can always go back and
update grab_cache_page_write_begin() to take (pos, len) so that it can
tell partial writes from pages that don't need partial reads.  But for
now, keep it simple.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/filemap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index beba6bd6b511..8f4859989f1b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2578,7 +2578,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
 					pgoff_t index, unsigned flags)
 {
 	struct page *page;
-	int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT;
+	int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT;
 
 	if (flags & AOP_FLAG_NOFS)
 		fgp_flags |= FGP_NOFS;
-- 
cgit v1.2.3


From 59dc76b0d4dfdd7dc46a1010e4afb44f60f3e97f Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Fri, 20 May 2016 16:56:31 -0700
Subject: mm: vmscan: reduce size of inactive file list

The inactive file list should still be large enough to contain readahead
windows and freshly written file data, but it no longer is the only
source for detecting multiple accesses to file pages.  The workingset
refault measurement code causes recently evicted file pages that get
accessed again after a shorter interval to be promoted directly to the
active list.

With that mechanism in place, we can afford to (on a larger system)
dedicate more memory to the active file list, so we can actually cache
more of the frequently used file pages in memory, and not have them
pushed out by streaming writes, once-used streaming file reads, etc.

This can help things like database workloads, where only half the page
cache can currently be used to cache the database working set.  This
patch automatically increases that fraction on larger systems, using the
same ratio that has already been used for anonymous memory.

[hannes@cmpxchg.org: cgroup-awareness]
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  25 -----------
 mm/page_alloc.c            |  44 -------------------
 mm/vmscan.c                | 104 ++++++++++++++++++---------------------------
 3 files changed, 42 insertions(+), 131 deletions(-)

(limited to 'mm')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 94da96738df3..a805474df4ab 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return mz->lru_size[lru];
 }
 
-static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	unsigned long inactive_ratio;
-	unsigned long inactive;
-	unsigned long active;
-	unsigned long gb;
-
-	inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
-	active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
-
-	gb = (inactive + active) >> (30 - PAGE_SHIFT);
-	if (gb)
-		inactive_ratio = int_sqrt(10 * gb);
-	else
-		inactive_ratio = 1;
-
-	return inactive * inactive_ratio < active;
-}
-
 void mem_cgroup_handle_over_high(void);
 
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
@@ -646,12 +627,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
 	return true;
 }
 
-static inline bool
-mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return true;
-}
-
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c469c1dfb8b..edbdf56b3c9b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6670,49 +6670,6 @@ void setup_per_zone_wmarks(void)
 	mutex_unlock(&zonelists_mutex);
 }
 
-/*
- * The inactive anon list should be small enough that the VM never has to
- * do too much work, but large enough that each inactive page has a chance
- * to be referenced again before it is swapped out.
- *
- * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
- * INACTIVE_ANON pages on this zone's LRU, maintained by the
- * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
- * the anonymous pages are kept on the inactive list.
- *
- * total     target    max
- * memory    ratio     inactive anon
- * -------------------------------------
- *   10MB       1         5MB
- *  100MB       1        50MB
- *    1GB       3       250MB
- *   10GB      10       0.9GB
- *  100GB      31         3GB
- *    1TB     101        10GB
- *   10TB     320        32GB
- */
-static void __meminit calculate_zone_inactive_ratio(struct zone *zone)
-{
-	unsigned int gb, ratio;
-
-	/* Zone size in gigabytes */
-	gb = zone->managed_pages >> (30 - PAGE_SHIFT);
-	if (gb)
-		ratio = int_sqrt(10 * gb);
-	else
-		ratio = 1;
-
-	zone->inactive_ratio = ratio;
-}
-
-static void __meminit setup_per_zone_inactive_ratio(void)
-{
-	struct zone *zone;
-
-	for_each_zone(zone)
-		calculate_zone_inactive_ratio(zone);
-}
-
 /*
  * Initialise min_free_kbytes.
  *
@@ -6758,7 +6715,6 @@ int __meminit init_per_zone_wmark_min(void)
 	setup_per_zone_wmarks();
 	refresh_zone_stat_thresholds();
 	setup_per_zone_lowmem_reserve();
-	setup_per_zone_inactive_ratio();
 	return 0;
 }
 core_initcall(init_per_zone_wmark_min)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcfdfc1a0942..38d6d06c955f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1862,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	free_hot_cold_page_list(&l_hold, true);
 }
 
-#ifdef CONFIG_SWAP
-static bool inactive_anon_is_low_global(struct zone *zone)
-{
-	unsigned long active, inactive;
-
-	active = zone_page_state(zone, NR_ACTIVE_ANON);
-	inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-
-	return inactive * zone->inactive_ratio < active;
-}
-
-/**
- * inactive_anon_is_low - check if anonymous pages need to be deactivated
- * @lruvec: LRU vector to check
+/*
+ * The inactive anon list should be small enough that the VM never has
+ * to do too much work.
  *
- * Returns true if the zone does not have enough inactive anon pages,
- * meaning some active anon pages need to be deactivated.
- */
-static bool inactive_anon_is_low(struct lruvec *lruvec)
-{
-	/*
-	 * If we don't have swap space, anonymous page deactivation
-	 * is pointless.
-	 */
-	if (!total_swap_pages)
-		return false;
-
-	if (!mem_cgroup_disabled())
-		return mem_cgroup_inactive_anon_is_low(lruvec);
-
-	return inactive_anon_is_low_global(lruvec_zone(lruvec));
-}
-#else
-static inline bool inactive_anon_is_low(struct lruvec *lruvec)
-{
-	return false;
-}
-#endif
-
-/**
- * inactive_file_is_low - check if file pages need to be deactivated
- * @lruvec: LRU vector to check
+ * The inactive file list should be small enough to leave most memory
+ * to the established workingset on the scan-resistant active list,
+ * but large enough to avoid thrashing the aggregate readahead window.
  *
- * When the system is doing streaming IO, memory pressure here
- * ensures that active file pages get deactivated, until more
- * than half of the file pages are on the inactive list.
+ * Both inactive lists should also be large enough that each inactive
+ * page has a chance to be referenced again before it is reclaimed.
  *
- * Once we get to that situation, protect the system's working
- * set from being evicted by disabling active file page aging.
+ * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
+ * on this LRU, maintained by the pageout code. A zone->inactive_ratio
+ * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
  *
- * This uses a different ratio than the anonymous pages, because
- * the page cache uses a use-once replacement algorithm.
+ * total     target    max
+ * memory    ratio     inactive
+ * -------------------------------------
+ *   10MB       1         5MB
+ *  100MB       1        50MB
+ *    1GB       3       250MB
+ *   10GB      10       0.9GB
+ *  100GB      31         3GB
+ *    1TB     101        10GB
+ *   10TB     320        32GB
  */
-static bool inactive_file_is_low(struct lruvec *lruvec)
+static bool inactive_list_is_low(struct lruvec *lruvec, bool file)
 {
+	unsigned long inactive_ratio;
 	unsigned long inactive;
 	unsigned long active;
+	unsigned long gb;
 
-	inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE);
-	active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE);
+	/*
+	 * If we don't have swap space, anonymous page deactivation
+	 * is pointless.
+	 */
+	if (!file && !total_swap_pages)
+		return false;
 
-	return active > inactive;
-}
+	inactive = lruvec_lru_size(lruvec, file * LRU_FILE);
+	active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE);
 
-static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru)
-{
-	if (is_file_lru(lru))
-		return inactive_file_is_low(lruvec);
+	gb = (inactive + active) >> (30 - PAGE_SHIFT);
+	if (gb)
+		inactive_ratio = int_sqrt(10 * gb);
 	else
-		return inactive_anon_is_low(lruvec);
+		inactive_ratio = 1;
+
+	return inactive * inactive_ratio < active;
 }
 
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
 				 struct lruvec *lruvec, struct scan_control *sc)
 {
 	if (is_active_lru(lru)) {
-		if (inactive_list_is_low(lruvec, lru))
+		if (inactive_list_is_low(lruvec, is_file_lru(lru)))
 			shrink_active_list(nr_to_scan, lruvec, sc, lru);
 		return 0;
 	}
@@ -2059,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
 	 * lruvec even if it has plenty of old anonymous pages unless the
 	 * system is under heavy pressure.
 	 */
-	if (!inactive_file_is_low(lruvec) &&
+	if (!inactive_list_is_low(lruvec, true) &&
 	    lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) {
 		scan_balance = SCAN_FILE;
 		goto out;
@@ -2301,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg,
 	 * Even if we did not try to evict anon pages at all, we want to
 	 * rebalance the anon lru active/inactive ratio.
 	 */
-	if (inactive_anon_is_low(lruvec))
+	if (inactive_list_is_low(lruvec, false))
 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 				   sc, LRU_ACTIVE_ANON);
 
@@ -2962,7 +2942,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc)
 	do {
 		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
 
-		if (inactive_anon_is_low(lruvec))
+		if (inactive_list_is_low(lruvec, false))
 			shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
 					   sc, LRU_ACTIVE_ANON);
 
-- 
cgit v1.2.3


From b6459cc154e804f0de0d61fa023c4946b742cc96 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:34 -0700
Subject: vmscan: consider classzone_idx in compaction_ready

Motivation:
As pointed out by Linus [2][3] relying on zone_reclaimable as a way to
communicate the reclaim progress is rater dubious. I tend to agree,
not only it is really obscure, it is not hard to imagine cases where a
single page freed in the loop keeps all the reclaimers looping without
getting any progress because their gfp_mask wouldn't allow to get that
page anyway (e.g. single GFP_ATOMIC alloc and free loop). This is rather
rare so it doesn't happen in the practice but the current logic which we
have is rather obscure and hard to follow a also non-deterministic.

This is an attempt to make the OOM detection more deterministic and
easier to follow because each reclaimer basically tracks its own
progress which is implemented at the page allocator layer rather spread
out between the allocator and the reclaim.  The more on the
implementation is described in the first patch.

I have tested several different scenarios but it should be clear that
testing OOM killer is quite hard to be representative.  There is usually
a tiny gap between almost OOM and full blown OOM which is often time
sensitive.  Anyway, I have tested the following 2 scenarios and I would
appreciate if there are more to test.

Testing environment: a virtual machine with 2G of RAM and 2CPUs without
any swap to make the OOM more deterministic.

1) 2 writers (each doing dd with 4M blocks to an xfs partition with 1G
   file size, removes the files and starts over again) running in
   parallel for 10s to build up a lot of dirty pages when 100 parallel
   mem_eaters (anon private populated mmap which waits until it gets
   signal) with 80M each.

   This causes an OOM flood of course and I have compared both patched
   and unpatched kernels. The test is considered finished after there
   are no OOM conditions detected. This should tell us whether there are
   any excessive kills or some of them premature (e.g. due to dirty pages):

I have performed two runs this time each after a fresh boot.

* base kernel
$ grep "Out of memory:" base-oom-run1.log | wc -l
78
$ grep "Out of memory:" base-oom-run2.log | wc -l
78

$ grep "Kill process" base-oom-run1.log | tail -n1
[   91.391203] Out of memory: Kill process 3061 (mem_eater) score 39 or sacrifice child
$ grep "Kill process" base-oom-run2.log | tail -n1
[   82.141919] Out of memory: Kill process 3086 (mem_eater) score 39 or sacrifice child

$ grep "DMA32 free:" base-oom-run1.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk
min: 5376.00 max: 6776.00 avg: 5530.75 std: 166.50 nr: 61
$ grep "DMA32 free:" base-oom-run2.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk
min: 5416.00 max: 5608.00 avg: 5514.15 std: 42.94 nr: 52

$ grep "DMA32.*all_unreclaimable? no" base-oom-run1.log | wc -l
1
$ grep "DMA32.*all_unreclaimable? no" base-oom-run2.log | wc -l
3

* patched kernel
$ grep "Out of memory:" patched-oom-run1.log | wc -l
78
miso@tiehlicka /mnt/share/devel/miso/kvm $ grep "Out of memory:" patched-oom-run2.log | wc -l
77

e grep "Kill process" patched-oom-run1.log | tail -n1
[  497.317732] Out of memory: Kill process 3108 (mem_eater) score 39 or sacrifice child
$ grep "Kill process" patched-oom-run2.log | tail -n1
[  316.169920] Out of memory: Kill process 3093 (mem_eater) score 39 or sacrifice child

$ grep "DMA32 free:" patched-oom-run1.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk
min: 5420.00 max: 5808.00 avg: 5513.90 std: 60.45 nr: 78
$ grep "DMA32 free:" patched-oom-run2.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk
min: 5380.00 max: 6384.00 avg: 5520.94 std: 136.84 nr: 77

e grep "DMA32.*all_unreclaimable? no" patched-oom-run1.log | wc -l
2
$ grep "DMA32.*all_unreclaimable? no" patched-oom-run2.log | wc -l
3

The patched kernel run noticeably longer while invoking OOM killer same
number of times. This means that the original implementation is much
more aggressive and triggers the OOM killer sooner. free pages stats
show that neither kernels went OOM too early most of the time, though. I
guess the difference is in the backoff when retries without any progress
do sleep for a while if there is memory under writeback or dirty which
is highly likely considering the parallel IO.
Both kernels have seen races where zone wasn't marked unreclaimable
and we still hit the OOM killer. This is most likely a race where
a task managed to exit between the last allocation attempt and the oom
killer invocation.

2) 2 writers again with 10s of run and then 10 mem_eaters to consume as much
   memory as possible without triggering the OOM killer. This required a lot
   of tuning but I've considered 3 consecutive runs in three different boots
   without OOM as a success.

* base kernel
size=$(awk '/MemFree/{printf "%dK", ($2/10)-(16*1024)}' /proc/meminfo)

* patched kernel
size=$(awk '/MemFree/{printf "%dK", ($2/10)-(12*1024)}' /proc/meminfo)

That means 40M more memory was usable without triggering OOM killer. The
base kernel sometimes managed to handle the same as patched but it
wasn't consistent and failed in at least on of the 3 runs. This seems
like a minor improvement.

I was testing also GPF_REPEAT costly requests (hughetlb) with fragmented
memory and under memory pressure. The results are in patch 11 where the
logic is implemented. In short I can see huge improvement there.

I am certainly interested in other usecases as well as well as any
feedback. Especially those which require higher order requests.

This patch (of 14):

While playing with the oom detection rework [1] I have noticed that my
heavy order-9 (hugetlb) load close to OOM ended up in an endless loop
where the reclaim hasn't made any progress but did_some_progress didn't
reflect that and compaction_suitable was backing off because no zone is
above low wmark + 1 << order.

It turned out that this is in fact an old standing bug in
compaction_ready which ignores the requested_highidx and did the
watermark check for 0 classzone_idx.  This succeeds for zone DMA most
of the time as the zone is mostly unused because of lowmem protection.
As a result costly high order allocatios always report a successfull
progress even when there was none.  This wasn't a problem so far
because these allocations usually fail quite early or retry only few
times with __GFP_REPEAT but this will change after later patch in this
series so make sure to not lie about the progress and propagate
requested_highidx down to compaction_ready and use it for both the
watermak check and compaction_suitable to fix this issue.

[1] http://lkml.kernel.org/r/1459855533-4600-1-git-send-email-mhocko@kernel.org
[2] https://lkml.org/lkml/2015/10/12/808
[3] https://lkml.org/lkml/2015/10/13/597

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmscan.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 38d6d06c955f..a386454c015a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2459,7 +2459,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
  * Returns true if compaction should go ahead for a high-order request, or
  * the high-order allocation would succeed without compaction.
  */
-static inline bool compaction_ready(struct zone *zone, int order)
+static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx)
 {
 	unsigned long balance_gap, watermark;
 	bool watermark_ok;
@@ -2473,7 +2473,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
 	balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP(
 			zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO));
 	watermark = high_wmark_pages(zone) + balance_gap + (2UL << order);
-	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0);
+	watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx);
 
 	/*
 	 * If compaction is deferred, reclaim up to a point where
@@ -2486,7 +2486,7 @@ static inline bool compaction_ready(struct zone *zone, int order)
 	 * If compaction is not ready to start and allocation is not likely
 	 * to succeed without it, then keep reclaiming.
 	 */
-	if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED)
+	if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED)
 		return false;
 
 	return watermark_ok;
@@ -2566,7 +2566,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			if (IS_ENABLED(CONFIG_COMPACTION) &&
 			    sc->order > PAGE_ALLOC_COSTLY_ORDER &&
 			    zonelist_zone_idx(z) <= requested_highidx &&
-			    compaction_ready(zone, sc->order)) {
+			    compaction_ready(zone, sc->order, requested_highidx)) {
 				sc->compaction_ready = true;
 				continue;
 			}
-- 
cgit v1.2.3


From ea7ab982b6bdb7ce218fd3a7850bb2e2b414fdd0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:38 -0700
Subject: mm, compaction: change COMPACT_ constants into enum

Compaction code is doing weird dances between COMPACT_FOO -> int ->
unsigned long

But there doesn't seem to be any reason for that.  All functions which
return/use one of those constants are not expecting any other value so it
really makes sense to define an enum for them and make it clear that no
other values are expected.

This is a pure cleanup and shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 45 +++++++++++++++++++++++++++------------------
 mm/compaction.c            | 27 ++++++++++++++-------------
 mm/page_alloc.c            |  2 +-
 3 files changed, 42 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 242b660f64e6..706cbf00e919 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -2,21 +2,29 @@
 #define _LINUX_COMPACTION_H
 
 /* Return values for compact_zone() and try_to_compact_pages() */
-/* compaction didn't start as it was deferred due to past failures */
-#define COMPACT_DEFERRED	0
-/* compaction didn't start as it was not possible or direct reclaim was more suitable */
-#define COMPACT_SKIPPED		1
-/* compaction should continue to another pageblock */
-#define COMPACT_CONTINUE	2
-/* direct compaction partially compacted a zone and there are suitable pages */
-#define COMPACT_PARTIAL		3
-/* The full zone was compacted */
-#define COMPACT_COMPLETE	4
-/* For more detailed tracepoint output */
-#define COMPACT_NO_SUITABLE_PAGE	5
-#define COMPACT_NOT_SUITABLE_ZONE	6
-#define COMPACT_CONTENDED		7
 /* When adding new states, please adjust include/trace/events/compaction.h */
+enum compact_result {
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/*
+	 * compaction didn't start as it was not possible or direct reclaim
+	 * was more suitable
+	 */
+	COMPACT_SKIPPED,
+	/* compaction should continue to another pageblock */
+	COMPACT_CONTINUE,
+	/*
+	 * direct compaction partially compacted a zone and there are suitable
+	 * pages
+	 */
+	COMPACT_PARTIAL,
+	/* The full zone was compacted */
+	COMPACT_COMPLETE,
+	/* For more detailed tracepoint output */
+	COMPACT_NO_SUITABLE_PAGE,
+	COMPACT_NOT_SUITABLE_ZONE,
+	COMPACT_CONTENDED,
+};
 
 /* Used to signal whether compaction detected need_sched() or lock contention */
 /* No contention detected */
@@ -38,12 +46,13 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write,
 extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
-extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+extern enum compact_result try_to_compact_pages(gfp_t gfp_mask,
+			unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
-extern unsigned long compaction_suitable(struct zone *zone, int order,
+extern enum compact_result compaction_suitable(struct zone *zone, int order,
 		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
@@ -57,7 +66,7 @@ extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
 
 #else
-static inline unsigned long try_to_compact_pages(gfp_t gfp_mask,
+static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask,
 			unsigned int order, int alloc_flags,
 			const struct alloc_context *ac,
 			enum migrate_mode mode, int *contended)
@@ -73,7 +82,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat)
 {
 }
 
-static inline unsigned long compaction_suitable(struct zone *zone, int order,
+static inline enum compact_result compaction_suitable(struct zone *zone, int order,
 					int alloc_flags, int classzone_idx)
 {
 	return COMPACT_SKIPPED;
diff --git a/mm/compaction.c b/mm/compaction.c
index eda3c2244f30..e721d252c5d2 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order)
 	return order == -1;
 }
 
-static int __compact_finished(struct zone *zone, struct compact_control *cc,
+static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc,
 			    const int migratetype)
 {
 	unsigned int order;
@@ -1292,8 +1292,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
 	return COMPACT_NO_SUITABLE_PAGE;
 }
 
-static int compact_finished(struct zone *zone, struct compact_control *cc,
-			    const int migratetype)
+static enum compact_result compact_finished(struct zone *zone,
+			struct compact_control *cc,
+			const int migratetype)
 {
 	int ret;
 
@@ -1312,7 +1313,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
  *   COMPACT_PARTIAL  - If the allocation would succeed without compaction
  *   COMPACT_CONTINUE - If compaction should run now
  */
-static unsigned long __compaction_suitable(struct zone *zone, int order,
+static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
@@ -1358,11 +1359,11 @@ static unsigned long __compaction_suitable(struct zone *zone, int order,
 	return COMPACT_CONTINUE;
 }
 
-unsigned long compaction_suitable(struct zone *zone, int order,
+enum compact_result compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
 					int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 
 	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
 	trace_mm_compaction_suitable(zone, order, ret);
@@ -1372,9 +1373,9 @@ unsigned long compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
-static int compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
-	int ret;
+	enum compact_result ret;
 	unsigned long start_pfn = zone->zone_start_pfn;
 	unsigned long end_pfn = zone_end_pfn(zone);
 	const int migratetype = gfpflags_to_migratetype(cc->gfp_mask);
@@ -1530,11 +1531,11 @@ out:
 	return ret;
 }
 
-static unsigned long compact_zone_order(struct zone *zone, int order,
+static enum compact_result compact_zone_order(struct zone *zone, int order,
 		gfp_t gfp_mask, enum migrate_mode mode, int *contended,
 		unsigned int alloc_flags, int classzone_idx)
 {
-	unsigned long ret;
+	enum compact_result ret;
 	struct compact_control cc = {
 		.nr_freepages = 0,
 		.nr_migratepages = 0,
@@ -1572,7 +1573,7 @@ int sysctl_extfrag_threshold = 500;
  *
  * This is the main entry point for direct page compaction.
  */
-unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, int *contended)
 {
@@ -1580,7 +1581,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	int rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_DEFERRED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1594,7 +1595,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	/* Compact each zone in the list */
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 								ac->nodemask) {
-		int status;
+		enum compact_result status;
 		int zone_contended;
 
 		if (compaction_deferred(zone, order))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index edbdf56b3c9b..ed62c4b90598 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3188,7 +3188,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		enum migrate_mode mode, int *contended_compaction,
 		bool *deferred_compaction)
 {
-	unsigned long compact_result;
+	enum compact_result compact_result;
 	struct page *page;
 
 	if (!order)
-- 
cgit v1.2.3


From c46649deae3f00aa8ba8716f0ddb8eef2dc9532f Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:41 -0700
Subject: mm, compaction: cover all compaction mode in compact_zone

The compiler is complaining after "mm, compaction: change COMPACT_
constants into enum"

  mm/compaction.c: In function `compact_zone':
  mm/compaction.c:1350:2: warning: enumeration value `COMPACT_DEFERRED' not handled in switch [-Wswitch]
    switch (ret) {
    ^
  mm/compaction.c:1350:2: warning: enumeration value `COMPACT_COMPLETE' not handled in switch [-Wswitch]
  mm/compaction.c:1350:2: warning: enumeration value `COMPACT_NO_SUITABLE_PAGE' not handled in switch [-Wswitch]
  mm/compaction.c:1350:2: warning: enumeration value `COMPACT_NOT_SUITABLE_ZONE' not handled in switch [-Wswitch]
  mm/compaction.c:1350:2: warning: enumeration value `COMPACT_CONTENDED' not handled in switch [-Wswitch]

compaction_suitable is allowed to return only COMPACT_PARTIAL,
COMPACT_SKIPPED and COMPACT_CONTINUE so other cases are simply
impossible.  Put a VM_BUG_ON to catch an impossible return value.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index e721d252c5d2..455ecd87f48d 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1383,15 +1383,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 
 	ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
 							cc->classzone_idx);
-	switch (ret) {
-	case COMPACT_PARTIAL:
-	case COMPACT_SKIPPED:
-		/* Compaction is likely to fail */
+	/* Compaction is likely to fail */
+	if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED)
 		return ret;
-	case COMPACT_CONTINUE:
-		/* Fall through to compaction */
-		;
-	}
+
+	/* huh, compaction_suitable is returning something unexpected */
+	VM_BUG_ON(ret != COMPACT_CONTINUE);
 
 	/*
 	 * Clear pageblock skip if there were failures recently and compaction
-- 
cgit v1.2.3


From 1d4746d395975e0ff5103e20ab169d1a95b4ef9e Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:44 -0700
Subject: mm, compaction: distinguish COMPACT_DEFERRED from COMPACT_SKIPPED

try_to_compact_pages() can currently return COMPACT_SKIPPED even when
the compaction is defered for some zone just because zone DMA is skipped
in 99% of cases due to watermark checks.  This makes COMPACT_DEFERRED
basically unusable for the page allocator as a feedback mechanism.

Make sure we distinguish those two states properly and switch their
ordering in the enum.  This would mean that the COMPACT_SKIPPED will be
returned only when all eligible zones are skipped.

As a result COMPACT_DEFERRED handling for THP in __alloc_pages_slowpath
will be more precise and we would bail out rather than reclaim.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 7 +++++--
 include/trace/events/compaction.h | 2 +-
 mm/compaction.c                   | 8 +++++---
 3 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 706cbf00e919..11f228712ed5 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -4,13 +4,16 @@
 /* Return values for compact_zone() and try_to_compact_pages() */
 /* When adding new states, please adjust include/trace/events/compaction.h */
 enum compact_result {
-	/* compaction didn't start as it was deferred due to past failures */
-	COMPACT_DEFERRED,
 	/*
 	 * compaction didn't start as it was not possible or direct reclaim
 	 * was more suitable
 	 */
 	COMPACT_SKIPPED,
+	/* compaction didn't start as it was deferred due to past failures */
+	COMPACT_DEFERRED,
+	/* compaction not active last round */
+	COMPACT_INACTIVE = COMPACT_DEFERRED,
+
 	/* compaction should continue to another pageblock */
 	COMPACT_CONTINUE,
 	/*
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index e215bf68f521..6ba16c86d7db 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -10,8 +10,8 @@
 #include <trace/events/mmflags.h>
 
 #define COMPACTION_STATUS					\
-	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_SKIPPED,		"skipped")		\
+	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
 	EM( COMPACT_COMPLETE,		"complete")		\
diff --git a/mm/compaction.c b/mm/compaction.c
index 455ecd87f48d..b2b94474dd28 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1578,7 +1578,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 	int may_perform_io = gfp_mask & __GFP_IO;
 	struct zoneref *z;
 	struct zone *zone;
-	enum compact_result rc = COMPACT_DEFERRED;
+	enum compact_result rc = COMPACT_SKIPPED;
 	int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */
 
 	*contended = COMPACT_CONTENDED_NONE;
@@ -1595,8 +1595,10 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 		enum compact_result status;
 		int zone_contended;
 
-		if (compaction_deferred(zone, order))
+		if (compaction_deferred(zone, order)) {
+			rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
 			continue;
+		}
 
 		status = compact_zone_order(zone, order, gfp_mask, mode,
 				&zone_contended, alloc_flags,
@@ -1667,7 +1669,7 @@ break_loop:
 	 * If at least one zone wasn't deferred or skipped, we report if all
 	 * zones that were tried were lock contended.
 	 */
-	if (rc > COMPACT_SKIPPED && all_zones_contended)
+	if (rc > COMPACT_INACTIVE && all_zones_contended)
 		*contended = COMPACT_CONTENDED_LOCK;
 
 	return rc;
-- 
cgit v1.2.3


From c8f7de0bfae36e8532e5e25a39d15407f02aca78 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:47 -0700
Subject: mm, compaction: distinguish between full and partial COMPACT_COMPLETE

COMPACT_COMPLETE now means that compaction and free scanner met.  This
is not very useful information if somebody just wants to use this
feedback and make any decisions based on that.  The current caller might
be a poor guy who just happened to scan tiny portion of the zone and
that could be the reason no suitable pages were compacted.  Make sure we
distinguish the full and partial zone walks.

Consumers should treat COMPACT_PARTIAL_SKIPPED as a potential success
and be optimistic in retrying.

The existing users of COMPACT_COMPLETE are conservatively changed to use
COMPACT_PARTIAL_SKIPPED as well but some of them should be probably
reconsidered and only defer the compaction only for COMPACT_COMPLETE
with the new semantic.

This patch shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h        | 10 +++++++++-
 include/trace/events/compaction.h |  1 +
 mm/compaction.c                   | 14 +++++++++++---
 mm/internal.h                     |  1 +
 4 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 11f228712ed5..9b37f9d3f7a8 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -21,7 +21,15 @@ enum compact_result {
 	 * pages
 	 */
 	COMPACT_PARTIAL,
-	/* The full zone was compacted */
+	/*
+	 * direct compaction has scanned part of the zone but wasn't successfull
+	 * to compact suitable pages.
+	 */
+	COMPACT_PARTIAL_SKIPPED,
+	/*
+	 * The full zone was compacted scanned but wasn't successfull to compact
+	 * suitable pages.
+	 */
 	COMPACT_COMPLETE,
 	/* For more detailed tracepoint output */
 	COMPACT_NO_SUITABLE_PAGE,
diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h
index 6ba16c86d7db..36e2d6fb1360 100644
--- a/include/trace/events/compaction.h
+++ b/include/trace/events/compaction.h
@@ -14,6 +14,7 @@
 	EM( COMPACT_DEFERRED,		"deferred")		\
 	EM( COMPACT_CONTINUE,		"continue")		\
 	EM( COMPACT_PARTIAL,		"partial")		\
+	EM( COMPACT_PARTIAL_SKIPPED,	"partial_skipped")	\
 	EM( COMPACT_COMPLETE,		"complete")		\
 	EM( COMPACT_NO_SUITABLE_PAGE,	"no_suitable_page")	\
 	EM( COMPACT_NOT_SUITABLE_ZONE,	"not_suitable_zone")	\
diff --git a/mm/compaction.c b/mm/compaction.c
index b2b94474dd28..4af1577adb5c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1252,7 +1252,10 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_
 		if (cc->direct_compaction)
 			zone->compact_blockskip_flush = true;
 
-		return COMPACT_COMPLETE;
+		if (cc->whole_zone)
+			return COMPACT_COMPLETE;
+		else
+			return COMPACT_PARTIAL_SKIPPED;
 	}
 
 	if (is_via_compact_memory(cc->order))
@@ -1413,6 +1416,10 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
 		zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
 		zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
 	}
+
+	if (cc->migrate_pfn == start_pfn)
+		cc->whole_zone = true;
+
 	cc->last_migrated_pfn = 0;
 
 	trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
@@ -1634,7 +1641,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
 			goto break_loop;
 		}
 
-		if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) {
+		if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE ||
+					status == COMPACT_PARTIAL_SKIPPED)) {
 			/*
 			 * We think that allocation won't succeed in this zone
 			 * so we defer compaction there. If it ends up
@@ -1881,7 +1889,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 						cc.classzone_idx, 0)) {
 			success = true;
 			compaction_defer_reset(zone, cc.order, false);
-		} else if (status == COMPACT_COMPLETE) {
+		} else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
 			/*
 			 * We use sync migration mode here, so we defer like
 			 * sync direct compaction does.
diff --git a/mm/internal.h b/mm/internal.h
index 3ac544f1963f..f6f3353b0868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -174,6 +174,7 @@ struct compact_control {
 	enum migrate_mode mode;		/* Async or sync migration mode */
 	bool ignore_skip_hint;		/* Scan blocks even if marked skip */
 	bool direct_compaction;		/* False from kcompactd or /proc/... */
+	bool whole_zone;		/* Whole zone has been scanned */
 	int order;			/* order a direct compactor needs */
 	const gfp_t gfp_mask;		/* gfp mask of a direct compactor */
 	const unsigned int alloc_flags;	/* alloc flags of a direct compactor */
-- 
cgit v1.2.3


From c5d01d0d18e2ab7a21f0371b00e4d1a06f79cdf5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:56:53 -0700
Subject: mm, compaction: simplify __alloc_pages_direct_compact feedback
 interface

__alloc_pages_direct_compact communicates potential back off by two
variables:
	- deferred_compaction tells that the compaction returned
	  COMPACT_DEFERRED
	- contended_compaction is set when there is a contention on
	  zone->lock resp. zone->lru_lock locks

__alloc_pages_slowpath then backs of for THP allocation requests to
prevent from long stalls. This is rather messy and it would be much
cleaner to return a single compact result value and hide all the nasty
details into __alloc_pages_direct_compact.

This patch shouldn't introduce any functional changes.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 67 ++++++++++++++++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed62c4b90598..8bcc10616fab 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3185,29 +3185,21 @@ out:
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		enum migrate_mode mode, int *contended_compaction,
-		bool *deferred_compaction)
+		enum migrate_mode mode, enum compact_result *compact_result)
 {
-	enum compact_result compact_result;
 	struct page *page;
+	int contended_compaction;
 
 	if (!order)
 		return NULL;
 
 	current->flags |= PF_MEMALLOC;
-	compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
-						mode, contended_compaction);
+	*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+						mode, &contended_compaction);
 	current->flags &= ~PF_MEMALLOC;
 
-	switch (compact_result) {
-	case COMPACT_DEFERRED:
-		*deferred_compaction = true;
-		/* fall-through */
-	case COMPACT_SKIPPED:
+	if (*compact_result <= COMPACT_INACTIVE)
 		return NULL;
-	default:
-		break;
-	}
 
 	/*
 	 * At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3233,6 +3225,24 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 	 */
 	count_vm_event(COMPACTFAIL);
 
+	/*
+	 * In all zones where compaction was attempted (and not
+	 * deferred or skipped), lock contention has been detected.
+	 * For THP allocation we do not want to disrupt the others
+	 * so we fallback to base pages instead.
+	 */
+	if (contended_compaction == COMPACT_CONTENDED_LOCK)
+		*compact_result = COMPACT_CONTENDED;
+
+	/*
+	 * If compaction was aborted due to need_resched(), we do not
+	 * want to further increase allocation latency, unless it is
+	 * khugepaged trying to collapse.
+	 */
+	if (contended_compaction == COMPACT_CONTENDED_SCHED
+		&& !(current->flags & PF_KTHREAD))
+		*compact_result = COMPACT_CONTENDED;
+
 	cond_resched();
 
 	return NULL;
@@ -3241,8 +3251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
-		enum migrate_mode mode, int *contended_compaction,
-		bool *deferred_compaction)
+		enum migrate_mode mode, enum compact_result *compact_result)
 {
 	return NULL;
 }
@@ -3387,8 +3396,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
-	bool deferred_compaction = false;
-	int contended_compaction = COMPACT_CONTENDED_NONE;
+	enum compact_result compact_result;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3475,8 +3483,7 @@ retry:
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
 					migration_mode,
-					&contended_compaction,
-					&deferred_compaction);
+					&compact_result);
 	if (page)
 		goto got_pg;
 
@@ -3489,25 +3496,14 @@ retry:
 		 * to heavily disrupt the system, so we fail the allocation
 		 * instead of entering direct reclaim.
 		 */
-		if (deferred_compaction)
-			goto nopage;
-
-		/*
-		 * In all zones where compaction was attempted (and not
-		 * deferred or skipped), lock contention has been detected.
-		 * For THP allocation we do not want to disrupt the others
-		 * so we fallback to base pages instead.
-		 */
-		if (contended_compaction == COMPACT_CONTENDED_LOCK)
+		if (compact_result == COMPACT_DEFERRED)
 			goto nopage;
 
 		/*
-		 * If compaction was aborted due to need_resched(), we do not
-		 * want to further increase allocation latency, unless it is
-		 * khugepaged trying to collapse.
+		 * Compaction is contended so rather back off than cause
+		 * excessive stalls.
 		 */
-		if (contended_compaction == COMPACT_CONTENDED_SCHED
-			&& !(current->flags & PF_KTHREAD))
+		if(compact_result == COMPACT_CONTENDED)
 			goto nopage;
 	}
 
@@ -3555,8 +3551,7 @@ noretry:
 	 */
 	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
 					    ac, migration_mode,
-					    &contended_compaction,
-					    &deferred_compaction);
+					    &compact_result);
 	if (page)
 		goto got_pg;
 nopage:
-- 
cgit v1.2.3


From 0a0337e0d1d134465778a16f5cbea95086e8e9e0 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:00 -0700
Subject: mm, oom: rework oom detection

__alloc_pages_slowpath has traditionally relied on the direct reclaim
and did_some_progress as an indicator that it makes sense to retry
allocation rather than declaring OOM.  shrink_zones had to rely on
zone_reclaimable if shrink_zone didn't make any progress to prevent from
a premature OOM killer invocation - the LRU might be full of dirty or
writeback pages and direct reclaim cannot clean those up.

zone_reclaimable allows to rescan the reclaimable lists several times
and restart if a page is freed.  This is really subtle behavior and it
might lead to a livelock when a single freed page keeps allocator
looping but the current task will not be able to allocate that single
page.  OOM killer would be more appropriate than looping without any
progress for unbounded amount of time.

This patch changes OOM detection logic and pulls it out from shrink_zone
which is too low to be appropriate for any high level decisions such as
OOM which is per zonelist property.  It is __alloc_pages_slowpath which
knows how many attempts have been done and what was the progress so far
therefore it is more appropriate to implement this logic.

The new heuristic is implemented in should_reclaim_retry helper called
from __alloc_pages_slowpath.  It tries to be more deterministic and
easier to follow.  It builds on an assumption that retrying makes sense
only if the currently reclaimable memory + free pages would allow the
current allocation request to succeed (as per __zone_watermark_ok) at
least for one zone in the usable zonelist.

This alone wouldn't be sufficient, though, because the writeback might
get stuck and reclaimable pages might be pinned for a really long time
or even depend on the current allocation context.  Therefore there is a
backoff mechanism implemented which reduces the reclaim target after
each reclaim round without any progress.  This means that we should
eventually converge to only NR_FREE_PAGES as the target and fail on the
wmark check and proceed to OOM.  The backoff is simple and linear with
1/16 of the reclaimable pages for each round without any progress.  We
are optimistic and reset counter for successful reclaim rounds.

Costly high order pages mostly preserve their semantic and those without
__GFP_REPEAT fail right away while those which have the flag set will
back off after the amount of reclaimable pages reaches equivalent of the
requested order.  The only difference is that if there was no progress
during the reclaim we rely on zone watermark check.  This is more
logical thing to do than previous 1<<order attempts which were a result
of zone_reclaimable faking the progress.

[vdavydov@virtuozzo.com: check classzone_idx for shrink_zone]
[hannes@cmpxchg.org: separate the heuristic into should_reclaim_retry]
[rientjes@google.com: use zone_page_state_snapshot for NR_FREE_PAGES]
[rientjes@google.com: shrink_zones doesn't need to return anything]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |   1 +
 mm/page_alloc.c      | 100 ++++++++++++++++++++++++++++++++++++++++++++++-----
 mm/vmscan.c          |  25 +++----------
 3 files changed, 97 insertions(+), 29 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index ad220359f1b0..0af2bb2028fd 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page,
 						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
+extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8bcc10616fab..fa39efc3a692 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
 	return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE;
 }
 
+/*
+ * Maximum number of reclaim retries without any progress before OOM killer
+ * is consider as the only way to move forward.
+ */
+#define MAX_RECLAIM_RETRIES 16
+
+/*
+ * Checks whether it makes sense to retry the reclaim to make a forward progress
+ * for the given allocation request.
+ * The reclaim feedback represented by did_some_progress (any progress during
+ * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
+ * pages) and no_progress_loops (number of reclaim rounds without any progress
+ * in a row) is considered as well as the reclaimable pages on the applicable
+ * zone list (with a backoff mechanism which is a function of no_progress_loops).
+ *
+ * Returns true if a retry is viable or false to enter the oom path.
+ */
+static inline bool
+should_reclaim_retry(gfp_t gfp_mask, unsigned order,
+		     struct alloc_context *ac, int alloc_flags,
+		     bool did_some_progress, unsigned long pages_reclaimed,
+		     int no_progress_loops)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure we converge to OOM if we cannot make any progress
+	 * several times in the row.
+	 */
+	if (no_progress_loops > MAX_RECLAIM_RETRIES)
+		return false;
+
+	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+		if (pages_reclaimed >= (1<<order))
+			return false;
+
+		if (did_some_progress)
+			return true;
+	}
+
+	/*
+	 * Keep reclaiming pages while there is a chance this will lead somewhere.
+	 * If none of the target zones can satisfy our allocation request even
+	 * if all reclaimable pages are considered then we are screwed and have
+	 * to go OOM.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+
+		available = zone_reclaimable_pages(zone);
+		available -= DIV_ROUND_UP(no_progress_loops * available,
+					  MAX_RECLAIM_RETRIES);
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+		/*
+		 * Would the allocation succeed if we reclaimed the whole
+		 * available?
+		 */
+		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
+				ac->high_zoneidx, alloc_flags, available)) {
+			/* Wait for some write requests to complete then retry */
+			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+			return true;
+		}
+	}
+
+	return false;
+}
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 						struct alloc_context *ac)
@@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	enum compact_result compact_result;
+	int no_progress_loops = 0;
 
 	/*
 	 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3525,23 +3597,35 @@ retry:
 	if (gfp_mask & __GFP_NORETRY)
 		goto noretry;
 
-	/* Keep reclaiming pages as long as there is reasonable progress */
-	pages_reclaimed += did_some_progress;
-	if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) ||
-	    ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) {
-		/* Wait for some write requests to complete then retry */
-		wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50);
-		goto retry;
+	/*
+	 * Do not retry costly high order allocations unless they are
+	 * __GFP_REPEAT
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
+		goto noretry;
+
+	if (did_some_progress) {
+		no_progress_loops = 0;
+		pages_reclaimed += did_some_progress;
+	} else {
+		no_progress_loops++;
 	}
 
+	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
+				 did_some_progress > 0, pages_reclaimed,
+				 no_progress_loops))
+		goto retry;
+
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)
 		goto got_pg;
 
 	/* Retry as long as the OOM killer is making progress */
-	if (did_some_progress)
+	if (did_some_progress) {
+		no_progress_loops = 0;
 		goto retry;
+	}
 
 noretry:
 	/*
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a386454c015a..c4a2f4512fca 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc)
 }
 #endif
 
-static unsigned long zone_reclaimable_pages(struct zone *zone)
+unsigned long zone_reclaimable_pages(struct zone *zone)
 {
 	unsigned long nr;
 
@@ -2507,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_
  *
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
- *
- * Returns true if a zone was reclaimable.
  */
-static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 {
 	struct zoneref *z;
 	struct zone *zone;
@@ -2518,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	unsigned long nr_soft_scanned;
 	gfp_t orig_mask;
 	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
-	bool reclaimable = false;
 
 	/*
 	 * If the number of buffer_heads in the machine exceeds the maximum
@@ -2583,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 						&nr_soft_scanned);
 			sc->nr_reclaimed += nr_soft_reclaimed;
 			sc->nr_scanned += nr_soft_scanned;
-			if (nr_soft_reclaimed)
-				reclaimable = true;
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx))
-			reclaimable = true;
-
-		if (global_reclaim(sc) &&
-		    !reclaimable && zone_reclaimable(zone))
-			reclaimable = true;
+		shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 	}
 
 	/*
@@ -2601,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	 * promoted it to __GFP_HIGHMEM.
 	 */
 	sc->gfp_mask = orig_mask;
-
-	return reclaimable;
 }
 
 /*
@@ -2627,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
 	unsigned long writeback_threshold;
-	bool zones_reclaimable;
 retry:
 	delayacct_freepages_start();
 
@@ -2638,7 +2625,7 @@ retry:
 		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
 				sc->priority);
 		sc->nr_scanned = 0;
-		zones_reclaimable = shrink_zones(zonelist, sc);
+		shrink_zones(zonelist, sc);
 
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
@@ -2685,10 +2672,6 @@ retry:
 		goto retry;
 	}
 
-	/* Any of the zones still reclaimable?  Don't OOM. */
-	if (zones_reclaimable)
-		return 1;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From ede37713737834d98ec72ed299a305d53e909f73 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:03 -0700
Subject: mm: throttle on IO only when there are too many dirty and writeback
 pages

wait_iff_congested has been used to throttle allocator before it retried
another round of direct reclaim to allow the writeback to make some
progress and prevent reclaim from looping over dirty/writeback pages
without making any progress.

We used to do congestion_wait before commit 0e093d99763e ("writeback: do
not sleep on the congestion queue if there are no congested BDIs or if
significant congestion is not being encountered in the current zone")
but that led to undesirable stalls and sleeping for the full timeout
even when the BDI wasn't congested.  Hence wait_iff_congested was used
instead.

But it seems that even wait_iff_congested doesn't work as expected.  We
might have a small file LRU list with all pages dirty/writeback and yet
the bdi is not congested so this is just a cond_resched in the end and
can end up triggering pre mature OOM.

This patch replaces the unconditional wait_iff_congested by
congestion_wait which is executed only if we _know_ that the last round
of direct reclaim didn't make any progress and dirty+writeback pages are
more than a half of the reclaimable pages on the zone which might be
usable for our target allocation.  This shouldn't reintroduce stalls
fixed by 0e093d99763e because congestion_wait is called only when we are
getting hopeless when sleeping is a better choice than OOM with many
pages under IO.

We have to preserve logic introduced by commit 373ccbe59270 ("mm,
vmstat: allow WQ concurrency to discover memory reclaim doesn't make any
progress") into the __alloc_pages_slowpath now that wait_iff_congested
is not used anymore.  As the only remaining user of wait_iff_congested
is shrink_inactive_list we can remove the WQ specific short sleep from
wait_iff_congested because the sleep is needed to be done only once in
the allocation retry cycle.

[mhocko@suse.com: high_zoneidx->ac_classzone_idx to evaluate memory reserves properly]
 Link: http://lkml.kernel.org/r/1463051677-29418-2-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/backing-dev.c | 20 +++-----------------
 mm/page_alloc.c  | 41 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 40 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0c6317b7db38..ed173b8ae8f2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -957,9 +957,8 @@ EXPORT_SYMBOL(congestion_wait);
  * jiffies for either a BDI to exit congestion of the given @sync queue
  * or a write to complete.
  *
- * In the absence of zone congestion, a short sleep or a cond_resched is
- * performed to yield the processor and to allow other subsystems to make
- * a forward progress.
+ * In the absence of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
  *
  * The return value is 0 if the sleep is for the full timeout. Otherwise,
  * it is the number of jiffies that were still remaining when the function
@@ -979,20 +978,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
 	 */
 	if (atomic_read(&nr_wb_congested[sync]) == 0 ||
 	    !test_bit(ZONE_CONGESTED, &zone->flags)) {
-
-		/*
-		 * Memory allocation/reclaim might be called from a WQ
-		 * context and the current implementation of the WQ
-		 * concurrency control doesn't recognize that a particular
-		 * WQ is congested if the worker thread is looping without
-		 * ever sleeping. Therefore we have to do a short sleep
-		 * here rather than calling cond_resched().
-		 */
-		if (current->flags & PF_WQ_WORKER)
-			schedule_timeout_uninterruptible(1);
-		else
-			cond_resched();
-
+		cond_resched();
 		/* In case we scheduled, work out time remaining */
 		ret = timeout - (jiffies - start);
 		if (ret < 0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fa39efc3a692..f51c302126a1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3436,8 +3436,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
 					ac->nodemask) {
 		unsigned long available;
+		unsigned long reclaimable;
 
-		available = zone_reclaimable_pages(zone);
+		available = reclaimable = zone_reclaimable_pages(zone);
 		available -= DIV_ROUND_UP(no_progress_loops * available,
 					  MAX_RECLAIM_RETRIES);
 		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@ -3447,9 +3448,41 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		 * available?
 		 */
 		if (__zone_watermark_ok(zone, order, min_wmark_pages(zone),
-				ac->high_zoneidx, alloc_flags, available)) {
-			/* Wait for some write requests to complete then retry */
-			wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50);
+				ac_classzone_idx(ac), alloc_flags, available)) {
+			/*
+			 * If we didn't make any progress and have a lot of
+			 * dirty + writeback pages then we should wait for
+			 * an IO to complete to slow down the reclaim and
+			 * prevent from pre mature OOM
+			 */
+			if (!did_some_progress) {
+				unsigned long writeback;
+				unsigned long dirty;
+
+				writeback = zone_page_state_snapshot(zone,
+								     NR_WRITEBACK);
+				dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY);
+
+				if (2*(writeback + dirty) > reclaimable) {
+					congestion_wait(BLK_RW_ASYNC, HZ/10);
+					return true;
+				}
+			}
+
+			/*
+			 * Memory allocation/reclaim might be called from a WQ
+			 * context and the current implementation of the WQ
+			 * concurrency control doesn't recognize that
+			 * a particular WQ is congested if the worker thread is
+			 * looping without ever sleeping. Therefore we have to
+			 * do a short sleep here rather than calling
+			 * cond_resched().
+			 */
+			if (current->flags & PF_WQ_WORKER)
+				schedule_timeout_uninterruptible(1);
+			else
+				cond_resched();
+
 			return true;
 		}
 	}
-- 
cgit v1.2.3


From 33c2d21438daea807947923377995c73ee8ed3fc Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:06 -0700
Subject: mm, oom: protect !costly allocations some more

should_reclaim_retry will give up retries for higher order allocations
if none of the eligible zones has any requested or higher order pages
available even if we pass the watermak check for order-0.  This is done
because there is no guarantee that the reclaimable and currently free
pages will form the required order.

This can, however, lead to situations where the high-order request (e.g.
order-2 required for the stack allocation during fork) will trigger OOM
too early - e.g.  after the first reclaim/compaction round.  Such a
system would have to be highly fragmented and there is no guarantee
further reclaim/compaction attempts would help but at least make sure
that the compaction was active before we go OOM and keep retrying even
if should_reclaim_retry tells us to oom if

	- the last compaction round backed off or
	- we haven't completed at least MAX_COMPACT_RETRIES active
	  compaction rounds.

The first rule ensures that the very last attempt for compaction was not
ignored while the second guarantees that the compaction has done some
work.  Multiple retries might be needed to prevent occasional pigggy
backing of other contexts to steal the compacted pages before the
current context manages to retry to allocate them.

compaction_failed() is taken as a final word from the compaction that
the retry doesn't make much sense.  We have to be careful though because
the first compaction round is MIGRATE_ASYNC which is rather weak as it
ignores pages under writeback and gives up too easily in other
situations.  We therefore have to make sure that MIGRATE_SYNC_LIGHT mode
has been used before we give up.  With this logic in place we do not
have to increase the migration mode unconditionally and rather do it
only if the compaction failed for the weaker mode.  A nice side effect
is that the stronger migration mode is used only when really needed so
this has a potential of smaller latencies in some cases.

Please note that the compaction doesn't tell us much about how
successful it was when returning compaction_made_progress so we just
have to blindly trust that another retry is worthwhile and cap the
number to something reasonable to guarantee a convergence.

If the given number of successful retries is not sufficient for a
reasonable workloads we should focus on the collected compaction
tracepoints data and try to address the issue in the compaction code.
If this is not feasible we can increase the retries limit.

[mhocko@suse.com: fix warning]
  Link: http://lkml.kernel.org/r/20160512061636.GA4200@dhcp22.suse.cz
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 78 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f51c302126a1..38ad6dd7cba0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3180,6 +3180,13 @@ out:
 	return page;
 }
 
+
+/*
+ * Maximum number of compaction retries wit a progress before OOM
+ * killer is consider as the only way to move forward.
+ */
+#define MAX_COMPACT_RETRIES 16
+
 #ifdef CONFIG_COMPACTION
 /* Try memory compaction for high-order allocations before reclaim */
 static struct page *
@@ -3247,14 +3254,60 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 
 	return NULL;
 }
+
+static inline bool
+should_compact_retry(unsigned int order, enum compact_result compact_result,
+		     enum migrate_mode *migrate_mode,
+		     int compaction_retries)
+{
+	if (!order)
+		return false;
+
+	/*
+	 * compaction considers all the zone as desperately out of memory
+	 * so it doesn't really make much sense to retry except when the
+	 * failure could be caused by weak migration mode.
+	 */
+	if (compaction_failed(compact_result)) {
+		if (*migrate_mode == MIGRATE_ASYNC) {
+			*migrate_mode = MIGRATE_SYNC_LIGHT;
+			return true;
+		}
+		return false;
+	}
+
+	/*
+	 * !costly allocations are really important and we have to make sure
+	 * the compaction wasn't deferred or didn't bail out early due to locks
+	 * contention before we go OOM. Still cap the reclaim retry loops with
+	 * progress to prevent from looping forever and potential trashing.
+	 */
+	if (order <= PAGE_ALLOC_COSTLY_ORDER) {
+		if (compaction_withdrawn(compact_result))
+			return true;
+		if (compaction_retries <= MAX_COMPACT_RETRIES)
+			return true;
+	}
+
+	return false;
+}
 #else
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 		unsigned int alloc_flags, const struct alloc_context *ac,
 		enum migrate_mode mode, enum compact_result *compact_result)
 {
+	*compact_result = COMPACT_SKIPPED;
 	return NULL;
 }
+
+static inline bool
+should_compact_retry(unsigned int order, enum compact_result compact_result,
+		     enum migrate_mode *migrate_mode,
+		     int compaction_retries)
+{
+	return false;
+}
 #endif /* CONFIG_COMPACTION */
 
 /* Perform direct synchronous page reclaim */
@@ -3501,6 +3554,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	enum compact_result compact_result;
+	int compaction_retries = 0;
 	int no_progress_loops = 0;
 
 	/*
@@ -3612,13 +3666,8 @@ retry:
 			goto nopage;
 	}
 
-	/*
-	 * It can become very expensive to allocate transparent hugepages at
-	 * fault, so use asynchronous memory compaction for THP unless it is
-	 * khugepaged trying to collapse.
-	 */
-	if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD))
-		migration_mode = MIGRATE_SYNC_LIGHT;
+	if (order && compaction_made_progress(compact_result))
+		compaction_retries++;
 
 	/* Try direct reclaim and then allocating */
 	page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
@@ -3649,6 +3698,17 @@ retry:
 				 no_progress_loops))
 		goto retry;
 
+	/*
+	 * It doesn't make any sense to retry for the compaction if the order-0
+	 * reclaim is not able to make any progress because the current
+	 * implementation of the compaction depends on the sufficient amount
+	 * of free memory (see __compaction_suitable)
+	 */
+	if (did_some_progress > 0 &&
+			should_compact_retry(order, compact_result,
+				&migration_mode, compaction_retries))
+		goto retry;
+
 	/* Reclaim has failed us, start killing things */
 	page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
 	if (page)
@@ -3662,10 +3722,18 @@ retry:
 
 noretry:
 	/*
-	 * High-order allocations do not necessarily loop after
-	 * direct reclaim and reclaim/compaction depends on compaction
-	 * being called after reclaim so call directly if necessary
+	 * High-order allocations do not necessarily loop after direct reclaim
+	 * and reclaim/compaction depends on compaction being called after
+	 * reclaim so call directly if necessary.
+	 * It can become very expensive to allocate transparent hugepages at
+	 * fault, so use asynchronous memory compaction for THP unless it is
+	 * khugepaged trying to collapse. All other requests should tolerate
+	 * at least light sync migration.
 	 */
+	if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD))
+		migration_mode = MIGRATE_ASYNC;
+	else
+		migration_mode = MIGRATE_SYNC_LIGHT;
 	page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags,
 					    ac, migration_mode,
 					    &compact_result);
-- 
cgit v1.2.3


From 7854ea6c28c6076050e24773eeb78e2925bd7411 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:09 -0700
Subject: mm: consider compaction feedback also for costly allocation

PAGE_ALLOC_COSTLY_ORDER retry logic is mostly handled inside
should_reclaim_retry currently where we decide to not retry after at
least order worth of pages were reclaimed or the watermark check for at
least one zone would succeed after reclaiming all pages if the reclaim
hasn't made any progress.  Compaction feedback is mostly ignored and we
just try to make sure that the compaction did at least something before
giving up.

The first condition was added by a41f24ea9fd6 ("page allocator: smarter
retry of costly-order allocations) and it assumed that lumpy reclaim
could have created a page of the sufficient order.  Lumpy reclaim, has
been removed quite some time ago so the assumption doesn't hold anymore.
Remove the check for the number of reclaimed pages and rely on the
compaction feedback solely.  should_reclaim_retry now only makes sure
that we keep retrying reclaim for high order pages only if they are
hidden by watermaks so order-0 reclaim makes really sense.

should_compact_retry now keeps retrying even for the costly allocations.
The number of retries is reduced wrt.  !costly requests because they are
less important and harder to grant and so their pressure shouldn't cause
contention for other requests or cause an over reclaim.  We also do not
reset no_progress_loops for costly request to make sure we do not keep
reclaiming too agressively.

This has been tested by running a process which fragments memory:
	- compact memory
	- mmap large portion of the memory (1920M on 2GRAM machine with 2G
	  of swapspace)
	- MADV_DONTNEED single page in PAGE_SIZE*((1UL<<MAX_ORDER)-1)
	  steps until certain amount of memory is freed (250M in my test)
	  and reduce the step to (step / 2) + 1 after reaching the end of
	  the mapping
	- then run a script which populates the page cache 2G (MemTotal)
	  from /dev/zero to a new file
And then tries to allocate
nr_hugepages=$(awk '/MemAvailable/{printf "%d\n", $2/(2*1024)}' /proc/meminfo)
huge pages.

root@test1:~# echo 1 > /proc/sys/vm/overcommit_memory;echo 1 > /proc/sys/vm/compact_memory; ./fragment-mem-and-run /root/alloc_hugepages.sh 1920M 250M
Node 0, zone      DMA     31     28     31     10      2      0      2      1      2      3      1
Node 0, zone    DMA32    437    319    171     50     28     25     20     16     16     14    437

* This is the /proc/buddyinfo after the compaction

Done fragmenting. size=2013265920 freed=262144000
Node 0, zone      DMA    165     48      3      1      2      0      2      2      2      2      0
Node 0, zone    DMA32  35109  14575    185     51     41     12      6      0      0      0      0

* /proc/buddyinfo after memory got fragmented

Executing "/root/alloc_hugepages.sh"
Eating some pagecache
508623+0 records in
508623+0 records out
2083319808 bytes (2.1 GB) copied, 11.7292 s, 178 MB/s
Node 0, zone      DMA      3      5      3      1      2      0      2      2      2      2      0
Node 0, zone    DMA32    111    344    153     20     24     10      3      0      0      0      0

* /proc/buddyinfo after page cache got eaten

Trying to allocate 129
129

* 129 hugepages requested and all of them granted.

Node 0, zone      DMA      3      5      3      1      2      0      2      2      2      2      0
Node 0, zone    DMA32    127     97     30     99     11      6      2      1      4      0      0

* /proc/buddyinfo after hugetlb allocation.

10 runs will behave as follows:
Trying to allocate 130
130
--
Trying to allocate 129
129
--
Trying to allocate 128
128
--
Trying to allocate 129
129
--
Trying to allocate 128
128
--
Trying to allocate 129
129
--
Trying to allocate 132
132
--
Trying to allocate 129
129
--
Trying to allocate 128
128
--
Trying to allocate 129
129

So basically 100% success for all 10 attempts.
Without the patch numbers looked much worse:
Trying to allocate 128
12
--
Trying to allocate 129
14
--
Trying to allocate 129
7
--
Trying to allocate 129
16
--
Trying to allocate 129
30
--
Trying to allocate 129
38
--
Trying to allocate 129
19
--
Trying to allocate 129
37
--
Trying to allocate 129
28
--
Trying to allocate 129
37

Just for completness the base kernel without oom detection rework looks
as follows:
Trying to allocate 127
30
--
Trying to allocate 129
12
--
Trying to allocate 129
52
--
Trying to allocate 128
32
--
Trying to allocate 129
12
--
Trying to allocate 129
10
--
Trying to allocate 129
32
--
Trying to allocate 128
14
--
Trying to allocate 128
16
--
Trying to allocate 129
8

As we can see the success rate is much more volatile and smaller without
this patch. So the patch not only makes the retry logic for costly
requests more sensible the success rate is even higher.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 63 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 38ad6dd7cba0..dea406a62e3d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3260,6 +3260,8 @@ should_compact_retry(unsigned int order, enum compact_result compact_result,
 		     enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
+	int max_retries = MAX_COMPACT_RETRIES;
+
 	if (!order)
 		return false;
 
@@ -3277,17 +3279,24 @@ should_compact_retry(unsigned int order, enum compact_result compact_result,
 	}
 
 	/*
-	 * !costly allocations are really important and we have to make sure
-	 * the compaction wasn't deferred or didn't bail out early due to locks
-	 * contention before we go OOM. Still cap the reclaim retry loops with
-	 * progress to prevent from looping forever and potential trashing.
+	 * make sure the compaction wasn't deferred or didn't bail out early
+	 * due to locks contention before we declare that we should give up.
 	 */
-	if (order <= PAGE_ALLOC_COSTLY_ORDER) {
-		if (compaction_withdrawn(compact_result))
-			return true;
-		if (compaction_retries <= MAX_COMPACT_RETRIES)
-			return true;
-	}
+	if (compaction_withdrawn(compact_result))
+		return true;
+
+	/*
+	 * !costly requests are much more important than __GFP_REPEAT
+	 * costly ones because they are de facto nofail and invoke OOM
+	 * killer to move on while costly can fail and users are ready
+	 * to cope with that. 1/4 retries is rather arbitrary but we
+	 * would need much more detailed feedback from compaction to
+	 * make a better decision.
+	 */
+	if (order > PAGE_ALLOC_COSTLY_ORDER)
+		max_retries /= 4;
+	if (compaction_retries <= max_retries)
+		return true;
 
 	return false;
 }
@@ -3449,18 +3458,17 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask)
  * Checks whether it makes sense to retry the reclaim to make a forward progress
  * for the given allocation request.
  * The reclaim feedback represented by did_some_progress (any progress during
- * the last reclaim round), pages_reclaimed (cumulative number of reclaimed
- * pages) and no_progress_loops (number of reclaim rounds without any progress
- * in a row) is considered as well as the reclaimable pages on the applicable
- * zone list (with a backoff mechanism which is a function of no_progress_loops).
+ * the last reclaim round) and no_progress_loops (number of reclaim rounds without
+ * any progress in a row) is considered as well as the reclaimable pages on the
+ * applicable zone list (with a backoff mechanism which is a function of
+ * no_progress_loops).
  *
  * Returns true if a retry is viable or false to enter the oom path.
  */
 static inline bool
 should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 		     struct alloc_context *ac, int alloc_flags,
-		     bool did_some_progress, unsigned long pages_reclaimed,
-		     int no_progress_loops)
+		     bool did_some_progress, int no_progress_loops)
 {
 	struct zone *zone;
 	struct zoneref *z;
@@ -3472,14 +3480,6 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
 	if (no_progress_loops > MAX_RECLAIM_RETRIES)
 		return false;
 
-	if (order > PAGE_ALLOC_COSTLY_ORDER) {
-		if (pages_reclaimed >= (1<<order))
-			return false;
-
-		if (did_some_progress)
-			return true;
-	}
-
 	/*
 	 * Keep reclaiming pages while there is a chance this will lead somewhere.
 	 * If none of the target zones can satisfy our allocation request even
@@ -3550,7 +3550,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
 	struct page *page = NULL;
 	unsigned int alloc_flags;
-	unsigned long pages_reclaimed = 0;
 	unsigned long did_some_progress;
 	enum migrate_mode migration_mode = MIGRATE_ASYNC;
 	enum compact_result compact_result;
@@ -3686,16 +3685,18 @@ retry:
 	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT))
 		goto noretry;
 
-	if (did_some_progress) {
+	/*
+	 * Costly allocations might have made a progress but this doesn't mean
+	 * their order will become available due to high fragmentation so
+	 * always increment the no progress counter for them
+	 */
+	if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
 		no_progress_loops = 0;
-		pages_reclaimed += did_some_progress;
-	} else {
+	else
 		no_progress_loops++;
-	}
 
 	if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
-				 did_some_progress > 0, pages_reclaimed,
-				 no_progress_loops))
+				 did_some_progress > 0, no_progress_loops))
 		goto retry;
 
 	/*
-- 
cgit v1.2.3


From 86a294a81f93d6f36d00ec3ff779d36d218f852d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:12 -0700
Subject: mm, oom, compaction: prevent from should_compact_retry looping for
 ever for costly orders

"mm: consider compaction feedback also for costly allocation" has
removed the upper bound for the reclaim/compaction retries based on the
number of reclaimed pages for costly orders.  While this is desirable
the patch did miss a mis interaction between reclaim, compaction and the
retry logic.  The direct reclaim tries to get zones over min watermark
while compaction backs off and returns COMPACT_SKIPPED when all zones
are below low watermark + 1<<order gap.  If we are getting really close
to OOM then __compaction_suitable can keep returning COMPACT_SKIPPED a
high order request (e.g.  hugetlb order-9) while the reclaim is not able
to release enough pages to get us over low watermark.  The reclaim is
still able to make some progress (usually trashing over few remaining
pages) so we are not able to break out from the loop.

I have seen this happening with the same test described in "mm: consider
compaction feedback also for costly allocation" on a swapless system.
The original problem got resolved by "vmscan: consider classzone_idx in
compaction_ready" but it shows how things might go wrong when we
approach the oom event horizont.

The reason why compaction requires being over low rather than min
watermark is not clear to me.  This check was there essentially since
56de7263fcf3 ("mm: compaction: direct compact when a high-order
allocation fails").  It is clearly an implementation detail though and
we shouldn't pull it into the generic retry logic while we should be
able to cope with such eventuality.  The only place in
should_compact_retry where we retry without any upper bound is for
compaction_withdrawn() case.

Introduce compaction_zonelist_suitable function which checks the given
zonelist and returns true only if there is at least one zone which would
would unblock __compaction_suitable if more memory got reclaimed.  In
this implementation it checks __compaction_suitable with NR_FREE_PAGES
plus part of the reclaimable memory as the target for the watermark
check.  The reclaimable memory is reduced linearly by the allocation
order.  The idea is that we do not want to reclaim all the remaining
memory for a single allocation request just unblock
__compaction_suitable which doesn't guarantee we will make a further
progress.

The new helper is then used if compaction_withdrawn() feedback was
provided so we do not retry if there is no outlook for a further
progress.  !costly requests shouldn't be affected much - e.g.  order-2
pages would require to have at least 64kB on the reclaimable LRUs while
order-9 would need at least 32M which should be enough to not lock up.

[vbabka@suse.cz: fix classzone_idx vs. high_zoneidx usage in compaction_zonelist_suitable]
[akpm@linux-foundation.org: fix it for Mel's mm-page_alloc-remove-field-from-alloc_context.patch]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: David Rientjes <rientjes@google.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h |  4 ++++
 include/linux/mmzone.h     |  3 +++
 mm/compaction.c            | 42 +++++++++++++++++++++++++++++++++++++++---
 mm/page_alloc.c            | 23 +++++++++++++----------
 4 files changed, 59 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index 8d8c916fe67a..a58c852a268f 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -142,6 +142,10 @@ static inline bool compaction_withdrawn(enum compact_result result)
 	return false;
 }
 
+
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+					int alloc_flags);
+
 extern int kcompactd_run(int nid);
 extern void kcompactd_stop(int nid);
 extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60db2096fd8..8dd0333b01dc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -739,6 +739,9 @@ static inline bool is_dev_zone(const struct zone *zone)
 extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx,
 		unsigned int alloc_flags);
diff --git a/mm/compaction.c b/mm/compaction.c
index 4af1577adb5c..d8a20fcf8678 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1318,7 +1318,8 @@ static enum compact_result compact_finished(struct zone *zone,
  */
 static enum compact_result __compaction_suitable(struct zone *zone, int order,
 					unsigned int alloc_flags,
-					int classzone_idx)
+					int classzone_idx,
+					unsigned long wmark_target)
 {
 	int fragindex;
 	unsigned long watermark;
@@ -1341,7 +1342,8 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
 	 * allocated and for a short time, the footprint is higher
 	 */
 	watermark += (2UL << order);
-	if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags))
+	if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+				 alloc_flags, wmark_target))
 		return COMPACT_SKIPPED;
 
 	/*
@@ -1368,7 +1370,8 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 {
 	enum compact_result ret;
 
-	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+	ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+				    zone_page_state(zone, NR_FREE_PAGES));
 	trace_mm_compaction_suitable(zone, order, ret);
 	if (ret == COMPACT_NOT_SUITABLE_ZONE)
 		ret = COMPACT_SKIPPED;
@@ -1376,6 +1379,39 @@ enum compact_result compaction_suitable(struct zone *zone, int order,
 	return ret;
 }
 
+bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
+		int alloc_flags)
+{
+	struct zone *zone;
+	struct zoneref *z;
+
+	/*
+	 * Make sure at least one zone would pass __compaction_suitable if we continue
+	 * retrying the reclaim.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		unsigned long available;
+		enum compact_result compact_result;
+
+		/*
+		 * Do not consider all the reclaimable memory because we do not
+		 * want to trash just for a single high order allocation which
+		 * is even not guaranteed to appear even if __compaction_suitable
+		 * is happy about the watermark check.
+		 */
+		available = zone_reclaimable_pages(zone) / order;
+		available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
+		compact_result = __compaction_suitable(zone, order, alloc_flags,
+				ac_classzone_idx(ac), available);
+		if (compact_result != COMPACT_SKIPPED &&
+				compact_result != COMPACT_NOT_SUITABLE_ZONE)
+			return true;
+	}
+
+	return false;
+}
+
 static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
 {
 	enum compact_result ret;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dea406a62e3d..089f760ce64a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2750,10 +2750,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
  * one free page of a suitable size. Checking now avoids taking the zone lock
  * to check in the allocation paths if no pages are free.
  */
-static bool __zone_watermark_ok(struct zone *z, unsigned int order,
-			unsigned long mark, int classzone_idx,
-			unsigned int alloc_flags,
-			long free_pages)
+bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
+			 int classzone_idx, unsigned int alloc_flags,
+			 long free_pages)
 {
 	long min = mark;
 	int o;
@@ -3256,8 +3255,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
-		     enum migrate_mode *migrate_mode,
+should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
+		     enum compact_result compact_result, enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
 	int max_retries = MAX_COMPACT_RETRIES;
@@ -3281,9 +3280,11 @@ should_compact_retry(unsigned int order, enum compact_result compact_result,
 	/*
 	 * make sure the compaction wasn't deferred or didn't bail out early
 	 * due to locks contention before we declare that we should give up.
+	 * But do not retry if the given zonelist is not suitable for
+	 * compaction.
 	 */
 	if (compaction_withdrawn(compact_result))
-		return true;
+		return compaction_zonelist_suitable(ac, order, alloc_flags);
 
 	/*
 	 * !costly requests are much more important than __GFP_REPEAT
@@ -3311,7 +3312,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
 }
 
 static inline bool
-should_compact_retry(unsigned int order, enum compact_result compact_result,
+should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
+		     enum compact_result compact_result,
 		     enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
@@ -3706,8 +3708,9 @@ retry:
 	 * of free memory (see __compaction_suitable)
 	 */
 	if (did_some_progress > 0 &&
-			should_compact_retry(order, compact_result,
-				&migration_mode, compaction_retries))
+			should_compact_retry(ac, order, alloc_flags,
+				compact_result, &migration_mode,
+				compaction_retries))
 		goto retry;
 
 	/* Reclaim has failed us, start killing things */
-- 
cgit v1.2.3


From 31e49bfda18464b9b0cfe42044d5a4be4a0ca0f3 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:15 -0700
Subject: mm, oom: protect !costly allocations some more for !CONFIG_COMPACTION

Joonsoo has reported that he is able to trigger OOM for !costly high
order requests (heavy fork() workload close the OOM) with the new oom
detection rework.  This is because we rely only on should_reclaim_retry
when the compaction is disabled and it only checks watermarks for the
requested order and so we might trigger OOM when there is a lot of free
memory.

It is not very clear what are the usual workloads when the compaction is
disabled.  Relying on high order allocations heavily without any
mechanism to create those orders except for unbound amount of reclaim is
certainly not a good idea.

To prevent from potential regressions let's help this configuration
some.  We have to sacrifice the determinsm though because there simply
is none here possible.  should_compact_retry implementation for
!CONFIG_COMPACTION, which was empty so far, will do watermark check for
order-0 on all eligible zones.  This will cause retrying until either
the reclaim cannot make any further progress or all the zones are
depleted even for order-0 pages.  This means that the number of retries
is basically unbounded for !costly orders but that was the case before
the rework as well so this shouldn't regress.

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/1463051677-29418-3-git-send-email-mhocko@kernel.org
Reported-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Hillf Danton <hillf.zj@alibaba-inc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 089f760ce64a..7e49bc3705b6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3317,6 +3317,24 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
 		     enum migrate_mode *migrate_mode,
 		     int compaction_retries)
 {
+	struct zone *zone;
+	struct zoneref *z;
+
+	if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
+		return false;
+
+	/*
+	 * There are setups with compaction disabled which would prefer to loop
+	 * inside the allocator rather than hit the oom killer prematurely.
+	 * Let's give them a good hope and keep retrying while the order-0
+	 * watermarks are OK.
+	 */
+	for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+					ac->nodemask) {
+		if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
+					ac_classzone_idx(ac), alloc_flags))
+			return true;
+	}
 	return false;
 }
 #endif /* CONFIG_COMPACTION */
-- 
cgit v1.2.3


From bb8a4b7fd1266ef888b3a80aa5f266062b224ef4 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:18 -0700
Subject: mm, oom_reaper: hide oom reaped tasks from OOM killer more carefully

Commit 36324a990cf5 ("oom: clear TIF_MEMDIE after oom_reaper managed to
unmap the address space") not only clears TIF_MEMDIE for oom reaped task
but also set OOM_SCORE_ADJ_MIN for the target task to hide it from the
oom killer.  This works in simple cases but it is not sufficient for
(unlikely) cases where the mm is shared between independent processes
(as they do not share signal struct).  If the mm had only small amount
of memory which could be reaped then another task sharing the mm could
be selected and that wouldn't help to move out from the oom situation.

Introduce MMF_OOM_REAPED mm flag which is checked in oom_badness (same
as OOM_SCORE_ADJ_MIN) and task is skipped if the flag is set.  Set the
flag after __oom_reap_task is done with a task.  This will force the
select_bad_process() to ignore all already oom reaped tasks as well as
no such task is sacrificed for its parent.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 1 +
 mm/oom_kill.c         | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 31bd0d97d178..40eabf176ce2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -521,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm)
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
+#define MMF_OOM_REAPED		21	/* mm has been already reaped */
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
 
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 415f7eb913fa..c0376efa79ec 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
+	/*
+	 * Do not even consider tasks which are explicitly marked oom
+	 * unkillable or have been already oom reaped.
+	 */
 	adj = (long)p->signal->oom_score_adj;
-	if (adj == OOM_SCORE_ADJ_MIN) {
+	if (adj == OOM_SCORE_ADJ_MIN ||
+			test_bit(MMF_OOM_REAPED, &p->mm->flags)) {
 		task_unlock(p);
 		return 0;
 	}
@@ -513,7 +518,7 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 * This task can be safely ignored because we cannot do much more
 	 * to release its memory.
 	 */
-	tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN;
+	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
 	mmput(mm);
 	return ret;
-- 
cgit v1.2.3


From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 20 May 2016 16:57:21 -0700
Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper
 context

Tetsuo has properly noted that mmput slow path might get blocked waiting
for another party (e.g.  exit_aio waits for an IO).  If that happens the
oom_reaper would be put out of the way and will not be able to process
next oom victim.  We should strive for making this context as reliable
and independent on other subsystems as much as possible.

Introduce mmput_async which will perform the slow path from an async
(WQ) context.  This will delay the operation but that shouldn't be a
problem because the oom_reaper has reclaimed the victim's address space
for most cases as much as possible and the remaining context shouldn't
bind too much memory anymore.  The only exception is when mmap_sem
trylock has failed which shouldn't happen too often.

The issue is only theoretical but not impossible.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_types.h |  2 ++
 include/linux/sched.h    |  5 +++++
 kernel/fork.c            | 50 +++++++++++++++++++++++++++++++++---------------
 mm/oom_kill.c            |  8 ++++++--
 4 files changed, 48 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1fda9c99ef95..d553855503e6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/uprobes.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -513,6 +514,7 @@ struct mm_struct {
 #ifdef CONFIG_HUGETLB_PAGE
 	atomic_long_t hugetlb_usage;
 #endif
+	struct work_struct async_put_work;
 };
 
 static inline void mm_init_cpumask(struct mm_struct *mm)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 40eabf176ce2..479e3cade7e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm)
 
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
+/* same as above but performs the slow path from the async kontext. Can
+ * be called from the atomic context as well
+ */
+extern void mmput_async(struct mm_struct *);
+
 /* Grab a reference to a task's mm, if it is not already going away */
 extern struct mm_struct *get_task_mm(struct task_struct *task);
 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index 3e8451527cbe..8fbed7194af1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm)
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
 
+static inline void __mmput(struct mm_struct *mm)
+{
+	VM_BUG_ON(atomic_read(&mm->mm_users));
+
+	uprobe_clear_state(mm);
+	exit_aio(mm);
+	ksm_exit(mm);
+	khugepaged_exit(mm); /* must run before exit_mmap */
+	exit_mmap(mm);
+	set_mm_exe_file(mm, NULL);
+	if (!list_empty(&mm->mmlist)) {
+		spin_lock(&mmlist_lock);
+		list_del(&mm->mmlist);
+		spin_unlock(&mmlist_lock);
+	}
+	if (mm->binfmt)
+		module_put(mm->binfmt->module);
+	mmdrop(mm);
+}
+
 /*
  * Decrement the use count and release all resources for an mm.
  */
@@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
+	if (atomic_dec_and_test(&mm->mm_users))
+		__mmput(mm);
+}
+EXPORT_SYMBOL_GPL(mmput);
+
+static void mmput_async_fn(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+	__mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
 	if (atomic_dec_and_test(&mm->mm_users)) {
-		uprobe_clear_state(mm);
-		exit_aio(mm);
-		ksm_exit(mm);
-		khugepaged_exit(mm); /* must run before exit_mmap */
-		exit_mmap(mm);
-		set_mm_exe_file(mm, NULL);
-		if (!list_empty(&mm->mmlist)) {
-			spin_lock(&mmlist_lock);
-			list_del(&mm->mmlist);
-			spin_unlock(&mmlist_lock);
-		}
-		if (mm->binfmt)
-			module_put(mm->binfmt->module);
-		mmdrop(mm);
+		INIT_WORK(&mm->async_put_work, mmput_async_fn);
+		schedule_work(&mm->async_put_work);
 	}
 }
-EXPORT_SYMBOL_GPL(mmput);
 
 /**
  * set_mm_exe_file - change a reference to the mm's executable file
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0376efa79ec..c0e37dd1422f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
 static struct task_struct *oom_reaper_list;
 static DEFINE_SPINLOCK(oom_reaper_lock);
 
-
 static bool __oom_reap_task(struct task_struct *tsk)
 {
 	struct mmu_gather tlb;
@@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
 out:
-	mmput(mm);
+	/*
+	 * Drop our reference but make sure the mmput slow path is called from a
+	 * different context because we shouldn't risk we get stuck there and
+	 * put the oom_reaper out of the way.
+	 */
+	mmput_async(mm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From f44666b04605d1c7fd94ab90b7ccf633e7eff228 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Fri, 20 May 2016 16:57:27 -0700
Subject: mm,oom: speed up select_bad_process() loop

Since commit 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel
panics"), select_bad_process() is using for_each_process_thread().

Since oom_unkillable_task() scans all threads in the caller's thread
group and oom_task_origin() scans signal_struct of the caller's thread
group, we don't need to call oom_unkillable_task() and oom_task_origin()
on each thread.  Also, since !mm test will be done later at
oom_badness(), we don't need to do !mm test on each thread.  Therefore,
we only need to do TIF_MEMDIE test on each thread.

Although the original code was correct it was quite inefficient because
each thread group was scanned num_threads times which can be a lot
especially with processes with many threads.  Even though the OOM is
extremely cold path it is always good to be as effective as possible
when we are inside rcu_read_lock() - aka unpreemptible context.

If we track number of TIF_MEMDIE threads inside signal_struct, we don't
need to do TIF_MEMDIE test on each thread.  This will allow
select_bad_process() to use for_each_process().

This patch adds a counter to signal_struct for tracking how many
TIF_MEMDIE threads are in a given thread group, and check it at
oom_scan_process_thread() so that select_bad_process() can use
for_each_process() rather than for_each_process_thread().

[mhocko@suse.com: do not blow the signal_struct size]
  Link: http://lkml.kernel.org/r/20160520075035.GF19172@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/201605182230.IDC73435.MVSOHLFOQFOJtF@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h |  1 +
 mm/oom_kill.c         | 17 ++++++-----------
 2 files changed, 7 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 479e3cade7e9..01fe1bb68754 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -669,6 +669,7 @@ struct signal_struct {
 	atomic_t		sigcnt;
 	atomic_t		live;
 	int			nr_threads;
+	atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */
 	struct list_head	thread_head;
 
 	wait_queue_head_t	wait_chldexit;	/* for wait4() */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index c0e37dd1422f..5bb2f7698ad7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -283,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 	 * This task already has access to memory reserves and is being killed.
 	 * Don't allow any other task to have access to the reserves.
 	 */
-	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (!is_sysrq_oom(oc))
-			return OOM_SCAN_ABORT;
-	}
-	if (!task->mm)
-		return OOM_SCAN_CONTINUE;
+	if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims))
+		return OOM_SCAN_ABORT;
 
 	/*
 	 * If task is allocating a lot of memory and has been marked to be
@@ -307,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc,
 static struct task_struct *select_bad_process(struct oom_control *oc,
 		unsigned int *ppoints, unsigned long totalpages)
 {
-	struct task_struct *g, *p;
+	struct task_struct *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
 
 	rcu_read_lock();
-	for_each_process_thread(g, p) {
+	for_each_process(p) {
 		unsigned int points;
 
 		switch (oom_scan_process_thread(oc, p, totalpages)) {
@@ -331,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc,
 		points = oom_badness(p, NULL, oc->nodemask, totalpages);
 		if (!points || points < chosen_points)
 			continue;
-		/* Prefer thread group leaders for display purposes */
-		if (points == chosen_points && thread_group_leader(chosen))
-			continue;
 
 		chosen = p;
 		chosen_points = points;
@@ -673,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk)
 	/* OOM killer might race with memcg OOM */
 	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_inc(&tsk->signal->oom_victims);
 	/*
 	 * Make sure that the task is woken up from uninterruptible sleep
 	 * if it is frozen because OOM killer wouldn't be able to free
@@ -690,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk)
 {
 	if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE))
 		return;
+	atomic_dec(&tsk->signal->oom_victims);
 
 	if (!atomic_dec_return(&oom_victims))
 		wake_up_all(&oom_victims_wait);
-- 
cgit v1.2.3


From 340a43bed674a70308f196f2a61ec0b01f8a14d9 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 20 May 2016 16:57:30 -0700
Subject: mm: thp: simplify the implementation of mk_huge_pmd()

The implementation of mk_huge_pmd looks verbose, it could be just
simplified to one line code.

Signed-off-by: Yang Shi <yang.shi@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 66675eed67be..86eafd91c982 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -764,10 +764,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 
 static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot)
 {
-	pmd_t entry;
-	entry = mk_pmd(page, prot);
-	entry = pmd_mkhuge(entry);
-	return entry;
+	return pmd_mkhuge(mk_pmd(page, prot));
 }
 
 static inline struct list_head *page_deferred_list(struct page *page)
-- 
cgit v1.2.3


From 495367c051fb200a42636bdc63be78ca1713a85a Mon Sep 17 00:00:00 2001
From: Chen Yucong <slaoub@gmail.com>
Date: Fri, 20 May 2016 16:57:32 -0700
Subject: mm/memory-failure.c: replace "MCE" with "Memory failure"

HWPoison was specific to some particular x86 platforms.  And it is often
seen as high level machine check handler.  And therefore, 'MCE' is used
for the format prefix of printk().  However, 'PowerNV' has also used
HWPoison for handling memory errors[1], so 'MCE' is no longer suitable
to memory_failure.c.

Additionally, 'MCE' and 'Memory failure' have different context.  The
former belongs to exception context and the latter belongs to process
context.  Furthermore, HWPoison can also be used for off-lining those
sub-health pages that do not trigger any machine check exception.

This patch aims to replace 'MCE' with a more appropriate prefix.

[1] commit 75eb3d9b60c2 ("powerpc/powernv: Get FSP memory errors
and plumb into memory poison infrastructure.")

Signed-off-by: Chen Yucong <slaoub@gmail.com>
Acked-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory-failure.c | 72 +++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ca5acee53b7a..2fcca6b0e005 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -184,8 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 	struct siginfo si;
 	int ret;
 
-	pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n",
-	       pfn, t->comm, t->pid);
+	pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
+		pfn, t->comm, t->pid);
 	si.si_signo = SIGBUS;
 	si.si_errno = 0;
 	si.si_addr = (void *)addr;
@@ -208,7 +208,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno,
 		ret = send_sig_info(SIGBUS, &si, t);  /* synchronous? */
 	}
 	if (ret < 0)
-		pr_info("MCE: Error sending signal to %s:%d: %d\n",
+		pr_info("Memory failure: Error sending signal to %s:%d: %d\n",
 			t->comm, t->pid, ret);
 	return ret;
 }
@@ -289,7 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 	} else {
 		tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC);
 		if (!tk) {
-			pr_err("MCE: Out of memory while machine check handling\n");
+			pr_err("Memory failure: Out of memory while machine check handling\n");
 			return;
 		}
 	}
@@ -303,7 +303,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
 	 * a SIGKILL because the error is not contained anymore.
 	 */
 	if (tk->addr == -EFAULT) {
-		pr_info("MCE: Unable to find user space address %lx in %s\n",
+		pr_info("Memory failure: Unable to find user space address %lx in %s\n",
 			page_to_pfn(p), tsk->comm);
 		tk->addr_valid = 0;
 	}
@@ -334,7 +334,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
 			 * signal and then access the memory. Just kill it.
 			 */
 			if (fail || tk->addr_valid == 0) {
-				pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
+				pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
 				       pfn, tk->tsk->comm, tk->tsk->pid);
 				force_sig(SIGKILL, tk->tsk);
 			}
@@ -347,7 +347,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno,
 			 */
 			else if (kill_proc(tk->tsk, tk->addr, trapno,
 					      pfn, page, flags) < 0)
-				pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
+				pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n",
 				       pfn, tk->tsk->comm, tk->tsk->pid);
 		}
 		put_task_struct(tk->tsk);
@@ -559,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn)
  */
 static int me_unknown(struct page *p, unsigned long pfn)
 {
-	pr_err("MCE %#lx: Unknown page state\n", pfn);
+	pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
 	return MF_FAILED;
 }
 
@@ -604,11 +604,12 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 	if (mapping->a_ops->error_remove_page) {
 		err = mapping->a_ops->error_remove_page(mapping, p);
 		if (err != 0) {
-			pr_info("MCE %#lx: Failed to punch page: %d\n",
+			pr_info("Memory failure: %#lx: Failed to punch page: %d\n",
 				pfn, err);
 		} else if (page_has_private(p) &&
 				!try_to_release_page(p, GFP_NOIO)) {
-			pr_info("MCE %#lx: failed to release buffers\n", pfn);
+			pr_info("Memory failure: %#lx: failed to release buffers\n",
+				pfn);
 		} else {
 			ret = MF_RECOVERED;
 		}
@@ -620,7 +621,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
 		if (invalidate_inode_page(p))
 			ret = MF_RECOVERED;
 		else
-			pr_info("MCE %#lx: Failed to invalidate\n", pfn);
+			pr_info("Memory failure: %#lx: Failed to invalidate\n",
+				pfn);
 	}
 	return ret;
 }
@@ -833,7 +835,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type,
 {
 	trace_memory_failure_event(pfn, type, result);
 
-	pr_err("MCE %#lx: recovery action for %s: %s\n",
+	pr_err("Memory failure: %#lx: recovery action for %s: %s\n",
 		pfn, action_page_types[type], action_name[result]);
 }
 
@@ -849,7 +851,7 @@ static int page_action(struct page_state *ps, struct page *p,
 	if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
 		count--;
 	if (count != 0) {
-		pr_err("MCE %#lx: %s still referenced by %d users\n",
+		pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
 		       pfn, action_page_types[ps->type], count);
 		result = MF_FAILED;
 	}
@@ -882,7 +884,7 @@ int get_hwpoison_page(struct page *page)
 		 * tries to touch the "partially handled" page.
 		 */
 		if (!PageAnon(head)) {
-			pr_err("MCE: %#lx: non anonymous thp\n",
+			pr_err("Memory failure: %#lx: non anonymous thp\n",
 				page_to_pfn(page));
 			return 0;
 		}
@@ -892,7 +894,8 @@ int get_hwpoison_page(struct page *page)
 		if (head == compound_head(page))
 			return 1;
 
-		pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page));
+		pr_info("Memory failure: %#lx cannot catch tail\n",
+			page_to_pfn(page));
 		put_page(head);
 	}
 
@@ -931,12 +934,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		return SWAP_SUCCESS;
 
 	if (PageKsm(p)) {
-		pr_err("MCE %#lx: can't handle KSM pages.\n", pfn);
+		pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn);
 		return SWAP_FAIL;
 	}
 
 	if (PageSwapCache(p)) {
-		pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn);
+		pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n",
+			pfn);
 		ttu |= TTU_IGNORE_HWPOISON;
 	}
 
@@ -954,7 +958,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		} else {
 			kill = 0;
 			ttu |= TTU_IGNORE_HWPOISON;
-			pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n",
+			pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n",
 				pfn);
 		}
 	}
@@ -972,7 +976,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
 
 	ret = try_to_unmap(hpage, ttu);
 	if (ret != SWAP_SUCCESS)
-		pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n",
+		pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n",
 		       pfn, page_mapcount(hpage));
 
 	/*
@@ -1040,14 +1044,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 		panic("Memory failure from trap %d on page %lx", trapno, pfn);
 
 	if (!pfn_valid(pfn)) {
-		pr_err("MCE %#lx: memory outside kernel control\n", pfn);
+		pr_err("Memory failure: %#lx: memory outside kernel control\n",
+			pfn);
 		return -ENXIO;
 	}
 
 	p = pfn_to_page(pfn);
 	orig_head = hpage = compound_head(p);
 	if (TestSetPageHWPoison(p)) {
-		pr_err("MCE %#lx: already hardware poisoned\n", pfn);
+		pr_err("Memory failure: %#lx: already hardware poisoned\n",
+			pfn);
 		return 0;
 	}
 
@@ -1112,9 +1118,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 		if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
 			unlock_page(hpage);
 			if (!PageAnon(hpage))
-				pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+				pr_err("Memory failure: %#lx: non anonymous thp\n",
+					pfn);
 			else
-				pr_err("MCE: %#lx: thp split failed\n", pfn);
+				pr_err("Memory failure: %#lx: thp split failed\n",
+					pfn);
 			if (TestClearPageHWPoison(p))
 				num_poisoned_pages_sub(nr_pages);
 			put_hwpoison_page(p);
@@ -1178,7 +1186,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
 	 * unpoison always clear PG_hwpoison inside page lock
 	 */
 	if (!PageHWPoison(p)) {
-		pr_err("MCE %#lx: just unpoisoned\n", pfn);
+		pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
 		num_poisoned_pages_sub(nr_pages);
 		unlock_page(hpage);
 		put_hwpoison_page(hpage);
@@ -1395,25 +1403,25 @@ int unpoison_memory(unsigned long pfn)
 	page = compound_head(p);
 
 	if (!PageHWPoison(p)) {
-		unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n",
+		unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
 				 pfn, &unpoison_rs);
 		return 0;
 	}
 
 	if (page_count(page) > 1) {
-		unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n",
+		unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
 				 pfn, &unpoison_rs);
 		return 0;
 	}
 
 	if (page_mapped(page)) {
-		unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n",
+		unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
 				 pfn, &unpoison_rs);
 		return 0;
 	}
 
 	if (page_mapping(page)) {
-		unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n",
+		unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
 				 pfn, &unpoison_rs);
 		return 0;
 	}
@@ -1424,7 +1432,7 @@ int unpoison_memory(unsigned long pfn)
 	 * In such case, we yield to memory_failure() and make unpoison fail.
 	 */
 	if (!PageHuge(page) && PageTransHuge(page)) {
-		unpoison_pr_info("MCE: Memory failure is now running on %#lx\n",
+		unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
 				 pfn, &unpoison_rs);
 		return 0;
 	}
@@ -1439,13 +1447,13 @@ int unpoison_memory(unsigned long pfn)
 		 * to the end.
 		 */
 		if (PageHuge(page)) {
-			unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n",
+			unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n",
 					 pfn, &unpoison_rs);
 			return 0;
 		}
 		if (TestClearPageHWPoison(p))
 			num_poisoned_pages_dec();
-		unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n",
+		unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
 				 pfn, &unpoison_rs);
 		return 0;
 	}
@@ -1458,7 +1466,7 @@ int unpoison_memory(unsigned long pfn)
 	 * the free buddy page pool.
 	 */
 	if (TestClearPageHWPoison(page)) {
-		unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n",
+		unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
 				 pfn, &unpoison_rs);
 		num_poisoned_pages_sub(nr_pages);
 		freeit = 1;
-- 
cgit v1.2.3


From f705ac4b39f30a6a5f8411a42114758f4d4655bc Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov <kuleshovmail@gmail.com>
Date: Fri, 20 May 2016 16:57:35 -0700
Subject: mm/memblock.c: move memblock_{add,reserve}_region into
 memblock_{add,reserve}

memblock_add_region() and memblock_reserve_region() do nothing specific
before the call of memblock_add_range(), only print debug output.

We can do the same in memblock_add() and memblock_reserve() since both
memblock_add_region() and memblock_reserve_region() are not used by
anybody outside of memblock.c and memblock_{add,reserve}() have the same
set of flags and nids.

Since memblock_add_region() and memblock_reserve_region() will be
inlined, there will not be functional changes, but will improve code
readability a little.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Acked-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 28 ++++++----------------------
 1 file changed, 6 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index b570dddb4cb9..3b93daa46fc5 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -606,22 +606,14 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
 	return memblock_add_range(&memblock.memory, base, size, nid, 0);
 }
 
-static int __init_memblock memblock_add_region(phys_addr_t base,
-						phys_addr_t size,
-						int nid,
-						unsigned long flags)
+int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
 {
 	memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n",
 		     (unsigned long long)base,
 		     (unsigned long long)base + size - 1,
-		     flags, (void *)_RET_IP_);
-
-	return memblock_add_range(&memblock.memory, base, size, nid, flags);
-}
+		     0UL, (void *)_RET_IP_);
 
-int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
-{
-	return memblock_add_region(base, size, MAX_NUMNODES, 0);
+	return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
 }
 
 /**
@@ -732,22 +724,14 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
 	return memblock_remove_range(&memblock.reserved, base, size);
 }
 
-static int __init_memblock memblock_reserve_region(phys_addr_t base,
-						   phys_addr_t size,
-						   int nid,
-						   unsigned long flags)
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
 {
 	memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n",
 		     (unsigned long long)base,
 		     (unsigned long long)base + size - 1,
-		     flags, (void *)_RET_IP_);
-
-	return memblock_add_range(&memblock.reserved, base, size, nid, flags);
-}
+		     0UL, (void *)_RET_IP_);
 
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
-{
-	return memblock_reserve_region(base, size, MAX_NUMNODES, 0);
+	return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
 }
 
 /**
-- 
cgit v1.2.3


From 80c4bd7a5e4368b680e0aeb57050a1b06eb573d8 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Fri, 20 May 2016 16:57:38 -0700
Subject: mm/vmalloc: keep a separate lazy-free list

When mixing lots of vmallocs and set_memory_*() (which calls
vm_unmap_aliases()) I encountered situations where the performance
degraded severely due to the walking of the entire vmap_area list each
invocation.

One simple improvement is to add the lazily freed vmap_area to a
separate lockless free list, such that we then avoid having to walk the
full list on each purge.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Roman Pen <r.peniaev@gmail.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Roman Pen <r.peniaev@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Shawn Lin <shawn.lin@rock-chips.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  3 ++-
 mm/vmalloc.c            | 39 +++++++++++++++++++--------------------
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index d1f1d338af20..957adb741b6f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/init.h>
 #include <linux/list.h>
+#include <linux/llist.h>
 #include <asm/page.h>		/* pgprot_t */
 #include <linux/rbtree.h>
 
@@ -44,7 +45,7 @@ struct vmap_area {
 	unsigned long flags;
 	struct rb_node rb_node;         /* address sorted rbtree */
 	struct list_head list;          /* address sorted list */
-	struct list_head purge_list;    /* "lazy purge" list */
+	struct llist_node purge_list;    /* "lazy purge" list */
 	struct vm_struct *vm;
 	struct rcu_head rcu_head;
 };
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ae7d20b447ff..6e3291882739 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 
 /*** Global kva allocator ***/
 
-#define VM_LAZY_FREE	0x01
-#define VM_LAZY_FREEING	0x02
 #define VM_VM_AREA	0x04
 
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
+static LLIST_HEAD(vmap_purge_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 					int sync, int force_flush)
 {
 	static DEFINE_SPINLOCK(purge_lock);
-	LIST_HEAD(valist);
+	struct llist_node *valist;
 	struct vmap_area *va;
 	struct vmap_area *n_va;
 	int nr = 0;
@@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 	if (sync)
 		purge_fragmented_blocks_allcpus();
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(va, &vmap_area_list, list) {
-		if (va->flags & VM_LAZY_FREE) {
-			if (va->va_start < *start)
-				*start = va->va_start;
-			if (va->va_end > *end)
-				*end = va->va_end;
-			nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
-			list_add_tail(&va->purge_list, &valist);
-			va->flags |= VM_LAZY_FREEING;
-			va->flags &= ~VM_LAZY_FREE;
-		}
+	valist = llist_del_all(&vmap_purge_list);
+	llist_for_each_entry(va, valist, purge_list) {
+		if (va->va_start < *start)
+			*start = va->va_start;
+		if (va->va_end > *end)
+			*end = va->va_end;
+		nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
 	}
-	rcu_read_unlock();
 
 	if (nr)
 		atomic_sub(nr, &vmap_lazy_nr);
@@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
 
 	if (nr) {
 		spin_lock(&vmap_area_lock);
-		list_for_each_entry_safe(va, n_va, &valist, purge_list)
+		llist_for_each_entry_safe(va, n_va, valist, purge_list)
 			__free_vmap_area(va);
 		spin_unlock(&vmap_area_lock);
 	}
@@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void)
  */
 static void free_vmap_area_noflush(struct vmap_area *va)
 {
-	va->flags |= VM_LAZY_FREE;
-	atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
-	if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
+	int nr_lazy;
+
+	nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
+				    &vmap_lazy_nr);
+
+	/* After this point, we may free va at any time */
+	llist_add(&va->purge_list, &vmap_purge_list);
+
+	if (unlikely(nr_lazy > lazy_max_pages()))
 		try_purge_vmap_area_lazy();
 }
 
-- 
cgit v1.2.3


From d5957d2fc232a689543bdbed1a5ff8002f0e9843 Mon Sep 17 00:00:00 2001
From: Yongji Xie <xyjxie@linux.vnet.ibm.com>
Date: Fri, 20 May 2016 16:57:41 -0700
Subject: mm: fix incorrect pfn passed to untrack_pfn() in remap_pfn_range()

We use generic hooks in remap_pfn_range() to help archs to track pfnmap
regions.  The code is something like:

  int remap_pfn_range()
  {
	...
	track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
	...
	pfn -= addr >> PAGE_SHIFT;
	...
	untrack_pfn(vma, pfn, PAGE_ALIGN(size));
	...
  }

Here we can easily find the pfn is changed but not recovered before
untrack_pfn() is called.  That's incorrect.

There are no known runtime effects - this is from inspection.

Signed-off-by: Yongji Xie <xyjxie@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Matthew Wilcox <matthew.r.wilcox@intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: David Hildenbrand <dahi@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 07493e34ab7e..007c72ad03f6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1744,6 +1744,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	unsigned long next;
 	unsigned long end = addr + PAGE_ALIGN(size);
 	struct mm_struct *mm = vma->vm_mm;
+	unsigned long remap_pfn = pfn;
 	int err;
 
 	/*
@@ -1770,7 +1771,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		vma->vm_pgoff = pfn;
 	}
 
-	err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
+	err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
 	if (err)
 		return -EINVAL;
 
@@ -1789,7 +1790,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	} while (pgd++, addr = next, addr != end);
 
 	if (err)
-		untrack_pfn(vma, pfn, PAGE_ALIGN(size));
+		untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
 
 	return err;
 }
-- 
cgit v1.2.3


From f4fcd55841fc9e46daac553b39361572453c2b88 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <koct9i@gmail.com>
Date: Fri, 20 May 2016 16:57:45 -0700
Subject: mm: enable RLIMIT_DATA by default with workaround for valgrind

Since commit 84638335900f ("mm: rework virtual memory accounting")
RLIMIT_DATA limits both brk() and private mmap() but this's disabled by
default because of incompatibility with older versions of valgrind.

Valgrind always set limit to zero and fails if RLIMIT_DATA is enabled.
Fortunately it changes only rlim_cur and keeps rlim_max for reverting
limit back when needed.

This patch checks current usage also against rlim_max if rlim_cur is
zero.  This is safe because task anyway can increase rlim_cur up to
rlim_max.  Size of brk is still checked against rlim_cur, so this part
is completely compatible - zero rlim_cur forbids brk() but allows
private mmap().

Link: http://lkml.kernel.org/r/56A28613.5070104@de.ibm.com
Signed-off-by: Konstantin Khlebnikov <koct9i@gmail.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index fba246b8f1a5..b9274a0c82c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -66,7 +66,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
 int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
 #endif
 
-static bool ignore_rlimit_data = true;
+static bool ignore_rlimit_data;
 core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
 
 static void unmap_region(struct mm_struct *mm,
@@ -2886,13 +2886,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
 
 	if (is_data_mapping(flags) &&
 	    mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
-		if (ignore_rlimit_data)
-			pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n",
+		/* Workaround for Valgrind */
+		if (rlimit(RLIMIT_DATA) == 0 &&
+		    mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT)
+			return true;
+		if (!ignore_rlimit_data) {
+			pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n",
 				     current->comm, current->pid,
 				     (mm->data_vm + npages) << PAGE_SHIFT,
 				     rlimit(RLIMIT_DATA));
-		else
 			return false;
+		}
 	}
 
 	return true;
-- 
cgit v1.2.3


From 297880f4af4e492ed5084be9397d65a18ade56ee Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 20 May 2016 16:57:50 -0700
Subject: mm, hugetlb_cgroup: round limit_in_bytes down to hugepage size

The page_counter rounds limits down to page size values.  This makes
sense, except in the case of hugetlb_cgroup where it's not possible to
charge partial hugepages.  If the hugetlb_cgroup margin is less than the
hugepage size being charged, it will fail as expected.

Round the hugetlb_cgroup limit down to hugepage size, since it is the
effective limit of the cgroup.

For consistency, round down PAGE_COUNTER_MAX as well when a
hugetlb_cgroup is created: this prevents error reports when a user
cannot restore the value to the kernel default.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Nikolay Borisov <kernel@kyup.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/hugetlb_cgroup.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index d8fb10de0f14..eec1150125b9 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -67,26 +67,42 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
 	return false;
 }
 
+static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
+				struct hugetlb_cgroup *parent_h_cgroup)
+{
+	int idx;
+
+	for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) {
+		struct page_counter *counter = &h_cgroup->hugepage[idx];
+		struct page_counter *parent = NULL;
+		unsigned long limit;
+		int ret;
+
+		if (parent_h_cgroup)
+			parent = &parent_h_cgroup->hugepage[idx];
+		page_counter_init(counter, parent);
+
+		limit = round_down(PAGE_COUNTER_MAX,
+				   1 << huge_page_order(&hstates[idx]));
+		ret = page_counter_limit(counter, limit);
+		VM_BUG_ON(ret);
+	}
+}
+
 static struct cgroup_subsys_state *
 hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
 	struct hugetlb_cgroup *h_cgroup;
-	int idx;
 
 	h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
 	if (!h_cgroup)
 		return ERR_PTR(-ENOMEM);
 
-	if (parent_h_cgroup) {
-		for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
-			page_counter_init(&h_cgroup->hugepage[idx],
-					  &parent_h_cgroup->hugepage[idx]);
-	} else {
+	if (!parent_h_cgroup)
 		root_h_cgroup = h_cgroup;
-		for (idx = 0; idx < HUGE_MAX_HSTATE; idx++)
-			page_counter_init(&h_cgroup->hugepage[idx], NULL);
-	}
+
+	hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
 	return &h_cgroup->css;
 }
 
@@ -285,6 +301,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
 		return ret;
 
 	idx = MEMFILE_IDX(of_cft(of)->private);
+	nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx]));
 
 	switch (MEMFILE_ATTR(of_cft(of)->private)) {
 	case RES_LIMIT:
-- 
cgit v1.2.3


From a4a921aa5c7a1ab621e71fd1a4289e76fe230cbd Mon Sep 17 00:00:00 2001
From: Ming Li <mingli199x@qq.com>
Date: Fri, 20 May 2016 16:57:56 -0700
Subject: mm/swap.c: put activate_page_pvecs and other pagevecs together

Put the activate_page_pvecs definition next to those of the other
pagevecs, for clarity.

Signed-off-by: Ming Li <mingli199x@qq.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/swap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index 03aacbcb013f..95916142fc46 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
 static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
 static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
+#ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+#endif
 
 /*
  * This path almost never happens for VM activity - pages are normally
@@ -274,8 +277,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
 }
 
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
-
 static void activate_page_drain(int cpu)
 {
 	struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
-- 
cgit v1.2.3


From 2a138dc7e50bfdc90f9db9b52584ac5564952425 Mon Sep 17 00:00:00 2001
From: Minfei Huang <mnghuan@gmail.com>
Date: Fri, 20 May 2016 16:58:13 -0700
Subject: mm: use existing helper to convert "on"/"off" to boolean

It's more convenient to use existing function helper to convert string
"on/off" to boolean.

Link: http://lkml.kernel.org/r/1461908824-16129-1-git-send-email-mnghuan@gmail.com
Signed-off-by: Minfei Huang <mnghuan@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c  | 9 +--------
 mm/page_poison.c | 8 +-------
 2 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e49bc3705b6..3f4b69aaa23a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -613,14 +613,7 @@ static int __init early_debug_pagealloc(char *buf)
 {
 	if (!buf)
 		return -EINVAL;
-
-	if (strcmp(buf, "on") == 0)
-		_debug_pagealloc_enabled = true;
-
-	if (strcmp(buf, "off") == 0)
-		_debug_pagealloc_enabled = false;
-
-	return 0;
+	return kstrtobool(buf, &_debug_pagealloc_enabled);
 }
 early_param("debug_pagealloc", early_debug_pagealloc);
 
diff --git a/mm/page_poison.c b/mm/page_poison.c
index 479e7ea2bea6..1eae5fad2446 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -13,13 +13,7 @@ static int early_page_poison_param(char *buf)
 {
 	if (!buf)
 		return -EINVAL;
-
-	if (strcmp(buf, "on") == 0)
-		want_page_poisoning = true;
-	else if (strcmp(buf, "off") == 0)
-		want_page_poisoning = false;
-
-	return 0;
+	return strtobool(buf, &want_page_poisoning);
 }
 early_param("page_poison", early_page_poison_param);
 
-- 
cgit v1.2.3


From 51038171b7ecf0017fbe3e06bd246eaa26f4d2e7 Mon Sep 17 00:00:00 2001
From: Greg Thelen <gthelen@google.com>
Date: Fri, 20 May 2016 16:58:18 -0700
Subject: memcg: fix stale mem_cgroup_force_empty() comment

Commit f61c42a7d911 ("memcg: remove tasks/children test from
mem_cgroup_force_empty()") removed memory reparenting from the function.

Fix the function's comment.

Link: http://lkml.kernel.org/r/1462569810-54496-1-git-send-email-gthelen@google.com
Signed-off-by: Greg Thelen <gthelen@google.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Vladimir Davydov <vdavydov@virtuozzo.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d71d387868e6..b3f16ab4b431 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2652,8 +2652,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
 }
 
 /*
- * Reclaims as many pages from the given memcg as possible and moves
- * the rest to the parent.
+ * Reclaims as many pages from the given memcg as possible.
  *
  * Caller is responsible for holding css reference for memcg.
  */
-- 
cgit v1.2.3


From 7b8da4c7f0777489f8690115b5fd7704ac0abb8f Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 20 May 2016 16:58:21 -0700
Subject: vmstat: get rid of the ugly cpu_stat_off variable

The cpu_stat_off variable is unecessary since we can check if a
workqueue request is pending otherwise.  Removal of cpu_stat_off makes
it pretty easy for the vmstat shepherd to ensure that the proper things
happen.

Removing the state also removes all races related to it.  Should a
workqueue not be scheduled as needed for vmstat_update then the shepherd
will notice and schedule it as needed.  Should a workqueue be
unecessarily scheduled then the vmstat updater will disable it.

[akpm@linux-foundation.org: fix indentation, per Michal]
Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1605061306460.17934@east.gentwo.org
Signed-off-by: Christoph Lameter <cl@linux.com>
Cc: Tejun Heo <htejun@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmstat.c | 51 ++++++++++-----------------------------------------
 1 file changed, 10 insertions(+), 41 deletions(-)

(limited to 'mm')

diff --git a/mm/vmstat.c b/mm/vmstat.c
index 5b72a8ad2813..77e42ef388c2 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1352,7 +1352,6 @@ static const struct file_operations proc_vmstat_file_operations = {
 static struct workqueue_struct *vmstat_wq;
 static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
-static cpumask_var_t cpu_stat_off;
 
 #ifdef CONFIG_PROC_FS
 static void refresh_vm_stats(struct work_struct *work)
@@ -1421,24 +1420,10 @@ static void vmstat_update(struct work_struct *w)
 		 * Counters were updated so we expect more updates
 		 * to occur in the future. Keep on running the
 		 * update worker thread.
-		 * If we were marked on cpu_stat_off clear the flag
-		 * so that vmstat_shepherd doesn't schedule us again.
 		 */
-		if (!cpumask_test_and_clear_cpu(smp_processor_id(),
-						cpu_stat_off)) {
-			queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+		queue_delayed_work_on(smp_processor_id(), vmstat_wq,
 				this_cpu_ptr(&vmstat_work),
 				round_jiffies_relative(sysctl_stat_interval));
-		}
-	} else {
-		/*
-		 * We did not update any counters so the app may be in
-		 * a mode where it does not cause counter updates.
-		 * We may be uselessly running vmstat_update.
-		 * Defer the checking for differentials to the
-		 * shepherd thread on a different processor.
-		 */
-		cpumask_set_cpu(smp_processor_id(), cpu_stat_off);
 	}
 }
 
@@ -1470,16 +1455,17 @@ static bool need_update(int cpu)
 	return false;
 }
 
+/*
+ * Switch off vmstat processing and then fold all the remaining differentials
+ * until the diffs stay at zero. The function is used by NOHZ and can only be
+ * invoked when tick processing is not active.
+ */
 void quiet_vmstat(void)
 {
 	if (system_state != SYSTEM_RUNNING)
 		return;
 
-	/*
-	 * If we are already in hands of the shepherd then there
-	 * is nothing for us to do here.
-	 */
-	if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+	if (!delayed_work_pending(this_cpu_ptr(&vmstat_work)))
 		return;
 
 	if (!need_update(smp_processor_id()))
@@ -1494,7 +1480,6 @@ void quiet_vmstat(void)
 	refresh_cpu_vm_stats(false);
 }
 
-
 /*
  * Shepherd worker thread that checks the
  * differentials of processors that have their worker
@@ -1511,20 +1496,11 @@ static void vmstat_shepherd(struct work_struct *w)
 
 	get_online_cpus();
 	/* Check processors whose vmstat worker threads have been disabled */
-	for_each_cpu(cpu, cpu_stat_off) {
+	for_each_online_cpu(cpu) {
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
 
-		if (need_update(cpu)) {
-			if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-				queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
-		} else {
-			/*
-			 * Cancel the work if quiet_vmstat has put this
-			 * cpu on cpu_stat_off because the work item might
-			 * be still scheduled
-			 */
-			cancel_delayed_work(dw);
-		}
+		if (!delayed_work_pending(dw) && need_update(cpu))
+			queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
 	}
 	put_online_cpus();
 
@@ -1540,10 +1516,6 @@ static void __init start_shepherd_timer(void)
 		INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
 			vmstat_update);
 
-	if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
-		BUG();
-	cpumask_copy(cpu_stat_off, cpu_online_mask);
-
 	vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
 	schedule_delayed_work(&shepherd,
 		round_jiffies_relative(sysctl_stat_interval));
@@ -1578,16 +1550,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
 	case CPU_ONLINE_FROZEN:
 		refresh_zone_stat_thresholds();
 		node_set_state(cpu_to_node(cpu), N_CPU);
-		cpumask_set_cpu(cpu, cpu_stat_off);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
 		cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
-		cpumask_clear_cpu(cpu, cpu_stat_off);
 		break;
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		cpumask_set_cpu(cpu, cpu_stat_off);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
-- 
cgit v1.2.3


From d5ee7c3bcca6fe2b4f7a1fdee253250059c110d2 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Fri, 20 May 2016 16:58:27 -0700
Subject: mm: thp: split_huge_pmd_address() comment improvement

Comment is partly wrong, this improves it by including the case of
split_huge_pmd_address() called by try_to_unmap_one if TTU_SPLIT_HUGE_PMD
is set.

Link: http://lkml.kernel.org/r/1462547040-1737-4-git-send-email-aarcange@redhat.com
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86eafd91c982..1764184c4774 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3023,8 +3023,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
 		return;
 
 	/*
-	 * Caller holds the mmap_sem write mode, so a huge pmd cannot
-	 * materialize from under us.
+	 * Caller holds the mmap_sem write mode or the anon_vma lock,
+	 * so a huge pmd cannot materialize from under us (khugepaged
+	 * holds both the mmap_sem write mode and the anon_vma lock
+	 * write mode).
 	 */
 	__split_huge_pmd(vma, pmd, address, freeze);
 }
-- 
cgit v1.2.3


From 9a001fc19cccdeb9be4c3b89ad089d92df303c44 Mon Sep 17 00:00:00 2001
From: Vitaly Wool <vitalywool@gmail.com>
Date: Fri, 20 May 2016 16:58:30 -0700
Subject: z3fold: the 3-fold allocator for compressed pages

This patch introduces z3fold, a special purpose allocator for storing
compressed pages.  It is designed to store up to three compressed pages
per physical page.  It is a ZBUD derivative which allows for higher
compression ratio keeping the simplicity and determinism of its
predecessor.

This patch comes as a follow-up to the discussions at the Embedded Linux
Conference in San-Diego related to the talk [1].  The outcome of these
discussions was that it would be good to have a compressed page
allocator as stable and deterministic as zbud with with higher
compression ratio.

To keep the determinism and simplicity, z3fold, just like zbud, always
stores an integral number of compressed pages per page, but it can store
up to 3 pages unlike zbud which can store at most 2.  Therefore the
compression ratio goes to around 2.6x while zbud's one is around 1.7x.

The patch is based on the latest linux.git tree.

This version has been updated after testing on various simulators (e.g.
ARM Versatile Express, MIPS Malta, x86_64/Haswell) and basing on
comments from Dan Streetman [3].

[1] https://openiotelc2016.sched.org/event/6DAC/swapping-and-embedded-compression-relieves-the-pressure-vitaly-wool-softprise-consulting-ou
[2] https://lkml.org/lkml/2016/4/21/799
[3] https://lkml.org/lkml/2016/5/4/852

Link: http://lkml.kernel.org/r/20160509151753.ec3f9fda3c9898d31ff52a32@gmail.com
Signed-off-by: Vitaly Wool <vitalywool@gmail.com>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/vm/z3fold.txt |  26 ++
 mm/Kconfig                  |  12 +-
 mm/Makefile                 |   1 +
 mm/z3fold.c                 | 792 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 830 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/vm/z3fold.txt
 create mode 100644 mm/z3fold.c

(limited to 'mm')

diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.txt
new file mode 100644
index 000000000000..38e4dac810b6
--- /dev/null
+++ b/Documentation/vm/z3fold.txt
@@ -0,0 +1,26 @@
+z3fold
+------
+
+z3fold is a special purpose allocator for storing compressed pages.
+It is designed to store up to three compressed pages per physical page.
+It is a zbud derivative which allows for higher compression
+ratio keeping the simplicity and determinism of its predecessor.
+
+The main differences between z3fold and zbud are:
+* unlike zbud, z3fold allows for up to PAGE_SIZE allocations
+* z3fold can hold up to 3 compressed pages in its page
+* z3fold doesn't export any API itself and is thus intended to be used
+  via the zpool API.
+
+To keep the determinism and simplicity, z3fold, just like zbud, always
+stores an integral number of compressed pages per page, but it can store
+up to 3 pages unlike zbud which can store at most 2. Therefore the
+compression ratio goes to around 2.7x while zbud's one is around 1.7x.
+
+Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not
+return a dereferenceable pointer. Instead, it returns an unsigned long
+handle which encodes actual location of the allocated object.
+
+Keeping effective compression ratio close to zsmalloc's, z3fold doesn't
+depend on MMU enabled and provides more predictable reclaim behavior
+which makes it a better fit for small and response-critical systems.
diff --git a/mm/Kconfig b/mm/Kconfig
index b0432b71137d..1a6a28ebcb8b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -567,7 +567,7 @@ config ZPOOL
 	  zsmalloc.
 
 config ZBUD
-	tristate "Low density storage for compressed pages"
+	tristate "Low (Up to 2x) density storage for compressed pages"
 	default n
 	help
 	  A special purpose allocator for storing compressed pages.
@@ -576,6 +576,16 @@ config ZBUD
 	  deterministic reclaim properties that make it preferable to a higher
 	  density approach when reclaim will be used.
 
+config Z3FOLD
+	tristate "Up to 3x density storage for compressed pages"
+	depends on ZPOOL
+	default n
+	help
+	  A special purpose allocator for storing compressed pages.
+	  It is designed to store up to three compressed pages per physical
+	  page. It is a ZBUD derivative so the simplicity and determinism are
+	  still there.
+
 config ZSMALLOC
 	tristate "Memory allocator for compressed pages"
 	depends on MMU
diff --git a/mm/Makefile b/mm/Makefile
index deb467edca2d..78c6f7dedb83 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
 obj-$(CONFIG_ZPOOL)	+= zpool.o
 obj-$(CONFIG_ZBUD)	+= zbud.o
 obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
+obj-$(CONFIG_Z3FOLD)	+= z3fold.o
 obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
 obj-$(CONFIG_CMA)	+= cma.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
diff --git a/mm/z3fold.c b/mm/z3fold.c
new file mode 100644
index 000000000000..34917d55d311
--- /dev/null
+++ b/mm/z3fold.c
@@ -0,0 +1,792 @@
+/*
+ * z3fold.c
+ *
+ * Author: Vitaly Wool <vitaly.wool@konsulko.com>
+ * Copyright (C) 2016, Sony Mobile Communications Inc.
+ *
+ * This implementation is based on zbud written by Seth Jennings.
+ *
+ * z3fold is an special purpose allocator for storing compressed pages. It
+ * can store up to three compressed pages per page which improves the
+ * compression ratio of zbud while retaining its main concepts (e. g. always
+ * storing an integral number of objects per page) and simplicity.
+ * It still has simple and deterministic reclaim properties that make it
+ * preferable to a higher density approach (with no requirement on integral
+ * number of object per page) when reclaim is used.
+ *
+ * As in zbud, pages are divided into "chunks".  The size of the chunks is
+ * fixed at compile time and is determined by NCHUNKS_ORDER below.
+ *
+ * z3fold doesn't export any API and is meant to be used via zpool API.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zpool.h>
+
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation.  It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk
+ * in allocated page is occupied by z3fold header, NCHUNKS will be calculated
+ * to 63 which shows the max number of free chunks in z3fold page, also there
+ * will be 63 freelists per pool.
+ */
+#define NCHUNKS_ORDER	6
+
+#define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+#define NCHUNKS		((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
+
+#define BUDDY_MASK	((1 << NCHUNKS_ORDER) - 1)
+
+struct z3fold_pool;
+struct z3fold_ops {
+	int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+/**
+ * struct z3fold_pool - stores metadata for each z3fold pool
+ * @lock:	protects all pool fields and first|last_chunk fields of any
+ *		z3fold page in the pool
+ * @unbuddied:	array of lists tracking z3fold pages that contain 2- buddies;
+ *		the lists each z3fold page is added to depends on the size of
+ *		its free region.
+ * @buddied:	list tracking the z3fold pages that contain 3 buddies;
+ *		these z3fold pages are full
+ * @lru:	list tracking the z3fold pages in LRU order by most recently
+ *		added buddy.
+ * @pages_nr:	number of z3fold pages in the pool.
+ * @ops:	pointer to a structure of user defined operations specified at
+ *		pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular z3fold pool.
+ */
+struct z3fold_pool {
+	spinlock_t lock;
+	struct list_head unbuddied[NCHUNKS];
+	struct list_head buddied;
+	struct list_head lru;
+	u64 pages_nr;
+	const struct z3fold_ops *ops;
+	struct zpool *zpool;
+	const struct zpool_ops *zpool_ops;
+};
+
+enum buddy {
+	HEADLESS = 0,
+	FIRST,
+	MIDDLE,
+	LAST,
+	BUDDIES_MAX
+};
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying the first chunk of each
+ *			z3fold page, except for HEADLESS pages
+ * @buddy:	links the z3fold page into the relevant list in the pool
+ * @first_chunks:	the size of the first buddy in chunks, 0 if free
+ * @middle_chunks:	the size of the middle buddy in chunks, 0 if free
+ * @last_chunks:	the size of the last buddy in chunks, 0 if free
+ * @first_num:		the starting number (for the first handle)
+ */
+struct z3fold_header {
+	struct list_head buddy;
+	unsigned short first_chunks;
+	unsigned short middle_chunks;
+	unsigned short last_chunks;
+	unsigned short start_middle;
+	unsigned short first_num:NCHUNKS_ORDER;
+};
+
+/*
+ * Internal z3fold page flags
+ */
+enum z3fold_page_flags {
+	UNDER_RECLAIM = 0,
+	PAGE_HEADLESS,
+	MIDDLE_CHUNK_MAPPED,
+};
+
+/*****************
+ * Helpers
+*****************/
+
+/* Converts an allocation size in bytes to size in z3fold chunks */
+static int size_to_chunks(size_t size)
+{
+	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes the z3fold header of a newly allocated z3fold page */
+static struct z3fold_header *init_z3fold_page(struct page *page)
+{
+	struct z3fold_header *zhdr = page_address(page);
+
+	INIT_LIST_HEAD(&page->lru);
+	clear_bit(UNDER_RECLAIM, &page->private);
+	clear_bit(PAGE_HEADLESS, &page->private);
+	clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+
+	zhdr->first_chunks = 0;
+	zhdr->middle_chunks = 0;
+	zhdr->last_chunks = 0;
+	zhdr->first_num = 0;
+	zhdr->start_middle = 0;
+	INIT_LIST_HEAD(&zhdr->buddy);
+	return zhdr;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_z3fold_page(struct z3fold_header *zhdr)
+{
+	__free_page(virt_to_page(zhdr));
+}
+
+/*
+ * Encodes the handle of a particular buddy within a z3fold page
+ * Pool lock should be held as this function accesses first_num
+ */
+static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
+{
+	unsigned long handle;
+
+	handle = (unsigned long)zhdr;
+	if (bud != HEADLESS)
+		handle += (bud + zhdr->first_num) & BUDDY_MASK;
+	return handle;
+}
+
+/* Returns the z3fold page where a given handle is stored */
+static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+{
+	return (struct z3fold_header *)(handle & PAGE_MASK);
+}
+
+/* Returns buddy number */
+static enum buddy handle_to_buddy(unsigned long handle)
+{
+	struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
+	return (handle - zhdr->first_num) & BUDDY_MASK;
+}
+
+/*
+ * Returns the number of free chunks in a z3fold page.
+ * NB: can't be used with HEADLESS pages.
+ */
+static int num_free_chunks(struct z3fold_header *zhdr)
+{
+	int nfree;
+	/*
+	 * If there is a middle object, pick up the bigger free space
+	 * either before or after it. Otherwise just subtract the number
+	 * of chunks occupied by the first and the last objects.
+	 */
+	if (zhdr->middle_chunks != 0) {
+		int nfree_before = zhdr->first_chunks ?
+			0 : zhdr->start_middle - 1;
+		int nfree_after = zhdr->last_chunks ?
+			0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks;
+		nfree = max(nfree_before, nfree_after);
+	} else
+		nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
+	return nfree;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * z3fold_create_pool() - create a new z3fold pool
+ * @gfp:	gfp flags when allocating the z3fold pool structure
+ * @ops:	user-defined operations for the z3fold pool
+ *
+ * Return: pointer to the new z3fold pool or NULL if the metadata allocation
+ * failed.
+ */
+static struct z3fold_pool *z3fold_create_pool(gfp_t gfp,
+		const struct z3fold_ops *ops)
+{
+	struct z3fold_pool *pool;
+	int i;
+
+	pool = kzalloc(sizeof(struct z3fold_pool), gfp);
+	if (!pool)
+		return NULL;
+	spin_lock_init(&pool->lock);
+	for_each_unbuddied_list(i, 0)
+		INIT_LIST_HEAD(&pool->unbuddied[i]);
+	INIT_LIST_HEAD(&pool->buddied);
+	INIT_LIST_HEAD(&pool->lru);
+	pool->pages_nr = 0;
+	pool->ops = ops;
+	return pool;
+}
+
+/**
+ * z3fold_destroy_pool() - destroys an existing z3fold pool
+ * @pool:	the z3fold pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+static void z3fold_destroy_pool(struct z3fold_pool *pool)
+{
+	kfree(pool);
+}
+
+/* Has to be called with lock held */
+static int z3fold_compact_page(struct z3fold_header *zhdr)
+{
+	struct page *page = virt_to_page(zhdr);
+	void *beg = zhdr;
+
+
+	if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) &&
+	    zhdr->middle_chunks != 0 &&
+	    zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+		memmove(beg + ZHDR_SIZE_ALIGNED,
+			beg + (zhdr->start_middle << CHUNK_SHIFT),
+			zhdr->middle_chunks << CHUNK_SHIFT);
+		zhdr->first_chunks = zhdr->middle_chunks;
+		zhdr->middle_chunks = 0;
+		zhdr->start_middle = 0;
+		zhdr->first_num++;
+		return 1;
+	}
+	return 0;
+}
+
+/**
+ * z3fold_alloc() - allocates a region of a given size
+ * @pool:	z3fold pool from which to allocate
+ * @size:	size in bytes of the desired allocation
+ * @gfp:	gfp flags used if the pool needs to grow
+ * @handle:	handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request.  A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as z3fold pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	int chunks = 0, i, freechunks;
+	struct z3fold_header *zhdr = NULL;
+	enum buddy bud;
+	struct page *page;
+
+	if (!size || (gfp & __GFP_HIGHMEM))
+		return -EINVAL;
+
+	if (size > PAGE_SIZE)
+		return -ENOSPC;
+
+	if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
+		bud = HEADLESS;
+	else {
+		chunks = size_to_chunks(size);
+		spin_lock(&pool->lock);
+
+		/* First, try to find an unbuddied z3fold page. */
+		zhdr = NULL;
+		for_each_unbuddied_list(i, chunks) {
+			if (!list_empty(&pool->unbuddied[i])) {
+				zhdr = list_first_entry(&pool->unbuddied[i],
+						struct z3fold_header, buddy);
+				page = virt_to_page(zhdr);
+				if (zhdr->first_chunks == 0) {
+					if (zhdr->middle_chunks != 0 &&
+					    chunks >= zhdr->start_middle)
+						bud = LAST;
+					else
+						bud = FIRST;
+				} else if (zhdr->last_chunks == 0)
+					bud = LAST;
+				else if (zhdr->middle_chunks == 0)
+					bud = MIDDLE;
+				else {
+					pr_err("No free chunks in unbuddied\n");
+					WARN_ON(1);
+					continue;
+				}
+				list_del(&zhdr->buddy);
+				goto found;
+			}
+		}
+		bud = FIRST;
+		spin_unlock(&pool->lock);
+	}
+
+	/* Couldn't find unbuddied z3fold page, create new one */
+	page = alloc_page(gfp);
+	if (!page)
+		return -ENOMEM;
+	spin_lock(&pool->lock);
+	pool->pages_nr++;
+	zhdr = init_z3fold_page(page);
+
+	if (bud == HEADLESS) {
+		set_bit(PAGE_HEADLESS, &page->private);
+		goto headless;
+	}
+
+found:
+	if (bud == FIRST)
+		zhdr->first_chunks = chunks;
+	else if (bud == LAST)
+		zhdr->last_chunks = chunks;
+	else {
+		zhdr->middle_chunks = chunks;
+		zhdr->start_middle = zhdr->first_chunks + 1;
+	}
+
+	if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+			zhdr->middle_chunks == 0) {
+		/* Add to unbuddied list */
+		freechunks = num_free_chunks(zhdr);
+		list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+	} else {
+		/* Add to buddied list */
+		list_add(&zhdr->buddy, &pool->buddied);
+	}
+
+headless:
+	/* Add/move z3fold page to beginning of LRU */
+	if (!list_empty(&page->lru))
+		list_del(&page->lru);
+
+	list_add(&page->lru, &pool->lru);
+
+	*handle = encode_handle(zhdr, bud);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/**
+ * z3fold_free() - frees the allocation associated with the given handle
+ * @pool:	pool in which the allocation resided
+ * @handle:	handle associated with the allocation returned by z3fold_alloc()
+ *
+ * In the case that the z3fold page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0.  The page is actually freed
+ * once both buddies are evicted (see z3fold_reclaim_page() below).
+ */
+static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
+{
+	struct z3fold_header *zhdr;
+	int freechunks;
+	struct page *page;
+	enum buddy bud;
+
+	spin_lock(&pool->lock);
+	zhdr = handle_to_z3fold_header(handle);
+	page = virt_to_page(zhdr);
+
+	if (test_bit(PAGE_HEADLESS, &page->private)) {
+		/* HEADLESS page stored */
+		bud = HEADLESS;
+	} else {
+		bud = (handle - zhdr->first_num) & BUDDY_MASK;
+
+		switch (bud) {
+		case FIRST:
+			zhdr->first_chunks = 0;
+			break;
+		case MIDDLE:
+			zhdr->middle_chunks = 0;
+			zhdr->start_middle = 0;
+			break;
+		case LAST:
+			zhdr->last_chunks = 0;
+			break;
+		default:
+			pr_err("%s: unknown bud %d\n", __func__, bud);
+			WARN_ON(1);
+			spin_unlock(&pool->lock);
+			return;
+		}
+	}
+
+	if (test_bit(UNDER_RECLAIM, &page->private)) {
+		/* z3fold page is under reclaim, reclaim will free */
+		spin_unlock(&pool->lock);
+		return;
+	}
+
+	if (bud != HEADLESS) {
+		/* Remove from existing buddy list */
+		list_del(&zhdr->buddy);
+	}
+
+	if (bud == HEADLESS ||
+	    (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 &&
+			zhdr->last_chunks == 0)) {
+		/* z3fold page is empty, free */
+		list_del(&page->lru);
+		clear_bit(PAGE_HEADLESS, &page->private);
+		free_z3fold_page(zhdr);
+		pool->pages_nr--;
+	} else {
+		z3fold_compact_page(zhdr);
+		/* Add to the unbuddied list */
+		freechunks = num_free_chunks(zhdr);
+		list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+	}
+
+	spin_unlock(&pool->lock);
+}
+
+/**
+ * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool:	pool from which a page will attempt to be evicted
+ * @retires:	number of pages on the LRU list for which eviction will
+ *		be attempted before failing
+ *
+ * z3fold reclaim is different from normal system reclaim in that it is done
+ * from the bottom, up. This is because only the bottom layer, z3fold, has
+ * information on how the allocations are organized within each z3fold page.
+ * This has the potential to create interesting locking situations between
+ * z3fold and the user, however.
+ *
+ * To avoid these, this is how z3fold_reclaim_page() should be called:
+
+ * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
+ * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
+ * call the user-defined eviction handler with the pool and handle as
+ * arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
+ * appropriate list and try the next z3fold page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the z3fold page are successfully evicted, then the
+ * z3fold page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
+{
+	int i, ret = 0, freechunks;
+	struct z3fold_header *zhdr;
+	struct page *page;
+	unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
+
+	spin_lock(&pool->lock);
+	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+			retries == 0) {
+		spin_unlock(&pool->lock);
+		return -EINVAL;
+	}
+	for (i = 0; i < retries; i++) {
+		page = list_last_entry(&pool->lru, struct page, lru);
+		list_del(&page->lru);
+
+		/* Protect z3fold page against free */
+		set_bit(UNDER_RECLAIM, &page->private);
+		zhdr = page_address(page);
+		if (!test_bit(PAGE_HEADLESS, &page->private)) {
+			list_del(&zhdr->buddy);
+			/*
+			 * We need encode the handles before unlocking, since
+			 * we can race with free that will set
+			 * (first|last)_chunks to 0
+			 */
+			first_handle = 0;
+			last_handle = 0;
+			middle_handle = 0;
+			if (zhdr->first_chunks)
+				first_handle = encode_handle(zhdr, FIRST);
+			if (zhdr->middle_chunks)
+				middle_handle = encode_handle(zhdr, MIDDLE);
+			if (zhdr->last_chunks)
+				last_handle = encode_handle(zhdr, LAST);
+		} else {
+			first_handle = encode_handle(zhdr, HEADLESS);
+			last_handle = middle_handle = 0;
+		}
+
+		spin_unlock(&pool->lock);
+
+		/* Issue the eviction callback(s) */
+		if (middle_handle) {
+			ret = pool->ops->evict(pool, middle_handle);
+			if (ret)
+				goto next;
+		}
+		if (first_handle) {
+			ret = pool->ops->evict(pool, first_handle);
+			if (ret)
+				goto next;
+		}
+		if (last_handle) {
+			ret = pool->ops->evict(pool, last_handle);
+			if (ret)
+				goto next;
+		}
+next:
+		spin_lock(&pool->lock);
+		clear_bit(UNDER_RECLAIM, &page->private);
+		if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) ||
+		    (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 &&
+		     zhdr->middle_chunks == 0)) {
+			/*
+			 * All buddies are now free, free the z3fold page and
+			 * return success.
+			 */
+			clear_bit(PAGE_HEADLESS, &page->private);
+			free_z3fold_page(zhdr);
+			pool->pages_nr--;
+			spin_unlock(&pool->lock);
+			return 0;
+		} else if (zhdr->first_chunks != 0 &&
+			   zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) {
+			/* Full, add to buddied list */
+			list_add(&zhdr->buddy, &pool->buddied);
+		} else if (!test_bit(PAGE_HEADLESS, &page->private)) {
+			z3fold_compact_page(zhdr);
+			/* add to unbuddied list */
+			freechunks = num_free_chunks(zhdr);
+			list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+		}
+
+		/* add to beginning of LRU */
+		list_add(&page->lru, &pool->lru);
+	}
+	spin_unlock(&pool->lock);
+	return -EAGAIN;
+}
+
+/**
+ * z3fold_map() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be mapped
+ *
+ * Extracts the buddy number from handle and constructs the pointer to the
+ * correct starting chunk within the page.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
+{
+	struct z3fold_header *zhdr;
+	struct page *page;
+	void *addr;
+	enum buddy buddy;
+
+	spin_lock(&pool->lock);
+	zhdr = handle_to_z3fold_header(handle);
+	addr = zhdr;
+	page = virt_to_page(zhdr);
+
+	if (test_bit(PAGE_HEADLESS, &page->private))
+		goto out;
+
+	buddy = handle_to_buddy(handle);
+	switch (buddy) {
+	case FIRST:
+		addr += ZHDR_SIZE_ALIGNED;
+		break;
+	case MIDDLE:
+		addr += zhdr->start_middle << CHUNK_SHIFT;
+		set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+		break;
+	case LAST:
+		addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
+		break;
+	default:
+		pr_err("unknown buddy id %d\n", buddy);
+		WARN_ON(1);
+		addr = NULL;
+		break;
+	}
+out:
+	spin_unlock(&pool->lock);
+	return addr;
+}
+
+/**
+ * z3fold_unmap() - unmaps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be unmapped
+ */
+static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
+{
+	struct z3fold_header *zhdr;
+	struct page *page;
+	enum buddy buddy;
+
+	spin_lock(&pool->lock);
+	zhdr = handle_to_z3fold_header(handle);
+	page = virt_to_page(zhdr);
+
+	if (test_bit(PAGE_HEADLESS, &page->private)) {
+		spin_unlock(&pool->lock);
+		return;
+	}
+
+	buddy = handle_to_buddy(handle);
+	if (buddy == MIDDLE)
+		clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+	spin_unlock(&pool->lock);
+}
+
+/**
+ * z3fold_get_pool_size() - gets the z3fold pool size in pages
+ * @pool:	pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool.  The pool lock need not be
+ * taken to access pages_nr.
+ */
+static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
+{
+	return pool->pages_nr;
+}
+
+/*****************
+ * zpool
+ ****************/
+
+static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
+{
+	if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
+		return pool->zpool_ops->evict(pool->zpool, handle);
+	else
+		return -ENOENT;
+}
+
+static const struct z3fold_ops z3fold_zpool_ops = {
+	.evict =	z3fold_zpool_evict
+};
+
+static void *z3fold_zpool_create(const char *name, gfp_t gfp,
+			       const struct zpool_ops *zpool_ops,
+			       struct zpool *zpool)
+{
+	struct z3fold_pool *pool;
+
+	pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL);
+	if (pool) {
+		pool->zpool = zpool;
+		pool->zpool_ops = zpool_ops;
+	}
+	return pool;
+}
+
+static void z3fold_zpool_destroy(void *pool)
+{
+	z3fold_destroy_pool(pool);
+}
+
+static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
+			unsigned long *handle)
+{
+	return z3fold_alloc(pool, size, gfp, handle);
+}
+static void z3fold_zpool_free(void *pool, unsigned long handle)
+{
+	z3fold_free(pool, handle);
+}
+
+static int z3fold_zpool_shrink(void *pool, unsigned int pages,
+			unsigned int *reclaimed)
+{
+	unsigned int total = 0;
+	int ret = -EINVAL;
+
+	while (total < pages) {
+		ret = z3fold_reclaim_page(pool, 8);
+		if (ret < 0)
+			break;
+		total++;
+	}
+
+	if (reclaimed)
+		*reclaimed = total;
+
+	return ret;
+}
+
+static void *z3fold_zpool_map(void *pool, unsigned long handle,
+			enum zpool_mapmode mm)
+{
+	return z3fold_map(pool, handle);
+}
+static void z3fold_zpool_unmap(void *pool, unsigned long handle)
+{
+	z3fold_unmap(pool, handle);
+}
+
+static u64 z3fold_zpool_total_size(void *pool)
+{
+	return z3fold_get_pool_size(pool) * PAGE_SIZE;
+}
+
+static struct zpool_driver z3fold_zpool_driver = {
+	.type =		"z3fold",
+	.owner =	THIS_MODULE,
+	.create =	z3fold_zpool_create,
+	.destroy =	z3fold_zpool_destroy,
+	.malloc =	z3fold_zpool_malloc,
+	.free =		z3fold_zpool_free,
+	.shrink =	z3fold_zpool_shrink,
+	.map =		z3fold_zpool_map,
+	.unmap =	z3fold_zpool_unmap,
+	.total_size =	z3fold_zpool_total_size,
+};
+
+MODULE_ALIAS("zpool-z3fold");
+
+static int __init init_z3fold(void)
+{
+	/* Make sure the z3fold header will fit in one chunk */
+	BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED);
+	zpool_register_driver(&z3fold_zpool_driver);
+
+	return 0;
+}
+
+static void __exit exit_z3fold(void)
+{
+	zpool_unregister_driver(&z3fold_zpool_driver);
+}
+
+module_init(init_z3fold);
+module_exit(exit_z3fold);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
+MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
-- 
cgit v1.2.3


From cd33a76b0f2805fb0d6a05a2d53933f3817ccc9b Mon Sep 17 00:00:00 2001
From: Richard Leitner <dev@g0hl1n.net>
Date: Fri, 20 May 2016 16:58:33 -0700
Subject: mm/memblock.c: remove unnecessary always-true comparison

Comparing an u64 variable to >= 0 returns always true and can therefore
be removed.  This issue was detected using the -Wtype-limits gcc flag.

This patch fixes following type-limits warning:

  mm/memblock.c: In function `__next_reserved_mem_region':
  mm/memblock.c:843:11: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits]
    if (*idx >= 0 && *idx < type->cnt) {

Link: http://lkml.kernel.org/r/20160510103625.3a7f8f32@g0hl1n.net
Signed-off-by: Richard Leitner <dev@g0hl1n.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memblock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memblock.c b/mm/memblock.c
index 3b93daa46fc5..ac1248933b31 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -824,7 +824,7 @@ void __init_memblock __next_reserved_mem_region(u64 *idx,
 {
 	struct memblock_type *type = &memblock.reserved;
 
-	if (*idx >= 0 && *idx < type->cnt) {
+	if (*idx < type->cnt) {
 		struct memblock_region *r = &type->regions[*idx];
 		phys_addr_t base = r->base;
 		phys_addr_t size = r->size;
-- 
cgit v1.2.3


From 4b50bcc7eda4d3cc9e3f2a0aa60e590fedf728c5 Mon Sep 17 00:00:00 2001
From: Stefan Bader <stefan.bader@canonical.com>
Date: Fri, 20 May 2016 16:58:38 -0700
Subject: mm: use phys_addr_t for reserve_bootmem_region() arguments

Since commit 92923ca3aace ("mm: meminit: only set page reserved in the
memblock region") the reserved bit is set on reserved memblock regions.
However start and end address are passed as unsigned long.  This is only
32bit on i386, so it can end up marking the wrong pages reserved for
ranges at 4GB and above.

This was observed on a 32bit Xen dom0 which was booted with initial
memory set to a value below 4G but allowing to balloon in memory
(dom0_mem=1024M for example).  This would define a reserved bootmem
region for the additional memory (for example on a 8GB system there was
a reverved region covering the 4GB-8GB range).  But since the addresses
were passed on as unsigned long, this was actually marking all pages
from 0 to 4GB as reserved.

Fixes: 92923ca3aacef63 ("mm: meminit: only set page reserved in the memblock region")
Link: http://lkml.kernel.org/r/1463491221-10573-1-git-send-email-stefan.bader@canonical.com
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Cc: <stable@vger.kernel.org>	[4.2+]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/page_alloc.c    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 65d18a45b8e8..fbdb9d40847f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1763,7 +1763,7 @@ extern void free_highmem_page(struct page *page);
 extern void adjust_managed_page_count(struct page *page, long count);
 extern void mem_init_print_info(const char *str);
 
-extern void reserve_bootmem_region(unsigned long start, unsigned long end);
+extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end);
 
 /* Free the reserved page into the buddy system, so it gets managed. */
 static inline void __free_reserved_page(struct page *page)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3f4b69aaa23a..2dd1ba4e70cc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1205,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn)
  * marks the pages PageReserved. The remaining valid pages are later
  * sent to the buddy page allocator.
  */
-void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
+void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
 {
 	unsigned long start_pfn = PFN_DOWN(start);
 	unsigned long end_pfn = PFN_UP(end);
-- 
cgit v1.2.3


From 5c0a85fad949212b3e059692deecdeed74ae7ec7 Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 20 May 2016 16:58:41 -0700
Subject: mm: make faultaround produce old ptes

Currently, faultaround code produces young pte.  This can screw up
vmscan behaviour[1], as it makes vmscan think that these pages are hot
and not push them out on first round.

During sparse file access faultaround gets more pages mapped and all of
them are young.  Under memory pressure, this makes vmscan swap out anon
pages instead, or to drop other page cache pages which otherwise stay
resident.

Modify faultaround to produce old ptes, so they can easily be reclaimed
under memory pressure.

This can to some extend defeat the purpose of faultaround on machines
without hardware accessed bit as it will not help us with reducing the
number of minor page faults.

We may want to disable faultaround on such machines altogether, but
that's subject for separate patchset.

Minchan:
 "I tested 512M mmap sequential word read test on non-HW access bit
  system (i.e., ARM) and confirmed it doesn't increase minor fault any
  more.

  old: 4096 fault_around
  minor fault: 131291
  elapsed time: 6747645 usec

  new: 65536 fault_around
  minor fault: 131291
  elapsed time: 6709263 usec

  0.56% benefit"

[1] https://lkml.kernel.org/r/1460992636-711-1-git-send-email-vinmenon@codeaurora.org

Link: http://lkml.kernel.org/r/1463488366-47723-1-git-send-email-kirill.shutemov@linux.intel.com
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Tested-by: Minchan Kim <minchan@kernel.org>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Vinayak Menon <vinmenon@codeaurora.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/filemap.c       |  2 +-
 mm/memory.c        | 23 ++++++++++++++++++-----
 3 files changed, 20 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fbdb9d40847f..f223ac26b5d9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 }
 
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon);
+		struct page *page, pte_t *pte, bool write, bool anon, bool old);
 #endif
 
 /*
diff --git a/mm/filemap.c b/mm/filemap.c
index 8f4859989f1b..b418405903bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2191,7 +2191,7 @@ repeat:
 		if (file->f_ra.mmap_miss > 0)
 			file->f_ra.mmap_miss--;
 		addr = address + (page->index - vmf->pgoff) * PAGE_SIZE;
-		do_set_pte(vma, addr, page, pte, false, false);
+		do_set_pte(vma, addr, page, pte, false, false, true);
 		unlock_page(page);
 		goto next;
 unlock:
diff --git a/mm/memory.c b/mm/memory.c
index 007c72ad03f6..f29e5ab0342d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2876,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
  * vm_ops->map_pages.
  */
 void do_set_pte(struct vm_area_struct *vma, unsigned long address,
-		struct page *page, pte_t *pte, bool write, bool anon)
+		struct page *page, pte_t *pte, bool write, bool anon, bool old)
 {
 	pte_t entry;
 
@@ -2884,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+	if (old)
+		entry = pte_mkold(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address, false);
@@ -3021,9 +3023,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
 		pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-		do_fault_around(vma, address, pte, pgoff, flags);
 		if (!pte_same(*pte, orig_pte))
 			goto unlock_out;
+		do_fault_around(vma, address, pte, pgoff, flags);
+		/* Check if the fault is handled by faultaround */
+		if (!pte_same(*pte, orig_pte)) {
+			/*
+			 * Faultaround produce old pte, but the pte we've
+			 * handler fault for should be young.
+			 */
+			pte_t entry = pte_mkyoung(*pte);
+			if (ptep_set_access_flags(vma, address, pte, entry, 0))
+				update_mmu_cache(vma, address, pte);
+			goto unlock_out;
+		}
 		pte_unmap_unlock(pte, ptl);
 	}
 
@@ -3038,7 +3051,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, false, false);
+	do_set_pte(vma, address, fault_page, pte, false, false, false);
 	unlock_page(fault_page);
 unlock_out:
 	pte_unmap_unlock(pte, ptl);
@@ -3090,7 +3103,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		}
 		goto uncharge_out;
 	}
-	do_set_pte(vma, address, new_page, pte, true, true);
+	do_set_pte(vma, address, new_page, pte, true, true, false);
 	mem_cgroup_commit_charge(new_page, memcg, false, false);
 	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
@@ -3147,7 +3160,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		put_page(fault_page);
 		return ret;
 	}
-	do_set_pte(vma, address, fault_page, pte, true, false);
+	do_set_pte(vma, address, fault_page, pte, true, false, false);
 	pte_unmap_unlock(pte, ptl);
 
 	if (set_page_dirty(fault_page))
-- 
cgit v1.2.3


From d0834a6c2c5b0c76cfb806bd7dba6556d8b4edbb Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 20 May 2016 16:58:44 -0700
Subject: mm: disable fault around on emulated access bit architecture

fault_around aims to reduce minor faults of file-backed pages via
speculative ahead pte mapping and relying on readahead logic.  However,
on non-HW access bit architecture the benefit is highly limited because
they should emulate the young bit with minor faults for reclaim's page
aging algorithm.  IOW, we cannot reduce minor faults on those
architectures.

I did quick a test on my ARM machine.

512M file mmap sequential every word read on eSATA drive 4 times.
stddev is stable.

  = fault_around 4096 =
  elapsed time(usec): 6747645

  = fault_around 65536 =
  elapsed time(usec): 6709263

  0.5% gain.

Even when I tested it with eMMC there is no gain because I guess with
slow storage the major fault is the dominant factor.

Also, fault_around has the side effect of shrinking slab more
aggressively and causes higher vmpressure, so if such speculation fails,
it can evict slab more which can result in page I/O (e.g., inode cache).
In the end, it would make void any benefit of fault_around.

So let's make the default "disabled" on those architectures.

Link: http://lkml.kernel.org/r/20160518014229.GB21538@bbox
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index f29e5ab0342d..a1b93d9e4449 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2899,8 +2899,16 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	update_mmu_cache(vma, address, pte);
 }
 
+/*
+ * If architecture emulates "accessed" or "young" bit without HW support,
+ * there is no much gain with fault_around.
+ */
 static unsigned long fault_around_bytes __read_mostly =
+#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+	PAGE_SIZE;
+#else
 	rounddown_pow_of_two(65536);
+#endif
 
 #ifdef CONFIG_DEBUG_FS
 static int fault_around_bytes_get(void *data, u64 *val)
-- 
cgit v1.2.3


From 29b52de182acf50f85a8284ad39104d84c9bbf57 Mon Sep 17 00:00:00 2001
From: "seokhoon.yoon" <iamyooon@gmail.com>
Date: Fri, 20 May 2016 16:58:47 -0700
Subject: mm, kasan: fix to call kasan_free_pages() after poisoning page

When CONFIG_PAGE_POISONING and CONFIG_KASAN is enabled,
free_pages_prepare()'s codeflow is below.

  1)kmemcheck_free_shadow()
  2)kasan_free_pages()
    - set shadow byte of page is freed
  3)kernel_poison_pages()
  3.1) check access to page is valid or not using kasan
    ---> error occur, kasan think it is invalid access
  3.2) poison page
  4)kernel_map_pages()

So kasan_free_pages() should be called after poisoning the page.

Link: http://lkml.kernel.org/r/1463220405-7455-1-git-send-email-iamyooon@gmail.com
Signed-off-by: seokhoon.yoon <iamyooon@gmail.com>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2dd1ba4e70cc..383b14b4f61d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -993,7 +993,6 @@ static __always_inline bool free_pages_prepare(struct page *page,
 
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
-	kasan_free_pages(page, order);
 
 	/*
 	 * Check tail pages before head page information is cleared to
@@ -1035,6 +1034,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
 	arch_free_page(page, order);
 	kernel_poison_pages(page, 1 << order, 0);
 	kernel_map_pages(page, 1 << order, 0);
+	kasan_free_pages(page, order);
 
 	return true;
 }
-- 
cgit v1.2.3


From e570f56cccd215db68e50870ee74b7d9c0022109 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Fri, 20 May 2016 16:58:50 -0700
Subject: mm: check_new_page_bad() directly returns in __PG_HWPOISON case

Currently we check page->flags twice for "HWPoisoned" case of
check_new_page_bad(), which can cause a race with unpoisoning.

This race unnecessarily taints kernel with "BUG: Bad page state".
check_new_page_bad() is the only caller of bad_page() which is
interested in __PG_HWPOISON, so let's move the hwpoison related code in
bad_page() to it.

Link: http://lkml.kernel.org/r/20160518100949.GA17299@hori1.linux.bs1.fc.nec.co.jp
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 383b14b4f61d..f8f3bfc435ee 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -522,12 +522,6 @@ static void bad_page(struct page *page, const char *reason,
 	static unsigned long nr_shown;
 	static unsigned long nr_unshown;
 
-	/* Don't complain about poisoned pages */
-	if (PageHWPoison(page)) {
-		page_mapcount_reset(page); /* remove PageBuddy */
-		return;
-	}
-
 	/*
 	 * Allow a burst of 60 reports, then keep quiet for that minute;
 	 * or allow a steady drip of one report per second.
@@ -1654,6 +1648,9 @@ static void check_new_page_bad(struct page *page)
 	if (unlikely(page->flags & __PG_HWPOISON)) {
 		bad_reason = "HWPoisoned (hardware-corrupted)";
 		bad_flags = __PG_HWPOISON;
+		/* Don't complain about hwpoisoned pages */
+		page_mapcount_reset(page); /* remove PageBuddy */
+		return;
 	}
 	if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
 		bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
-- 
cgit v1.2.3


From a53eaff8c1192bb5bdfda5deb484bc8f415c5dfd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 20 May 2016 16:58:53 -0700
Subject: MM: increase safety margin provided by PF_LESS_THROTTLE

When nfsd is exporting a filesystem over NFS which is then NFS-mounted
on the local machine there is a risk of deadlock.  This happens when
there are lots of dirty pages in the NFS filesystem and they cause NFSD
to be throttled, either in throttle_vm_writeout() or in
balance_dirty_pages().

To avoid this problem the PF_LESS_THROTTLE flag is set for NFSD threads
and it provides a 25% increase to the limits that affect NFSD.  Any
process writing to an NFS filesystem will be throttled well before the
number of dirty NFS pages reaches the limit imposed on NFSD, so NFSD
will not deadlock on pages that it needs to write out.  At least it
shouldn't.

All processes are allowed a small excess margin to avoid performing too
many calculations: ratelimit_pages.

ratelimit_pages is set so that if a thread on every CPU uses the entire
margin, the total will only go 3% over the limit, and this is much less
than the 25% bonus that PF_LESS_THROTTLE provides, so this margin
shouldn't be a problem.  But it is.

The "total memory" that these 3% and 25% are calculated against are not
really total memory but are "global_dirtyable_memory()" which doesn't
include anonymous memory, just free memory and page-cache memory.

The "ratelimit_pages" number is based on whatever the
global_dirtyable_memory was on the last CPU hot-plug, which might not be
what you expect, but is probably close to the total freeable memory.

The throttle threshold uses the global_dirtable_memory at the moment
when the throttling happens, which could be much less than at the last
CPU hotplug.  So if lots of anonymous memory has been allocated, thus
pushing out lots of page-cache pages, then NFSD might end up being
throttled due to dirty NFS pages because the "25%" bonus it gets is
calculated against a rather small amount of dirtyable memory, while the
"3%" margin that other processes are allowed to dirty without penalty is
calculated against a much larger number.

To remove this possibility of deadlock we need to make sure that the
margin granted to PF_LESS_THROTTLE exceeds that rate-limit margin.
Simply adding ratelimit_pages isn't enough as that should be multiplied
by the number of cpus.

So add "global_wb_domain.dirty_limit / 32" as that more accurately
reflects the current total over-shoot margin.  This ensures that the
number of dirty NFS pages never gets so high that nfsd will be throttled
waiting for them to be written.

Link: http://lkml.kernel.org/r/87futgowwv.fsf@notabene.neil.brown.name
Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page-writeback.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3b88795ab46e..b9956fdee8f5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -411,8 +411,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
 		bg_thresh = thresh / 2;
 	tsk = current;
 	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-		bg_thresh += bg_thresh / 4;
-		thresh += thresh / 4;
+		bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
+		thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
 	}
 	dtc->thresh = thresh;
 	dtc->bg_thresh = bg_thresh;
-- 
cgit v1.2.3


From f0508977787b3bcfd54e82111ab50d6245b9f4df Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 20 May 2016 16:58:56 -0700
Subject: mm, thp: khugepaged should scan when sleep value is written

If a large value is written to scan_sleep_millisecs, for example, that
period must lapse before khugepaged will wake up for periodic
collapsing.

If this value is tuned to 1 day, for example, and then re-tuned to its
default 10s, khugepaged will still wait for a day before scanning again.

This patch causes khugepaged to wakeup immediately when the value is
changed and then sleep until that value is rewritten or the new value
lapses.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1605181453200.4786@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/huge_memory.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1764184c4774..41ef7547e822 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,6 +89,7 @@ static unsigned int khugepaged_full_scans;
 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
 /* during fragmentation poll the hugepage allocator once every minute */
 static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
+static unsigned long khugepaged_sleep_expire;
 static struct task_struct *khugepaged_thread __read_mostly;
 static DEFINE_MUTEX(khugepaged_mutex);
 static DEFINE_SPINLOCK(khugepaged_mm_lock);
@@ -467,6 +468,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
 		return -EINVAL;
 
 	khugepaged_scan_sleep_millisecs = msecs;
+	khugepaged_sleep_expire = 0;
 	wake_up_interruptible(&khugepaged_wait);
 
 	return count;
@@ -494,6 +496,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
 		return -EINVAL;
 
 	khugepaged_alloc_sleep_millisecs = msecs;
+	khugepaged_sleep_expire = 0;
 	wake_up_interruptible(&khugepaged_wait);
 
 	return count;
@@ -2791,15 +2794,25 @@ static void khugepaged_do_scan(void)
 		put_page(hpage);
 }
 
+static bool khugepaged_should_wakeup(void)
+{
+	return kthread_should_stop() ||
+	       time_after_eq(jiffies, khugepaged_sleep_expire);
+}
+
 static void khugepaged_wait_work(void)
 {
 	if (khugepaged_has_work()) {
-		if (!khugepaged_scan_sleep_millisecs)
+		const unsigned long scan_sleep_jiffies =
+			msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
+
+		if (!scan_sleep_jiffies)
 			return;
 
+		khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
 		wait_event_freezable_timeout(khugepaged_wait,
-					     kthread_should_stop(),
-			msecs_to_jiffies(khugepaged_scan_sleep_millisecs));
+					     khugepaged_should_wakeup(),
+					     scan_sleep_jiffies);
 		return;
 	}
 
-- 
cgit v1.2.3


From 6cd9dc3e75078ef646076fa63adfb9b85ced0b66 Mon Sep 17 00:00:00 2001
From: Chen Feng <puck.chen@hisilicon.com>
Date: Fri, 20 May 2016 16:59:02 -0700
Subject: mm/compaction.c: fix zoneindex in kcompactd()

While testing the kcompactd in my platform 3G MEM only DMA ZONE.  I
found the kcompactd never wakeup.  It seems the zoneindex has already
minus 1 before.  So the traverse here should be <=.

It fixes a regression where kswapd could previously compact, but
kcompactd not.  Not a crash fix though.

[akpm@linux-foundation.org: fix kcompactd_do_work() as well, per Hugh]
Link: http://lkml.kernel.org/r/1463659121-84124-1-git-send-email-puck.chen@hisilicon.com
Fixes: accf62422b3a ("mm, kswapd: replace kswapd compaction with waking up kcompactd")
Signed-off-by: Chen Feng <puck.chen@hisilicon.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Tejun Heo <tj@kernel.org>
Cc: Zhuangluan Su <suzhuangluan@hisilicon.com>
Cc: Yiping Xu <xuyiping@hisilicon.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/compaction.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/compaction.c b/mm/compaction.c
index d8a20fcf8678..1427366ad673 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1862,7 +1862,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
 	struct zone *zone;
 	enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
 
-	for (zoneid = 0; zoneid < classzone_idx; zoneid++) {
+	for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
 		zone = &pgdat->node_zones[zoneid];
 
 		if (!populated_zone(zone))
@@ -1897,7 +1897,7 @@ static void kcompactd_do_work(pg_data_t *pgdat)
 							cc.classzone_idx);
 	count_vm_event(KCOMPACTD_WAKE);
 
-	for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) {
+	for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
 		int status;
 
 		zone = &pgdat->node_zones[zoneid];
-- 
cgit v1.2.3


From dfef2ef4027b1304149a65dc33794eab65e8a3bf Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Fri, 20 May 2016 16:59:05 -0700
Subject: mm, migrate: increment fail count on ENOMEM

If page migration fails due to -ENOMEM, nr_failed should still be
incremented for proper statistics.

This was encountered recently when all page migration vmstats showed 0,
and inferred that migrate_pages() was never called, although in reality
the first page migration failed because compaction_alloc() failed to
find a migration target.

This patch increments nr_failed so the vmstat is properly accounted on
ENOMEM.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1605191510230.32658@chino.kir.corp.google.com
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/migrate.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/migrate.c b/mm/migrate.c
index 53ab6398e7a2..9baf41c877ff 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1171,6 +1171,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
 
 			switch(rc) {
 			case -ENOMEM:
+				nr_failed++;
 				goto out;
 			case -EAGAIN:
 				retry++;
-- 
cgit v1.2.3


From 55834c59098d0c5a97b0f3247e55832b67facdcf Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:11 -0700
Subject: mm: kasan: initial memory quarantine implementation

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

When the object is freed, its state changes from KASAN_STATE_ALLOC to
KASAN_STATE_QUARANTINE.  The object is poisoned and put into quarantine
instead of being returned to the allocator, therefore every subsequent
access to that object triggers a KASAN error, and the error handler is
able to say where the object has been allocated and deallocated.

When it's time for the object to leave quarantine, its state becomes
KASAN_STATE_FREE and it's returned to the allocator.  From now on the
allocator may reuse it for another allocation.  Before that happens,
it's still possible to detect a use-after free on that object (it
retains the allocation/deallocation stacks).

When the allocator reuses this object, the shadow is unpoisoned and old
allocation/deallocation stacks are wiped.  Therefore a use of this
object, even an incorrect one, won't trigger ASan warning.

Without the quarantine, it's not guaranteed that the objects aren't
reused immediately, that's why the probability of catching a
use-after-free is lower than with quarantine in place.

Quarantine isolates freed objects in a separate queue.  The objects are
returned to the allocator later, which helps to detect use-after-free
errors.

Freed objects are first added to per-cpu quarantine queues.  When a
cache is destroyed or memory shrinking is requested, the objects are
moved into the global quarantine queue.  Whenever a kmalloc call allows
memory reclaiming, the oldest objects are popped out of the global queue
until the total size of objects in quarantine is less than 3/4 of the
maximum quarantine size (which is a fraction of installed physical
memory).

As long as an object remains in the quarantine, KASAN is able to report
accesses to it, so the chance of reporting a use-after-free is
increased.  Once the object leaves quarantine, the allocator may reuse
it, in which case the object is unpoisoned and KASAN can't detect
incorrect accesses to it.

Right now quarantine support is only enabled in SLAB allocator.
Unification of KASAN features in SLAB and SLUB will be done later.

This patch is based on the "mm: kasan: quarantine" patch originally
prepared by Dmitry Chernenkov.  A number of improvements have been
suggested by Andrey Ryabinin.

[glider@google.com: v9]
  Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kasan.h |  13 ++-
 mm/kasan/Makefile     |   1 +
 mm/kasan/kasan.c      |  57 ++++++++--
 mm/kasan/kasan.h      |  21 +++-
 mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/kasan/report.c     |   1 +
 mm/mempool.c          |   2 +-
 mm/slab.c             |  12 ++-
 mm/slab.h             |   2 +
 mm/slab_common.c      |   2 +
 10 files changed, 387 insertions(+), 15 deletions(-)
 create mode 100644 mm/kasan/quarantine.c

(limited to 'mm')

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 737371b56044..611927f5870d 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 			unsigned long *flags);
+void kasan_cache_shrink(struct kmem_cache *cache);
+void kasan_cache_destroy(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
 void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
 void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
-void kasan_slab_free(struct kmem_cache *s, void *object);
+bool kasan_slab_free(struct kmem_cache *s, void *object);
+void kasan_poison_slab_free(struct kmem_cache *s, void *object);
 
 struct kasan_cache {
 	int alloc_meta_offset;
@@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
 				      size_t *size,
 				      unsigned long *flags) {}
+static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
+static inline void kasan_cache_destroy(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size,
 
 static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
 				   gfp_t flags) {}
-static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+static inline bool kasan_slab_free(struct kmem_cache *s, void *object)
+{
+	return false;
+}
+static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {}
 
 static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
 static inline void kasan_free_shadow(const struct vm_struct *vm) {}
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 131daadf40e4..1548749a3d45 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
 obj-y := kasan.o report.o kasan_init.o
+obj-$(CONFIG_SLAB) += quarantine.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 38f1dd79acdb..8df666bb23be 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size,
 }
 #endif
 
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
+void kasan_cache_destroy(struct kmem_cache *cache)
+{
+	quarantine_remove_cache(cache);
+}
+
 void kasan_poison_slab(struct page *page)
 {
 	kasan_poison_shadow(page_address(page),
@@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 	kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
-void kasan_slab_free(struct kmem_cache *cache, void *object)
+void kasan_poison_slab_free(struct kmem_cache *cache, void *object)
 {
 	unsigned long size = cache->object_size;
 	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
@@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object)
 	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
 		return;
 
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object)
+{
 #ifdef CONFIG_SLAB
-	if (cache->flags & SLAB_KASAN) {
-		struct kasan_free_meta *free_info =
-			get_free_info(cache, object);
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return false;
+
+	if (likely(cache->flags & SLAB_KASAN)) {
 		struct kasan_alloc_meta *alloc_info =
 			get_alloc_info(cache, object);
-		alloc_info->state = KASAN_STATE_FREE;
-		set_track(&free_info->track, GFP_NOWAIT);
+		struct kasan_free_meta *free_info =
+			get_free_info(cache, object);
+
+		switch (alloc_info->state) {
+		case KASAN_STATE_ALLOC:
+			alloc_info->state = KASAN_STATE_QUARANTINE;
+			quarantine_put(free_info, cache);
+			set_track(&free_info->track, GFP_NOWAIT);
+			kasan_poison_slab_free(cache, object);
+			return true;
+		case KASAN_STATE_QUARANTINE:
+		case KASAN_STATE_FREE:
+			pr_err("Double free");
+			dump_stack();
+			break;
+		default:
+			break;
+		}
 	}
+	return false;
+#else
+	kasan_poison_slab_free(cache, object);
+	return false;
 #endif
-
-	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
 }
 
 void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
@@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(object == NULL))
 		return;
 
@@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
 	unsigned long redzone_start;
 	unsigned long redzone_end;
 
+	if (flags & __GFP_RECLAIM)
+		quarantine_reduce();
+
 	if (unlikely(ptr == NULL))
 		return;
 
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 30a2f0ba0e09..7f7ac51d7faf 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -62,6 +62,7 @@ struct kasan_global {
 enum kasan_state {
 	KASAN_STATE_INIT,
 	KASAN_STATE_ALLOC,
+	KASAN_STATE_QUARANTINE,
 	KASAN_STATE_FREE
 };
 
@@ -79,9 +80,14 @@ struct kasan_alloc_meta {
 	u32 reserved;
 };
 
+struct qlist_node {
+	struct qlist_node *next;
+};
 struct kasan_free_meta {
-	/* Allocator freelist pointer, unused by KASAN. */
-	void **freelist;
+	/* This field is used while the object is in the quarantine.
+	 * Otherwise it might be used for the allocator freelist.
+	 */
+	struct qlist_node quarantine_link;
 	struct kasan_track track;
 };
 
@@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void)
 void kasan_report(unsigned long addr, size_t size,
 		bool is_write, unsigned long ip);
 
+#ifdef CONFIG_SLAB
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+void quarantine_reduce(void);
+void quarantine_remove_cache(struct kmem_cache *cache);
+#else
+static inline void quarantine_put(struct kasan_free_meta *info,
+				struct kmem_cache *cache) { }
+static inline void quarantine_reduce(void) { }
+static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
+#endif
+
 #endif
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
new file mode 100644
index 000000000000..4973505a9bdd
--- /dev/null
+++ b/mm/kasan/quarantine.c
@@ -0,0 +1,291 @@
+/*
+ * KASAN quarantine.
+ *
+ * Author: Alexander Potapenko <glider@google.com>
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * Based on code by Dmitry Chernenkov.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/hash.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "../slab.h"
+#include "kasan.h"
+
+/* Data structure and operations for quarantine queues. */
+
+/*
+ * Each queue is a signle-linked list, which also stores the total size of
+ * objects inside of it.
+ */
+struct qlist_head {
+	struct qlist_node *head;
+	struct qlist_node *tail;
+	size_t bytes;
+};
+
+#define QLIST_INIT { NULL, NULL, 0 }
+
+static bool qlist_empty(struct qlist_head *q)
+{
+	return !q->head;
+}
+
+static void qlist_init(struct qlist_head *q)
+{
+	q->head = q->tail = NULL;
+	q->bytes = 0;
+}
+
+static void qlist_put(struct qlist_head *q, struct qlist_node *qlink,
+		size_t size)
+{
+	if (unlikely(qlist_empty(q)))
+		q->head = qlink;
+	else
+		q->tail->next = qlink;
+	q->tail = qlink;
+	qlink->next = NULL;
+	q->bytes += size;
+}
+
+static void qlist_move_all(struct qlist_head *from, struct qlist_head *to)
+{
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	if (qlist_empty(to)) {
+		*to = *from;
+		qlist_init(from);
+		return;
+	}
+
+	to->tail->next = from->head;
+	to->tail = from->tail;
+	to->bytes += from->bytes;
+
+	qlist_init(from);
+}
+
+static void qlist_move(struct qlist_head *from, struct qlist_node *last,
+		struct qlist_head *to, size_t size)
+{
+	if (unlikely(last == from->tail)) {
+		qlist_move_all(from, to);
+		return;
+	}
+	if (qlist_empty(to))
+		to->head = from->head;
+	else
+		to->tail->next = from->head;
+	to->tail = last;
+	from->head = last->next;
+	last->next = NULL;
+	from->bytes -= size;
+	to->bytes += size;
+}
+
+
+/*
+ * The object quarantine consists of per-cpu queues and a global queue,
+ * guarded by quarantine_lock.
+ */
+static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine);
+
+static struct qlist_head global_quarantine;
+static DEFINE_SPINLOCK(quarantine_lock);
+
+/* Maximum size of the global queue. */
+static unsigned long quarantine_size;
+
+/*
+ * The fraction of physical memory the quarantine is allowed to occupy.
+ * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep
+ * the ratio low to avoid OOM.
+ */
+#define QUARANTINE_FRACTION 32
+
+#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4)
+#define QUARANTINE_PERCPU_SIZE (1 << 20)
+
+static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
+{
+	return virt_to_head_page(qlink)->slab_cache;
+}
+
+static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	struct kasan_free_meta *free_info =
+		container_of(qlink, struct kasan_free_meta,
+			     quarantine_link);
+
+	return ((void *)free_info) - cache->kasan_info.free_meta_offset;
+}
+
+static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
+{
+	void *object = qlink_to_object(qlink, cache);
+	struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	alloc_info->state = KASAN_STATE_FREE;
+	___cache_free(cache, object, _THIS_IP_);
+	local_irq_restore(flags);
+}
+
+static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
+{
+	struct qlist_node *qlink;
+
+	if (unlikely(qlist_empty(q)))
+		return;
+
+	qlink = q->head;
+	while (qlink) {
+		struct kmem_cache *obj_cache =
+			cache ? cache :	qlink_to_cache(qlink);
+		struct qlist_node *next = qlink->next;
+
+		qlink_free(qlink, obj_cache);
+		qlink = next;
+	}
+	qlist_init(q);
+}
+
+void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head *q;
+	struct qlist_head temp = QLIST_INIT;
+
+	local_irq_save(flags);
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_put(q, &info->quarantine_link, cache->size);
+	if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE))
+		qlist_move_all(q, &temp);
+
+	local_irq_restore(flags);
+
+	if (unlikely(!qlist_empty(&temp))) {
+		spin_lock_irqsave(&quarantine_lock, flags);
+		qlist_move_all(&temp, &global_quarantine);
+		spin_unlock_irqrestore(&quarantine_lock, flags);
+	}
+}
+
+void quarantine_reduce(void)
+{
+	size_t new_quarantine_size;
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+	size_t size_to_free = 0;
+	struct qlist_node *last;
+
+	if (likely(READ_ONCE(global_quarantine.bytes) <=
+		   READ_ONCE(quarantine_size)))
+		return;
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+
+	/*
+	 * Update quarantine size in case of hotplug. Allocate a fraction of
+	 * the installed memory to quarantine minus per-cpu queue limits.
+	 */
+	new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+		QUARANTINE_FRACTION;
+	new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus();
+	WRITE_ONCE(quarantine_size, new_quarantine_size);
+
+	last = global_quarantine.head;
+	while (last) {
+		struct kmem_cache *cache = qlink_to_cache(last);
+
+		size_to_free += cache->size;
+		if (!last->next || size_to_free >
+		    global_quarantine.bytes - QUARANTINE_LOW_SIZE)
+			break;
+		last = last->next;
+	}
+	qlist_move(&global_quarantine, last, &to_free, size_to_free);
+
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, NULL);
+}
+
+static void qlist_move_cache(struct qlist_head *from,
+				   struct qlist_head *to,
+				   struct kmem_cache *cache)
+{
+	struct qlist_node *prev = NULL, *curr;
+
+	if (unlikely(qlist_empty(from)))
+		return;
+
+	curr = from->head;
+	while (curr) {
+		struct qlist_node *qlink = curr;
+		struct kmem_cache *obj_cache = qlink_to_cache(qlink);
+
+		if (obj_cache == cache) {
+			if (unlikely(from->head == qlink)) {
+				from->head = curr->next;
+				prev = curr;
+			} else
+				prev->next = curr->next;
+			if (unlikely(from->tail == qlink))
+				from->tail = curr->next;
+			from->bytes -= cache->size;
+			qlist_put(to, qlink, cache->size);
+		} else {
+			prev = curr;
+		}
+		curr = curr->next;
+	}
+}
+
+static void per_cpu_remove_cache(void *arg)
+{
+	struct kmem_cache *cache = arg;
+	struct qlist_head to_free = QLIST_INIT;
+	struct qlist_head *q;
+
+	q = this_cpu_ptr(&cpu_quarantine);
+	qlist_move_cache(q, &to_free, cache);
+	qlist_free_all(&to_free, cache);
+}
+
+void quarantine_remove_cache(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	struct qlist_head to_free = QLIST_INIT;
+
+	on_each_cpu(per_cpu_remove_cache, cache, 1);
+
+	spin_lock_irqsave(&quarantine_lock, flags);
+	qlist_move_cache(&global_quarantine, &to_free, cache);
+	spin_unlock_irqrestore(&quarantine_lock, flags);
+
+	qlist_free_all(&to_free, cache);
+}
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 60869a5a0124..b3c122ddd454 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page,
 		print_track(&alloc_info->track);
 		break;
 	case KASAN_STATE_FREE:
+	case KASAN_STATE_QUARANTINE:
 		pr_err("Object freed, allocated with size %u bytes\n",
 		       alloc_info->alloc_size);
 		free_info = get_free_info(cache, object);
diff --git a/mm/mempool.c b/mm/mempool.c
index 9b7a14a791cc..9e075f829d0d 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element)
 static void kasan_poison_element(mempool_t *pool, void *element)
 {
 	if (pool->alloc == mempool_alloc_slab)
-		kasan_slab_free(pool->pool_data, element);
+		kasan_poison_slab_free(pool->pool_data, element);
 	if (pool->alloc == mempool_kmalloc)
 		kasan_kfree(element);
 	if (pool->alloc == mempool_alloc_pages)
diff --git a/mm/slab.c b/mm/slab.c
index c11bf5007952..28864c022430 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3547,9 +3547,17 @@ free_done:
 static inline void __cache_free(struct kmem_cache *cachep, void *objp,
 				unsigned long caller)
 {
-	struct array_cache *ac = cpu_cache_get(cachep);
+	/* Put the object into the quarantine, don't touch it for now. */
+	if (kasan_slab_free(cachep, objp))
+		return;
+
+	___cache_free(cachep, objp, caller);
+}
 
-	kasan_slab_free(cachep, objp);
+void ___cache_free(struct kmem_cache *cachep, void *objp,
+		unsigned long caller)
+{
+	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
 	kmemleak_free_recursive(objp, cachep->flags);
diff --git a/mm/slab.h b/mm/slab.h
index 5969769fbee6..dedb1a920fb8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos);
 void slab_stop(struct seq_file *m, void *p);
 int memcg_slab_show(struct seq_file *m, void *p);
 
+void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr);
+
 #endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3239bfd758e6..a65dad7fdcd1 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
 	get_online_cpus();
 	get_online_mems();
 
+	kasan_cache_destroy(s);
 	mutex_lock(&slab_mutex);
 
 	s->refcount--;
@@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 
 	get_online_cpus();
 	get_online_mems();
+	kasan_cache_shrink(cachep);
 	ret = __kmem_cache_shrink(cachep, false);
 	put_online_mems();
 	put_online_cpus();
-- 
cgit v1.2.3


From 4ebb31a42ffa03912447fe1aabbdb28242f909ba Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 20 May 2016 16:59:14 -0700
Subject: mm, kasan: don't call kasan_krealloc() from ksize().

Instead of calling kasan_krealloc(), which replaces the memory
allocation stack ID (if stack depot is used), just unpoison the whole
memory chunk.

Signed-off-by: Alexander Potapenko <glider@google.com>
Acked-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Konstantin Serebryany <kcc@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/slab.c | 2 +-
 mm/slub.c | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 28864c022430..cc8bbc1e6bc9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4501,7 +4501,7 @@ size_t ksize(const void *objp)
 	/* We assume that ksize callers could use the whole allocated area,
 	 * so we need to unpoison this area.
 	 */
-	kasan_krealloc(objp, size, GFP_NOWAIT);
+	kasan_unpoison_shadow(objp, size);
 
 	return size;
 }
diff --git a/mm/slub.c b/mm/slub.c
index cf1faa4d3992..825ff4505336 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3635,8 +3635,9 @@ size_t ksize(const void *object)
 {
 	size_t size = __ksize(object);
 	/* We assume that ksize callers could use whole allocated area,
-	   so we need unpoison this area. */
-	kasan_krealloc(object, size, GFP_NOWAIT);
+	 * so we need to unpoison this area.
+	 */
+	kasan_unpoison_shadow(object, size);
 	return size;
 }
 EXPORT_SYMBOL(ksize);
-- 
cgit v1.2.3


From 936bb4bbbb832f81055328b84e5afe1fc7246a8d Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:20 -0700
Subject: mm/kasan: print name of mem[set,cpy,move]() caller in report

When bogus memory access happens in mem[set,cpy,move]() it's usually
caller's fault.  So don't blame mem[set,cpy,move]() in bug report, blame
the caller instead.

Before:
  BUG: KASAN: out-of-bounds access in memset+0x23/0x40 at <address>
After:
  BUG: KASAN: out-of-bounds access in <memset_caller> at <address>

Link: http://lkml.kernel.org/r/1462538722-1574-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.c | 64 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 30 deletions(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 8df666bb23be..e5beb40d97b1 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -273,32 +273,36 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
 	return memory_is_poisoned_n(addr, size);
 }
 
-
-static __always_inline void check_memory_region(unsigned long addr,
-						size_t size, bool write)
+static __always_inline void check_memory_region_inline(unsigned long addr,
+						size_t size, bool write,
+						unsigned long ret_ip)
 {
 	if (unlikely(size == 0))
 		return;
 
 	if (unlikely((void *)addr <
 		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
-		kasan_report(addr, size, write, _RET_IP_);
+		kasan_report(addr, size, write, ret_ip);
 		return;
 	}
 
 	if (likely(!memory_is_poisoned(addr, size)))
 		return;
 
-	kasan_report(addr, size, write, _RET_IP_);
+	kasan_report(addr, size, write, ret_ip);
 }
 
-void __asan_loadN(unsigned long addr, size_t size);
-void __asan_storeN(unsigned long addr, size_t size);
+static void check_memory_region(unsigned long addr,
+				size_t size, bool write,
+				unsigned long ret_ip)
+{
+	check_memory_region_inline(addr, size, write, ret_ip);
+}
 
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
-	__asan_storeN((unsigned long)addr, len);
+	check_memory_region((unsigned long)addr, len, true, _RET_IP_);
 
 	return __memset(addr, c, len);
 }
@@ -306,8 +310,8 @@ void *memset(void *addr, int c, size_t len)
 #undef memmove
 void *memmove(void *dest, const void *src, size_t len)
 {
-	__asan_loadN((unsigned long)src, len);
-	__asan_storeN((unsigned long)dest, len);
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
 
 	return __memmove(dest, src, len);
 }
@@ -315,8 +319,8 @@ void *memmove(void *dest, const void *src, size_t len)
 #undef memcpy
 void *memcpy(void *dest, const void *src, size_t len)
 {
-	__asan_loadN((unsigned long)src, len);
-	__asan_storeN((unsigned long)dest, len);
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
 
 	return __memcpy(dest, src, len);
 }
@@ -690,22 +694,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size)
 }
 EXPORT_SYMBOL(__asan_unregister_globals);
 
-#define DEFINE_ASAN_LOAD_STORE(size)				\
-	void __asan_load##size(unsigned long addr)		\
-	{							\
-		check_memory_region(addr, size, false);		\
-	}							\
-	EXPORT_SYMBOL(__asan_load##size);			\
-	__alias(__asan_load##size)				\
-	void __asan_load##size##_noabort(unsigned long);	\
-	EXPORT_SYMBOL(__asan_load##size##_noabort);		\
-	void __asan_store##size(unsigned long addr)		\
-	{							\
-		check_memory_region(addr, size, true);		\
-	}							\
-	EXPORT_SYMBOL(__asan_store##size);			\
-	__alias(__asan_store##size)				\
-	void __asan_store##size##_noabort(unsigned long);	\
+#define DEFINE_ASAN_LOAD_STORE(size)					\
+	void __asan_load##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, false, _RET_IP_);\
+	}								\
+	EXPORT_SYMBOL(__asan_load##size);				\
+	__alias(__asan_load##size)					\
+	void __asan_load##size##_noabort(unsigned long);		\
+	EXPORT_SYMBOL(__asan_load##size##_noabort);			\
+	void __asan_store##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, true, _RET_IP_);	\
+	}								\
+	EXPORT_SYMBOL(__asan_store##size);				\
+	__alias(__asan_store##size)					\
+	void __asan_store##size##_noabort(unsigned long);		\
 	EXPORT_SYMBOL(__asan_store##size##_noabort)
 
 DEFINE_ASAN_LOAD_STORE(1);
@@ -716,7 +720,7 @@ DEFINE_ASAN_LOAD_STORE(16);
 
 void __asan_loadN(unsigned long addr, size_t size)
 {
-	check_memory_region(addr, size, false);
+	check_memory_region(addr, size, false, _RET_IP_);
 }
 EXPORT_SYMBOL(__asan_loadN);
 
@@ -726,7 +730,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort);
 
 void __asan_storeN(unsigned long addr, size_t size)
 {
-	check_memory_region(addr, size, true);
+	check_memory_region(addr, size, true, _RET_IP_);
 }
 EXPORT_SYMBOL(__asan_storeN);
 
-- 
cgit v1.2.3


From 64f8ebaf115bcddc4aaa902f981c57ba6506bc42 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 20 May 2016 16:59:28 -0700
Subject: mm/kasan: add API to check memory regions

Memory access coded in an assembly won't be seen by KASAN as a compiler
can instrument only C code.  Add kasan_check_[read,write]() API which is
going to be used to check a certain memory range.

Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Acked-by: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS                  |  2 +-
 include/linux/kasan-checks.h | 12 ++++++++++++
 mm/kasan/kasan.c             | 12 ++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kasan-checks.h

(limited to 'mm')

diff --git a/MAINTAINERS b/MAINTAINERS
index 374ffa2d81b7..8b92445561b6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6242,7 +6242,7 @@ S:	Maintained
 F:	arch/*/include/asm/kasan.h
 F:	arch/*/mm/kasan_init*
 F:	Documentation/kasan.txt
-F:	include/linux/kasan.h
+F:	include/linux/kasan*.h
 F:	lib/test_kasan.c
 F:	mm/kasan/
 F:	scripts/Makefile.kasan
diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h
new file mode 100644
index 000000000000..b7f8aced7870
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e5beb40d97b1..18b6a2b8d183 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr,
 	check_memory_region_inline(addr, size, write, ret_ip);
 }
 
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
 #undef memset
 void *memset(void *addr, int c, size_t len)
 {
-- 
cgit v1.2.3


From a42094676f076534bf4998625456fe0bb99c1f1e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 20 May 2016 16:59:36 -0700
Subject: zsmalloc: use first_page rather than page

Clean up function parameter "struct page".  Many functions of zsmalloc
expect that page paramter is "first_page" so use "first_page" rather
than "page" for code readability.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 62 ++++++++++++++++++++++++++++++-----------------------------
 1 file changed, 32 insertions(+), 30 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index fe47fbba995a..c3e55a4bcc78 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -413,26 +413,28 @@ static int is_last_page(struct page *page)
 	return PagePrivate2(page);
 }
 
-static void get_zspage_mapping(struct page *page, unsigned int *class_idx,
+static void get_zspage_mapping(struct page *first_page,
+				unsigned int *class_idx,
 				enum fullness_group *fullness)
 {
 	unsigned long m;
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	m = (unsigned long)page->mapping;
+	m = (unsigned long)first_page->mapping;
 	*fullness = m & FULLNESS_MASK;
 	*class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK;
 }
 
-static void set_zspage_mapping(struct page *page, unsigned int class_idx,
+static void set_zspage_mapping(struct page *first_page,
+				unsigned int class_idx,
 				enum fullness_group fullness)
 {
 	unsigned long m;
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
 	m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
 			(fullness & FULLNESS_MASK);
-	page->mapping = (struct address_space *)m;
+	first_page->mapping = (struct address_space *)m;
 }
 
 /*
@@ -625,14 +627,14 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
  * the pool (not yet implemented). This function returns fullness
  * status of the given page.
  */
-static enum fullness_group get_fullness_group(struct page *page)
+static enum fullness_group get_fullness_group(struct page *first_page)
 {
 	int inuse, max_objects;
 	enum fullness_group fg;
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	inuse = page->inuse;
-	max_objects = page->objects;
+	inuse = first_page->inuse;
+	max_objects = first_page->objects;
 
 	if (inuse == 0)
 		fg = ZS_EMPTY;
@@ -652,12 +654,12 @@ static enum fullness_group get_fullness_group(struct page *page)
  * have. This functions inserts the given zspage into the freelist
  * identified by <class, fullness_group>.
  */
-static void insert_zspage(struct page *page, struct size_class *class,
+static void insert_zspage(struct page *first_page, struct size_class *class,
 				enum fullness_group fullness)
 {
 	struct page **head;
 
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
 	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
 		return;
@@ -667,7 +669,7 @@ static void insert_zspage(struct page *page, struct size_class *class,
 
 	head = &class->fullness_list[fullness];
 	if (!*head) {
-		*head = page;
+		*head = first_page;
 		return;
 	}
 
@@ -675,21 +677,21 @@ static void insert_zspage(struct page *page, struct size_class *class,
 	 * We want to see more ZS_FULL pages and less almost
 	 * empty/full. Put pages with higher ->inuse first.
 	 */
-	list_add_tail(&page->lru, &(*head)->lru);
-	if (page->inuse >= (*head)->inuse)
-		*head = page;
+	list_add_tail(&first_page->lru, &(*head)->lru);
+	if (first_page->inuse >= (*head)->inuse)
+		*head = first_page;
 }
 
 /*
  * This function removes the given zspage from the freelist identified
  * by <class, fullness_group>.
  */
-static void remove_zspage(struct page *page, struct size_class *class,
+static void remove_zspage(struct page *first_page, struct size_class *class,
 				enum fullness_group fullness)
 {
 	struct page **head;
 
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
 	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
 		return;
@@ -698,11 +700,11 @@ static void remove_zspage(struct page *page, struct size_class *class,
 	BUG_ON(!*head);
 	if (list_empty(&(*head)->lru))
 		*head = NULL;
-	else if (*head == page)
+	else if (*head == first_page)
 		*head = (struct page *)list_entry((*head)->lru.next,
 					struct page, lru);
 
-	list_del_init(&page->lru);
+	list_del_init(&first_page->lru);
 	zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ?
 			CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1);
 }
@@ -717,21 +719,21 @@ static void remove_zspage(struct page *page, struct size_class *class,
  * fullness group.
  */
 static enum fullness_group fix_fullness_group(struct size_class *class,
-						struct page *page)
+						struct page *first_page)
 {
 	int class_idx;
 	enum fullness_group currfg, newfg;
 
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	get_zspage_mapping(page, &class_idx, &currfg);
-	newfg = get_fullness_group(page);
+	get_zspage_mapping(first_page, &class_idx, &currfg);
+	newfg = get_fullness_group(first_page);
 	if (newfg == currfg)
 		goto out;
 
-	remove_zspage(page, class, currfg);
-	insert_zspage(page, class, newfg);
-	set_zspage_mapping(page, class_idx, newfg);
+	remove_zspage(first_page, class, currfg);
+	insert_zspage(first_page, class, newfg);
+	set_zspage_mapping(first_page, class_idx, newfg);
 
 out:
 	return newfg;
@@ -1234,11 +1236,11 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 	return true;
 }
 
-static bool zspage_full(struct page *page)
+static bool zspage_full(struct page *first_page)
 {
-	BUG_ON(!is_first_page(page));
+	BUG_ON(!is_first_page(first_page));
 
-	return page->inuse == page->objects;
+	return first_page->inuse == first_page->objects;
 }
 
 unsigned long zs_get_total_pages(struct zs_pool *pool)
-- 
cgit v1.2.3


From 830e4bc5baa9fda5d45257e9a3dbb3555c6c180e Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 20 May 2016 16:59:39 -0700
Subject: zsmalloc: clean up many BUG_ON

There are many BUG_ON in zsmalloc.c which is not recommened so change
them as alternatives.

Normal rule is as follows:

1. avoid BUG_ON if possible. Instead, use VM_BUG_ON or VM_BUG_ON_PAGE

2. use VM_BUG_ON_PAGE if we need to see struct page's fields

3. use those assertion in primitive functions so higher functions can
   rely on the assertion in the primitive function.

4. Don't use assertion if following instruction can trigger Oops

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 42 +++++++++++++++---------------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c3e55a4bcc78..dfe684cc3a03 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -418,7 +418,7 @@ static void get_zspage_mapping(struct page *first_page,
 				enum fullness_group *fullness)
 {
 	unsigned long m;
-	BUG_ON(!is_first_page(first_page));
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
 	m = (unsigned long)first_page->mapping;
 	*fullness = m & FULLNESS_MASK;
@@ -430,7 +430,7 @@ static void set_zspage_mapping(struct page *first_page,
 				enum fullness_group fullness)
 {
 	unsigned long m;
-	BUG_ON(!is_first_page(first_page));
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
 	m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) |
 			(fullness & FULLNESS_MASK);
@@ -631,7 +631,8 @@ static enum fullness_group get_fullness_group(struct page *first_page)
 {
 	int inuse, max_objects;
 	enum fullness_group fg;
-	BUG_ON(!is_first_page(first_page));
+
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
 	inuse = first_page->inuse;
 	max_objects = first_page->objects;
@@ -659,7 +660,7 @@ static void insert_zspage(struct page *first_page, struct size_class *class,
 {
 	struct page **head;
 
-	BUG_ON(!is_first_page(first_page));
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
 	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
 		return;
@@ -691,13 +692,13 @@ static void remove_zspage(struct page *first_page, struct size_class *class,
 {
 	struct page **head;
 
-	BUG_ON(!is_first_page(first_page));
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
 	if (fullness >= _ZS_NR_FULLNESS_GROUPS)
 		return;
 
 	head = &class->fullness_list[fullness];
-	BUG_ON(!*head);
+	VM_BUG_ON_PAGE(!*head, first_page);
 	if (list_empty(&(*head)->lru))
 		*head = NULL;
 	else if (*head == first_page)
@@ -724,8 +725,6 @@ static enum fullness_group fix_fullness_group(struct size_class *class,
 	int class_idx;
 	enum fullness_group currfg, newfg;
 
-	BUG_ON(!is_first_page(first_page));
-
 	get_zspage_mapping(first_page, &class_idx, &currfg);
 	newfg = get_fullness_group(first_page);
 	if (newfg == currfg)
@@ -811,7 +810,7 @@ static void *location_to_obj(struct page *page, unsigned long obj_idx)
 	unsigned long obj;
 
 	if (!page) {
-		BUG_ON(obj_idx);
+		VM_BUG_ON(obj_idx);
 		return NULL;
 	}
 
@@ -844,7 +843,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page,
 			void *obj)
 {
 	if (class->huge) {
-		VM_BUG_ON(!is_first_page(page));
+		VM_BUG_ON_PAGE(!is_first_page(page), page);
 		return page_private(page);
 	} else
 		return *(unsigned long *)obj;
@@ -894,8 +893,8 @@ static void free_zspage(struct page *first_page)
 {
 	struct page *nextp, *tmp, *head_extra;
 
-	BUG_ON(!is_first_page(first_page));
-	BUG_ON(first_page->inuse);
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+	VM_BUG_ON_PAGE(first_page->inuse, first_page);
 
 	head_extra = (struct page *)page_private(first_page);
 
@@ -921,7 +920,8 @@ static void init_zspage(struct page *first_page, struct size_class *class)
 	unsigned long off = 0;
 	struct page *page = first_page;
 
-	BUG_ON(!is_first_page(first_page));
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
+
 	while (page) {
 		struct page *next_page;
 		struct link_free *link;
@@ -1238,7 +1238,7 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
 
 static bool zspage_full(struct page *first_page)
 {
-	BUG_ON(!is_first_page(first_page));
+	VM_BUG_ON_PAGE(!is_first_page(first_page), first_page);
 
 	return first_page->inuse == first_page->objects;
 }
@@ -1276,14 +1276,12 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	struct page *pages[2];
 	void *ret;
 
-	BUG_ON(!handle);
-
 	/*
 	 * Because we use per-cpu mapping areas shared among the
 	 * pools/users, we can't allow mapping in interrupt context
 	 * because it can corrupt another users mappings.
 	 */
-	BUG_ON(in_interrupt());
+	WARN_ON_ONCE(in_interrupt());
 
 	/* From now on, migration cannot move the object */
 	pin_tag(handle);
@@ -1327,8 +1325,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 	struct size_class *class;
 	struct mapping_area *area;
 
-	BUG_ON(!handle);
-
 	obj = handle_to_obj(handle);
 	obj_to_location(obj, &page, &obj_idx);
 	get_zspage_mapping(get_first_page(page), &class_idx, &fg);
@@ -1448,8 +1444,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class,
 	unsigned long f_objidx, f_offset;
 	void *vaddr;
 
-	BUG_ON(!obj);
-
 	obj &= ~OBJ_ALLOCATED_TAG;
 	obj_to_location(obj, &f_page, &f_objidx);
 	first_page = get_first_page(f_page);
@@ -1549,7 +1543,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
 			kunmap_atomic(d_addr);
 			kunmap_atomic(s_addr);
 			s_page = get_next_page(s_page);
-			BUG_ON(!s_page);
 			s_addr = kmap_atomic(s_page);
 			d_addr = kmap_atomic(d_page);
 			s_size = class->size - written;
@@ -1559,7 +1552,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
 		if (d_off >= PAGE_SIZE) {
 			kunmap_atomic(d_addr);
 			d_page = get_next_page(d_page);
-			BUG_ON(!d_page);
 			d_addr = kmap_atomic(d_page);
 			d_size = class->size - written;
 			d_off = 0;
@@ -1694,8 +1686,6 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
 {
 	enum fullness_group fullness;
 
-	BUG_ON(!is_first_page(first_page));
-
 	fullness = get_fullness_group(first_page);
 	insert_zspage(first_page, class, fullness);
 	set_zspage_mapping(first_page, class->index, fullness);
@@ -1759,8 +1749,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class)
 	spin_lock(&class->lock);
 	while ((src_page = isolate_source_page(class))) {
 
-		BUG_ON(!is_first_page(src_page));
-
 		if (!zs_can_compact(class))
 			break;
 
-- 
cgit v1.2.3


From 251cbb951b831acd8451d75b40696834f07c29c5 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 20 May 2016 16:59:42 -0700
Subject: zsmalloc: reorder function parameters

Clean up function parameter ordering to order higher data structure
first.

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 50 ++++++++++++++++++++++++++------------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index dfe684cc3a03..18535abfad40 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -569,7 +569,7 @@ static const struct file_operations zs_stat_size_ops = {
 	.release        = single_release,
 };
 
-static int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static int zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
 	struct dentry *entry;
 
@@ -609,7 +609,7 @@ static void __exit zs_stat_exit(void)
 {
 }
 
-static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool)
+static inline int zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
 	return 0;
 }
@@ -655,8 +655,9 @@ static enum fullness_group get_fullness_group(struct page *first_page)
  * have. This functions inserts the given zspage into the freelist
  * identified by <class, fullness_group>.
  */
-static void insert_zspage(struct page *first_page, struct size_class *class,
-				enum fullness_group fullness)
+static void insert_zspage(struct size_class *class,
+				enum fullness_group fullness,
+				struct page *first_page)
 {
 	struct page **head;
 
@@ -687,8 +688,9 @@ static void insert_zspage(struct page *first_page, struct size_class *class,
  * This function removes the given zspage from the freelist identified
  * by <class, fullness_group>.
  */
-static void remove_zspage(struct page *first_page, struct size_class *class,
-				enum fullness_group fullness)
+static void remove_zspage(struct size_class *class,
+				enum fullness_group fullness,
+				struct page *first_page)
 {
 	struct page **head;
 
@@ -730,8 +732,8 @@ static enum fullness_group fix_fullness_group(struct size_class *class,
 	if (newfg == currfg)
 		goto out;
 
-	remove_zspage(first_page, class, currfg);
-	insert_zspage(first_page, class, newfg);
+	remove_zspage(class, currfg, first_page);
+	insert_zspage(class, newfg, first_page);
 	set_zspage_mapping(first_page, class_idx, newfg);
 
 out:
@@ -915,7 +917,7 @@ static void free_zspage(struct page *first_page)
 }
 
 /* Initialize a newly allocated zspage */
-static void init_zspage(struct page *first_page, struct size_class *class)
+static void init_zspage(struct size_class *class, struct page *first_page)
 {
 	unsigned long off = 0;
 	struct page *page = first_page;
@@ -1003,7 +1005,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags)
 		prev_page = page;
 	}
 
-	init_zspage(first_page, class);
+	init_zspage(class, first_page);
 
 	first_page->freelist = location_to_obj(first_page, 0);
 	/* Maximum number of objects we can store in this zspage */
@@ -1348,8 +1350,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
-static unsigned long obj_malloc(struct page *first_page,
-		struct size_class *class, unsigned long handle)
+static unsigned long obj_malloc(struct size_class *class,
+				struct page *first_page, unsigned long handle)
 {
 	unsigned long obj;
 	struct link_free *link;
@@ -1426,7 +1428,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 				class->size, class->pages_per_zspage));
 	}
 
-	obj = obj_malloc(first_page, class, handle);
+	obj = obj_malloc(class, first_page, handle);
 	/* Now move the zspage to another fullness group, if required */
 	fix_fullness_group(class, first_page);
 	record_obj(handle, obj);
@@ -1499,8 +1501,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_free);
 
-static void zs_object_copy(unsigned long dst, unsigned long src,
-				struct size_class *class)
+static void zs_object_copy(struct size_class *class, unsigned long dst,
+				unsigned long src)
 {
 	struct page *s_page, *d_page;
 	unsigned long s_objidx, d_objidx;
@@ -1566,8 +1568,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src,
  * Find alloced object in zspage from index object and
  * return handle.
  */
-static unsigned long find_alloced_obj(struct page *page, int index,
-					struct size_class *class)
+static unsigned long find_alloced_obj(struct size_class *class,
+					struct page *page, int index)
 {
 	unsigned long head;
 	int offset = 0;
@@ -1617,7 +1619,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 	int ret = 0;
 
 	while (1) {
-		handle = find_alloced_obj(s_page, index, class);
+		handle = find_alloced_obj(class, s_page, index);
 		if (!handle) {
 			s_page = get_next_page(s_page);
 			if (!s_page)
@@ -1634,8 +1636,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 		}
 
 		used_obj = handle_to_obj(handle);
-		free_obj = obj_malloc(d_page, class, handle);
-		zs_object_copy(free_obj, used_obj, class);
+		free_obj = obj_malloc(class, d_page, handle);
+		zs_object_copy(class, free_obj, used_obj);
 		index++;
 		/*
 		 * record_obj updates handle's value to free_obj and it will
@@ -1664,7 +1666,7 @@ static struct page *isolate_target_page(struct size_class *class)
 	for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) {
 		page = class->fullness_list[i];
 		if (page) {
-			remove_zspage(page, class, i);
+			remove_zspage(class, i, page);
 			break;
 		}
 	}
@@ -1687,7 +1689,7 @@ static enum fullness_group putback_zspage(struct zs_pool *pool,
 	enum fullness_group fullness;
 
 	fullness = get_fullness_group(first_page);
-	insert_zspage(first_page, class, fullness);
+	insert_zspage(class, fullness, first_page);
 	set_zspage_mapping(first_page, class->index, fullness);
 
 	if (fullness == ZS_EMPTY) {
@@ -1712,7 +1714,7 @@ static struct page *isolate_source_page(struct size_class *class)
 		if (!page)
 			continue;
 
-		remove_zspage(page, class, i);
+		remove_zspage(class, i, page);
 		break;
 	}
 
@@ -1949,7 +1951,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 
 	pool->flags = flags;
 
-	if (zs_pool_stat_create(name, pool))
+	if (zs_pool_stat_create(pool, name))
 		goto err;
 
 	/*
-- 
cgit v1.2.3


From 1ee4716585ed80b7917ba3c5aa38e5e0d677d583 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Fri, 20 May 2016 16:59:45 -0700
Subject: zsmalloc: remove unused pool param in obj_free

Let's remove unused pool param in obj_free

Signed-off-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 18535abfad40..ae288c9f7156 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1438,8 +1438,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 }
 EXPORT_SYMBOL_GPL(zs_malloc);
 
-static void obj_free(struct zs_pool *pool, struct size_class *class,
-			unsigned long obj)
+static void obj_free(struct size_class *class, unsigned long obj)
 {
 	struct link_free *link;
 	struct page *first_page, *f_page;
@@ -1485,7 +1484,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	class = pool->size_class[class_idx];
 
 	spin_lock(&class->lock);
-	obj_free(pool, class, obj);
+	obj_free(class, obj);
 	fullness = fix_fullness_group(class, first_page);
 	if (fullness == ZS_EMPTY) {
 		zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
@@ -1648,7 +1647,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
 		free_obj |= BIT(HANDLE_PIN_BIT);
 		record_obj(handle, free_obj);
 		unpin_tag(handle);
-		obj_free(pool, class, used_obj);
+		obj_free(class, used_obj);
 	}
 
 	/* Remember last position in this iteration */
-- 
cgit v1.2.3


From d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Fri, 20 May 2016 16:59:48 -0700
Subject: zsmalloc: require GFP in zs_malloc()

Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to
zs_create_pool(), so we can be more flexible, but, more importantly, we
need this to switch zram to per-cpu compression streams -- zram will try
to allocate handle with preemption disabled in a fast path and switch to
a slow path (using different gfp mask) if the fast one has failed.

Apart from that, this also align zs_malloc() interface with zspool/zbud.

[sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask]
  Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish
Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/block/zram/zram_drv.c |  4 ++--
 include/linux/zsmalloc.h      |  4 ++--
 mm/zsmalloc.c                 | 24 +++++++++++++-----------
 3 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 370c2f76016d..b09acdb753ee 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize)
 		goto out_error;
 	}
 
-	meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM);
+	meta->mem_pool = zs_create_pool(pool_name);
 	if (!meta->mem_pool) {
 		pr_err("Error creating memory pool\n");
 		goto out_error;
@@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			src = uncmem;
 	}
 
-	handle = zs_malloc(meta->mem_pool, clen);
+	handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM);
 	if (!handle) {
 		pr_err("Error allocating memory for compressed page: %u, size=%zu\n",
 			index, clen);
diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 34eb16098a33..57a8e98f2708 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -41,10 +41,10 @@ struct zs_pool_stats {
 
 struct zs_pool;
 
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags);
+struct zs_pool *zs_create_pool(const char *name);
 void zs_destroy_pool(struct zs_pool *pool);
 
-unsigned long zs_malloc(struct zs_pool *pool, size_t size);
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags);
 void zs_free(struct zs_pool *pool, unsigned long obj);
 
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index ae288c9f7156..aba39a291523 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -247,7 +247,6 @@ struct zs_pool {
 	struct size_class **size_class;
 	struct kmem_cache *handle_cachep;
 
-	gfp_t flags;	/* allocation flags used when growing pool */
 	atomic_long_t pages_allocated;
 
 	struct zs_pool_stats stats;
@@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool)
 	kmem_cache_destroy(pool->handle_cachep);
 }
 
-static unsigned long alloc_handle(struct zs_pool *pool)
+static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp)
 {
 	return (unsigned long)kmem_cache_alloc(pool->handle_cachep,
-		pool->flags & ~__GFP_HIGHMEM);
+			gfp & ~__GFP_HIGHMEM);
 }
 
 static void free_handle(struct zs_pool *pool, unsigned long handle)
@@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp,
 			     const struct zpool_ops *zpool_ops,
 			     struct zpool *zpool)
 {
-	return zs_create_pool(name, gfp);
+	/*
+	 * Ignore global gfp flags: zs_malloc() may be invoked from
+	 * different contexts and its caller must provide a valid
+	 * gfp mask.
+	 */
+	return zs_create_pool(name);
 }
 
 static void zs_zpool_destroy(void *pool)
@@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool)
 static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp,
 			unsigned long *handle)
 {
-	*handle = zs_malloc(pool, size);
+	*handle = zs_malloc(pool, size, gfp);
 	return *handle ? 0 : -1;
 }
 static void zs_zpool_free(void *pool, unsigned long handle)
@@ -1391,7 +1395,7 @@ static unsigned long obj_malloc(struct size_class *class,
  * otherwise 0.
  * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail.
  */
-unsigned long zs_malloc(struct zs_pool *pool, size_t size)
+unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 {
 	unsigned long handle, obj;
 	struct size_class *class;
@@ -1400,7 +1404,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 	if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE))
 		return 0;
 
-	handle = alloc_handle(pool);
+	handle = alloc_handle(pool, gfp);
 	if (!handle)
 		return 0;
 
@@ -1413,7 +1417,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
 
 	if (!first_page) {
 		spin_unlock(&class->lock);
-		first_page = alloc_zspage(class, pool->flags);
+		first_page = alloc_zspage(class, gfp);
 		if (unlikely(!first_page)) {
 			free_handle(pool, handle);
 			return 0;
@@ -1878,7 +1882,7 @@ static int zs_register_shrinker(struct zs_pool *pool)
  * On success, a pointer to the newly created pool is returned,
  * otherwise NULL.
  */
-struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
+struct zs_pool *zs_create_pool(const char *name)
 {
 	int i;
 	struct zs_pool *pool;
@@ -1948,8 +1952,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags)
 		prev_class = class;
 	}
 
-	pool->flags = flags;
-
 	if (zs_pool_stat_create(pool, name))
 		goto err;
 
-- 
cgit v1.2.3


From 200867af4dedfe7cb707f96773684de1d1fd21e6 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Fri, 20 May 2016 16:59:54 -0700
Subject: mm/zswap: use workqueue to destroy pool

Add a work_struct to struct zswap_pool, and change __zswap_pool_empty to
use the workqueue instead of using call_rcu().

When zswap destroys a pool no longer in use, it uses call_rcu() to
perform the destruction/freeing.  Since that executes in softirq
context, it must not sleep.  However, actually destroying the pool
involves freeing the per-cpu compressors (which requires locking the
cpu_add_remove_lock mutex) and freeing the zpool, for which the
implementation may sleep (e.g.  zsmalloc calls kmem_cache_destroy, which
locks the slab_mutex).  So if either mutex is currently taken, or any
other part of the compressor or zpool implementation sleeps, it will
result in a BUG().

It's not easy to reproduce this when changing zswap's params normally.
In testing with a loaded system, this does not fail:

  $ cd /sys/module/zswap/parameters
  $ echo lz4 > compressor ; echo zsmalloc > zpool

nor does this:

  $ while true ; do
  > echo lzo > compressor ; echo zbud > zpool
  > sleep 1
  > echo lz4 > compressor ; echo zsmalloc > zpool
  > sleep 1
  > done

although it's still possible either of those might fail, depending on
whether anything else besides zswap has locked the mutexes.

However, changing a parameter with no delay immediately causes the
schedule while atomic BUG:

  $ while true ; do
  > echo lzo > compressor ; echo lz4 > compressor
  > done

This is essentially the same as Yu Zhao's proposed patch to zsmalloc,
but moved to zswap, to cover compressor and zpool freeing.

Fixes: f1c54846ee45 ("zswap: dynamic pool creation")
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Reported-by: Yu Zhao <yuzhao@google.com>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Dan Streetman <dan.streetman@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zswap.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/zswap.c b/mm/zswap.c
index de0f119b1780..275b22cc8df4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -117,7 +117,7 @@ struct zswap_pool {
 	struct crypto_comp * __percpu *tfm;
 	struct kref kref;
 	struct list_head list;
-	struct rcu_head rcu_head;
+	struct work_struct work;
 	struct notifier_block notifier;
 	char tfm_name[CRYPTO_MAX_ALG_NAME];
 };
@@ -658,9 +658,11 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool)
 	return kref_get_unless_zero(&pool->kref);
 }
 
-static void __zswap_pool_release(struct rcu_head *head)
+static void __zswap_pool_release(struct work_struct *work)
 {
-	struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
+	struct zswap_pool *pool = container_of(work, typeof(*pool), work);
+
+	synchronize_rcu();
 
 	/* nobody should have been able to get a kref... */
 	WARN_ON(kref_get_unless_zero(&pool->kref));
@@ -680,7 +682,9 @@ static void __zswap_pool_empty(struct kref *kref)
 	WARN_ON(pool == zswap_pool_current());
 
 	list_del_rcu(&pool->list);
-	call_rcu(&pool->rcu_head, __zswap_pool_release);
+
+	INIT_WORK(&pool->work, __zswap_pool_release);
+	schedule_work(&pool->work);
 
 	spin_unlock(&zswap_pools_lock);
 }
-- 
cgit v1.2.3


From d34f615720d17c49b6779f6fcd5cb7eb82231a38 Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Fri, 20 May 2016 16:59:56 -0700
Subject: mm/zsmalloc: don't fail if can't create debugfs info

Change the return type of zs_pool_stat_create() to void, and remove the
logic to abort pool creation if the stat debugfs dir/file could not be
created.

The debugfs stat file is for debugging/information only, and doesn't
affect operation of zsmalloc; there is no reason to abort creating the
pool if the stat file can't be created.  This was seen with zswap, which
used the same name for all pool creations, which caused zsmalloc to fail
to create a second pool for zswap if CONFIG_ZSMALLOC_STAT was enabled.

Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Reviewed-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Dan Streetman <dan.streetman@canonical.com>
Cc: Minchan Kim <minchan@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index aba39a291523..72698db958e7 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -573,17 +573,17 @@ static const struct file_operations zs_stat_size_ops = {
 	.release        = single_release,
 };
 
-static int zs_pool_stat_create(struct zs_pool *pool, const char *name)
+static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
 	struct dentry *entry;
 
 	if (!zs_stat_root)
-		return -ENODEV;
+		return;
 
 	entry = debugfs_create_dir(name, zs_stat_root);
 	if (!entry) {
 		pr_warn("debugfs dir <%s> creation failed\n", name);
-		return -ENOMEM;
+		return;
 	}
 	pool->stat_dentry = entry;
 
@@ -592,10 +592,8 @@ static int zs_pool_stat_create(struct zs_pool *pool, const char *name)
 	if (!entry) {
 		pr_warn("%s: debugfs file entry <%s> creation failed\n",
 				name, "classes");
-		return -ENOMEM;
+		return;
 	}
-
-	return 0;
 }
 
 static void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -613,9 +611,8 @@ static void __exit zs_stat_exit(void)
 {
 }
 
-static inline int zs_pool_stat_create(struct zs_pool *pool, const char *name)
+static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
-	return 0;
 }
 
 static inline void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -623,7 +620,6 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 #endif
 
-
 /*
  * For each size class, zspages are divided into different groups
  * depending on how "full" they are. This was done so that we could
@@ -1952,8 +1948,8 @@ struct zs_pool *zs_create_pool(const char *name)
 		prev_class = class;
 	}
 
-	if (zs_pool_stat_create(pool, name))
-		goto err;
+	/* debug only, don't abort if it fails */
+	zs_pool_stat_create(pool, name);
 
 	/*
 	 * Not critical, we still can use the pool
-- 
cgit v1.2.3


From 57578c2ea2cb2e0d362a9212ac83cf90221d4883 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:01:54 -0700
Subject: raxix-tree: introduce CONFIG_RADIX_TREE_MULTIORDER

I've been receiving increasingly concerned notes from 0day about how
much my recent changes have been bloating the radix tree.  Make it
happier by only including multiorder support if
CONFIG_TRANSPARENT_HUGEPAGES is set.

This is an independent Kconfig option, so other radix tree users can
also set it if they have a need.

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/Kconfig                             |  3 +++
 lib/radix-tree.c                        | 26 ++++++++++++++++++--------
 mm/Kconfig                              |  1 +
 tools/testing/radix-tree/linux/kernel.h |  1 +
 4 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/lib/Kconfig b/lib/Kconfig
index 61d55bd0ed89..d79909dc01ec 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -362,6 +362,9 @@ config INTERVAL_TREE
 
 	  for more information.
 
+config RADIX_TREE_MULTIORDER
+	bool
+
 config ASSOCIATIVE_ARRAY
 	bool
 	help
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1624c4117961..799f341977d0 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -484,6 +484,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 		slot = node->slots[offset];
 	}
 
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
 	/* Insert pointers to the canonical entry */
 	if ((shift - order) > 0) {
 		int i, n = 1 << (shift - order);
@@ -499,6 +500,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
 			node->count++;
 		}
 	}
+#endif
 
 	if (nodep)
 		*nodep = node;
@@ -1469,6 +1471,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 	return deleted;
 }
 
+static inline void delete_sibling_entries(struct radix_tree_node *node,
+					void *ptr, unsigned offset)
+{
+#ifdef CONFIG_RADIX_TREE_MULTIORDER
+	int i;
+	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
+		if (node->slots[offset + i] != ptr)
+			break;
+		node->slots[offset + i] = NULL;
+		node->count--;
+	}
+#endif
+}
+
 /**
  *	radix_tree_delete_item    -    delete an item from a radix tree
  *	@root:		radix tree root
@@ -1484,7 +1500,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 			     unsigned long index, void *item)
 {
 	struct radix_tree_node *node;
-	unsigned int offset, i;
+	unsigned int offset;
 	void **slot;
 	void *entry;
 	int tag;
@@ -1513,13 +1529,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 			radix_tree_tag_clear(root, index, tag);
 	}
 
-	/* Delete any sibling slots pointing to this slot */
-	for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) {
-		if (node->slots[offset + i] != ptr_to_indirect(slot))
-			break;
-		node->slots[offset + i] = NULL;
-		node->count--;
-	}
+	delete_sibling_entries(node, ptr_to_indirect(slot), offset);
 	node->slots[offset] = NULL;
 	node->count--;
 
diff --git a/mm/Kconfig b/mm/Kconfig
index 1a6a28ebcb8b..2664c118b5d2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -404,6 +404,7 @@ config TRANSPARENT_HUGEPAGE
 	bool "Transparent Hugepage Support"
 	depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select COMPACTION
+	select RADIX_TREE_MULTIORDER
 	help
 	  Transparent Hugepages allows the kernel to use huge pages and
 	  huge tlb transparently to the applications whenever possible.
diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h
index 31fe2c77d7ae..8ea0ed450810 100644
--- a/tools/testing/radix-tree/linux/kernel.h
+++ b/tools/testing/radix-tree/linux/kernel.h
@@ -9,6 +9,7 @@
 
 #include "../../include/linux/compiler.h"
 
+#define CONFIG_RADIX_TREE_MULTIORDER
 #define CONFIG_SHMEM
 #define CONFIG_SWAP
 
-- 
cgit v1.2.3


From d604c324524bf61c68182bb27db64656a78fe911 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <willy@linux.intel.com>
Date: Fri, 20 May 2016 17:03:45 -0700
Subject: radix-tree: introduce radix_tree_replace_clear_tags()

In addition to replacing the entry, we also clear all associated tags.
This is really a one-off special for page_cache_tree_delete() which had
far too much detailed knowledge about how the radix tree works.

For efficiency, factor node_tag_clear() out of radix_tree_tag_clear() It
can be used by radix_tree_delete_item() as well as
radix_tree_replace_clear_tags().

Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Kirill Shutemov <kirill.shutemov@linux.intel.com>
Cc: Jan Kara <jack@suse.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/radix-tree.h |  9 ++++--
 lib/radix-tree.c           | 76 ++++++++++++++++++++++++++++------------------
 mm/filemap.c               | 23 ++------------
 3 files changed, 56 insertions(+), 52 deletions(-)

(limited to 'mm')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index bad63105e37e..11c8e7cc3920 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -281,9 +281,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root,
 			      struct radix_tree_node *node);
 void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *);
 void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-unsigned int
-radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
-			unsigned long first_index, unsigned int max_items);
+struct radix_tree_node *radix_tree_replace_clear_tags(
+				struct radix_tree_root *root,
+				unsigned long index, void *entry);
+unsigned int radix_tree_gang_lookup(struct radix_tree_root *root,
+			void **results, unsigned long first_index,
+			unsigned int max_items);
 unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root,
 			void ***results, unsigned long *indices,
 			unsigned long first_index, unsigned int max_items);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 9d9b4b9af4b6..c7114d233b38 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -740,6 +740,26 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 }
 EXPORT_SYMBOL(radix_tree_tag_set);
 
+static void node_tag_clear(struct radix_tree_root *root,
+				struct radix_tree_node *node,
+				unsigned int tag, unsigned int offset)
+{
+	while (node) {
+		if (!tag_get(node, tag, offset))
+			return;
+		tag_clear(node, tag, offset);
+		if (any_tag_set(node, tag))
+			return;
+
+		offset = node->offset;
+		node = node->parent;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+}
+
 /**
  *	radix_tree_tag_clear - clear a tag on a radix tree node
  *	@root:		radix tree root
@@ -776,28 +796,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		offset = radix_tree_descend(parent, &node, offset);
 	}
 
-	if (node == NULL)
-		goto out;
+	if (node)
+		node_tag_clear(root, parent, tag, offset);
 
-	index >>= shift;
-
-	while (parent) {
-		if (!tag_get(parent, tag, offset))
-			goto out;
-		tag_clear(parent, tag, offset);
-		if (any_tag_set(parent, tag))
-			goto out;
-
-		index >>= RADIX_TREE_MAP_SHIFT;
-		offset = index & RADIX_TREE_MAP_MASK;
-		parent = parent->parent;
-	}
-
-	/* clear the root's tag bit */
-	if (root_tag_get(root, tag))
-		root_tag_clear(root, tag);
-
-out:
 	return node;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
@@ -1525,14 +1526,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 
 	offset = get_slot_offset(node, slot);
 
-	/*
-	 * Clear all tags associated with the item to be deleted.
-	 * This way of doing it would be inefficient, but seldom is any set.
-	 */
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (tag_get(node, tag, offset))
-			radix_tree_tag_clear(root, index, tag);
-	}
+	/* Clear all tags associated with the item to be deleted.  */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		node_tag_clear(root, node, tag, offset);
 
 	delete_sibling_entries(node, node_to_entry(slot), offset);
 	node->slots[offset] = NULL;
@@ -1559,6 +1555,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+struct radix_tree_node *radix_tree_replace_clear_tags(
+			struct radix_tree_root *root,
+			unsigned long index, void *entry)
+{
+	struct radix_tree_node *node;
+	void **slot;
+
+	__radix_tree_lookup(root, index, &node, &slot);
+
+	if (node) {
+		unsigned int tag, offset = get_slot_offset(node, slot);
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+			node_tag_clear(root, node, tag, offset);
+	} else {
+		/* Clear root node tags */
+		root->gfp_mask &= __GFP_BITS_MASK;
+	}
+
+	radix_tree_replace_slot(slot, entry);
+	return node;
+}
+
 /**
  *	radix_tree_tagged - test whether any items in the tree are tagged
  *	@root:		radix tree root
diff --git a/mm/filemap.c b/mm/filemap.c
index b418405903bc..9665b1d4f318 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 				   struct page *page, void *shadow)
 {
 	struct radix_tree_node *node;
-	unsigned long index;
-	unsigned int offset;
-	unsigned int tag;
-	void **slot;
 
 	VM_BUG_ON(!PageLocked(page));
 
-	__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);
+	node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index,
+								shadow);
 
 	if (shadow) {
 		mapping->nrexceptional++;
@@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	}
 	mapping->nrpages--;
 
-	if (!node) {
-		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
-		radix_tree_replace_slot(slot, shadow);
+	if (!node)
 		return;
-	}
-
-	/* Clear tree tags for the removed page */
-	index = page->index;
-	offset = index & RADIX_TREE_MAP_MASK;
-	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
-		if (test_bit(offset, node->tags[tag]))
-			radix_tree_tag_clear(&mapping->page_tree, index, tag);
-	}
 
-	/* Delete page, swap shadow entry */
-	radix_tree_replace_slot(slot, shadow);
 	workingset_node_pages_dec(node);
 	if (shadow)
 		workingset_node_shadows_inc(node);
-- 
cgit v1.2.3


From dee410792419aaa8bc3e3b35d2ccb6515835916d Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Sat, 14 May 2016 12:20:44 -0700
Subject: /dev/dax, core: file operations and dax-mmap

The "Device DAX" core enables dax mappings of performance / feature
differentiated memory.  An open mapping or file handle keeps the backing
struct device live, but new mappings are only possible while the device
is enabled.   Faults are handled under rcu_read_lock to synchronize
with the enabled state of the device.

Similar to the filesystem-dax case the backing memory may optionally
have struct page entries.  However, unlike fs-dax there is no support
for private mappings, or mappings that are not backed by media (see
use of zero-page in fs-dax).

Mappings are always guaranteed to match the alignment of the dax_region.
If the dax_region is configured to have a 2MB alignment, all mappings
are guaranteed to be backed by a pmd entry.  Contrast this determinism
with the fs-dax case where pmd mappings are opportunistic.  If userspace
attempts to force a misaligned mapping, the driver will fail the mmap
attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
like MAP_PRIVATE mappings.

Cc: Hannes Reinecke <hare@suse.de>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Reviewed-by: Johannes Thumshirn <jthumshirn@suse.de>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
 drivers/dax/Kconfig |   1 +
 drivers/dax/dax.c   | 322 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 mm/huge_memory.c    |   1 +
 mm/hugetlb.c        |   1 +
 4 files changed, 325 insertions(+)

(limited to 'mm')

diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index 86ffbaa891ad..cedab7572de3 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,6 +1,7 @@
 menuconfig DEV_DAX
 	tristate "DAX: direct access to differentiated memory"
 	default m if NVDIMM_DAX
+	depends on TRANSPARENT_HUGEPAGE
 	help
 	  Support raw access to differentiated (persistence, bandwidth,
 	  latency...) memory via an mmap(2) capable character
diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
index 4c22a40f2335..b891a129b275 100644
--- a/drivers/dax/dax.c
+++ b/drivers/dax/dax.c
@@ -49,6 +49,7 @@ struct dax_region {
  * @region - parent region
  * @dev - device backing the character device
  * @kref - enable this data to be tracked in filp->private_data
+ * @alive - !alive + rcu grace period == no new mappings can be established
  * @id - child id in the region
  * @num_resources - number of physical address extents in this device
  * @res - array of physical address ranges
@@ -57,6 +58,7 @@ struct dax_dev {
 	struct dax_region *region;
 	struct device *dev;
 	struct kref kref;
+	bool alive;
 	int id;
 	int num_resources;
 	struct resource res[0];
@@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev)
 
 	dev_dbg(dev, "%s\n", __func__);
 
+	/*
+	 * Note, rcu is not protecting the liveness of dax_dev, rcu is
+	 * ensuring that any fault handlers that might have seen
+	 * dax_dev->alive == true, have completed.  Any fault handlers
+	 * that start after synchronize_rcu() has started will abort
+	 * upon seeing dax_dev->alive == false.
+	 */
+	dax_dev->alive = false;
+	synchronize_rcu();
+
 	get_device(dev);
 	device_unregister(dev);
 	ida_simple_remove(&dax_region->ida, dax_dev->id);
@@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 	memcpy(dax_dev->res, res, sizeof(*res) * count);
 	dax_dev->num_resources = count;
 	kref_init(&dax_dev->kref);
+	dax_dev->alive = true;
 	dax_dev->region = dax_region;
 	kref_get(&dax_region->kref);
 
@@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
 }
 EXPORT_SYMBOL_GPL(devm_create_dax_dev);
 
+/* return an unmapped area aligned to the dax region specified alignment */
+static unsigned long dax_dev_get_unmapped_area(struct file *filp,
+		unsigned long addr, unsigned long len, unsigned long pgoff,
+		unsigned long flags)
+{
+	unsigned long off, off_end, off_align, len_align, addr_align, align;
+	struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
+	struct dax_region *dax_region;
+
+	if (!dax_dev || addr)
+		goto out;
+
+	dax_region = dax_dev->region;
+	align = dax_region->align;
+	off = pgoff << PAGE_SHIFT;
+	off_end = off + len;
+	off_align = round_up(off, align);
+
+	if ((off_end <= off_align) || ((off_end - off_align) < align))
+		goto out;
+
+	len_align = len + align;
+	if ((off + len_align) < off)
+		goto out;
+
+	addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
+			pgoff, flags);
+	if (!IS_ERR_VALUE(addr_align)) {
+		addr_align += (off - addr_align) & (align - 1);
+		return addr_align;
+	}
+ out:
+	return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
+}
+
+static int __match_devt(struct device *dev, const void *data)
+{
+	const dev_t *devt = data;
+
+	return dev->devt == *devt;
+}
+
+static struct device *dax_dev_find(dev_t dev_t)
+{
+	return class_find_device(dax_class, NULL, &dev_t, __match_devt);
+}
+
+static int dax_dev_open(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = NULL;
+	struct device *dev;
+
+	dev = dax_dev_find(inode->i_rdev);
+	if (!dev)
+		return -ENXIO;
+
+	device_lock(dev);
+	dax_dev = dev_get_drvdata(dev);
+	if (dax_dev) {
+		dev_dbg(dev, "%s\n", __func__);
+		filp->private_data = dax_dev;
+		kref_get(&dax_dev->kref);
+		inode->i_flags = S_DAX;
+	}
+	device_unlock(dev);
+
+	if (!dax_dev) {
+		put_device(dev);
+		return -ENXIO;
+	}
+	return 0;
+}
+
+static int dax_dev_release(struct inode *inode, struct file *filp)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	struct device *dev = dax_dev->dev;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+	put_device(dev);
+
+	return 0;
+}
+
+static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		const char *func)
+{
+	struct dax_region *dax_region = dax_dev->region;
+	struct device *dev = dax_dev->dev;
+	unsigned long mask;
+
+	if (!dax_dev->alive)
+		return -ENXIO;
+
+	/* prevent private / writable mappings from being established */
+	if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
+		dev_info(dev, "%s: %s: fail, attempted private mapping\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	mask = dax_region->align - 1;
+	if (vma->vm_start & mask || vma->vm_end & mask) {
+		dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
+				current->comm, func, vma->vm_start, vma->vm_end,
+				mask);
+		return -EINVAL;
+	}
+
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
+			&& (vma->vm_flags & VM_DONTCOPY) == 0) {
+		dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	if (!vma_is_dax(vma)) {
+		dev_info(dev, "%s: %s: fail, vma is not DAX capable\n",
+				current->comm, func);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
+		unsigned long size)
+{
+	struct resource *res;
+	phys_addr_t phys;
+	int i;
+
+	for (i = 0; i < dax_dev->num_resources; i++) {
+		res = &dax_dev->res[i];
+		phys = pgoff * PAGE_SIZE + res->start;
+		if (phys >= res->start && phys <= res->end)
+			break;
+		pgoff -= PHYS_PFN(resource_size(res));
+	}
+
+	if (i < dax_dev->num_resources) {
+		res = &dax_dev->res[i];
+		if (phys + size - 1 <= res->end)
+			return phys;
+	}
+
+	return -1;
+}
+
+static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	unsigned long vaddr = (unsigned long) vmf->virtual_address;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	int rc = VM_FAULT_SIGBUS;
+	phys_addr_t phys;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PAGE_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				vmf->pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	rc = vm_insert_mixed(vma, vaddr, pfn);
+
+	if (rc == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (rc < 0 && rc != -EBUSY)
+		return VM_FAULT_SIGBUS;
+
+	return VM_FAULT_NOPAGE;
+}
+
+static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (vmf->flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+	rcu_read_lock();
+	rc = __dax_dev_fault(dax_dev, vma, vmf);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static int __dax_dev_pmd_fault(struct dax_dev *dax_dev,
+		struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd,
+		unsigned int flags)
+{
+	unsigned long pmd_addr = addr & PMD_MASK;
+	struct device *dev = dax_dev->dev;
+	struct dax_region *dax_region;
+	phys_addr_t phys;
+	pgoff_t pgoff;
+	pfn_t pfn;
+
+	if (check_vma(dax_dev, vma, __func__))
+		return VM_FAULT_SIGBUS;
+
+	dax_region = dax_dev->region;
+	if (dax_region->align > PMD_SIZE) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	/* dax pmd mappings require pfn_t_devmap() */
+	if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) {
+		dev_dbg(dev, "%s: alignment > fault size\n", __func__);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pgoff = linear_page_index(vma, pmd_addr);
+	phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE);
+	if (phys == -1) {
+		dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
+				pgoff);
+		return VM_FAULT_SIGBUS;
+	}
+
+	pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
+
+	return vmf_insert_pfn_pmd(vma, addr, pmd, pfn,
+			flags & FAULT_FLAG_WRITE);
+}
+
+static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned int flags)
+{
+	int rc;
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
+			current->comm, (flags & FAULT_FLAG_WRITE)
+			? "write" : "read", vma->vm_start, vma->vm_end);
+
+	rcu_read_lock();
+	rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags);
+	rcu_read_unlock();
+
+	return rc;
+}
+
+static void dax_dev_vm_open(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	kref_get(&dax_dev->kref);
+}
+
+static void dax_dev_vm_close(struct vm_area_struct *vma)
+{
+	struct file *filp = vma->vm_file;
+	struct dax_dev *dax_dev = filp->private_data;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+	dax_dev_put(dax_dev);
+}
+
+static const struct vm_operations_struct dax_dev_vm_ops = {
+	.fault = dax_dev_fault,
+	.pmd_fault = dax_dev_pmd_fault,
+	.open = dax_dev_vm_open,
+	.close = dax_dev_vm_close,
+};
+
+static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct dax_dev *dax_dev = filp->private_data;
+	int rc;
+
+	dev_dbg(dax_dev->dev, "%s\n", __func__);
+
+	rc = check_vma(dax_dev, vma, __func__);
+	if (rc)
+		return rc;
+
+	kref_get(&dax_dev->kref);
+	vma->vm_ops = &dax_dev_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	return 0;
+
+}
+
 static const struct file_operations dax_fops = {
 	.llseek = noop_llseek,
 	.owner = THIS_MODULE,
+	.open = dax_dev_open,
+	.release = dax_dev_release,
+	.get_unmapped_area = dax_dev_get_unmapped_area,
+	.mmap = dax_dev_mmap,
 };
 
 static int __init dax_init(void)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 86f9f8b82f8e..52ea012d8a80 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 	insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write);
 	return VM_FAULT_NOPAGE;
 }
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
 static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 19d0d08b396f..b14e98129b07 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
 {
 	return vma_hugecache_offset(hstate_vma(vma), vma, address);
 }
+EXPORT_SYMBOL_GPL(linear_hugepage_index);
 
 /*
  * Return the size of the pages allocated when backing a VMA. In the majority
-- 
cgit v1.2.3


From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 22 May 2016 17:21:27 -0700
Subject: x86: remove more uaccess_32.h complexity

I'm looking at trying to possibly merge the 32-bit and 64-bit versions
of the x86 uaccess.h implementation, but first this needs to be cleaned
up.

For example, the 32-bit version of "__copy_from_user_inatomic()" is
mostly the special cases for the constant size, and it's actually almost
never relevant.  Most users aren't actually using a constant size
anyway, and the few cases that do small constant copies are better off
just using __get_user() instead.

So get rid of the unnecessary complexity.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_32.h | 26 --------------------------
 kernel/events/uprobes.c           |  3 +--
 kernel/futex.c                    |  2 +-
 mm/maccess.c                      |  3 +--
 4 files changed, 3 insertions(+), 31 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 537cc883ea29..4b32da24faaf 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -65,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n)
 static __always_inline unsigned long
 __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
 {
-	/* Avoid zeroing the tail if the copy fails..
-	 * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
-	 * but as the zeroing behaviour is only significant when n is not
-	 * constant, that shouldn't be a problem.
-	 */
-	if (__builtin_constant_p(n)) {
-		unsigned long ret;
-
-		switch (n) {
-		case 1:
-			__uaccess_begin();
-			__get_user_size(*(u8 *)to, from, 1, ret, 1);
-			__uaccess_end();
-			return ret;
-		case 2:
-			__uaccess_begin();
-			__get_user_size(*(u16 *)to, from, 2, ret, 2);
-			__uaccess_end();
-			return ret;
-		case 4:
-			__uaccess_begin();
-			__get_user_size(*(u32 *)to, from, 4, ret, 4);
-			__uaccess_end();
-			return ret;
-		}
-	}
 	return __copy_from_user_ll_nozero(to, from, n);
 }
 
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7edc95edfaee..c01f733ff2e1 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	int result;
 
 	pagefault_disable();
-	result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
-							sizeof(opcode));
+	result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr);
 	pagefault_enable();
 
 	if (likely(result == 0))
diff --git a/kernel/futex.c b/kernel/futex.c
index c20f06f38ef3..ee25f5ba4aca 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	int ret;
 
 	pagefault_disable();
-	ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
+	ret = __get_user(*dest, from);
 	pagefault_enable();
 
 	return ret ? -EFAULT : 0;
diff --git a/mm/maccess.c b/mm/maccess.c
index d159b1c96e48..78f9274dd49d 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
 	pagefault_disable();
 
 	do {
-		ret = __copy_from_user_inatomic(dst++,
-						(const void __user __force *)src++, 1);
+		ret = __get_user(*dst++, (const char __user __force *)src++);
 	} while (dst[-1] && ret == 0 && src - unsafe_addr < count);
 
 	dst[-1] = '\0';
-- 
cgit v1.2.3


From 1383399d7be029281997889df23150fa6c16be6e Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Mon, 23 May 2016 16:22:29 -0700
Subject: mm: memcontrol: fix possible css ref leak on oom

mem_cgroup_oom may be invoked multiple times while a process is handling
a page fault, in which case current->memcg_in_oom will be overwritten
leaking the previously taken css reference.

Link: http://lkml.kernel.org/r/1464019330-7579-1-git-send-email-vdavydov@virtuozzo.com
Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b3f16ab4b431..cf428d7b9a03 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1604,7 +1604,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-	if (!current->memcg_may_oom)
+	if (!current->memcg_may_oom || current->memcg_in_oom)
 		return;
 	/*
 	 * We are in the middle of the charge context here, so we
-- 
cgit v1.2.3


From dc0ef0df7b6a90892ec41933212ac701152a254c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:27 -0700
Subject: mm: make mmap_sem for write waits killable for mm syscalls

This is a follow up work for oom_reaper [1].  As the async OOM killing
depends on oom_sem for read we would really appreciate if a holder for
write didn't stood in the way.  This patchset is changing many of
down_write calls to be killable to help those cases when the writer is
blocked and waiting for readers to release the lock and so help
__oom_reap_task to process the oom victim.

Most of the patches are really trivial because the lock is help from a
shallow syscall paths where we can return EINTR trivially and allow the
current task to die (note that EINTR will never get to the userspace as
the task has fatal signal pending).  Others seem to be easy as well as
the callers are already handling fatal errors and bail and return to
userspace which should be sufficient to handle the failure gracefully.
I am not familiar with all those code paths so a deeper review is really
appreciated.

As this work is touching more areas which are not directly connected I
have tried to keep the CC list as small as possible and people who I
believed would be familiar are CCed only to the specific patches (all
should have received the cover though).

This patchset is based on linux-next and it depends on
down_write_killable for rw_semaphores which got merged into tip
locking/rwsem branch and it is merged into this next tree.  I guess it
would be easiest to route these patches via mmotm because of the
dependency on the tip tree but if respective maintainers prefer other
way I have no objections.

I haven't covered all the mmap_write(mm->mmap_sem) instances here

  $ git grep "down_write(.*\<mmap_sem\>)" next/master | wc -l
  98
  $ git grep "down_write(.*\<mmap_sem\>)" | wc -l
  62

I have tried to cover those which should be relatively easy to review in
this series because this alone should be a nice improvement.  Other
places can be changed on top.

[0] http://lkml.kernel.org/r/1456752417-9626-1-git-send-email-mhocko@kernel.org
[1] http://lkml.kernel.org/r/1452094975-551-1-git-send-email-mhocko@kernel.org
[2] http://lkml.kernel.org/r/1456750705-7141-1-git-send-email-mhocko@kernel.org

This patch (of 18):

This is the first step in making mmap_sem write waiters killable.  It
focuses on the trivial ones which are taking the lock early after
entering the syscall and they are not changing state before.

Therefore it is very easy to change them to use down_write_killable and
immediately return with -EINTR.  This will allow the waiter to pass away
without blocking the mmap_sem which might be required to make a forward
progress.  E.g.  the oom reaper will need the lock for reading to
dismantle the OOM victim address space.

The only tricky function in this patch is vm_mmap_pgoff which has many
call sites via vm_mmap.  To reduce the risk keep vm_mmap with the
original non-killable semantic for now.

vm_munmap callers do not bother checking the return value so open code
it into the munmap syscall path for now for simplicity.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Mel Gorman <mgorman@suse.de>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/internal.h |  5 +++--
 mm/madvise.c  |  8 +++++---
 mm/mlock.c    | 16 ++++++++++------
 mm/mmap.c     | 27 +++++++++++++++++++++++----
 mm/mprotect.c |  3 ++-
 mm/mremap.c   |  3 ++-
 mm/nommu.c    |  2 +-
 mm/util.c     | 12 +++++++++---
 8 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/internal.h b/mm/internal.h
index f6f3353b0868..bff7fd702331 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -442,9 +442,10 @@ extern u64 hwpoison_filter_flags_value;
 extern u64 hwpoison_filter_memcg;
 extern u32 hwpoison_filter_enable;
 
-extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long,
-        unsigned long, unsigned long);
+        unsigned long, unsigned long,
+        bool);
 
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
diff --git a/mm/madvise.c b/mm/madvise.c
index 07427d3fcead..93fb63e88b5e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -707,10 +707,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
 		return error;
 
 	write = madvise_need_mmap_write(behavior);
-	if (write)
-		down_write(&current->mm->mmap_sem);
-	else
+	if (write) {
+		if (down_write_killable(&current->mm->mmap_sem))
+			return -EINTR;
+	} else {
 		down_read(&current->mm->mmap_sem);
+	}
 
 	/*
 	 * If the interval [start,end) covers some unmapped address
diff --git a/mm/mlock.c b/mm/mlock.c
index 96f001041928..ef8dc9f395c4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -617,7 +617,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
 	return error;
 }
 
-static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
+static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
 {
 	unsigned long locked;
 	unsigned long lock_limit;
@@ -635,7 +635,8 @@ static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
 	lock_limit >>= PAGE_SHIFT;
 	locked = len >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
+	if (down_write_killable(&current->mm->mmap_sem))
+		return -EINTR;
 
 	locked += current->mm->locked_vm;
 
@@ -678,7 +679,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
 	len = PAGE_ALIGN(len + (offset_in_page(start)));
 	start &= PAGE_MASK;
 
-	down_write(&current->mm->mmap_sem);
+	if (down_write_killable(&current->mm->mmap_sem))
+		return -EINTR;
 	ret = apply_vma_lock_flags(start, len, 0);
 	up_write(&current->mm->mmap_sem);
 
@@ -748,9 +750,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 	lock_limit = rlimit(RLIMIT_MEMLOCK);
 	lock_limit >>= PAGE_SHIFT;
 
-	ret = -ENOMEM;
-	down_write(&current->mm->mmap_sem);
+	if (down_write_killable(&current->mm->mmap_sem))
+		return -EINTR;
 
+	ret = -ENOMEM;
 	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
 	    capable(CAP_IPC_LOCK))
 		ret = apply_mlockall_flags(flags);
@@ -765,7 +768,8 @@ SYSCALL_DEFINE0(munlockall)
 {
 	int ret;
 
-	down_write(&current->mm->mmap_sem);
+	if (down_write_killable(&current->mm->mmap_sem))
+		return -EINTR;
 	ret = apply_mlockall_flags(0);
 	up_write(&current->mm->mmap_sem);
 	return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index b9274a0c82c9..11e1f2ca72af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -178,7 +178,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	unsigned long min_brk;
 	bool populate;
 
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
 
 #ifdef CONFIG_COMPAT_BRK
 	/*
@@ -1332,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
 out_fput:
 	if (file)
 		fput(file);
@@ -2493,6 +2494,10 @@ int vm_munmap(unsigned long start, size_t len)
 	int ret;
 	struct mm_struct *mm = current->mm;
 
+	/*
+	 * XXX convert to down_write_killable as soon as all users are able
+	 * to handle the error.
+	 */
 	down_write(&mm->mmap_sem);
 	ret = do_munmap(mm, start, len);
 	up_write(&mm->mmap_sem);
@@ -2502,8 +2507,15 @@ EXPORT_SYMBOL(vm_munmap);
 
 SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 {
+	int ret;
+	struct mm_struct *mm = current->mm;
+
 	profile_munmap(addr);
-	return vm_munmap(addr, len);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+	ret = do_munmap(mm, addr, len);
+	up_write(&mm->mmap_sem);
+	return ret;
 }
 
 
@@ -2535,7 +2547,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
 		return ret;
 
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	vma = find_vma(mm, start);
 
 	if (!vma || !(vma->vm_flags & VM_SHARED))
@@ -2700,6 +2714,11 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	unsigned long ret;
 	bool populate;
 
+	/*
+	 * XXX not all users are chcecking the return value, convert
+	 * to down_write_killable after they are able to cope with
+	 * error
+	 */
 	down_write(&mm->mmap_sem);
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b650c5412f58..5019a1ef2848 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -379,7 +379,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 
 	reqprot = prot;
 
-	down_write(&current->mm->mmap_sem);
+	if (down_write_killable(&current->mm->mmap_sem))
+		return -EINTR;
 
 	vma = find_vma(current->mm, start);
 	error = -ENOMEM;
diff --git a/mm/mremap.c b/mm/mremap.c
index 9dc499977924..1f157adfdaf9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -503,7 +503,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	if (!new_len)
 		return ret;
 
-	down_write(&current->mm->mmap_sem);
+	if (down_write_killable(&current->mm->mmap_sem))
+		return -EINTR;
 
 	if (flags & MREMAP_FIXED) {
 		ret = mremap_to(addr, old_len, new_addr, new_len,
diff --git a/mm/nommu.c b/mm/nommu.c
index c8bd59a03c71..b74512746aae 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
 
 	if (file)
 		fput(file);
diff --git a/mm/util.c b/mm/util.c
index 8a1b3a1fb595..03b237746850 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
-	unsigned long flag, unsigned long pgoff)
+	unsigned long flag, unsigned long pgoff, bool killable)
 {
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
@@ -297,7 +297,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
-		down_write(&mm->mmap_sem);
+		if (killable) {
+			if (down_write_killable(&mm->mmap_sem))
+				return -EINTR;
+		} else {
+			down_write(&mm->mmap_sem);
+		}
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 				    &populate);
 		up_write(&mm->mmap_sem);
@@ -307,6 +312,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	return ret;
 }
 
+/* XXX are all callers checking an error */
 unsigned long vm_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long offset)
@@ -316,7 +322,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 	if (unlikely(offset_in_page(offset)))
 		return -EINVAL;
 
-	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false);
 }
 EXPORT_SYMBOL(vm_mmap);
 
-- 
cgit v1.2.3


From 9fbeb5ab59a2b2a09cca2eb68283e7a090d4b98d Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:30 -0700
Subject: mm: make vm_mmap killable

All the callers of vm_mmap seem to check for the failure already and
bail out in one way or another on the error which means that we can
change it to use killable version of vm_mmap_pgoff and return -EINTR if
the current task gets killed while waiting for mmap_sem.  This also
means that vm_mmap_pgoff can be killable by default and drop the
additional parameter.

This will help in the OOM conditions when the oom victim might be stuck
waiting for the mmap_sem for write which in turn can block oom_reaper
which relies on the mmap_sem for read to make a forward progress and
reclaim the address space of the victim.

Please note that load_elf_binary is ignoring vm_mmap error for
current->personality & MMAP_PAGE_ZERO case but that shouldn't be a
problem because the address is not used anywhere and we never return to
the userspace if we got killed.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 +-
 mm/internal.h      |  3 +--
 mm/mmap.c          |  2 +-
 mm/nommu.c         |  2 +-
 mm/util.c          | 13 ++++---------
 5 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b530c99e8e81..d5eb8dddd7c0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2013,7 +2013,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 /* These take the mm semaphore themselves */
 extern unsigned long vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
-extern unsigned long vm_mmap(struct file *, unsigned long,
+extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
         unsigned long, unsigned long);
 
diff --git a/mm/internal.h b/mm/internal.h
index bff7fd702331..a37e5b6f9d25 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -444,8 +444,7 @@ extern u32 hwpoison_filter_enable;
 
 extern unsigned long  __must_check vm_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long,
-        unsigned long, unsigned long,
-        bool);
+        unsigned long, unsigned long);
 
 extern void set_pageblock_order(void);
 unsigned long reclaim_clean_pages_from_list(struct zone *zone,
diff --git a/mm/mmap.c b/mm/mmap.c
index 11e1f2ca72af..420088682d4a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1333,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 out_fput:
 	if (file)
 		fput(file);
diff --git a/mm/nommu.c b/mm/nommu.c
index b74512746aae..c8bd59a03c71 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
 	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
+	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
 	if (file)
 		fput(file);
diff --git a/mm/util.c b/mm/util.c
index 03b237746850..917e0e3d0f8e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
-	unsigned long flag, unsigned long pgoff, bool killable)
+	unsigned long flag, unsigned long pgoff)
 {
 	unsigned long ret;
 	struct mm_struct *mm = current->mm;
@@ -297,12 +297,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
-		if (killable) {
-			if (down_write_killable(&mm->mmap_sem))
-				return -EINTR;
-		} else {
-			down_write(&mm->mmap_sem);
-		}
+		if (down_write_killable(&mm->mmap_sem))
+			return -EINTR;
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 				    &populate);
 		up_write(&mm->mmap_sem);
@@ -312,7 +308,6 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 	return ret;
 }
 
-/* XXX are all callers checking an error */
 unsigned long vm_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
 	unsigned long flag, unsigned long offset)
@@ -322,7 +317,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
 	if (unlikely(offset_in_page(offset)))
 		return -EINVAL;
 
-	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false);
+	return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
 }
 EXPORT_SYMBOL(vm_mmap);
 
-- 
cgit v1.2.3


From ae7987835643e470cb220e6685bd36d92179ef9c Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:33 -0700
Subject: mm: make vm_munmap killable

Almost all current users of vm_munmap are ignoring the return value and
so they do not handle potential error.  This means that some VMAs might
stay behind.  This patch doesn't try to solve those potential problems.
Quite contrary it adds a new failure mode by using down_write_killable
in vm_munmap.  This should be safer than other failure modes, though,
because the process is guaranteed to die as soon as it leaves the kernel
and exit_mmap will clean the whole address space.

This will help in the OOM conditions when the oom victim might be stuck
waiting for the mmap_sem for write which in turn can block oom_reaper
which relies on the mmap_sem for read to make a forward progress and
reclaim the address space of the victim.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/mmap.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 420088682d4a..ca292a7c2b68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2494,11 +2494,9 @@ int vm_munmap(unsigned long start, size_t len)
 	int ret;
 	struct mm_struct *mm = current->mm;
 
-	/*
-	 * XXX convert to down_write_killable as soon as all users are able
-	 * to handle the error.
-	 */
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	ret = do_munmap(mm, start, len);
 	up_write(&mm->mmap_sem);
 	return ret;
-- 
cgit v1.2.3


From 2d6c928241add2848e4eebfce407e95164229976 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Mon, 23 May 2016 16:25:42 -0700
Subject: mm: make vm_brk killable

Now that all the callers handle vm_brk failure we can change it wait for
mmap_sem killable to help oom_reaper to not get blocked just because
vm_brk gets blocked behind mmap_sem readers.

Signed-off-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 2 +-
 mm/mmap.c          | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d5eb8dddd7c0..2835d598d258 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2011,7 +2011,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long vm_brk(unsigned long, unsigned long);
+extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index ca292a7c2b68..d3d9a94ca031 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2712,12 +2712,9 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	unsigned long ret;
 	bool populate;
 
-	/*
-	 * XXX not all users are chcecking the return value, convert
-	 * to down_write_killable after they are able to cope with
-	 * error
-	 */
-	down_write(&mm->mmap_sem);
+	if (down_write_killable(&mm->mmap_sem))
+		return -EINTR;
+
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
-- 
cgit v1.2.3


From 957949243bac5dc25e2a651f17059f54f184913e Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Thu, 26 May 2016 15:16:08 -0700
Subject: mm: make CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on !FLATMEM
 explicitly

Per the suggestion from Michal Hocko [1], DEFERRED_STRUCT_PAGE_INIT
requires some ordering wrt other initialization operations, e.g.
page_ext_init has to happen after the whole memmap is initialized
properly.

For SPARSEMEM this requires to wait for page_alloc_init_late.  Other
memory models (e.g.  flatmem) might have different initialization
layouts (page_ext_init_flatmem).  Currently DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG which in turn

	depends on SPARSEMEM || X86_64_ACPI_NUMA
	depends on ARCH_ENABLE_MEMORY_HOTPLUG

and X86_64_ACPI_NUMA depends on NUMA which in turn disable FLATMEM
memory model:

config ARCH_FLATMEM_ENABLE
	def_bool y
	depends on X86_32 && !NUMA

so FLATMEM is ruled out via dependency maze.  Be explicit and disable
FLATMEM for DEFERRED_STRUCT_PAGE_INIT so that we do not reintroduce
subtle initialization bugs

[1] http://lkml.kernel.org/r/20160523073157.GD2278@dhcp22.suse.cz

Link: http://lkml.kernel.org/r/1464027356-32282-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 2664c118b5d2..22fa8189e4fc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT
 	default n
 	depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
 	depends on MEMORY_HOTPLUG
+	depends on !FLATMEM
 	help
 	  Ordinarily all struct pages are initialised during early boot in a
 	  single thread. On very large machines this can take a considerable
-- 
cgit v1.2.3


From 9725759a96efb1ce56a1b93455ac0ab1901c5327 Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Thu, 26 May 2016 15:16:11 -0700
Subject: mm: kasan: remove unused 'reserved' field from struct
 kasan_alloc_meta

Commit cd11016e5f52 ("mm, kasan: stackdepot implementation.  Enable
stackdepot for SLAB") added 'reserved' field, but never used it.

Link: http://lkml.kernel.org/r/1464021054-2307-1-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/kasan/kasan.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'mm')

diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 7f7ac51d7faf..fb87923552ef 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -77,7 +77,6 @@ struct kasan_alloc_meta {
 	struct kasan_track track;
 	u32 state : 2;	/* enum kasan_state */
 	u32 alloc_size : 30;
-	u32 reserved;
 };
 
 struct qlist_node {
-- 
cgit v1.2.3


From 1ebab2db065e99eed9ab2304d3b7ad25c9568612 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Thu, 26 May 2016 15:16:19 -0700
Subject: memcg: fix mem_cgroup_out_of_memory() return value.

mem_cgroup_out_of_memory() is returning "true" if it finds a TIF_MEMDIE
task after an eligible task was found, "false" if it found a TIF_MEMDIE
task before an eligible task is found.

This difference confuses memory_max_write() which checks the return
value of mem_cgroup_out_of_memory().  Since memory_max_write() wants to
continue looping, mem_cgroup_out_of_memory() should return "true" in
this case.

This patch sets a dummy pointer in order to return "true".

Link: http://lkml.kernel.org/r/1463753327-5170-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cf428d7b9a03..f6477a9dbe7a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1302,6 +1302,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				mem_cgroup_iter_break(memcg, iter);
 				if (chosen)
 					put_task_struct(chosen);
+				/* Set a dummy value to return "true". */
+				chosen = (void *) 1;
 				goto unlock;
 			case OOM_SCAN_OK:
 				break;
-- 
cgit v1.2.3


From 4abaac9b733ea44fcf0d561ec1813e0394e61c9d Mon Sep 17 00:00:00 2001
From: Dan Streetman <ddstreet@ieee.org>
Date: Thu, 26 May 2016 15:16:27 -0700
Subject: update "mm/zsmalloc: don't fail if can't create debugfs info"

Some updates to commit d34f615720d1 ("mm/zsmalloc: don't fail if can't
create debugfs info"):

 - add pr_warn to all stat failure cases
 - do not prevent module loading on stat failure

Link: http://lkml.kernel.org/r/1463671123-5479-1-git-send-email-ddstreet@ieee.org
Signed-off-by: Dan Streetman <ddstreet@ieee.org>
Reviewed-by: Ganesh Mahendran <opensource.ganesh@gmail.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Cc: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Cc: Dan Streetman <dan.streetman@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/zsmalloc.c | 35 ++++++++++++++++-------------------
 1 file changed, 16 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 72698db958e7..b6d4f258cb53 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -45,6 +45,8 @@
  *
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
@@ -483,16 +485,16 @@ static inline unsigned long zs_stat_get(struct size_class *class,
 
 #ifdef CONFIG_ZSMALLOC_STAT
 
-static int __init zs_stat_init(void)
+static void __init zs_stat_init(void)
 {
-	if (!debugfs_initialized())
-		return -ENODEV;
+	if (!debugfs_initialized()) {
+		pr_warn("debugfs not available, stat dir not created\n");
+		return;
+	}
 
 	zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
 	if (!zs_stat_root)
-		return -ENOMEM;
-
-	return 0;
+		pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
 }
 
 static void __exit zs_stat_exit(void)
@@ -577,8 +579,10 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
 	struct dentry *entry;
 
-	if (!zs_stat_root)
+	if (!zs_stat_root) {
+		pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
 		return;
+	}
 
 	entry = debugfs_create_dir(name, zs_stat_root);
 	if (!entry) {
@@ -592,7 +596,8 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 	if (!entry) {
 		pr_warn("%s: debugfs file entry <%s> creation failed\n",
 				name, "classes");
-		return;
+		debugfs_remove_recursive(pool->stat_dentry);
+		pool->stat_dentry = NULL;
 	}
 }
 
@@ -602,9 +607,8 @@ static void zs_pool_stat_destroy(struct zs_pool *pool)
 }
 
 #else /* CONFIG_ZSMALLOC_STAT */
-static int __init zs_stat_init(void)
+static void __init zs_stat_init(void)
 {
-	return 0;
 }
 
 static void __exit zs_stat_exit(void)
@@ -2011,17 +2015,10 @@ static int __init zs_init(void)
 	zpool_register_driver(&zs_zpool_driver);
 #endif
 
-	ret = zs_stat_init();
-	if (ret) {
-		pr_err("zs stat initialization failed\n");
-		goto stat_fail;
-	}
+	zs_stat_init();
+
 	return 0;
 
-stat_fail:
-#ifdef CONFIG_ZPOOL
-	zpool_unregister_driver(&zs_zpool_driver);
-#endif
 notifier_fail:
 	zs_unregister_cpu_notifier();
 
-- 
cgit v1.2.3


From 5930122683dff58f0846b0f0405b4bd598a3ba6a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 27 May 2016 10:19:30 -0400
Subject: switch xattr_handler->set() to passing dentry and inode separately

preparation for similar switch in ->setxattr() (see the next commit for
rationale).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/9p/acl.c                  |  6 +++---
 fs/9p/xattr.c                |  5 +++--
 fs/btrfs/xattr.c             | 12 +++++-------
 fs/ceph/xattr.c              |  7 ++++---
 fs/cifs/xattr.c              |  9 +++++----
 fs/ext2/xattr_security.c     |  7 ++++---
 fs/ext2/xattr_trusted.c      |  7 ++++---
 fs/ext2/xattr_user.c         |  9 +++++----
 fs/ext4/xattr_security.c     |  7 ++++---
 fs/ext4/xattr_trusted.c      |  7 ++++---
 fs/ext4/xattr_user.c         |  9 +++++----
 fs/f2fs/xattr.c              | 12 ++++++------
 fs/gfs2/xattr.c              |  6 +++---
 fs/hfsplus/xattr.c           | 12 ++++++------
 fs/hfsplus/xattr.h           |  2 +-
 fs/hfsplus/xattr_security.c  |  7 ++++---
 fs/hfsplus/xattr_trusted.c   |  7 ++++---
 fs/hfsplus/xattr_user.c      |  7 ++++---
 fs/jffs2/security.c          |  7 ++++---
 fs/jffs2/xattr_trusted.c     |  7 ++++---
 fs/jffs2/xattr_user.c        |  7 ++++---
 fs/jfs/xattr.c               | 14 ++++++--------
 fs/nfs/nfs4proc.c            | 19 +++++++++----------
 fs/ocfs2/xattr.c             | 23 +++++++++++++----------
 fs/orangefs/xattr.c          | 10 ++++++----
 fs/posix_acl.c               |  6 +++---
 fs/reiserfs/xattr_security.c |  9 +++++----
 fs/reiserfs/xattr_trusted.c  |  9 +++++----
 fs/reiserfs/xattr_user.c     |  9 +++++----
 fs/ubifs/xattr.c             |  7 +++----
 fs/xattr.c                   |  6 ++++--
 fs/xfs/xfs_xattr.c           |  9 +++++----
 include/linux/xattr.h        |  4 ++--
 mm/shmem.c                   |  7 ++++---
 34 files changed, 156 insertions(+), 135 deletions(-)

(limited to 'mm')

diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index eb3589edf485..0576eaeb60b9 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
-			      struct dentry *dentry, const char *name,
-			      const void *value, size_t size, int flags)
+			      struct dentry *dentry, struct inode *inode,
+			      const char *name, const void *value,
+			      size_t size, int flags)
 {
 	int retval;
 	struct posix_acl *acl;
 	struct v9fs_session_info *v9ses;
-	struct inode *inode = d_inode(dentry);
 
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 18c62bae9591..a6bd349bab23 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *value, size_t size, int flags)
+				  struct dentry *dentry, struct inode *inode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
 {
 	const char *full_name = xattr_full_name(handler, name);
 
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3bfb252206c7..d1a177a3dbe8 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *buffer, size_t size,
-				   int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *buffer,
+				   size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
 }
 
 static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
-					struct dentry *dentry,
+					struct dentry *unused, struct inode *inode,
 					const char *name, const void *value,
 					size_t size, int flags)
 {
 	name = xattr_full_name(handler, name);
-	return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
+	return btrfs_set_prop(inode, name, value, size, flags);
 }
 
 static const struct xattr_handler btrfs_security_xattr_handler = {
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 0d66722c6a52..2baa6939dfe6 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1051,12 +1051,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler,
 }
 
 static int ceph_set_xattr_handler(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *value, size_t size, int flags)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, const void *value,
+				  size_t size, int flags)
 {
 	if (!ceph_is_valid_xattr(name))
 		return -EOPNOTSUPP;
-	return __ceph_setxattr(d_inode(dentry), name, value, size, flags);
+	return __ceph_setxattr(inode, name, value, size, flags);
 }
 
 const struct xattr_handler ceph_other_xattr_handler = {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index c8b77aa24a1d..5e23f64c0804 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -39,8 +39,9 @@
 enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT };
 
 static int cifs_xattr_set(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+			  struct dentry *dentry, struct inode *inode,
+			  const char *name, const void *value,
+			  size_t size, int flags)
 {
 	int rc = -EOPNOTSUPP;
 	unsigned int xid;
@@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler,
 			if (value &&
 			    pTcon->ses->server->ops->set_acl)
 				rc = pTcon->ses->server->ops->set_acl(pacl,
-						size, d_inode(dentry),
+						size, inode,
 						full_path, CIFS_ACL_DACL);
 			else
 				rc = -EOPNOTSUPP;
 			if (rc == 0) /* force revalidate of the inode */
-				CIFS_I(d_inode(dentry))->time = 0;
+				CIFS_I(inode)->time = 0;
 			kfree(pacl);
 		}
 #endif /* CONFIG_CIFS_ACL */
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 7fd3b867ce65..7b9e9c1842d5 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+			struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name,
 			      value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 0f85705ff519..65049b71af13 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_trusted_set(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       const void *value, size_t size, int flags)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, const void *value,
+		       size_t size, int flags)
 {
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
 
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 1fafd27037cc..fb2f992ae763 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext2_xattr_user_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 
-	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER,
+	return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 123a7d010efe..a8921112030d 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_security_set(const struct xattr_handler *handler,
-			struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags)
+			struct dentry *unused, struct inode *inode,
+			const char *name, const void *value,
+			size_t size, int flags)
 {
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 60652fa24cbc..c7765c735714 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_trusted_set(const struct xattr_handler *handler,
-		       struct dentry *dentry, const char *name,
-		       const void *value, size_t size, int flags)
+		       struct dentry *unused, struct inode *inode,
+		       const char *name, const void *value,
+		       size_t size, int flags)
 {
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
 			      name, value, size, flags);
 }
 
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 17a446ffecd3..ca20e423034b 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 
 static int
 ext4_xattr_user_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	if (!test_opt(dentry->d_sb, XATTR_USER))
+	if (!test_opt(inode->i_sb, XATTR_USER))
 		return -EOPNOTSUPP;
-	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER,
 			      name, value, size, flags);
 }
 
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 00ea56797258..e3decae3acfb 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, const void *value,
+		struct dentry *unused, struct inode *inode,
+		const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
+	struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
 
 	switch (handler->flags) {
 	case F2FS_XATTR_INDEX_USER:
@@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	return f2fs_setxattr(d_inode(dentry), handler->flags, name,
+	return f2fs_setxattr(inode, handler->flags, name,
 					value, size, NULL, flags);
 }
 
@@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 }
 
 static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
-		struct dentry *dentry, const char *name, const void *value,
+		struct dentry *unused, struct inode *inode,
+		const char *name, const void *value,
 		size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value == NULL)
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index f42ab53bd30d..3a2853504084 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
 }
 
 static int gfs2_xattr_set(const struct xattr_handler *handler,
-			  struct dentry *dentry, const char *name,
-			  const void *value, size_t size, int flags)
+			  struct dentry *unused, struct inode *inode,
+			  const char *name, const void *value,
+			  size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
 	int ret;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 4f118d282a7a..d37bb88dc746 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len)
 	return len;
 }
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
 		     const void *value, size_t size, int flags,
 		     const char *prefix, size_t prefixlen)
 {
@@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 		return -ENOMEM;
 	strcpy(xattr_name, prefix);
 	strcpy(xattr_name + prefixlen, name);
-	res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size,
-				 flags);
+	res = __hfsplus_setxattr(inode, xattr_name, value, size, flags);
 	kfree(xattr_name);
 	return res;
 }
@@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				const void *buffer, size_t size, int flags)
+				struct dentry *unused, struct inode *inode,
+				const char *name, const void *buffer,
+				size_t size, int flags)
 {
 	/*
 	 * Don't allow setting properly prefixed attributes
@@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
 	 * creates), so we pass the name through unmodified (after
 	 * ensuring it doesn't conflict with another namespace).
 	 */
-	return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
+	return __hfsplus_setxattr(inode, name, buffer, size, flags);
 }
 
 const struct xattr_handler hfsplus_xattr_osx_handler = {
diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h
index d04ba6f58df2..68f6b539371f 100644
--- a/fs/hfsplus/xattr.h
+++ b/fs/hfsplus/xattr.h
@@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[];
 int __hfsplus_setxattr(struct inode *inode, const char *name,
 			const void *value, size_t size, int flags);
 
-int hfsplus_setxattr(struct dentry *dentry, const char *name,
+int hfsplus_setxattr(struct inode *inode, const char *name,
 				   const void *value, size_t size, int flags,
 				   const char *prefix, size_t prefixlen);
 
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index ae2ca8c2e335..37b3efa733ef 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_security_setxattr(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *name,
-				     const void *buffer, size_t size, int flags)
+				     struct dentry *unused, struct inode *inode,
+				     const char *name, const void *buffer,
+				     size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_SECURITY_PREFIX,
 				XATTR_SECURITY_PREFIX_LEN);
 }
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index eae2947060aa..94519d6c627d 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    const void *buffer, size_t size, int flags)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, const void *buffer,
+				    size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 3c9eec3e4c7b..fae6c0ea0030 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int hfsplus_user_setxattr(const struct xattr_handler *handler,
-				 struct dentry *dentry, const char *name,
-				 const void *buffer, size_t size, int flags)
+				 struct dentry *unused, struct inode *inode,
+				 const char *name, const void *buffer,
+				 size_t size, int flags)
 {
-	return hfsplus_setxattr(dentry, name, buffer, size, flags,
+	return hfsplus_setxattr(inode, name, buffer, size, flags,
 				XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index 3ed9a4b49778..c2332e30f218 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_security_setxattr(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *buffer, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *buffer,
+				   size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index 4ebecff1d922..5d6030826c52 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
-				  struct dentry *dentry, const char *name,
-				  const void *buffer, size_t size, int flags)
+				  struct dentry *unused, struct inode *inode,
+				  const char *name, const void *buffer,
+				  size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index bce249e1b277..9d027b4abcf9 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 }
 
 static int jffs2_user_setxattr(const struct xattr_handler *handler,
-			       struct dentry *dentry, const char *name,
-			       const void *buffer, size_t size, int flags)
+			       struct dentry *unused, struct inode *inode,
+			       const char *name, const void *buffer,
+			       size_t size, int flags)
 {
-	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
+	return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER,
 				 name, buffer, size, flags);
 }
 
diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c
index beb182b503b3..0bf3c33aedff 100644
--- a/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set(const struct xattr_handler *handler,
-			 struct dentry *dentry, const char *name,
-			 const void *value, size_t size, int flags)
+			 struct dentry *unused, struct inode *inode,
+			 const char *name, const void *value,
+			 size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	name = xattr_full_name(handler, name);
 	return __jfs_xattr_set(inode, name, value, size, flags);
 }
@@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler,
 }
 
 static int jfs_xattr_set_os2(const struct xattr_handler *handler,
-			     struct dentry *dentry, const char *name,
-			     const void *value, size_t size, int flags)
+			     struct dentry *unused, struct inode *inode,
+			     const char *name, const void *value,
+			     size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	if (is_known_namespace(name))
 		return -EOPNOTSUPP;
 	return __jfs_xattr_set(inode, name, value, size, flags);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 084e8570da18..2e802ec47b8a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4993,12 +4993,11 @@ static int nfs4_do_set_security_label(struct inode *inode,
 }
 
 static int
-nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen)
+nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen)
 {
 	struct nfs4_label ilabel, *olabel = NULL;
 	struct nfs_fattr fattr;
 	struct rpc_cred *cred;
-	struct inode *inode = d_inode(dentry);
 	int status;
 
 	if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL))
@@ -6255,11 +6254,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
 #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
 
 static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *key,
-				   const void *buf, size_t buflen,
-				   int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *key, const void *buf,
+				   size_t buflen, int flags)
 {
-	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
+	return nfs4_proc_set_acl(inode, buf, buflen);
 }
 
 static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
@@ -6277,12 +6276,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
-				     struct dentry *dentry, const char *key,
-				     const void *buf, size_t buflen,
-				     int flags)
+				     struct dentry *unused, struct inode *inode,
+				     const char *key, const void *buf,
+				     size_t buflen, int flags)
 {
 	if (security_ismaclabel(key))
-		return nfs4_set_security_label(dentry, buf, buflen);
+		return nfs4_set_security_label(inode, buf, buflen);
 
 	return -EOPNOTSUPP;
 }
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ad16995c9e7a..d2053853951e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
-				    struct dentry *dentry, const char *name,
-				    const void *value, size_t size, int flags)
+				    struct dentry *unused, struct inode *inode,
+				    const char *name, const void *value,
+				    size_t size, int flags)
 {
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY,
 			       name, value, size, flags);
 }
 
@@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *value, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
 {
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED,
 			       name, value, size, flags);
 }
 
@@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 }
 
 static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
-				struct dentry *dentry, const char *name,
-				const void *value, size_t size, int flags)
+				struct dentry *unused, struct inode *inode,
+				const char *name, const void *value,
+				size_t size, int flags)
 {
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 
-	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER,
+	return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER,
 			       name, value, size, flags);
 }
 
diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c
index 99c19545752c..5893ddde0e4b 100644
--- a/fs/orangefs/xattr.c
+++ b/fs/orangefs/xattr.c
@@ -448,13 +448,14 @@ out_unlock:
 }
 
 static int orangefs_xattr_set_default(const struct xattr_handler *handler,
-				      struct dentry *dentry,
+				      struct dentry *unused,
+				      struct inode *inode,
 				      const char *name,
 				      const void *buffer,
 				      size_t size,
 				      int flags)
 {
-	return orangefs_inode_setxattr(dentry->d_inode,
+	return orangefs_inode_setxattr(inode,
 				    ORANGEFS_XATTR_NAME_DEFAULT_PREFIX,
 				    name,
 				    buffer,
@@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler,
 }
 
 static int orangefs_xattr_set_trusted(const struct xattr_handler *handler,
-				     struct dentry *dentry,
+				     struct dentry *unused,
+				     struct inode *inode,
 				     const char *name,
 				     const void *buffer,
 				     size_t size,
 				     int flags)
 {
-	return orangefs_inode_setxattr(dentry->d_inode,
+	return orangefs_inode_setxattr(inode,
 				    ORANGEFS_XATTR_NAME_TRUSTED_PREFIX,
 				    name,
 				    buffer,
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 2c60f17e7d92..8a4a266beff3 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 
 static int
 posix_acl_xattr_set(const struct xattr_handler *handler,
-		    struct dentry *dentry, const char *name,
-		    const void *value, size_t size, int flags)
+		    struct dentry *unused, struct inode *inode,
+		    const char *name, const void *value,
+		    size_t size, int flags)
 {
-	struct inode *inode = d_backing_inode(dentry);
 	struct posix_acl *acl = NULL;
 	int ret;
 
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 86aeb9dd805a..e4cbb7719906 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-security_set(const struct xattr_handler *handler, struct dentry *dentry,
-	     const char *name, const void *buffer, size_t size, int flags)
+security_set(const struct xattr_handler *handler, struct dentry *unused,
+	     struct inode *inode, const char *name, const void *buffer,
+	     size_t size, int flags)
 {
-	if (IS_PRIVATE(d_inode(dentry)))
+	if (IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index 31837f031f59..f15a5f9e84ce 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
-	    const char *name, const void *buffer, size_t size, int flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *unused,
+	    struct inode *inode, const char *name, const void *buffer,
+	    size_t size, int flags)
 {
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
+	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode))
 		return -EPERM;
 
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index f7c39731684b..dc59df43b2db 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused,
 }
 
 static int
-user_set(const struct xattr_handler *handler, struct dentry *dentry,
-	 const char *name, const void *buffer, size_t size, int flags)
+user_set(const struct xattr_handler *handler, struct dentry *unused,
+	 struct inode *inode, const char *name, const void *buffer,
+	 size_t size, int flags)
 {
-	if (!reiserfs_xattrs_user(dentry->d_sb))
+	if (!reiserfs_xattrs_user(inode->i_sb))
 		return -EOPNOTSUPP;
-	return reiserfs_xattr_set(d_inode(dentry),
+	return reiserfs_xattr_set(inode,
 				  xattr_full_name(handler, name),
 				  buffer, size, flags);
 }
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 6c277eb6aef9..b5fc27969e9d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
 }
 
 static int ubifs_xattr_set(const struct xattr_handler *handler,
-			   struct dentry *dentry, const char *name,
-			   const void *value, size_t size, int flags)
+			   struct dentry *dentry, struct inode *inode,
+			   const char *name, const void *value,
+			   size_t size, int flags)
 {
-	struct inode *inode = d_inode(dentry);
-
 	dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
 		name, inode->i_ino, dentry, size);
 
diff --git a/fs/xattr.c b/fs/xattr.c
index fc81e771488a..b16d07889700 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -754,7 +754,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, name, value, size, flags);
+	return handler->set(handler, dentry, d_inode(dentry), name, value,
+			    size, flags);
 }
 
 /*
@@ -769,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (IS_ERR(handler))
 		return PTR_ERR(handler);
-	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
+	return handler->set(handler, dentry, d_inode(dentry), name, NULL,
+			    0, XATTR_REPLACE);
 }
 
 EXPORT_SYMBOL(generic_getxattr);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index d111f691f313..2773b155cb56 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -74,11 +74,12 @@ xfs_forget_acl(
 }
 
 static int
-xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
-		const char *name, const void *value, size_t size, int flags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused,
+		struct inode *inode, const char *name, const void *value,
+		size_t size, int flags)
 {
 	int			xflags = handler->flags;
-	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
+	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
 
 	/* Convert Linux syscall to XFS internal ATTR flags */
@@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
 	error = xfs_attr_set(ip, (unsigned char *)name,
 				(void *)value, size, xflags);
 	if (!error)
-		xfs_forget_acl(d_inode(dentry), name, xflags);
+		xfs_forget_acl(inode, name, xflags);
 
 	return error;
 }
diff --git a/include/linux/xattr.h b/include/linux/xattr.h
index 1cc4c578deb9..76beb206741a 100644
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -33,8 +33,8 @@ struct xattr_handler {
 		   struct inode *inode, const char *name, void *buffer,
 		   size_t size);
 	int (*set)(const struct xattr_handler *, struct dentry *dentry,
-		   const char *name, const void *buffer, size_t size,
-		   int flags);
+		   struct inode *inode, const char *name, const void *buffer,
+		   size_t size, int flags);
 };
 
 const char *xattr_full_name(const struct xattr_handler *, const char *);
diff --git a/mm/shmem.c b/mm/shmem.c
index e418a995427d..a36144909b28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2645,10 +2645,11 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
 }
 
 static int shmem_xattr_handler_set(const struct xattr_handler *handler,
-				   struct dentry *dentry, const char *name,
-				   const void *value, size_t size, int flags)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, const void *value,
+				   size_t size, int flags)
 {
-	struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	name = xattr_full_name(handler, name);
 	return simple_xattr_set(&info->xattrs, name, value, size, flags);
-- 
cgit v1.2.3


From edd9f7230f591b7988533b1cafb07f3c03555f19 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@virtuozzo.com>
Date: Fri, 27 May 2016 14:27:21 -0700
Subject: mm: oom: do not reap task if there are live threads in threadgroup

If the current process is exiting, we don't invoke oom killer, instead
we give it access to memory reserves and try to reap its mm in case
nobody is going to use it.  There's a mistake in the code performing
this check - we just ignore any process of the same thread group no
matter if it is exiting or not - see try_oom_reaper.  Fix it.

Link: http://lkml.kernel.org/r/1464087628-7318-1-git-send-email-vdavydov@virtuozzo.com
Fixes: 3ef22dfff239 ("oom, oom_reaper: try to reap tasks which skip regular OOM killer path")Signed-off-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 5bb2f7698ad7..326dd14938f0 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -611,8 +611,6 @@ void try_oom_reaper(struct task_struct *tsk)
 
 			if (!process_shares_mm(p, mm))
 				continue;
-			if (same_thread_group(p, tsk))
-				continue;
 			if (fatal_signal_pending(p))
 				continue;
 
-- 
cgit v1.2.3


From fe53ca54270a757f0a28ee6bf3a54d952b550ed0 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 27 May 2016 14:27:30 -0700
Subject: mm: use early_pfn_to_nid in page_ext_init

page_ext_init() checks suitable pages with pfn_to_nid(), but
pfn_to_nid() depends on memmap which will not be setup fully until
page_alloc_init_late() is done.  Use early_pfn_to_nid() instead of
pfn_to_nid() so that page extension could be still used early even
though CONFIG_ DEFERRED_STRUCT_PAGE_INIT is enabled and catch early page
allocation call sites.

Suggested by Joonsoo Kim [1], this fix basically undoes the change
introduced by commit b8f1a75d61d840 ("mm: call page_ext_init() after all
struct pages are initialized") and fixes the same problem with a better
approach.

[1] http://lkml.kernel.org/r/CAAmzW4OUmyPwQjvd7QUfc6W1Aic__TyAuH80MLRZNMxKy0-wPQ@mail.gmail.com

Link: http://lkml.kernel.org/r/1464198689-23458-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/main.c   | 3 +--
 mm/page_ext.c | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/init/main.c b/init/main.c
index bc0f9e0bcf22..4c17fda5c2ff 100644
--- a/init/main.c
+++ b/init/main.c
@@ -607,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
+	page_ext_init();
 	debug_objects_mem_init();
 	kmemleak_init();
 	setup_per_cpu_pageset();
@@ -1003,8 +1004,6 @@ static noinline void __init kernel_init_freeable(void)
 	sched_init_smp();
 
 	page_alloc_init_late();
-	/* Initialize page ext after all struct pages are initializaed */
-	page_ext_init();
 
 	do_basic_setup();
 
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 2d864e64f7fe..44a4c029c8e7 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -390,8 +390,10 @@ void __init page_ext_init(void)
 			 * We know some arch can have a nodes layout such as
 			 * -------------pfn-------------->
 			 * N0 | N1 | N2 | N0 | N1 | N2|....
+			 *
+			 * Take into account DEFERRED_STRUCT_PAGE_INIT.
 			 */
-			if (pfn_to_nid(pfn) != nid)
+			if (early_pfn_to_nid(pfn) != nid)
 				continue;
 			if (init_section_page_ext(pfn, nid))
 				goto oom;
-- 
cgit v1.2.3


From f65e91df25aa426289cbcb580ca3183e24979fb1 Mon Sep 17 00:00:00 2001
From: Yang Shi <yang.shi@linaro.org>
Date: Fri, 27 May 2016 14:27:32 -0700
Subject: mm: use early_pfn_to_nid in register_page_bootmem_info_node

register_page_bootmem_info_node() is invoked in mem_init(), so it will
be called before page_alloc_init_late() if DEFERRED_STRUCT_PAGE_INIT is
enabled.  But, pfn_to_nid() depends on memmap which won't be fully setup
until page_alloc_init_late() is done, so replace pfn_to_nid() by
early_pfn_to_nid().

Link: http://lkml.kernel.org/r/1464210007-30930-1-git-send-email-yang.shi@linaro.org
Signed-off-by: Yang Shi <yang.shi@linaro.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memory_hotplug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index caf2a14c37ad..b8ee0806415f 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
 		 * multiple nodes we check that this pfn does not already
 		 * reside in some other nodes.
 		 */
-		if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
+		if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node))
 			register_page_bootmem_info_section(pfn);
 	}
 }
-- 
cgit v1.2.3


From e2fe14564d3316d1625ed20bf1083995f4960893 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Fri, 27 May 2016 14:27:35 -0700
Subject: oom_reaper: close race with exiting task

Tetsuo has reported:
  Out of memory: Kill process 443 (oleg's-test) score 855 or sacrifice child
  Killed process 443 (oleg's-test) total-vm:493248kB, anon-rss:423880kB, file-rss:4kB, shmem-rss:0kB
  sh invoked oom-killer: gfp_mask=0x24201ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD), order=0, oom_score_adj=0
  sh cpuset=/ mems_allowed=0
  CPU: 2 PID: 1 Comm: sh Not tainted 4.6.0-rc7+ #51
  Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013
  Call Trace:
    dump_stack+0x85/0xc8
    dump_header+0x5b/0x394
  oom_reaper: reaped process 443 (oleg's-test), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB

In other words:

  __oom_reap_task		exit_mm
    atomic_inc_not_zero
				  tsk->mm = NULL
				  mmput
				    atomic_dec_and_test # > 0
				  exit_oom_victim # New victim will be
						  # selected
				<OOM killer invoked>
				  # no TIF_MEMDIE task so we can select a new one
    unmap_page_range # to release the memory

The race exists even without the oom_reaper because anybody who pins the
address space and gets preempted might race with exit_mm but oom_reaper
made this race more probable.

We can address the oom_reaper part by using oom_lock for __oom_reap_task
because this would guarantee that a new oom victim will not be selected
if the oom reaper might race with the exit path.  This doesn't solve the
original issue, though, because somebody else still might be pinning
mm_users and so __mmput won't be called to release the memory but that
is not really realiably solvable because the task will get away from the
oom sight as soon as it is unhashed from the task_list and so we cannot
guarantee a new victim won't be selected.

[akpm@linux-foundation.org: fix use of unused `mm', Per Stephen]
[akpm@linux-foundation.org: coding-style fixes]
Fixes: aac453635549 ("mm, oom: introduce oom reaper")
Link: http://lkml.kernel.org/r/1464271493-20008-1-git-send-email-mhocko@kernel.org
Signed-off-by: Michal Hocko <mhocko@suse.com>
Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 326dd14938f0..dfb1ab61fb23 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -443,12 +443,28 @@ static bool __oom_reap_task(struct task_struct *tsk)
 {
 	struct mmu_gather tlb;
 	struct vm_area_struct *vma;
-	struct mm_struct *mm;
+	struct mm_struct *mm = NULL;
 	struct task_struct *p;
 	struct zap_details details = {.check_swap_entries = true,
 				      .ignore_dirty = true};
 	bool ret = true;
 
+	/*
+	 * We have to make sure to not race with the victim exit path
+	 * and cause premature new oom victim selection:
+	 * __oom_reap_task		exit_mm
+	 *   atomic_inc_not_zero
+	 *				  mmput
+	 *				    atomic_dec_and_test
+	 *				  exit_oom_victim
+	 *				[...]
+	 *				out_of_memory
+	 *				  select_bad_process
+	 *				    # no TIF_MEMDIE task selects new victim
+	 *  unmap_page_range # frees some memory
+	 */
+	mutex_lock(&oom_lock);
+
 	/*
 	 * Make sure we find the associated mm_struct even when the particular
 	 * thread has already terminated and cleared its mm.
@@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 */
 	p = find_lock_task_mm(tsk);
 	if (!p)
-		return true;
+		goto unlock_oom;
 
 	mm = p->mm;
 	if (!atomic_inc_not_zero(&mm->mm_users)) {
 		task_unlock(p);
-		return true;
+		goto unlock_oom;
 	}
 
 	task_unlock(p);
 
 	if (!down_read_trylock(&mm->mmap_sem)) {
 		ret = false;
-		goto out;
+		goto unlock_oom;
 	}
 
 	tlb_gather_mmu(&tlb, mm, 0, -1);
@@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk)
 	 * to release its memory.
 	 */
 	set_bit(MMF_OOM_REAPED, &mm->flags);
-out:
+unlock_oom:
+	mutex_unlock(&oom_lock);
 	/*
 	 * Drop our reference but make sure the mmput slow path is called from a
 	 * different context because we shouldn't risk we get stuck there and
 	 * put the oom_reaper out of the way.
 	 */
-	mmput_async(mm);
+	if (mm)
+		mmput_async(mm);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 0798d3c022dc63eb0ec02b511e1f76ca8411ef8e Mon Sep 17 00:00:00 2001
From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Date: Fri, 27 May 2016 14:27:38 -0700
Subject: mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap()

If page_move_anon_rmap() is refiling a pmd-splitted THP mapped in a tail
page from a pte, the "address" must be THP aligned in order for the
page->index bugcheck to pass in the CONFIG_DEBUG_VM=y builds.

Link: http://lkml.kernel.org/r/1464253620-106404-1-git-send-email-kirill.shutemov@linux.intel.com
Fixes: 6d0a07edd17c ("mm: thp: calculate the mapcount correctly for THP pages during WP faults")
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Tested-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: <stable@vger.kernel.org>        [4.5]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/rmap.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/rmap.c b/mm/rmap.c
index 8a839935b18c..0ea5d9071b32 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1098,6 +1098,8 @@ void page_move_anon_rmap(struct page *page,
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 	VM_BUG_ON_VMA(!anon_vma, vma);
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page))
+		address &= HPAGE_PMD_MASK;
 	VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
 
 	anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
-- 
cgit v1.2.3


From badbda53e505089062e194c614e6f23450bc98b2 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 27 May 2016 14:27:41 -0700
Subject: mm/cma: silence warnings due to max() usage

pageblock_order can be (at least) an unsigned int or an unsigned long
depending on the kernel config and architecture, so use max_t(unsigned
long, ...) when comparing it.

fixes these warnings:

In file included from include/asm-generic/bug.h:13:0,
                 from arch/powerpc/include/asm/bug.h:127,
                 from include/linux/bug.h:4,
                 from include/linux/mmdebug.h:4,
                 from include/linux/mm.h:8,
                 from include/linux/memblock.h:18,
                 from mm/cma.c:28:
mm/cma.c: In function 'cma_init_reserved_mem':
include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast
  (void) (&_max1 == &_max2);                   ^
mm/cma.c:186:27: note: in expansion of macro 'max'
  alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
                           ^
mm/cma.c: In function 'cma_declare_contiguous':
include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast
  (void) (&_max1 == &_max2);                   ^
include/linux/kernel.h:747:9: note: in definition of macro 'max'
  typeof(y) _max2 = (y);            ^
mm/cma.c:270:29: note: in expansion of macro 'max'
   (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
                             ^
include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast
  (void) (&_max1 == &_max2);                   ^
include/linux/kernel.h:747:21: note: in definition of macro 'max'
  typeof(y) _max2 = (y);                        ^
mm/cma.c:270:29: note: in expansion of macro 'max'
   (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
                             ^

[akpm@linux-foundation.org: coding-style fixes]
Link: http://lkml.kernel.org/r/20160526150748.5be38a4f@canb.auug.org.au
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/cma.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/cma.c b/mm/cma.c
index ea506eb18cd6..bd0e1412475e 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
 		return -EINVAL;
 
 	/* ensure minimal alignment required by mm core */
-	alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
+	alignment = PAGE_SIZE <<
+			max_t(unsigned long, MAX_ORDER - 1, pageblock_order);
 
 	/* alignment should be aligned with order_per_bit */
 	if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit))
@@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base,
 	 * migratetype page by page allocator's buddy algorithm. In the case,
 	 * you couldn't get a contiguous memory, which is not what we want.
 	 */
-	alignment = max(alignment,
-		(phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order));
+	alignment = max(alignment,  (phys_addr_t)PAGE_SIZE <<
+			  max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
 	base = ALIGN(base, alignment);
 	size = ALIGN(size, alignment);
 	limit &= ~(alignment - 1);
-- 
cgit v1.2.3


From cbedbac3e66121ddbac363776c23119f8eaeefda Mon Sep 17 00:00:00 2001
From: Li RongQing <roy.qing.li@gmail.com>
Date: Fri, 27 May 2016 14:27:43 -0700
Subject: mm/memcontrol.c: fix the margin computation in mem_cgroup_margin()

mem_cgroup_margin() might return (memory.limit - memory_count) when the
memsw.limit is in excess.  This doesn't happen usually because we do not
allow excess on hard limits and (memory.limit <= memsw.limit), but
__GFP_NOFAIL charges can force the charge and cause the excess when no
memory is really swappable (swap is full or no anonymous memory is
left).

[mhocko@suse.com: rewrote changelog]
  Link: http://lkml.kernel.org/r/20160525155122.GK20132@dhcp22.suse.cz
Link: http://lkml.kernel.org/r/1464068266-27736-1-git-send-email-roy.qing.li@gmail.com
Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
Acked-by: Vladimir Davydov <vdavydov@virtuozzo.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f6477a9dbe7a..485c688a7fa8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
 		limit = READ_ONCE(memcg->memsw.limit);
 		if (count <= limit)
 			margin = min(margin, limit - count);
+		else
+			margin = 0;
 	}
 
 	return margin;
-- 
cgit v1.2.3


From 7cf7806ce1e30f1691cf340f70b807acbdf419ef Mon Sep 17 00:00:00 2001
From: Li RongQing <roy.qing.li@gmail.com>
Date: Fri, 27 May 2016 14:27:46 -0700
Subject: mm/memcontrol.c: move comments for get_mctgt_type() to proper
 position

Move the comments for get_mctgt_type() to be before get_mctgt_type()
implementation.

Link: http://lkml.kernel.org/r/1463644638-7446-1-git-send-email-roy.qing.li@gmail.com
Signed-off-by: Li RongQing <roy.qing.li@gmail.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/memcontrol.c | 37 +++++++++++++++++++------------------
 1 file changed, 19 insertions(+), 18 deletions(-)

(limited to 'mm')

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 485c688a7fa8..925b431f3f03 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4309,24 +4309,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
 	return 0;
 }
 
-/**
- * get_mctgt_type - get target type of moving charge
- * @vma: the vma the pte to be checked belongs
- * @addr: the address corresponding to the pte to be checked
- * @ptent: the pte to be checked
- * @target: the pointer the target page or swap ent will be stored(can be NULL)
- *
- * Returns
- *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
- *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
- *     move charge. if @target is not NULL, the page is stored in target->page
- *     with extra refcnt got(Callers should handle it).
- *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
- *     target for charge migration. if @target is not NULL, the entry is stored
- *     in target->ent.
- *
- * Called with pte lock held.
- */
 union mc_target {
 	struct page	*page;
 	swp_entry_t	ent;
@@ -4515,6 +4497,25 @@ out:
 	return ret;
 }
 
+/**
+ * get_mctgt_type - get target type of moving charge
+ * @vma: the vma the pte to be checked belongs
+ * @addr: the address corresponding to the pte to be checked
+ * @ptent: the pte to be checked
+ * @target: the pointer the target page or swap ent will be stored(can be NULL)
+ *
+ * Returns
+ *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
+ *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
+ *     move charge. if @target is not NULL, the page is stored in target->page
+ *     with extra refcnt got(Callers should handle it).
+ *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
+ *     target for charge migration. if @target is not NULL, the entry is stored
+ *     in target->ent.
+ *
+ * Called with pte lock held.
+ */
+
 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		unsigned long addr, pte_t ptent, union mc_target *target)
 {
-- 
cgit v1.2.3


From 11e685672a0861ce136cc4e7f6fdd11e5390b1fa Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Fri, 27 May 2016 14:27:49 -0700
Subject: mm: disable DEFERRED_STRUCT_PAGE_INIT on !NO_BOOTMEM

When we have !NO_BOOTMEM, the deferred page struct initialization
doesn't work well because the pages reserved in bootmem are released to
the page allocator uncoditionally.  It causes memory corruption and
system crash eventually.

As Mel suggested, the bootmem is retiring slowly.  We fix the issue by
simply hiding DEFERRED_STRUCT_PAGE_INIT when bootmem is enabled.

Link: http://lkml.kernel.org/r/1460602170-5821-1-git-send-email-gwshan@linux.vnet.ibm.com
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 22fa8189e4fc..3e2daef3c946 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -648,7 +648,7 @@ config DEFERRED_STRUCT_PAGE_INIT
 	bool "Defer initialisation of struct pages to kthreads"
 	default n
 	depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
-	depends on MEMORY_HOTPLUG
+	depends on NO_BOOTMEM && MEMORY_HOTPLUG
 	depends on !FLATMEM
 	help
 	  Ordinarily all struct pages are initialised during early boot in a
-- 
cgit v1.2.3


From 7ded384a12688c2a86b618da16bc87713404dfcc Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 15:23:32 -0700
Subject: mm: fix section mismatch warning

The register_page_bootmem_info_node() function needs to be marked __init
in order to avoid a new warning introduced by commit f65e91df25aa ("mm:
use early_pfn_to_nid in register_page_bootmem_info_node").

Otherwise you'll get a warning about how a non-init function calls
early_pfn_to_nid (which is __meminit)

Cc: Yang Shi <yang.shi@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 2 +-
 mm/memory_hotplug.c            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index 20d8a5d4d133..5145620ba48a 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -182,7 +182,7 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat)
 #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */
 
 #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE
-extern void register_page_bootmem_info_node(struct pglist_data *pgdat);
+extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat);
 #else
 static inline void register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b8ee0806415f..e3cbdcaff2a5 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -263,7 +263,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn)
 }
 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 
-void register_page_bootmem_info_node(struct pglist_data *pgdat)
+void __init register_page_bootmem_info_node(struct pglist_data *pgdat)
 {
 	unsigned long i, pfn, end_pfn, nr_pages;
 	int node = pgdat->node_id;
-- 
cgit v1.2.3


From 5d22fc25d4fc8096d2d7df27ea1893d4e055e764 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 27 May 2016 15:57:31 -0700
Subject: mm: remove more IS_ERR_VALUE abuses

The do_brk() and vm_brk() return value was "unsigned long" and returned
the starting address on success, and an error value on failure.  The
reasons are entirely historical, and go back to it basically behaving
like the mmap() interface does.

However, nobody actually wanted that interface, and it causes totally
pointless IS_ERR_VALUE() confusion.

What every single caller actually wants is just the simpler integer
return of zero for success and negative error number on failure.

So just convert to that much clearer and more common calling convention,
and get rid of all the IS_ERR_VALUE() uses wrt vm_brk().

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/ia32/ia32_aout.c | 17 ++++++++---------
 fs/binfmt_aout.c          | 18 +++++++-----------
 fs/binfmt_elf.c           | 11 +++++------
 include/linux/mm.h        |  2 +-
 mm/mmap.c                 | 16 ++++++++--------
 mm/nommu.c                |  2 +-
 6 files changed, 30 insertions(+), 36 deletions(-)

(limited to 'mm')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index f5e737ff0022..cb26f18d43af 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -116,12 +116,12 @@ static struct linux_binfmt aout_format = {
 	.min_coredump	= PAGE_SIZE
 };
 
-static unsigned long set_brk(unsigned long start, unsigned long end)
+static int set_brk(unsigned long start, unsigned long end)
 {
 	start = PAGE_ALIGN(start);
 	end = PAGE_ALIGN(end);
 	if (end <= start)
-		return start;
+		return 0;
 	return vm_brk(start, end - start);
 }
 
@@ -321,7 +321,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
 
-		if (error != (text_addr & PAGE_MASK))
+		if (error)
 			return error;
 
 		error = read_code(bprm->file, text_addr, 32,
@@ -350,7 +350,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 		if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
 			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (IS_ERR_VALUE(error))
+			if (error)
 				return error;
 
 			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -378,7 +378,7 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 beyond_if:
 	error = set_brk(current->mm->start_brk, current->mm->brk);
-	if (IS_ERR_VALUE(error))
+	if (error)
 		return error;
 
 	set_binfmt(&aout_format);
@@ -441,7 +441,7 @@ static int load_aout_library(struct file *file)
 		}
 #endif
 		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (IS_ERR_VALUE(retval))
+		if (retval)
 			goto out;
 
 		read_code(file, start_addr, N_TXTOFF(ex),
@@ -461,9 +461,8 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		error = vm_brk(start_addr + len, bss - len);
-		retval = error;
-		if (error != start_addr + len)
+		retval = vm_brk(start_addr + len, bss - len);
+		if (retval)
 			goto out;
 	}
 	retval = 0;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2fab9f130e51..64b331ae3428 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -127,11 +127,8 @@ static int set_brk(unsigned long start, unsigned long end)
 {
 	start = PAGE_ALIGN(start);
 	end = PAGE_ALIGN(end);
-	if (end > start) {
-		unsigned long addr;
-		addr = vm_brk(start, end - start);
-		if (BAD_ADDR(addr))
-			return addr;
+	if (end > start)
+		return vm_brk(start, end - start);
 	}
 	return 0;
 }
@@ -275,7 +272,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		map_size = ex.a_text+ex.a_data;
 #endif
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
-		if (error != (text_addr & PAGE_MASK))
+		if (error)
 			return error;
 
 		error = read_code(bprm->file, text_addr, pos,
@@ -298,7 +295,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
 			error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			if (IS_ERR_VALUE(error))
+			if (error)
 				return error;
 
 			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
@@ -382,7 +379,7 @@ static int load_aout_library(struct file *file)
 			       file);
 		}
 		retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		if (IS_ERR_VALUE(retval))
+		if (retval)
 			goto out;
 
 		read_code(file, start_addr, N_TXTOFF(ex),
@@ -402,9 +399,8 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		error = vm_brk(start_addr + len, bss - len);
-		retval = error;
-		if (error != start_addr + len)
+		retval = vm_brk(start_addr + len, bss - len);
+		if (retval)
 			goto out;
 	}
 	retval = 0;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 938fc4ede764..e158b22ef32f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end)
 	start = ELF_PAGEALIGN(start);
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
-		unsigned long addr;
-		addr = vm_brk(start, end - start);
-		if (BAD_ADDR(addr))
-			return addr;
+		int error = vm_brk(start, end - start);
+		if (error)
+			return error;
 	}
 	current->mm->start_brk = current->mm->brk = end;
 	return 0;
@@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
 		/* Map the last of the bss segment */
 		error = vm_brk(elf_bss, last_bss - elf_bss);
-		if (BAD_ADDR(error))
+		if (error)
 			goto out;
 	}
 
@@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file)
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
 	if (bss > len) {
 		error = vm_brk(len, bss - len);
-		if (BAD_ADDR(error))
+		if (error)
 			goto out_free_ph;
 	}
 	error = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a00ec816233a..5df5feb49575 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2018,7 +2018,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {}
 #endif
 
 /* These take the mm semaphore themselves */
-extern unsigned long __must_check vm_brk(unsigned long, unsigned long);
+extern int __must_check vm_brk(unsigned long, unsigned long);
 extern int vm_munmap(unsigned long, size_t);
 extern unsigned long __must_check vm_mmap(struct file *, unsigned long,
         unsigned long, unsigned long,
diff --git a/mm/mmap.c b/mm/mmap.c
index d3d9a94ca031..de2c1769cc68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -168,7 +168,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	return next;
 }
 
-static unsigned long do_brk(unsigned long addr, unsigned long len);
+static int do_brk(unsigned long addr, unsigned long len);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
@@ -224,7 +224,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+	if (do_brk(oldbrk, newbrk-oldbrk) < 0)
 		goto out;
 
 set_brk:
@@ -2625,7 +2625,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static unsigned long do_brk(unsigned long addr, unsigned long len)
+static int do_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -2636,7 +2636,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 
 	len = PAGE_ALIGN(len);
 	if (!len)
-		return addr;
+		return 0;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
@@ -2703,13 +2703,13 @@ out:
 	if (flags & VM_LOCKED)
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
-	return addr;
+	return 0;
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
 	struct mm_struct *mm = current->mm;
-	unsigned long ret;
+	int ret;
 	bool populate;
 
 	if (down_write_killable(&mm->mmap_sem))
@@ -2718,7 +2718,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	ret = do_brk(addr, len);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
-	if (populate)
+	if (populate && !ret)
 		mm_populate(addr, len);
 	return ret;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index c8bd59a03c71..c2e58880207f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1682,7 +1682,7 @@ void exit_mmap(struct mm_struct *mm)
 	}
 }
 
-unsigned long vm_brk(unsigned long addr, unsigned long len)
+int vm_brk(unsigned long addr, unsigned long len)
 {
 	return -ENOMEM;
 }
-- 
cgit v1.2.3