From b296821a7c42fa58baa17513b2b7b30ae66f3336 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 10 Apr 2016 20:48:24 -0400 Subject: xattr_handler: pass dentry and inode as separate arguments of ->get() ... and do not assume they are already attached to each other Signed-off-by: Al Viro --- mm/shmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 9428c51ab2d6..00d5d025eece 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2646,10 +2646,10 @@ static int shmem_initxattrs(struct inode *inode, } static int shmem_xattr_handler_get(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - void *buffer, size_t size) + struct dentry *unused, struct inode *inode, + const char *name, void *buffer, size_t size) { - struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_get(&info->xattrs, name, buffer, size); -- cgit v1.2.3 From 8efd755ac2fe262d4c8d5c9bbe054bb67dae93da Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2016 11:39:12 +0200 Subject: mm/mmu_context, sched/core: Fix mmu_context.h assumption Some architectures (such as Alpha) rely on include/linux/sched.h definitions in their mmu_context.h files. So include sched.h before mmu_context.h. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- mm/mmu_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmu_context.c b/mm/mmu_context.c index f802c2d216a7..6f4d27c5bb32 100644 --- a/mm/mmu_context.c +++ b/mm/mmu_context.c @@ -4,9 +4,9 @@ */ #include +#include #include #include -#include #include -- cgit v1.2.3 From c64fb5c7448c1a0cfa163f126df3c112b6ca3e97 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Apr 2016 08:51:55 -0700 Subject: filemap: remove pos variables in generic_file_read_iter Just use ki_pos directly to make everyones life easier. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- mm/filemap.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index f2479af09da9..5885925cdb5b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1838,8 +1838,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - loff_t pos = *ppos; size_t count = iov_iter_count(iter); if (!count) @@ -1851,15 +1849,16 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t size; size = i_size_read(inode); - retval = filemap_write_and_wait_range(mapping, pos, - pos + count - 1); + retval = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count - 1); if (!retval) { struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(iocb, &data, pos); + retval = mapping->a_ops->direct_IO(iocb, &data, + iocb->ki_pos); } if (retval > 0) { - *ppos = pos + retval; + iocb->ki_pos += retval; iov_iter_advance(iter, retval); } @@ -1872,14 +1871,14 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || IS_DAX(inode)) { file_accessed(file); goto out; } } - retval = do_generic_file_read(file, ppos, iter, retval); + retval = do_generic_file_read(file, &iocb->ki_pos, iter, retval); out: return retval; } -- cgit v1.2.3 From 1af5bb491fbb41c8dab9d728a92758dd6a28afd4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Apr 2016 08:51:56 -0700 Subject: filemap: remove the pos argument to generic_file_direct_write Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/btrfs/file.c | 9 ++++----- fs/fuse/file.c | 2 +- include/linux/fs.h | 2 +- mm/filemap.c | 5 +++-- 4 files changed, 9 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8d7b5a45c005..6c376311a9d7 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1703,18 +1703,17 @@ again: return num_written ? num_written : ret; } -static ssize_t __btrfs_direct_write(struct kiocb *iocb, - struct iov_iter *from, - loff_t pos) +static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + loff_t pos = iocb->ki_pos; ssize_t written; ssize_t written_buffered; loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -1832,7 +1831,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { - num_written = __btrfs_direct_write(iocb, from, pos); + num_written = __btrfs_direct_write(iocb, from); } else { num_written = __btrfs_buffered_write(file, from, pos); if (num_written > 0) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 719924d6c706..7e8c4603d43a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1186,7 +1186,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_DIRECT) { loff_t pos = iocb->ki_pos; - written = generic_file_direct_write(iocb, from, pos); + written = generic_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 70e61b58baaf..e9eaa2074061 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2703,7 +2703,7 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *); extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *); extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *); -extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *, loff_t); +extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *); extern ssize_t generic_perform_write(struct file *, struct iov_iter *, loff_t); ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos); diff --git a/mm/filemap.c b/mm/filemap.c index 5885925cdb5b..e7108c31346d 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2499,11 +2499,12 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, EXPORT_SYMBOL(pagecache_write_end); ssize_t -generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; ssize_t written; size_t write_len; pgoff_t end; @@ -2717,7 +2718,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_DIRECT) { loff_t pos, endbyte; - written = generic_file_direct_write(iocb, from, iocb->ki_pos); + written = generic_file_direct_write(iocb, from); /* * If the write stopped short of completing, fall back to * buffered writes. Some filesystems do this for writes to -- cgit v1.2.3 From c8b8e32d700fe943a935e435ae251364d016c497 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Apr 2016 08:51:58 -0700 Subject: direct-io: eliminate the offset argument to ->direct_IO Including blkdev_direct_IO and dax_do_io. It has to be ki_pos to actually work, so eliminate the superflous argument. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 +- Documentation/filesystems/vfs.txt | 2 +- drivers/staging/lustre/lustre/llite/rw26.c | 4 ++-- fs/9p/vfs_addr.c | 3 ++- fs/affs/file.c | 5 +++-- fs/block_dev.c | 6 +++--- fs/btrfs/inode.c | 6 +++--- fs/ceph/addr.c | 3 +-- fs/cifs/file.c | 2 +- fs/dax.c | 4 ++-- fs/direct-io.c | 7 ++++--- fs/exofs/inode.c | 3 +-- fs/ext2/inode.c | 8 ++++---- fs/ext4/ext4.h | 3 +-- fs/ext4/indirect.c | 12 ++++++------ fs/ext4/inode.c | 18 +++++++++--------- fs/f2fs/data.c | 6 +++--- fs/fat/inode.c | 6 +++--- fs/fuse/file.c | 3 ++- fs/gfs2/aops.c | 6 +++--- fs/hfs/inode.c | 7 +++---- fs/hfsplus/inode.c | 7 +++---- fs/jfs/inode.c | 7 +++---- fs/nfs/direct.c | 17 +++++++---------- fs/nfs/file.c | 2 +- fs/nilfs2/inode.c | 4 ++-- fs/ocfs2/aops.c | 9 ++++----- fs/reiserfs/inode.c | 7 +++---- fs/udf/file.c | 3 +-- fs/udf/inode.c | 7 +++---- fs/xfs/xfs_aops.c | 7 +++---- fs/xfs/xfs_file.c | 2 +- include/linux/dax.h | 2 +- include/linux/fs.h | 9 ++++----- include/linux/nfs_fs.h | 5 ++--- mm/filemap.c | 5 ++--- mm/page_io.c | 2 +- 37 files changed, 99 insertions(+), 112 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 619af9bfdcb3..75eea7ce3d7c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -194,7 +194,7 @@ prototypes: void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); - int (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset); + int (*direct_IO)(struct kiocb *, struct iov_iter *iter); int (*migratepage)(struct address_space *, struct page *, struct page *); int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, unsigned long, unsigned long); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 4164bd6397a2..c61a223ef3ff 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -591,7 +591,7 @@ struct address_space_operations { void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, int); void (*freepage)(struct page *); - ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset); + ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* migrate the contents of a page to the specified target */ int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c index 69aa15e8e3ef..0c3459c1a518 100644 --- a/drivers/staging/lustre/lustre/llite/rw26.c +++ b/drivers/staging/lustre/lustre/llite/rw26.c @@ -358,14 +358,14 @@ static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, */ #define MAX_DIO_SIZE ((KMALLOC_MAX_SIZE / sizeof(struct brw_page) * \ PAGE_SIZE) & ~(DT_MAX_BRW_SIZE - 1)) -static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter, - loff_t file_offset) +static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter) { struct lu_env *env; struct cl_io *io; struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct ccc_object *obj = cl_inode2ccc(inode); + loff_t file_offset = iocb->ki_pos; ssize_t count = iov_iter_count(iter); ssize_t tot_bytes = 0, result = 0; struct ll_inode_info *lli = ll_i2info(inode); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index ac9225e86bf3..c37fb9c08970 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -245,9 +245,10 @@ static int v9fs_launder_page(struct page *page) * */ static ssize_t -v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; + loff_t pos = iocb->ki_pos; ssize_t n; int err = 0; if (iov_iter_rw(iter) == WRITE) { diff --git a/fs/affs/file.c b/fs/affs/file.c index 0cde550050e8..0deec9cc2362 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -389,12 +389,13 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) } static ssize_t -affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; if (iov_iter_rw(iter) == WRITE) { @@ -404,7 +405,7 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) return 0; } - ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, affs_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) affs_write_failed(mapping, offset + count); return ret; diff --git a/fs/block_dev.c b/fs/block_dev.c index 20a2c02b77c4..9e1f3fe25753 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -162,15 +162,15 @@ static struct inode *bdev_file_inode(struct file *file) } static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) - return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + return dax_do_io(iocb, inode, iter, blkdev_get_block, NULL, DIO_SKIP_DIO_COUNT); - return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, + return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2aaba58b4856..352d4e1dc985 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8541,13 +8541,13 @@ out: return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_data dio_data = { 0 }; + loff_t offset = iocb->ki_pos; size_t count = 0; int flags = 0; bool wakeup = true; @@ -8607,7 +8607,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ret = __blockdev_direct_IO(iocb, inode, BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iter, offset, btrfs_get_blocks_direct, NULL, + iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4801571f51cb..43098cd9602b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1292,8 +1292,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * intercept O_DIRECT reads and writes early, this function should * never get called. */ -static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) { WARN_ON(1); return -EINVAL; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c03d0744648b..cb070aa88e57 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3854,7 +3854,7 @@ void cifs_oplock_break(struct work_struct *work) * Direct IO is not yet supported in the cached mode. */ static ssize_t -cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter) { /* * FIXME diff --git a/fs/dax.c b/fs/dax.c index 75ba46d82a76..0dbe4e0f16fe 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -244,7 +244,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to - * @pos: The file offset where the I/O starts * @get_block: The filesystem method used to translate file offsets to blocks * @end_io: A filesystem callback for I/O completion * @flags: See below @@ -257,11 +256,12 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, * is in progress. */ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, loff_t pos, get_block_t get_block, + struct iov_iter *iter, get_block_t get_block, dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; + loff_t pos = iocb->ki_pos; loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); diff --git a/fs/direct-io.c b/fs/direct-io.c index 472037732daf..8949d3e35756 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1113,7 +1113,7 @@ static inline int drop_refcount(struct dio *dio) static inline ssize_t do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, - loff_t offset, get_block_t get_block, dio_iodone_t end_io, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); @@ -1121,6 +1121,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, unsigned blocksize_mask = (1 << blkbits) - 1; ssize_t retval = -EINVAL; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; loff_t end = offset + count; struct dio *dio; struct dio_submit sdio = { 0, }; @@ -1328,7 +1329,7 @@ out: ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, - loff_t offset, get_block_t get_block, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags) { @@ -1344,7 +1345,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, prefetch(bdev->bd_queue); prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); - return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block, + return do_blockdev_direct_IO(iocb, inode, bdev, iter, get_block, end_io, submit_io, flags); } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 49e1bd00b4ec..9dc4c6dbf3c9 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -960,8 +960,7 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, /* TODO: Should be easy enough to do proprly */ -static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { return 0; } diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6bd58e6ff038..b675610391b8 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -854,20 +854,20 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) } static ssize_t -ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL, + ret = dax_do_io(iocb, inode, iter, ext2_get_block, NULL, DIO_LOCKING); else - ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext2_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 349afebe21ee..72f4c9e00e97 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2581,8 +2581,7 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset); +extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3027fa681de5..627b7e8f9ef3 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -659,12 +659,12 @@ out: * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ -ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct ext4_inode_info *ei = EXT4_I(inode); + loff_t offset = iocb->ki_pos; handle_t *handle; ssize_t ret; int orphan = 0; @@ -707,21 +707,21 @@ retry: goto locked; } if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); else ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, ext4_dio_get_block, + ext4_dio_get_block, NULL, NULL, 0); inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, + ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, DIO_LOCKING); else - ret = blockdev_direct_IO(iocb, inode, iter, offset, + ret = blockdev_direct_IO(iocb, inode, iter, ext4_dio_get_block); if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 981a1fc30eaa..79b298d397b4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3334,12 +3334,12 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * if the machine crashes during the write. * */ -static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; ssize_t ret; + loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; @@ -3348,7 +3348,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* Use the old path for reads and writes beyond i_size. */ if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(iocb, iter, offset); + return ext4_ind_direct_IO(iocb, iter); BUG_ON(iocb->private == NULL); @@ -3400,11 +3400,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, get_block_func, + ret = dax_do_io(iocb, inode, iter, get_block_func, ext4_end_io_dio, dio_flags); else ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, offset, + inode->i_sb->s_bdev, iter, get_block_func, ext4_end_io_dio, NULL, dio_flags); @@ -3431,12 +3431,12 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } -static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; #ifdef CONFIG_EXT4_FS_ENCRYPTION @@ -3456,9 +3456,9 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(iocb, iter, offset); + ret = ext4_ext_direct_IO(iocb, iter); else - ret = ext4_ind_direct_IO(iocb, iter, offset); + ret = ext4_ind_direct_IO(iocb, iter); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); return ret; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 53fec0872e60..a4c5da5bfe1e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1655,12 +1655,12 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, return 0; } -static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; int err; err = check_direct_IO(inode, iter, offset); @@ -1672,7 +1672,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); + err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio); if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 226281068a46..3bcf57925dca 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -244,13 +244,13 @@ static int fat_write_end(struct file *file, struct address_space *mapping, return err; } -static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; ssize_t ret; if (iov_iter_rw(iter) == WRITE) { @@ -272,7 +272,7 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, fat_get_block); if (ret < 0 && iov_iter_rw(iter) == WRITE) fat_write_failed(mapping, offset + count); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7e8c4603d43a..02279073bf64 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2837,7 +2837,7 @@ static inline loff_t fuse_round_up(loff_t off) } static ssize_t -fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { DECLARE_COMPLETION_ONSTACK(wait); ssize_t ret = 0; @@ -2848,6 +2848,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct inode *inode; loff_t i_size; size_t count = iov_iter_count(iter); + loff_t offset = iocb->ki_pos; struct fuse_io_priv *io; bool is_sync = is_sync_kiocb(iocb); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1bbbee945f46..8524c0e322fc 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -1042,13 +1042,13 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset) -static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct address_space *mapping = inode->i_mapping; struct gfs2_inode *ip = GFS2_I(inode); + loff_t offset = iocb->ki_pos; struct gfs2_holder gh; int rv; @@ -1099,7 +1099,7 @@ static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, gfs2_get_block_direct, NULL, NULL, 0); + gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index cb1e5faa2fb7..c82331a9cf9b 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -124,8 +124,7 @@ static int hfs_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -133,7 +132,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, hfs_get_block); /* * In case of error extending write may have instantiated a few @@ -141,7 +140,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if (end > isize) hfs_write_failed(mapping, end); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index b28f39865c3a..2ad34a5eb5ad 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -122,8 +122,7 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -131,7 +130,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, hfsplus_get_block); /* * In case of error extending write may have instantiated a few @@ -139,7 +138,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if (end > isize) hfsplus_write_failed(mapping, end); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 9d9bae63ae2a..f6a2a78121b0 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -332,8 +332,7 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, jfs_get_block); } -static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -341,7 +340,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, jfs_get_block); /* * In case of error extending write may have instantiated a few @@ -349,7 +348,7 @@ static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if (end > isize) jfs_write_failed(mapping, end); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index c93826e4a8c6..346b5d85ce92 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -250,7 +250,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, * shunt off direct read and write requests before the VFS gets them, * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -261,7 +261,7 @@ ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (iov_iter_rw(iter) == READ) - return nfs_file_direct_read(iocb, iter, pos); + return nfs_file_direct_read(iocb, iter); return nfs_file_direct_write(iocb, iter); } @@ -545,7 +545,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_read - file direct read operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers into which to read data - * @pos: byte offset in file where reading starts * * We use this function for direct reads instead of calling * generic_file_aio_read() in order to avoid gfar's check to see if @@ -561,8 +560,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * client must read the updated atime from the server back into its * cache. */ -ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -574,7 +572,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); + file, count, (long long) iocb->ki_pos); result = 0; if (!count) @@ -594,7 +592,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, dreq->inode = inode; dreq->bytes_left = count; - dreq->io_start = pos; + dreq->io_start = iocb->ki_pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); if (IS_ERR(l_ctx)) { @@ -606,14 +604,14 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, dreq->iocb = iocb; NFS_I(inode)->read_io += count; - result = nfs_direct_read_schedule_iovec(dreq, iter, pos); + result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); if (result > 0) - iocb->ki_pos = pos + result; + iocb->ki_pos += result; } nfs_direct_req_release(dreq); @@ -969,7 +967,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * nfs_file_direct_write - file direct write operation for NFS files * @iocb: target I/O control block * @iter: vector of user buffers from which to write data - * @pos: byte offset in file where writing starts * * We use this function for direct writes instead of calling * generic_file_aio_write() in order to avoid taking the inode diff --git a/fs/nfs/file.c b/fs/nfs/file.c index be01095b97ae..717a8d6af52d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -164,7 +164,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_flags & IOCB_DIRECT) - return nfs_file_direct_read(iocb, to, iocb->ki_pos); + return nfs_file_direct_read(iocb, to); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 534631358b13..cfebcd2fc7f3 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -305,7 +305,7 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, } static ssize_t -nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) +nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); @@ -313,7 +313,7 @@ nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) return 0; /* Needs synchronization with the cleaner */ - return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); + return blockdev_direct_IO(iocb, inode, iter, nilfs_get_block); } const struct address_space_operations nilfs_aops = { diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ad1577348a92..6c66c62d4a7e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2423,13 +2423,11 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, return 0; } -static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file)->i_mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - loff_t end = offset + iter->count; get_block_t *get_block; /* @@ -2440,7 +2438,8 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; /* Fallback to buffered I/O if we do not support append dio. */ - if (end > i_size_read(inode) && !ocfs2_supports_append_dio(osb)) + if (iocb->ki_pos + iter->count > i_size_read(inode) && + !ocfs2_supports_append_dio(osb)) return 0; if (iov_iter_rw(iter) == READ) @@ -2449,7 +2448,7 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, get_block = ocfs2_dio_get_block; return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, offset, get_block, + iter, get_block, ocfs2_dio_end_io, NULL, 0); } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d5c2e9c865de..825455d3e4ba 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3279,15 +3279,14 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) * We thank Mingming Cao for helping us understand in great detail what * to do in this section of the code. */ -static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, + ret = blockdev_direct_IO(iocb, inode, iter, reiserfs_get_blocks_direct_io); /* @@ -3296,7 +3295,7 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, */ if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); - loff_t end = offset + count; + loff_t end = iocb->ki_pos + count; if ((end > isize) && inode_newsize_ok(inode, isize) == 0) { truncate_setsize(inode, isize); diff --git a/fs/udf/file.c b/fs/udf/file.c index 877ba1c9b461..7ab8d8196e90 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -99,8 +99,7 @@ static int udf_adinicb_write_begin(struct file *file, return 0; } -static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* Fallback to buffered I/O. */ return 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 2dc461eeb415..f323aff740ef 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -214,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, return ret; } -static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -223,9 +222,9 @@ static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) - udf_write_failed(mapping, offset + count); + udf_write_failed(mapping, iocb->ki_pos + count); return ret; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index e49b2406d15d..c535887c60a8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1406,8 +1406,7 @@ xfs_end_io_direct_write( STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, - struct iov_iter *iter, - loff_t offset) + struct iov_iter *iter) { struct inode *inode = iocb->ki_filp->f_mapping->host; dio_iodone_t *endio = NULL; @@ -1420,12 +1419,12 @@ xfs_vm_direct_IO( } if (IS_DAX(inode)) { - return dax_do_io(iocb, inode, iter, offset, + return dax_do_io(iocb, inode, iter, xfs_get_blocks_direct, endio, 0); } bdev = xfs_find_bdev_for_inode(inode); - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + return __blockdev_direct_IO(iocb, inode, bdev, iter, xfs_get_blocks_direct, endio, NULL, flags); } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 92f72fb05497..5de047ab2411 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -794,7 +794,7 @@ xfs_file_dio_aio_write( trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); data = *from; - ret = mapping->a_ops->direct_IO(iocb, &data, iocb->ki_pos); + ret = mapping->a_ops->direct_IO(iocb, &data); /* see generic_file_direct_write() for why this is necessary */ if (mapping->nrpages) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 636dd59ab505..982a6c4a62f3 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -5,7 +5,7 @@ #include #include -ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t, +ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, get_block_t, dio_iodone_t, int flags); int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size); int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); diff --git a/include/linux/fs.h b/include/linux/fs.h index e9eaa2074061..e6b2de159736 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -394,7 +394,7 @@ struct address_space_operations { void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); - ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter, loff_t offset); + ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a page to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. @@ -2766,18 +2766,17 @@ void dio_end_io(struct bio *bio, int error); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, - loff_t offset, get_block_t get_block, + get_block_t get_block, dio_iodone_t end_io, dio_submit_t submit_io, int flags); static inline ssize_t blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, loff_t offset, + struct iov_iter *iter, get_block_t get_block) { return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, - offset, get_block, NULL, NULL, - DIO_LOCKING | DIO_SKIP_HOLES); + get_block, NULL, NULL, DIO_LOCKING | DIO_SKIP_HOLES); } #endif diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 67300f8e5f2f..cede8f6a7e2d 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -445,10 +445,9 @@ static inline struct rpc_cred *nfs_file_cred(struct file *file) /* * linux/fs/nfs/direct.c */ -extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *, loff_t); +extern ssize_t nfs_direct_IO(struct kiocb *, struct iov_iter *); extern ssize_t nfs_file_direct_read(struct kiocb *iocb, - struct iov_iter *iter, - loff_t pos); + struct iov_iter *iter); extern ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter); diff --git a/mm/filemap.c b/mm/filemap.c index e7108c31346d..cb36db9f4107 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1853,8 +1853,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) iocb->ki_pos + count - 1); if (!retval) { struct iov_iter data = *iter; - retval = mapping->a_ops->direct_IO(iocb, &data, - iocb->ki_pos); + retval = mapping->a_ops->direct_IO(iocb, &data); } if (retval > 0) { @@ -2538,7 +2537,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) } data = *from; - written = mapping->a_ops->direct_IO(iocb, &data, pos); + written = mapping->a_ops->direct_IO(iocb, &data); /* * Finally, try again to invalidate clean pages which might have been diff --git a/mm/page_io.c b/mm/page_io.c index cd92e3d67a32..89275601d399 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -279,7 +279,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, set_page_writeback(page); unlock_page(page); - ret = mapping->a_ops->direct_IO(&kiocb, &from, kiocb.ki_pos); + ret = mapping->a_ops->direct_IO(&kiocb, &from); if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; -- cgit v1.2.3 From dde0c2e79848298cc25621ad080d47f94dbd7cce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Apr 2016 08:52:00 -0700 Subject: fs: add IOCB_SYNC and IOCB_DSYNC This will allow us to do per-I/O sync file writes, as required by a lot of fileservers or storage targets. XXX: Will need a few additional audits for O_DSYNC Signed-off-by: Al Viro --- fs/block_dev.c | 2 +- fs/btrfs/file.c | 2 +- fs/cifs/file.c | 2 +- fs/direct-io.c | 2 +- fs/ext4/file.c | 2 +- fs/f2fs/file.c | 2 +- fs/gfs2/file.c | 5 ++++- fs/nfs/direct.c | 2 +- fs/ntfs/file.c | 2 +- fs/udf/file.c | 2 +- fs/xfs/xfs_file.c | 2 +- include/linux/fs.h | 14 ++++++++++---- mm/filemap.c | 2 +- 13 files changed, 25 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/fs/block_dev.c b/fs/block_dev.c index 9e1f3fe25753..d8dc3512e927 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1662,7 +1662,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); if (ret > 0) { ssize_t err; - err = generic_write_sync(file, iocb->ki_pos - ret, ret); + err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6c376311a9d7..35ce146cceec 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1852,7 +1852,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, BTRFS_I(inode)->last_sub_trans = root->log_transid; spin_unlock(&BTRFS_I(inode)->lock); if (num_written > 0) { - err = generic_write_sync(file, pos, num_written); + err = generic_write_sync(iocb, pos, num_written); if (err < 0) num_written = err; } diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cb070aa88e57..b22b68ccfbe5 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2688,7 +2688,7 @@ out: inode_unlock(inode); if (rc > 0) { - ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); + ssize_t err = generic_write_sync(iocb, iocb->ki_pos - rc, rc); if (err < 0) rc = err; } diff --git a/fs/direct-io.c b/fs/direct-io.c index c61314b84b01..f7bcc0193dee 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -268,7 +268,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (dio->rw & WRITE) { int err; - err = generic_write_sync(dio->iocb->ki_filp, offset, + err = generic_write_sync(dio->iocb, offset, transferred); if (err < 0 && ret > 0) ret = err; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index fa2208bae2e1..1417e129be51 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -172,7 +172,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) { ssize_t err; - err = generic_write_sync(file, iocb->ki_pos - ret, ret); + err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 443e07705c2a..51ed8388e66c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1885,7 +1885,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) { ssize_t err; - err = generic_write_sync(file, iocb->ki_pos - ret, ret); + err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 208efc70ad49..5a7d69609309 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -895,7 +895,10 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t mark_inode_dirty(inode); } - return generic_write_sync(file, pos, count); + if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) + return vfs_fsync_range(file, pos, pos + count - 1, + (file->f_flags & __O_SYNC) ? 0 : 1); + return 0; out_trans_fail: gfs2_inplace_release(ip); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 346b5d85ce92..be86de9a77d7 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -1054,7 +1054,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); - generic_write_sync(file, pos, result); + generic_write_sync(iocb, pos, result); } } nfs_direct_req_release(dreq); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 91117ada8528..10dc38cc02bb 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1953,7 +1953,7 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) current->backing_dev_info = NULL; inode_unlock(vi); if (likely(written > 0)) { - err = generic_write_sync(file, iocb->ki_pos, written); + err = generic_write_sync(iocb, iocb->ki_pos, written); if (err < 0) written = 0; } diff --git a/fs/udf/file.c b/fs/udf/file.c index 7ab8d8196e90..8e3d1ae53b11 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -152,7 +152,7 @@ out: if (retval > 0) { mark_inode_dirty(inode); - err = generic_write_sync(file, iocb->ki_pos - retval, retval); + err = generic_write_sync(iocb, iocb->ki_pos - retval, retval); if (err < 0) retval = err; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 5de047ab2411..b5d70e77195d 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -908,7 +908,7 @@ xfs_file_write_iter( XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(file, iocb->ki_pos - ret, ret); + err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } diff --git a/include/linux/fs.h b/include/linux/fs.h index e6b2de159736..310ca1ed9293 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -323,6 +323,8 @@ struct writeback_control; #define IOCB_APPEND (1 << 1) #define IOCB_DIRECT (1 << 2) #define IOCB_HIPRI (1 << 3) +#define IOCB_DSYNC (1 << 4) +#define IOCB_SYNC (1 << 5) struct kiocb { struct file *ki_filp; @@ -2485,12 +2487,12 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); -static inline int generic_write_sync(struct file *file, loff_t pos, loff_t count) +static inline int generic_write_sync(struct kiocb *iocb, loff_t pos, loff_t count) { - if (!(file->f_flags & O_DSYNC) && !IS_SYNC(file->f_mapping->host)) + if (!(iocb->ki_flags & IOCB_DSYNC)) return 0; - return vfs_fsync_range(file, pos, pos + count - 1, - (file->f_flags & __O_SYNC) ? 0 : 1); + return vfs_fsync_range(iocb->ki_filp, pos, pos + count - 1, + (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); } extern void emergency_sync(void); extern void emergency_remount(void); @@ -2942,6 +2944,10 @@ static inline int iocb_flags(struct file *file) res |= IOCB_APPEND; if (io_is_direct(file)) res |= IOCB_DIRECT; + if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host)) + res |= IOCB_DSYNC; + if (file->f_flags & __O_SYNC) + res |= IOCB_SYNC; return res; } diff --git a/mm/filemap.c b/mm/filemap.c index cb36db9f4107..8345d6d3436a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2794,7 +2794,7 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret > 0) { ssize_t err; - err = generic_write_sync(file, iocb->ki_pos - ret, ret); + err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); if (err < 0) ret = err; } -- cgit v1.2.3 From e259221763a40403d5bb232209998e8c45804ab8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Apr 2016 08:52:01 -0700 Subject: fs: simplify the generic_write_sync prototype The kiocb already has the new position, so use that. The only interesting case is AIO, where we currently don't bother updating ki_pos. We're about to free the kiocb after we're done, so we might as well update it to make everyone's life simpler. While we're at it also return the bytes written argument passed in if we were successful so that the boilerplate error switch code in the callers can go away. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/block_dev.c | 8 ++------ fs/btrfs/file.c | 7 ++----- fs/cifs/file.c | 7 ++----- fs/direct-io.c | 17 +++++++++-------- fs/ext4/file.c | 9 ++------- fs/f2fs/file.c | 9 ++------- fs/nfs/direct.c | 4 +++- fs/ntfs/file.c | 7 ++----- fs/udf/file.c | 4 +--- fs/xfs/xfs_file.c | 6 +----- include/linux/fs.h | 24 ++++++++++++++++++------ mm/filemap.c | 9 ++------- 12 files changed, 46 insertions(+), 65 deletions(-) (limited to 'mm') diff --git a/fs/block_dev.c b/fs/block_dev.c index d8dc3512e927..a063d4d8ac39 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1660,12 +1660,8 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); - if (ret > 0) { - ssize_t err; - err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); blk_finish_plug(&plug); return ret; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 35ce146cceec..ea9f10bb089c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1851,11 +1851,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; spin_unlock(&BTRFS_I(inode)->lock); - if (num_written > 0) { - err = generic_write_sync(iocb, pos, num_written); - if (err < 0) - num_written = err; - } + if (num_written > 0) + num_written = generic_write_sync(iocb, num_written); if (sync) atomic_dec(&BTRFS_I(inode)->sync_writers); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b22b68ccfbe5..9b51d4936a29 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2687,11 +2687,8 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) out: inode_unlock(inode); - if (rc > 0) { - ssize_t err = generic_write_sync(iocb, iocb->ki_pos - rc, rc); - if (err < 0) - rc = err; - } + if (rc > 0) + rc = generic_write_sync(iocb, rc); up_read(&cinode->lock_sem); return rc; } diff --git a/fs/direct-io.c b/fs/direct-io.c index f7bcc0193dee..3bf3f20f8ecc 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -256,6 +256,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) if (dio->end_io) { int err; + // XXX: ki_pos?? err = dio->end_io(dio->iocb, offset, ret, dio->private); if (err) ret = err; @@ -265,15 +266,15 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) inode_dio_end(dio->inode); if (is_async) { - if (dio->rw & WRITE) { - int err; - - err = generic_write_sync(dio->iocb, offset, - transferred); - if (err < 0 && ret > 0) - ret = err; - } + /* + * generic_write_sync expects ki_pos to have been updated + * already, but the submission path only does this for + * synchronous I/O. + */ + dio->iocb->ki_pos += transferred; + if (dio->rw & WRITE) + ret = generic_write_sync(dio->iocb, transferred); dio->iocb->ki_complete(dio->iocb, ret, 0); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1417e129be51..00ff6912adb3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -169,13 +169,8 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); if (o_direct) blk_finish_plug(&plug); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 51ed8388e66c..28f75a1fe4a7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1882,13 +1882,8 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } inode_unlock(inode); - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index be86de9a77d7..0b9fca040b0c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -1054,7 +1054,9 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); - generic_write_sync(iocb, pos, result); + + /* XXX: should check the generic_write_sync retval */ + generic_write_sync(iocb, result); } } nfs_direct_req_release(dreq); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 10dc38cc02bb..5622ed5a201e 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1952,12 +1952,9 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; inode_unlock(vi); - if (likely(written > 0)) { - err = generic_write_sync(iocb, iocb->ki_pos, written); - if (err < 0) - written = 0; - } iocb->ki_pos += written; + if (likely(written > 0)) + written = generic_write_sync(iocb, written); return written ? written : err; } diff --git a/fs/udf/file.c b/fs/udf/file.c index 8e3d1ae53b11..632570617327 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -152,9 +152,7 @@ out: if (retval > 0) { mark_inode_dirty(inode); - err = generic_write_sync(iocb, iocb->ki_pos - retval, retval); - if (err < 0) - retval = err; + retval = generic_write_sync(iocb, retval); } return retval; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index b5d70e77195d..cd3540997d65 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -903,14 +903,10 @@ xfs_file_write_iter( ret = xfs_file_buffered_aio_write(iocb, from); if (ret > 0) { - ssize_t err; - XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); /* Handle various SYNC-type writes */ - err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; + ret = generic_write_sync(iocb, ret); } return ret; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 310ca1ed9293..f6a8ed864651 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2487,13 +2487,25 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync); extern int vfs_fsync(struct file *file, int datasync); -static inline int generic_write_sync(struct kiocb *iocb, loff_t pos, loff_t count) -{ - if (!(iocb->ki_flags & IOCB_DSYNC)) - return 0; - return vfs_fsync_range(iocb->ki_filp, pos, pos + count - 1, - (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); + +/* + * Sync the bytes written if this was a synchronous write. Expect ki_pos + * to already be updated for the write, and will return either the amount + * of bytes passed in, or an error if syncing the file failed. + */ +static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) +{ + if (iocb->ki_flags & IOCB_DSYNC) { + int ret = vfs_fsync_range(iocb->ki_filp, + iocb->ki_pos - count, iocb->ki_pos - 1, + (iocb->ki_flags & IOCB_SYNC) ? 0 : 1); + if (ret) + return ret; + } + + return count; } + extern void emergency_sync(void); extern void emergency_remount(void); #ifdef CONFIG_BLOCK diff --git a/mm/filemap.c b/mm/filemap.c index 8345d6d3436a..182b21825255 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2791,13 +2791,8 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); - if (ret > 0) { - ssize_t err; - - err = generic_write_sync(iocb, iocb->ki_pos - ret, ret); - if (err < 0) - ret = err; - } + if (ret > 0) + ret = generic_write_sync(iocb, ret); return ret; } EXPORT_SYMBOL(generic_file_write_iter); -- cgit v1.2.3 From 84e710da2a1dfacfc87f604869a4d22df91ce6cd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Apr 2016 00:58:55 -0400 Subject: parallel lookups machinery, part 2 We'll need to verify that there's neither a hashed nor in-lookup dentry with desired parent/name before adding to in-lookup set. One possible solution would be to hold the parent's ->d_lock through both checks, but while the in-lookup set is relatively small at any time, dcache is not. And holding the parent's ->d_lock through something like __d_lookup_rcu() would suck too badly. So we leave the parent's ->d_lock alone, which means that we watch out for the following scenario: * we verify that there's no hashed match * existing in-lookup match gets hashed by another process * we verify that there's no in-lookup matches and decide that everything's fine. Solution: per-directory kinda-sorta seqlock, bumped around the times we hash something that used to be in-lookup or move (and hash) something in place of in-lookup. Then the above would turn into * read the counter * do dcache lookup * if no matches found, check for in-lookup matches * if there had been none of those either, check if the counter has changed; repeat if it has. The "kinda-sorta" part is due to the fact that we don't have much spare space in inode. There is a spare word (shared with i_bdev/i_cdev/i_pipe), so the counter part is not a problem, but spinlock is a different story. We could use the parent's ->d_lock, and it would be less painful in terms of contention, for __d_add() it would be rather inconvenient to grab; we could do that (using lock_parent()), but... Fortunately, we can get serialization on the counter itself, and it might be a good idea in general; we can use cmpxchg() in a loop to get from even to odd and smp_store_release() from odd to even. This commit adds the counter and updating logics; the readers will be added in the next commit. Signed-off-by: Al Viro --- Documentation/filesystems/porting | 8 ++++++++ fs/dcache.c | 34 ++++++++++++++++++++++++++++++++-- fs/inode.c | 1 + include/linux/fs.h | 1 + mm/shmem.c | 3 ++- 5 files changed, 44 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 57bb3754a027..8810e2367fe6 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -531,3 +531,11 @@ in your dentry operations instead. dentry might be yet to be attached to inode, so do _not_ use its ->d_inode in the instances. Rationale: !@#!@# security_d_instantiate() needs to be called before we attach dentry to inode. +-- +[mandatory] + symlinks are no longer the only inodes that do *not* have i_bdev/i_cdev/ + i_pipe/i_link union zeroed out at inode eviction. As the result, you can't + assume that non-NULL value in ->i_nlink at ->destroy_inode() implies that + it's a symlink. Checking ->i_mode is really needed now. In-tree we had + to fix shmem_destroy_callback() that used to take that kind of shortcut; + watch out, since that shortcut is no longer valid. diff --git a/fs/dcache.c b/fs/dcache.c index 0f1d93866e69..10988f7e5a23 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2364,6 +2364,22 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); +static inline unsigned start_dir_add(struct inode *dir) +{ + + for (;;) { + unsigned n = dir->i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + return n; + cpu_relax(); + } +} + +static inline void end_dir_add(struct inode *dir, unsigned n) +{ + smp_store_release(&dir->i_dir_seq, n + 2); +} + void __d_lookup_done(struct dentry *dentry) { dentry->d_flags &= ~DCACHE_PAR_LOOKUP; @@ -2375,9 +2391,14 @@ EXPORT_SYMBOL(__d_lookup_done); static inline void __d_add(struct dentry *dentry, struct inode *inode) { + struct inode *dir = NULL; + unsigned n; spin_lock(&dentry->d_lock); - if (unlikely(d_in_lookup(dentry))) + if (unlikely(d_in_lookup(dentry))) { + dir = dentry->d_parent->d_inode; + n = start_dir_add(dir); __d_lookup_done(dentry); + } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); @@ -2387,6 +2408,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) __fsnotify_d_instantiate(dentry); } _d_rehash(dentry); + if (dir) + end_dir_add(dir, n); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); @@ -2616,6 +2639,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { + struct inode *dir = NULL; + unsigned n; if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2623,8 +2648,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target, BUG_ON(d_ancestor(target, dentry)); dentry_lock_for_move(dentry, target); - if (unlikely(d_in_lookup(target))) + if (unlikely(d_in_lookup(target))) { + dir = target->d_parent->d_inode; + n = start_dir_add(dir); __d_lookup_done(target); + } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); @@ -2674,6 +2702,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); + if (dir) + end_dir_add(dir, n); dentry_unlock_for_move(dentry, target); } diff --git a/fs/inode.c b/fs/inode.c index 4202aac99464..4b884f73214e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; + inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6d0fa9174a24..00cecc5a2f75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -684,6 +684,7 @@ struct inode { struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; + unsigned i_dir_seq; }; __u32 i_generation; diff --git a/mm/shmem.c b/mm/shmem.c index 4640699b209b..e684a9140228 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3123,7 +3123,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kfree(inode->i_link); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -- cgit v1.2.3 From 44f43e99fe70833058482d183e99fdfd11220996 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 9 May 2016 16:28:49 -0700 Subject: zsmalloc: fix zs_can_compact() integer overflow zs_can_compact() has two race conditions in its core calculation: unsigned long obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - zs_stat_get(class, OBJ_USED); 1) classes are not locked, so the numbers of allocated and used objects can change by the concurrent ops happening on other CPUs 2) shrinker invokes it from preemptible context Depending on the circumstances, thus, OBJ_ALLOCATED can become less than OBJ_USED, which can result in either very high or negative `total_scan' value calculated later in do_shrink_slab(). do_shrink_slab() has some logic to prevent those cases: vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-64 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 vmscan: shrink_slab: zs_shrinker_scan+0x0/0x28 [zsmalloc] negative objects to delete nr=-62 However, due to the way `total_scan' is calculated, not every shrinker->count_objects() overflow can be spotted and handled. To demonstrate the latter, I added some debugging code to do_shrink_slab() (x86_64) and the results were: vmscan: OVERFLOW: shrinker->count_objects() == -1 [18446744073709551615] vmscan: but total_scan > 0: 92679974445502 vmscan: resulting total_scan: 92679974445502 [..] vmscan: OVERFLOW: shrinker->count_objects() == -1 [18446744073709551615] vmscan: but total_scan > 0: 22634041808232578 vmscan: resulting total_scan: 22634041808232578 Even though shrinker->count_objects() has returned an overflowed value, the resulting `total_scan' is positive, and, what is more worrisome, it is insanely huge. This value is getting used later on in shrinker->scan_objects() loop: while (total_scan >= batch_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; cond_resched(); } `total_scan >= batch_size' is true for a very-very long time and 'total_scan >= freeable' is also true for quite some time, because `freeable < 0' and `total_scan' is large enough, for example, 22634041808232578. The only break condition, in the given scheme of things, is shrinker->scan_objects() == SHRINK_STOP test, which is a bit too weak to rely on, especially in heavy zsmalloc-usage scenarios. To fix the issue, take a pool stat snapshot and use it instead of racy zs_stat_get() calls. Link: http://lkml.kernel.org/r/20160509140052.3389-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: [4.3+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e72efb109fde..fe47fbba995a 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1735,10 +1735,13 @@ static struct page *isolate_source_page(struct size_class *class) static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; + unsigned long obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); + unsigned long obj_used = zs_stat_get(class, OBJ_USED); - obj_wasted = zs_stat_get(class, OBJ_ALLOCATED) - - zs_stat_get(class, OBJ_USED); + if (obj_allocated <= obj_used) + return 0; + obj_wasted = obj_allocated - obj_used; obj_wasted /= get_maxobj_per_zspage(class->size, class->pages_per_zspage); -- cgit v1.2.3 From 7496fea9a6bf644afe360af795b121a77635b37d Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Thu, 12 May 2016 15:42:21 -0700 Subject: ksm: fix conflict between mmput and scan_get_next_rmap_item A concurrency issue about KSM in the function scan_get_next_rmap_item. task A (ksmd): |task B (the mm's task): | mm = slot->mm; | down_read(&mm->mmap_sem); | | ... | | spin_lock(&ksm_mmlist_lock); | | ksm_scan.mm_slot go to the next slot; | | spin_unlock(&ksm_mmlist_lock); | |mmput() -> | ksm_exit(): | |spin_lock(&ksm_mmlist_lock); |if (mm_slot && ksm_scan.mm_slot != mm_slot) { | if (!mm_slot->rmap_list) { | easy_to_free = 1; | ... | |if (easy_to_free) { | mmdrop(mm); | ... | |So this mm_struct may be freed in the mmput(). | up_read(&mm->mmap_sem); | As we can see above, the ksmd thread may access a mm_struct that already been freed to the kmem_cache. Suppose a fork will get this mm_struct from the kmem_cache, the ksmd thread then call up_read(&mm->mmap_sem), will cause mmap_sem.count to become -1. As suggested by Andrea Arcangeli, unmerge_and_remove_all_rmap_items has the same SMP race condition, so fix it too. My prev fix in function scan_get_next_rmap_item will introduce a different SMP race condition, so just invert the up_read/spin_unlock order as Andrea Arcangeli said. Link: http://lkml.kernel.org/r/1462708815-31301-1-git-send-email-zhouchengming1@huawei.com Signed-off-by: Zhou Chengming Suggested-by: Andrea Arcangeli Reviewed-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Geliang Tang Cc: Minchan Kim Cc: Hanjun Guo Cc: Ding Tianhong Cc: Li Bin Cc: Zhen Lei Cc: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index b99e828172f6..4786b4150f62 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -783,6 +783,7 @@ static int unmerge_and_remove_all_rmap_items(void) } remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + up_read(&mm->mmap_sem); spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, @@ -794,12 +795,9 @@ static int unmerge_and_remove_all_rmap_items(void) free_mm_slot(mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); - up_read(&mm->mmap_sem); mmdrop(mm); - } else { + } else spin_unlock(&ksm_mmlist_lock); - up_read(&mm->mmap_sem); - } } /* Clean up stable nodes, but don't worry if some are still busy */ @@ -1663,8 +1661,15 @@ next_mm: up_read(&mm->mmap_sem); mmdrop(mm); } else { - spin_unlock(&ksm_mmlist_lock); up_read(&mm->mmap_sem); + /* + * up_read(&mm->mmap_sem) first because after + * spin_unlock(&ksm_mmlist_lock) run, the "mm" may + * already have been freed under us by __ksm_exit() + * because the "mm_slot" is still hashed and + * ksm_scan.mm_slot doesn't point to it anymore. + */ + spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ -- cgit v1.2.3 From 6d0a07edd17cfc12fdc1f36de8072fa17cc3666f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 12 May 2016 15:42:25 -0700 Subject: mm: thp: calculate the mapcount correctly for THP pages during WP faults This will provide fully accuracy to the mapcount calculation in the write protect faults, so page pinning will not get broken by false positive copy-on-writes. total_mapcount() isn't the right calculation needed in reuse_swap_page(), so this introduces a page_trans_huge_mapcount() that is effectively the full accurate return value for page_mapcount() if dealing with Transparent Hugepages, however we only use the page_trans_huge_mapcount() during COW faults where it strictly needed, due to its higher runtime cost. This also provide at practical zero cost the total_mapcount information which is needed to know if we can still relocate the page anon_vma to the local vma. If page_trans_huge_mapcount() returns 1 we can reuse the page no matter if it's a pte or a pmd_trans_huge triggering the fault, but we can only relocate the page anon_vma to the local vma->anon_vma if we're sure it's only this "vma" mapping the whole THP physical range. Kirill A. Shutemov discovered the problem with moving the page anon_vma to the local vma->anon_vma in a previous version of this patch and another problem in the way page_move_anon_rmap() was called. Andrew Morton discovered that CONFIG_SWAP=n wouldn't build in a previous version, because reuse_swap_page must be a macro to call page_trans_huge_mapcount from swap.h, so this uses a macro again instead of an inline function. With this change at least it's a less dangerous usage than it was before, because "page" is used only once now, while with the previous code reuse_swap_page(page++) would have called page_mapcount on page+1 and it would have increased page twice instead of just once. Dean Luick noticed an uninitialized variable that could result in a rmap inefficiency for the non-THP case in a previous version. Mike Marciniszyn said: : Our RDMA tests are seeing an issue with memory locking that bisects to : commit 61f5d698cc97 ("mm: re-enable THP") : : The test program registers two rather large MRs (512M) and RDMA : writes data to a passive peer using the first and RDMA reads it back : into the second MR and compares that data. The sizes are chosen randomly : between 0 and 1024 bytes. : : The test will get through a few (<= 4 iterations) and then gets a : compare error. : : Tracing indicates the kernel logical addresses associated with the individual : pages at registration ARE correct , the data in the "RDMA read response only" : packets ARE correct. : : The "corruption" occurs when the packet crosse two pages that are not physically : contiguous. The second page reads back as zero in the program. : : It looks like the user VA at the point of the compare error no longer points to : the same physical address as was registered. : : This patch totally resolves the issue! Link: http://lkml.kernel.org/r/1462547040-1737-2-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reviewed-by: "Kirill A. Shutemov" Reviewed-by: Dean Luick Tested-by: Alex Williamson Tested-by: Mike Marciniszyn Tested-by: Josh Collier Cc: Marc Haber Cc: [4.5] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++ include/linux/swap.h | 6 ++--- mm/huge_memory.c | 71 +++++++++++++++++++++++++++++++++++++++++++++------- mm/memory.c | 22 ++++++++++------ mm/swapfile.c | 13 +++++----- 5 files changed, 95 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 864d7221de84..8f468e0d2534 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -500,11 +500,20 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); +int page_trans_huge_mapcount(struct page *page, int *total_mapcount); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } +static inline int page_trans_huge_mapcount(struct page *page, + int *total_mapcount) +{ + int mapcount = page_mapcount(page); + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; +} #endif static inline struct page *virt_to_head_page(const void *x) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0a4cd4703f40..ad220359f1b0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -418,7 +418,7 @@ extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); -extern int reuse_swap_page(struct page *); +extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -513,8 +513,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page) \ - (!PageTransCompound(page) && page_mapcount(page) == 1) +#define reuse_swap_page(page, total_mapcount) \ + (page_trans_huge_mapcount(page, total_mapcount) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7daa7de8f48..b49ee126d4d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1298,15 +1298,9 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page); /* * We can only reuse the page if nobody else maps the huge page or it's - * part. We can do it by checking page_mapcount() on each sub-page, but - * it's expensive. - * The cheaper way is to check page_count() to be equal 1: every - * mapcount takes page reference reference, so this way we can - * guarantee, that the PMD is the only mapping. - * This can give false negative if somebody pinned the page, but that's - * fine. + * part. */ - if (page_mapcount(page) == 1 && page_count(page) == 1) { + if (page_trans_huge_mapcount(page, NULL) == 1) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -2079,7 +2073,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (pte_write(pteval)) { writable = true; } else { - if (PageSwapCache(page) && !reuse_swap_page(page)) { + if (PageSwapCache(page) && + !reuse_swap_page(page, NULL)) { unlock_page(page); result = SCAN_SWAP_CACHE_PAGE; goto out; @@ -3222,6 +3217,64 @@ int total_mapcount(struct page *page) return ret; } +/* + * This calculates accurately how many mappings a transparent hugepage + * has (unlike page_mapcount() which isn't fully accurate). This full + * accuracy is primarily needed to know if copy-on-write faults can + * reuse the page and change the mapping to read-write instead of + * copying them. At the same time this returns the total_mapcount too. + * + * The function returns the highest mapcount any one of the subpages + * has. If the return value is one, even if different processes are + * mapping different subpages of the transparent hugepage, they can + * all reuse it, because each process is reusing a different subpage. + * + * The total_mapcount is instead counting all virtual mappings of the + * subpages. If the total_mapcount is equal to "one", it tells the + * caller all mappings belong to the same "mm" and in turn the + * anon_vma of the transparent hugepage can become the vma->anon_vma + * local one as no other process may be mapping any of the subpages. + * + * It would be more accurate to replace page_mapcount() with + * page_trans_huge_mapcount(), however we only use + * page_trans_huge_mapcount() in the copy-on-write faults where we + * need full accuracy to avoid breaking page pinning, because + * page_trans_huge_mapcount() is slower than page_mapcount(). + */ +int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +{ + int i, ret, _total_mapcount, mapcount; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + return mapcount; + } + + page = compound_head(page); + + _total_mapcount = ret = 0; + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + ret = max(ret, mapcount); + _total_mapcount += mapcount; + } + if (PageDoubleMap(page)) { + ret -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + ret += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + return ret; +} + /* * This function splits huge page into normal pages. @page can point to any * subpage of huge page to split. Split doesn't change the position of @page. diff --git a/mm/memory.c b/mm/memory.c index 52c218e2b724..07493e34ab7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2373,6 +2373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * not dirty accountable. */ if (PageAnon(old_page) && !PageKsm(old_page)) { + int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); pte_unmap_unlock(page_table, ptl); @@ -2387,13 +2388,18 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } put_page(old_page); } - if (reuse_swap_page(old_page)) { - /* - * The page is all ours. Move it to our anon_vma so - * the rmap code will not search our parent or siblings. - * Protected against the rmap code by the page lock. - */ - page_move_anon_rmap(old_page, vma, address); + if (reuse_swap_page(old_page, &total_mapcount)) { + if (total_mapcount == 1) { + /* + * The page is all ours. Move it to + * our anon_vma so the rmap code will + * not search our parent or siblings. + * Protected against the rmap code by + * the page lock. + */ + page_move_anon_rmap(compound_head(old_page), + vma, address); + } unlock_page(old_page); return wp_page_reuse(mm, vma, address, page_table, ptl, orig_pte, old_page, 0, 0); @@ -2617,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { + if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; diff --git a/mm/swapfile.c b/mm/swapfile.c index 83874eced5bf..031713ab40ce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -922,18 +922,19 @@ out: * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. + * + * NOTE: total_mapcount should not be relied upon by the caller if + * reuse_swap_page() returns false, but it may be always overwritten + * (see the other implementation for CONFIG_SWAP=n). */ -int reuse_swap_page(struct page *page) +bool reuse_swap_page(struct page *page, int *total_mapcount) { int count; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) - return 0; - /* The page is part of THP and cannot be reused */ - if (PageTransCompound(page)) - return 0; - count = page_mapcount(page); + return false; + count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); if (count == 1 && !PageWriteback(page)) { -- cgit v1.2.3 From 4f622938a5e2b7f1374ffb1e5fc212744898f513 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:17 +0200 Subject: dax: Allow DAX code to replace exceptional entries Currently we forbid page_cache_tree_insert() to replace exceptional radix tree entries for DAX inodes. However to make DAX faults race free we will lock radix tree entries and when hole is created, we need to replace such locked radix tree entry with a hole page. So modify page_cache_tree_insert() to allow that. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- include/linux/dax.h | 1 + mm/filemap.c | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/dax.h b/include/linux/dax.h index 70600b63083f..aa148937bb3f 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -3,6 +3,7 @@ #include #include +#include #include /* We use lowest available exceptional entry bit for locking */ diff --git a/mm/filemap.c b/mm/filemap.c index f2479af09da9..dfe55c2cfb34 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -597,14 +597,21 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!radix_tree_exceptional_entry(p)) return -EEXIST; - if (WARN_ON(dax_mapping(mapping))) - return -EINVAL; - - if (shadowp) - *shadowp = p; mapping->nrexceptional--; - if (node) - workingset_node_shadows_dec(node); + if (!dax_mapping(mapping)) { + if (shadowp) + *shadowp = p; + if (node) + workingset_node_shadows_dec(node); + } else { + /* DAX can replace empty locked entry with a hole */ + WARN_ON_ONCE(p != + (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK)); + /* DAX accounts exceptional entries as normal pages */ + if (node) + workingset_node_pages_dec(node); + } } radix_tree_replace_slot(slot, page); mapping->nrpages++; -- cgit v1.2.3 From ac401cc782429cc8560ce4840b1405d603740917 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:18 +0200 Subject: dax: New fault locking Currently DAX page fault locking is racy. CPU0 (write fault) CPU1 (read fault) __dax_fault() __dax_fault() get_block(inode, block, &bh, 0) -> not mapped get_block(inode, block, &bh, 0) -> not mapped if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) get_block(inode, block, &bh, 1) -> allocates blocks if (page) -> no if (!buffer_mapped(&bh)) if (vmf->flags & FAULT_FLAG_WRITE) { } else { dax_load_hole(); } dax_insert_mapping() And we are in a situation where we fail in dax_radix_entry() with -EIO. Another problem with the current DAX page fault locking is that there is no race-free way to clear dirty tag in the radix tree. We can always end up with clean radix tree and dirty data in CPU cache. We fix the first problem by introducing locking of exceptional radix tree entries in DAX mappings acting very similarly to page lock and thus synchronizing properly faults against the same mapping index. The same lock can later be used to avoid races when clearing radix tree dirty tag. Reviewed-by: NeilBrown Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- fs/dax.c | 553 ++++++++++++++++++++++++++++++++++++++-------------- include/linux/dax.h | 3 + mm/filemap.c | 9 +- mm/truncate.c | 62 +++--- 4 files changed, 447 insertions(+), 180 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index 351afd3cf8be..f43c3d806fb6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -46,6 +46,30 @@ RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ RADIX_TREE_EXCEPTIONAL_ENTRY)) +/* We choose 4096 entries - same as per-zone page wait tables */ +#define DAX_WAIT_TABLE_BITS 12 +#define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) + +wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; + +static int __init init_dax_wait_table(void) +{ + int i; + + for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) + init_waitqueue_head(wait_table + i); + return 0; +} +fs_initcall(init_dax_wait_table); + +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index) +{ + unsigned long hash = hash_long((unsigned long)mapping ^ index, + DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} + static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; @@ -267,6 +291,263 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } EXPORT_SYMBOL_GPL(dax_do_io); +/* + * DAX radix tree locking + */ +struct exceptional_entry_key { + struct address_space *mapping; + unsigned long index; +}; + +struct wait_exceptional_entry_queue { + wait_queue_t wait; + struct exceptional_entry_key key; +}; + +static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, + int sync, void *keyp) +{ + struct exceptional_entry_key *key = keyp; + struct wait_exceptional_entry_queue *ewait = + container_of(wait, struct wait_exceptional_entry_queue, wait); + + if (key->mapping != ewait->key.mapping || + key->index != ewait->key.index) + return 0; + return autoremove_wake_function(wait, mode, sync, NULL); +} + +/* + * Check whether the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline int slot_locked(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + return entry & RADIX_DAX_ENTRY_LOCK; +} + +/* + * Mark the given slot is locked. The function must be called with + * mapping->tree_lock held + */ +static inline void *lock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry |= RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Mark the given slot is unlocked. The function must be called with + * mapping->tree_lock held + */ +static inline void *unlock_slot(struct address_space *mapping, void **slot) +{ + unsigned long entry = (unsigned long) + radix_tree_deref_slot_protected(slot, &mapping->tree_lock); + + entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; + radix_tree_replace_slot(slot, (void *)entry); + return (void *)entry; +} + +/* + * Lookup entry in radix tree, wait for it to become unlocked if it is + * exceptional entry and return it. The caller must call + * put_unlocked_mapping_entry() when he decided not to lock the entry or + * put_locked_mapping_entry() when he locked the entry and now wants to + * unlock it. + * + * The function must be called with mapping->tree_lock held. + */ +static void *get_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void ***slotp) +{ + void *ret, **slot; + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + ewait.key.mapping = mapping; + ewait.key.index = index; + + for (;;) { + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + &slot); + if (!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot)) { + if (slotp) + *slotp = slot; + return ret; + } + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&mapping->tree_lock); + schedule(); + finish_wait(wq, &ewait.wait); + spin_lock_irq(&mapping->tree_lock); + } +} + +/* + * Find radix tree entry at given index. If it points to a page, return with + * the page locked. If it points to the exceptional entry, return with the + * radix tree entry locked. If the radix tree doesn't contain given index, + * create empty exceptional entry for the index and return with it locked. + * + * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For + * persistent memory the benefit is doubtful. We can add that later if we can + * show it helps. + */ +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + +restart: + spin_lock_irq(&mapping->tree_lock); + ret = get_unlocked_mapping_entry(mapping, index, &slot); + /* No entry for given index? Make sure radix tree is big enough. */ + if (!ret) { + int err; + + spin_unlock_irq(&mapping->tree_lock); + err = radix_tree_preload( + mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); + if (err) + return ERR_PTR(err); + ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | + RADIX_DAX_ENTRY_LOCK); + spin_lock_irq(&mapping->tree_lock); + err = radix_tree_insert(&mapping->page_tree, index, ret); + radix_tree_preload_end(); + if (err) { + spin_unlock_irq(&mapping->tree_lock); + /* Someone already created the entry? */ + if (err == -EEXIST) + goto restart; + return ERR_PTR(err); + } + /* Good, we have inserted empty locked entry into the tree. */ + mapping->nrexceptional++; + spin_unlock_irq(&mapping->tree_lock); + return ret; + } + /* Normal page in radix tree? */ + if (!radix_tree_exceptional_entry(ret)) { + struct page *page = ret; + + get_page(page); + spin_unlock_irq(&mapping->tree_lock); + lock_page(page); + /* Page got truncated? Retry... */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto restart; + } + return page; + } + ret = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all) +{ + wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) { + struct exceptional_entry_key key; + + key.mapping = mapping; + key.index = index; + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + } +} + +static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *ret, **slot; + + spin_lock_irq(&mapping->tree_lock); + ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, false); +} + +/* + * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree + * entry to get unlocked before deleting it. + */ +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + void *entry; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + /* + * This gets called from truncate / punch_hole path. As such, the caller + * must hold locks protecting against concurrent modifications of the + * radix tree (usually fs-private i_mmap_sem for writing). Since the + * caller has seen exceptional entry for this index, we better find it + * at that index as well... + */ + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { + spin_unlock_irq(&mapping->tree_lock); + return 0; + } + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, true); + + return 1; +} + /* * The user has performed a load from a hole in the file. Allocating * a new page in the file would cause excessive storage usage for @@ -275,15 +556,24 @@ EXPORT_SYMBOL_GPL(dax_do_io); * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, struct page *page, - struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - if (!page) - page = find_or_create_page(mapping, vmf->pgoff, - GFP_KERNEL | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; + struct page *page; + + /* Hole page already exists? Return it... */ + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + return VM_FAULT_LOCKED; + } + /* This will replace locked radix tree entry with a hole page */ + page = find_or_create_page(mapping, vmf->pgoff, + vmf->gfp_mask | __GFP_ZERO); + if (!page) { + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + return VM_FAULT_OOM; + } vmf->page = page; return VM_FAULT_LOCKED; } @@ -307,77 +597,72 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } -#define NO_SECTOR -1 #define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) -static int dax_radix_entry(struct address_space *mapping, pgoff_t index, - sector_t sector, bool pmd_entry, bool dirty) +static void *dax_insert_mapping_entry(struct address_space *mapping, + struct vm_fault *vmf, + void *entry, sector_t sector) { struct radix_tree_root *page_tree = &mapping->page_tree; - pgoff_t pmd_index = DAX_PMD_INDEX(index); - int type, error = 0; - void *entry; + int error = 0; + bool hole_fill = false; + void *new_entry; + pgoff_t index = vmf->pgoff; - WARN_ON_ONCE(pmd_entry && !dirty); - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - spin_lock_irq(&mapping->tree_lock); - - entry = radix_tree_lookup(page_tree, pmd_index); - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { - index = pmd_index; - goto dirty; + /* Replacing hole page with block mapping? */ + if (!radix_tree_exceptional_entry(entry)) { + hole_fill = true; + /* + * Unmap the page now before we remove it from page cache below. + * The page is locked so it cannot be faulted in again. + */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); + error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); + if (error) + return ERR_PTR(error); } - entry = radix_tree_lookup(page_tree, index); - if (entry) { - type = RADIX_DAX_TYPE(entry); - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && - type != RADIX_DAX_PMD)) { - error = -EIO; + spin_lock_irq(&mapping->tree_lock); + new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | + RADIX_DAX_ENTRY_LOCK); + if (hole_fill) { + __delete_from_page_cache(entry, NULL); + /* Drop pagecache reference */ + put_page(entry); + error = radix_tree_insert(page_tree, index, new_entry); + if (error) { + new_entry = ERR_PTR(error); goto unlock; } + mapping->nrexceptional++; + } else { + void **slot; + void *ret; - if (!pmd_entry || type == RADIX_DAX_PMD) - goto dirty; - - /* - * We only insert dirty PMD entries into the radix tree. This - * means we don't need to worry about removing a dirty PTE - * entry and inserting a clean PMD entry, thus reducing the - * range we would flush with a follow-up fsync/msync call. - */ - radix_tree_delete(&mapping->page_tree, index); - mapping->nrexceptional--; - } - - if (sector == NO_SECTOR) { - /* - * This can happen during correct operation if our pfn_mkwrite - * fault raced against a hole punch operation. If this - * happens the pte that was hole punched will have been - * unmapped and the radix tree entry will have been removed by - * the time we are called, but the call will still happen. We - * will return all the way up to wp_pfn_shared(), where the - * pte_same() check will fail, eventually causing page fault - * to be retried by the CPU. - */ - goto unlock; + ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + WARN_ON_ONCE(ret != entry); + radix_tree_replace_slot(slot, new_entry); } - - error = radix_tree_insert(page_tree, index, - RADIX_DAX_ENTRY(sector, pmd_entry)); - if (error) - goto unlock; - - mapping->nrexceptional++; - dirty: - if (dirty) + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); unlock: spin_unlock_irq(&mapping->tree_lock); - return error; + if (hole_fill) { + radix_tree_preload_end(); + /* + * We don't need hole page anymore, it has been replaced with + * locked radix tree entry now. + */ + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(entry); + unlock_page(entry); + put_page(entry); + } + return new_entry; } static int dax_writeback_one(struct block_device *bdev, @@ -503,17 +788,19 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, +static int dax_insert_mapping(struct address_space *mapping, + struct buffer_head *bh, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = (unsigned long)vmf->virtual_address; - struct address_space *mapping = inode->i_mapping; struct block_device *bdev = bh->b_bdev; struct blk_dax_ctl dax = { - .sector = to_sector(bh, inode), + .sector = to_sector(bh, mapping->host), .size = bh->b_size, }; int error; + void *ret; + void *entry = *entryp; i_mmap_lock_read(mapping); @@ -523,16 +810,16 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); - error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, - vmf->flags & FAULT_FLAG_WRITE); - if (error) + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + if (IS_ERR(ret)) { + error = PTR_ERR(ret); goto out; + } + *entryp = ret; error = vm_insert_mixed(vma, vaddr, dax.pfn); - out: i_mmap_unlock_read(mapping); - return error; } @@ -552,7 +839,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct page *page; + void *entry; struct buffer_head bh; unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned blkbits = inode->i_blkbits; @@ -561,6 +848,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, int error; int major = 0; + /* + * Check whether offset isn't beyond end of file now. Caller is supposed + * to hold locks serializing us with truncate / punch hole so this is + * a reliable test. + */ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) return VM_FAULT_SIGBUS; @@ -570,40 +862,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; - repeat: - page = find_get_page(mapping, vmf->pgoff); - if (page) { - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { - put_page(page); - return VM_FAULT_RETRY; - } - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto repeat; - } + entry = grab_mapping_entry(mapping, vmf->pgoff); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto out; } error = get_block(inode, block, &bh, 0); if (!error && (bh.b_size < PAGE_SIZE)) error = -EIO; /* fs corruption? */ if (error) - goto unlock_page; - - if (!buffer_mapped(&bh) && !vmf->cow_page) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_page; - } else { - return dax_load_hole(mapping, page, vmf); - } - } + goto unlock_entry; if (vmf->cow_page) { struct page *new_page = vmf->cow_page; @@ -612,30 +881,37 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, else clear_user_highpage(new_page, vaddr); if (error) - goto unlock_page; - vmf->page = page; - if (!page) + goto unlock_entry; + if (!radix_tree_exceptional_entry(entry)) { + vmf->page = entry; + } else { + unlock_mapping_entry(mapping, vmf->pgoff); i_mmap_lock_read(mapping); + vmf->page = NULL; + } return VM_FAULT_LOCKED; } - /* Check we didn't race with a read fault installing a new page */ - if (!page && major) - page = find_lock_page(mapping, vmf->pgoff); - - if (page) { - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - delete_from_page_cache(page); - unlock_page(page); - put_page(page); - page = NULL; + if (!buffer_mapped(&bh)) { + if (vmf->flags & FAULT_FLAG_WRITE) { + error = get_block(inode, block, &bh, 1); + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); + major = VM_FAULT_MAJOR; + if (!error && (bh.b_size < PAGE_SIZE)) + error = -EIO; + if (error) + goto unlock_entry; + } else { + return dax_load_hole(mapping, entry, vmf); + } } /* Filesystem should not return unwritten buffers to us! */ WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(inode, &bh, vma, vmf); - + error = dax_insert_mapping(mapping, &bh, &entry, vma, vmf); + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) return VM_FAULT_OOM | major; @@ -643,13 +919,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if ((error < 0) && (error != -EBUSY)) return VM_FAULT_SIGBUS | major; return VM_FAULT_NOPAGE | major; - - unlock_page: - if (page) { - unlock_page(page); - put_page(page); - } - goto out; } EXPORT_SYMBOL(__dax_fault); @@ -718,7 +987,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int error, result = 0; + int result = 0; bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ @@ -865,13 +1134,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, * the write to insert a dirty entry. */ if (write) { - error = dax_radix_entry(mapping, pgoff, dax.sector, - true, true); - if (error) { - dax_pmd_dbg(&bh, address, - "PMD radix insertion failed"); - goto fallback; - } + /* + * We should insert radix-tree entry and dirty it here. + * For now this is broken... + */ } dev_dbg(part_to_dev(bdev->bd_part), @@ -931,23 +1197,18 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; - int error; - - /* - * We pass NO_SECTOR to dax_radix_entry() because we expect that a - * RADIX_DAX_PTE entry already exists in the radix tree from a - * previous call to __dax_fault(). We just want to look up that PTE - * entry using vmf->pgoff and make sure the dirty tag is set. This - * saves us from having to make a call to get_block() here to look - * up the sector. - */ - error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, - true); + struct address_space *mapping = file->f_mapping; + void *entry; + pgoff_t index = vmf->pgoff; - if (error == -ENOMEM) - return VM_FAULT_OOM; - if (error) - return VM_FAULT_SIGBUS; + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + put_unlocked_mapping_entry(mapping, index, entry); +out: + spin_unlock_irq(&mapping->tree_lock); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); diff --git a/include/linux/dax.h b/include/linux/dax.h index aa148937bb3f..756625c6d0dd 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -15,6 +15,9 @@ int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t); int dax_truncate_page(struct inode *, loff_t from, get_block_t); int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); int __dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t); +int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); +void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, bool wake_all); #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); diff --git a/mm/filemap.c b/mm/filemap.c index dfe55c2cfb34..7b9a4b180cae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -160,13 +160,15 @@ static void page_cache_tree_delete(struct address_space *mapping, return; /* - * Track node that only contains shadow entries. + * Track node that only contains shadow entries. DAX mappings contain + * no shadow entries and may contain other exceptional entries so skip + * those. * * Avoid acquiring the list_lru lock if already tracked. The * list_empty() test is safe as node->private_list is * protected by mapping->tree_lock. */ - if (!workingset_node_pages(node) && + if (!dax_mapping(mapping) && !workingset_node_pages(node) && list_empty(&node->private_list)) { node->private_data = mapping; list_lru_add(&workingset_shadow_nodes, &node->private_list); @@ -611,6 +613,9 @@ static int page_cache_tree_insert(struct address_space *mapping, /* DAX accounts exceptional entries as normal pages */ if (node) workingset_node_pages_dec(node); + /* Wakeup waiters for exceptional entry lock */ + dax_wake_mapping_entry_waiter(mapping, page->index, + false); } } radix_tree_replace_slot(slot, page); diff --git a/mm/truncate.c b/mm/truncate.c index b00272810871..4064f8f53daa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -34,40 +34,38 @@ static void clear_exceptional_entry(struct address_space *mapping, if (shmem_mapping(mapping)) return; - spin_lock_irq(&mapping->tree_lock); - if (dax_mapping(mapping)) { - if (radix_tree_delete_item(&mapping->page_tree, index, entry)) - mapping->nrexceptional--; - } else { - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + dax_delete_mapping_entry(mapping, index); + return; } + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } -- cgit v1.2.3 From bc2466e4257369d0ebee2b6265070d323343fa72 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:19 +0200 Subject: dax: Use radix tree entry lock to protect cow faults When doing cow faults, we cannot directly fill in PTE as we do for other faults as we rely on generic code to do proper accounting of the cowed page. We also have no page to lock to protect against races with truncate as other faults have and we need the protection to extend until the moment generic code inserts cowed page into PTE thus at that point we have no protection of fs-specific i_mmap_sem. So far we relied on using i_mmap_lock for the protection however that is completely special to cow faults. To make fault locking more uniform use DAX entry lock instead. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- fs/dax.c | 12 +++++------- include/linux/dax.h | 7 +++++++ include/linux/mm.h | 7 +++++++ mm/memory.c | 38 ++++++++++++++++++-------------------- 4 files changed, 37 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index f43c3d806fb6..be74635e05a6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -478,7 +478,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, } } -static void unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) { void *ret, **slot; @@ -501,7 +501,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, unlock_page(entry); put_page(entry); } else { - unlock_mapping_entry(mapping, index); + dax_unlock_mapping_entry(mapping, index); } } @@ -884,12 +884,10 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, goto unlock_entry; if (!radix_tree_exceptional_entry(entry)) { vmf->page = entry; - } else { - unlock_mapping_entry(mapping, vmf->pgoff); - i_mmap_lock_read(mapping); - vmf->page = NULL; + return VM_FAULT_LOCKED; } - return VM_FAULT_LOCKED; + vmf->entry = entry; + return VM_FAULT_DAX_LOCKED; } if (!buffer_mapped(&bh)) { diff --git a/include/linux/dax.h b/include/linux/dax.h index 756625c6d0dd..7bf12277c006 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -21,6 +21,7 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); +void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length); #else @@ -29,6 +30,12 @@ static inline struct page *read_dax_sector(struct block_device *bdev, { return ERR_PTR(-ENXIO); } +/* Shouldn't ever be called when dax is disabled. */ +static inline void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + BUG(); +} static inline int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length) { diff --git a/include/linux/mm.h b/include/linux/mm.h index a55e5be0894f..0ef9dc720ec3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -299,6 +299,12 @@ struct vm_fault { * is set (which is also implied by * VM_FAULT_ERROR). */ + void *entry; /* ->fault handler can alternatively + * return locked DAX entry. In that + * case handler should return + * VM_FAULT_DAX_LOCKED and fill in + * entry here. + */ /* for ->map_pages() only */ pgoff_t max_pgoff; /* map pages for offset from pgoff till * max_pgoff inclusive */ @@ -1084,6 +1090,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ +#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ diff --git a/mm/memory.c b/mm/memory.c index 93897f23cc11..f09cdb8d48fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -2785,7 +2786,8 @@ oom: */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, - struct page *cow_page, struct page **page) + struct page *cow_page, struct page **page, + void **entry) { struct vm_fault vmf; int ret; @@ -2800,8 +2802,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - if (!vmf.page) - goto out; + if (ret & VM_FAULT_DAX_LOCKED) { + *entry = vmf.entry; + return ret; + } if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) @@ -2815,7 +2819,6 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, else VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); - out: *page = vmf.page; return ret; } @@ -2987,7 +2990,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(pte, ptl); } - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3010,6 +3013,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; + void *fault_entry; struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; @@ -3027,26 +3031,24 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; } - ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page, + &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; - if (fault_page) + if (!(ret & VM_FAULT_DAX_LOCKED)) copy_user_highpage(new_page, fault_page, address, vma); __SetPageUptodate(new_page); pte = pte_offset_map_lock(mm, pmd, address, &ptl); if (unlikely(!pte_same(*pte, orig_pte))) { pte_unmap_unlock(pte, ptl); - if (fault_page) { + if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. - */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, + pgoff); } goto uncharge_out; } @@ -3054,15 +3056,11 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); - if (fault_page) { + if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); } else { - /* - * The fault handler has no page to lock, so it holds - * i_mmap_lock for read to protect against truncate. - */ - i_mmap_unlock_read(vma->vm_file->f_mapping); + dax_unlock_mapping_entry(vma->vm_file->f_mapping, pgoff); } return ret; uncharge_out: @@ -3082,7 +3080,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; - ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page); + ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; -- cgit v1.2.3 From 4d9a2c8746671efbb0c27d3ae28c7474597a7aad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 12 May 2016 18:29:20 +0200 Subject: dax: Remove i_mmap_lock protection Currently faults are protected against truncate by filesystem specific i_mmap_sem and page lock in case of hole page. Cow faults are protected DAX radix tree entry locking. So there's no need for i_mmap_lock in DAX code. Remove it. Reviewed-by: Ross Zwisler Signed-off-by: Jan Kara Signed-off-by: Ross Zwisler --- fs/dax.c | 24 +++++------------------- mm/memory.c | 2 -- 2 files changed, 5 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index be74635e05a6..6dbe6021cab7 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -798,29 +798,19 @@ static int dax_insert_mapping(struct address_space *mapping, .sector = to_sector(bh, mapping->host), .size = bh->b_size, }; - int error; void *ret; void *entry = *entryp; - i_mmap_lock_read(mapping); - - if (dax_map_atomic(bdev, &dax) < 0) { - error = PTR_ERR(dax.addr); - goto out; - } + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); - if (IS_ERR(ret)) { - error = PTR_ERR(ret); - goto out; - } + if (IS_ERR(ret)) + return PTR_ERR(ret); *entryp = ret; - error = vm_insert_mixed(vma, vaddr, dax.pfn); - out: - i_mmap_unlock_read(mapping); - return error; + return vm_insert_mixed(vma, vaddr, dax.pfn); } /** @@ -1058,8 +1048,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, truncate_pagecache_range(inode, lstart, lend); } - i_mmap_lock_read(mapping); - if (!write && !buffer_mapped(&bh)) { spinlock_t *ptl; pmd_t entry; @@ -1148,8 +1136,6 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } out: - i_mmap_unlock_read(mapping); - return result; fallback: diff --git a/mm/memory.c b/mm/memory.c index f09cdb8d48fa..06f552504e79 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2453,8 +2453,6 @@ void unmap_mapping_range(struct address_space *mapping, if (details.last_index < details.first_index) details.last_index = ULONG_MAX; - - /* DAX uses i_mmap_lock to serialise file truncate vs page fault */ i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) unmap_mapping_range_tree(&mapping->i_mmap, &details); -- cgit v1.2.3 From 18726ca8b34bbfb3ab5a1c0a52a5d8dd392466ed Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:02 -0700 Subject: mm/slab: fix the theoretical race by holding proper lock While processing concurrent allocation, SLAB could be contended a lot because it did a lots of work with holding a lock. This patchset try to reduce the number of critical section to reduce lock contention. Major changes are lockless decision to allocate more slab and lockless cpu cache refill from the newly allocated slab. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=365/806 Kmalloc N*alloc N*free(64): Average=452/690 Kmalloc N*alloc N*free(128): Average=736/886 Kmalloc N*alloc N*free(256): Average=1167/985 Kmalloc N*alloc N*free(512): Average=2088/1125 Kmalloc N*alloc N*free(1024): Average=4115/1184 Kmalloc N*alloc N*free(2048): Average=8451/1748 Kmalloc N*alloc N*free(4096): Average=16024/2048 * After Kmalloc N*alloc N*free(32): Average=344/792 Kmalloc N*alloc N*free(64): Average=347/882 Kmalloc N*alloc N*free(128): Average=390/959 Kmalloc N*alloc N*free(256): Average=393/1067 Kmalloc N*alloc N*free(512): Average=683/1229 Kmalloc N*alloc N*free(1024): Average=1295/1325 Kmalloc N*alloc N*free(2048): Average=2513/1664 Kmalloc N*alloc N*free(4096): Average=4742/2172 It shows that performance improves greatly (roughly more than 50%) for the object class whose size is more than 128 bytes. This patch (of 11): If we don't hold neither the slab_mutex nor the node lock, node's shared array cache could be freed and re-populated. If __kmem_cache_shrink() is called at the same time, it will call drain_array() with n->shared without holding node lock so problem can happen. This patch fix the situation by holding the node lock before trying to drain the shared array. In addition, add a debug check to confirm that n->shared access race doesn't exist. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 68 ++++++++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 17e2848979c5..3f1cc1ca4d88 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2180,6 +2180,11 @@ static void check_irq_on(void) BUG_ON(irqs_disabled()); } +static void check_mutex_acquired(void) +{ + BUG_ON(!mutex_is_locked(&slab_mutex)); +} + static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP @@ -2199,13 +2204,27 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) #else #define check_irq_off() do { } while(0) #define check_irq_on() do { } while(0) +#define check_mutex_acquired() do { } while(0) #define check_spinlock_acquired(x) do { } while(0) #define check_spinlock_acquired_node(x, y) do { } while(0) #endif -static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, - int force, int node); +static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac, + int node, bool free_all, struct list_head *list) +{ + int tofree; + + if (!ac || !ac->avail) + return; + + tofree = free_all ? ac->avail : (ac->limit + 4) / 5; + if (tofree > ac->avail) + tofree = (ac->avail + 1) / 2; + + free_block(cachep, ac->entry, tofree, node, list); + ac->avail -= tofree; + memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); +} static void do_drain(void *arg) { @@ -2229,6 +2248,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep) { struct kmem_cache_node *n; int node; + LIST_HEAD(list); on_each_cpu(do_drain, cachep, 1); check_irq_on(); @@ -2236,8 +2256,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep) if (n->alien) drain_alien_cache(cachep, n->alien); - for_each_kmem_cache_node(cachep, node, n) - drain_array(cachep, n, n->shared, 1, node); + for_each_kmem_cache_node(cachep, node, n) { + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, n->shared, node, true, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); + } } /* @@ -3869,29 +3894,26 @@ skip_setup: * if drain_array() is used on the shared array. */ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, - struct array_cache *ac, int force, int node) + struct array_cache *ac, int node) { LIST_HEAD(list); - int tofree; + + /* ac from n->shared can be freed if we don't hold the slab_mutex. */ + check_mutex_acquired(); if (!ac || !ac->avail) return; - if (ac->touched && !force) { + + if (ac->touched) { ac->touched = 0; - } else { - spin_lock_irq(&n->list_lock); - if (ac->avail) { - tofree = force ? ac->avail : (ac->limit + 4) / 5; - if (tofree > ac->avail) - tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node, &list); - ac->avail -= tofree; - memmove(ac->entry, &(ac->entry[tofree]), - sizeof(void *) * ac->avail); - } - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); + return; } + + spin_lock_irq(&n->list_lock); + drain_array_locked(cachep, ac, node, false, &list); + spin_unlock_irq(&n->list_lock); + + slabs_destroy(cachep, &list); } /** @@ -3929,7 +3951,7 @@ static void cache_reap(struct work_struct *w) reap_alien(searchp, n); - drain_array(searchp, n, cpu_cache_get(searchp), 0, node); + drain_array(searchp, n, cpu_cache_get(searchp), node); /* * These are racy checks but it does not matter @@ -3940,7 +3962,7 @@ static void cache_reap(struct work_struct *w) n->next_reap = jiffies + REAPTIMEOUT_NODE; - drain_array(searchp, n, n->shared, 0, node); + drain_array(searchp, n, n->shared, node); if (n->free_touched) n->free_touched = 0; -- cgit v1.2.3 From 8888177ea116d4d14ca0a2ba054d02f35b0dae29 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:05 -0700 Subject: mm/slab: remove BAD_ALIEN_MAGIC again Initial attemp to remove BAD_ALIEN_MAGIC is once reverted by 'commit edcad2509550 ("Revert "slab: remove BAD_ALIEN_MAGIC"")' because it causes a problem on m68k which has many node but !CONFIG_NUMA. In this case, although alien cache isn't used at all but to cope with some initialization path, garbage value is used and that is BAD_ALIEN_MAGIC. Now, this patch set use_alien_caches to 0 when !CONFIG_NUMA, there is no initialization path problem so we don't need BAD_ALIEN_MAGIC at all. So remove it. Signed-off-by: Joonsoo Kim Tested-by: Geert Uytterhoeven Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3f1cc1ca4d88..f36d3493f49f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -421,8 +421,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -644,7 +642,7 @@ static int transfer_objects(struct array_cache *to, static inline struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - return (struct alien_cache **)BAD_ALIEN_MAGIC; + return NULL; } static inline void free_alien_cache(struct alien_cache **ac_ptr) @@ -1212,7 +1210,7 @@ void __init kmem_cache_init(void) sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; - if (num_possible_nodes() == 1) + if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) use_alien_caches = 0; for (i = 0; i < NUM_INIT_LISTS; i++) -- cgit v1.2.3 From a5aa63a5f7352aa8991f64d46854dcb8d3788d55 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:08 -0700 Subject: mm/slab: drain the free slab as much as possible slabs_tofree() implies freeing all free slab. We can do it with just providing INT_MAX. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f36d3493f49f..a998d35599a3 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -895,12 +895,6 @@ static int init_cache_node_node(int node) return 0; } -static inline int slabs_tofree(struct kmem_cache *cachep, - struct kmem_cache_node *n) -{ - return (n->free_objects + cachep->num - 1) / cachep->num; -} - static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -965,7 +959,7 @@ free_slab: n = get_node(cachep, node); if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); } } @@ -1117,7 +1111,7 @@ static int __meminit drain_cache_node_node(int node) if (!n) continue; - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); if (!list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial)) { @@ -2311,7 +2305,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) check_irq_on(); for_each_kmem_cache_node(cachep, node, n) { - drain_freelist(cachep, n, slabs_tofree(cachep, n)); + drain_freelist(cachep, n, INT_MAX); ret += !list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial); -- cgit v1.2.3 From ded0ecf61118930988f0943e741056c8fd5d439c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:11 -0700 Subject: mm/slab: factor out kmem_cache_node initialization code It can be reused on other place, so factor out it. Following patch will use it. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 74 ++++++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index a998d35599a3..9bef33bc4daa 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -848,6 +848,46 @@ static inline gfp_t gfp_exact_node(gfp_t flags) } #endif +static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) +{ + struct kmem_cache_node *n; + + /* + * Set up the kmem_cache_node for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + n = get_node(cachep, node); + if (n) { + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + + cachep->num; + spin_unlock_irq(&n->list_lock); + + return 0; + } + + n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); + if (!n) + return -ENOMEM; + + kmem_cache_node_init(n); + n->next_reap = jiffies + REAPTIMEOUT_NODE + + ((unsigned long)cachep) % REAPTIMEOUT_NODE; + + n->free_limit = + (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; + + /* + * The kmem_cache_nodes don't come and go as CPUs + * come and go. slab_mutex is sufficient + * protection here. + */ + cachep->node[node] = n; + + return 0; +} + /* * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node @@ -859,39 +899,15 @@ static inline gfp_t gfp_exact_node(gfp_t flags) */ static int init_cache_node_node(int node) { + int ret; struct kmem_cache *cachep; - struct kmem_cache_node *n; - const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { - /* - * Set up the kmem_cache_node for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - n = get_node(cachep, node); - if (!n) { - n = kmalloc_node(memsize, GFP_KERNEL, node); - if (!n) - return -ENOMEM; - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - - /* - * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex is sufficient - * protection here. - */ - cachep->node[node] = n; - } - - spin_lock_irq(&n->list_lock); - n->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + ret = init_cache_node(cachep, node, GFP_KERNEL); + if (ret) + return ret; } + return 0; } -- cgit v1.2.3 From c3d332b6b2c11ddda9cce3e2f3135b68929d4b82 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:14 -0700 Subject: mm/slab: clean-up kmem_cache_node setup There are mostly same code for setting up kmem_cache_node either in cpuup_prepare() or alloc_kmem_cache_node(). Factor out and clean-up them. Signed-off-by: Joonsoo Kim Tested-by: Nishanth Menon Tested-by: Jon Hunter Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 168 +++++++++++++++++++++++++------------------------------------- 1 file changed, 68 insertions(+), 100 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 9bef33bc4daa..f1db679c2b5d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -911,6 +911,63 @@ static int init_cache_node_node(int node) return 0; } +static int setup_kmem_cache_node(struct kmem_cache *cachep, + int node, gfp_t gfp, bool force_change) +{ + int ret = -ENOMEM; + struct kmem_cache_node *n; + struct array_cache *old_shared = NULL; + struct array_cache *new_shared = NULL; + struct alien_cache **new_alien = NULL; + LIST_HEAD(list); + + if (use_alien_caches) { + new_alien = alloc_alien_cache(node, cachep->limit, gfp); + if (!new_alien) + goto fail; + } + + if (cachep->shared) { + new_shared = alloc_arraycache(node, + cachep->shared * cachep->batchcount, 0xbaadf00d, gfp); + if (!new_shared) + goto fail; + } + + ret = init_cache_node(cachep, node, gfp); + if (ret) + goto fail; + + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + if (n->shared && force_change) { + free_block(cachep, n->shared->entry, + n->shared->avail, node, &list); + n->shared->avail = 0; + } + + if (!n->shared || force_change) { + old_shared = n->shared; + n->shared = new_shared; + new_shared = NULL; + } + + if (!n->alien) { + n->alien = new_alien; + new_alien = NULL; + } + + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); + +fail: + kfree(old_shared); + kfree(new_shared); + free_alien_cache(new_alien); + + return ret; +} + static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -982,7 +1039,6 @@ free_slab: static int cpuup_prepare(long cpu) { struct kmem_cache *cachep; - struct kmem_cache_node *n = NULL; int node = cpu_to_mem(cpu); int err; @@ -1001,44 +1057,9 @@ static int cpuup_prepare(long cpu) * array caches */ list_for_each_entry(cachep, &slab_caches, list) { - struct array_cache *shared = NULL; - struct alien_cache **alien = NULL; - - if (cachep->shared) { - shared = alloc_arraycache(node, - cachep->shared * cachep->batchcount, - 0xbaadf00d, GFP_KERNEL); - if (!shared) - goto bad; - } - if (use_alien_caches) { - alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL); - if (!alien) { - kfree(shared); - goto bad; - } - } - n = get_node(cachep, node); - BUG_ON(!n); - - spin_lock_irq(&n->list_lock); - if (!n->shared) { - /* - * We are serialised from CPU_DEAD or - * CPU_UP_CANCELLED by the cpucontrol lock - */ - n->shared = shared; - shared = NULL; - } -#ifdef CONFIG_NUMA - if (!n->alien) { - n->alien = alien; - alien = NULL; - } -#endif - spin_unlock_irq(&n->list_lock); - kfree(shared); - free_alien_cache(alien); + err = setup_kmem_cache_node(cachep, node, GFP_KERNEL, false); + if (err) + goto bad; } return 0; @@ -3678,72 +3699,19 @@ EXPORT_SYMBOL(kfree); /* * This initializes kmem_cache_node or resizes various caches for all nodes. */ -static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) +static int setup_kmem_cache_nodes(struct kmem_cache *cachep, gfp_t gfp) { + int ret; int node; struct kmem_cache_node *n; - struct array_cache *new_shared; - struct alien_cache **new_alien = NULL; for_each_online_node(node) { - - if (use_alien_caches) { - new_alien = alloc_alien_cache(node, cachep->limit, gfp); - if (!new_alien) - goto fail; - } - - new_shared = NULL; - if (cachep->shared) { - new_shared = alloc_arraycache(node, - cachep->shared*cachep->batchcount, - 0xbaadf00d, gfp); - if (!new_shared) { - free_alien_cache(new_alien); - goto fail; - } - } - - n = get_node(cachep, node); - if (n) { - struct array_cache *shared = n->shared; - LIST_HEAD(list); - - spin_lock_irq(&n->list_lock); - - if (shared) - free_block(cachep, shared->entry, - shared->avail, node, &list); - - n->shared = new_shared; - if (!n->alien) { - n->alien = new_alien; - new_alien = NULL; - } - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); - slabs_destroy(cachep, &list); - kfree(shared); - free_alien_cache(new_alien); - continue; - } - n = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node); - if (!n) { - free_alien_cache(new_alien); - kfree(new_shared); + ret = setup_kmem_cache_node(cachep, node, gfp, true); + if (ret) goto fail; - } - kmem_cache_node_init(n); - n->next_reap = jiffies + REAPTIMEOUT_NODE + - ((unsigned long)cachep) % REAPTIMEOUT_NODE; - n->shared = new_shared; - n->alien = new_alien; - n->free_limit = (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - cachep->node[node] = n; } + return 0; fail: @@ -3785,7 +3753,7 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; if (!prev) - goto alloc_node; + goto setup_node; for_each_online_cpu(cpu) { LIST_HEAD(list); @@ -3802,8 +3770,8 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, } free_percpu(prev); -alloc_node: - return alloc_kmem_cache_node(cachep, gfp); +setup_node: + return setup_kmem_cache_nodes(cachep, gfp); } static int do_tune_cpucache(struct kmem_cache *cachep, int limit, -- cgit v1.2.3 From 6052b7880a95554993898f7cac075c2669f1dd7c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:17 -0700 Subject: mm/slab: don't keep free slabs if free_objects exceeds free_limit Currently, determination to free a slab is done whenever each freed object is put into the slab. This has a following problem. Assume free_limit = 10 and nr_free = 9. Free happens as following sequence and nr_free changes as following. free(become a free slab) free(not become a free slab) nr_free: 9 -> 10 (at first free) -> 11 (at second free) If we try to check if we can free current slab or not on each object free, we can't free any slab in this situation because current slab isn't a free slab when nr_free exceed free_limit (at second free) even if there is a free slab. However, if we check it lastly, we can free 1 free slab. This problem would cause to keep too much memory in the slab subsystem. This patch try to fix it by checking number of free object after all free work is done. If there is free slab at that time, we can free slab as much as possible so we keep free slab as minimal. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index f1db679c2b5d..3f16475b7189 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3296,6 +3296,9 @@ static void free_block(struct kmem_cache *cachep, void **objpp, { int i; struct kmem_cache_node *n = get_node(cachep, node); + struct page *page; + + n->free_objects += nr_objects; for (i = 0; i < nr_objects; i++) { void *objp; @@ -3308,17 +3311,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp, check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp); STATS_DEC_ACTIVE(cachep); - n->free_objects++; /* fixup slab chains */ - if (page->active == 0) { - if (n->free_objects > n->free_limit) { - n->free_objects -= cachep->num; - list_add_tail(&page->lru, list); - } else { - list_add(&page->lru, &n->slabs_free); - } - } else { + if (page->active == 0) + list_add(&page->lru, &n->slabs_free); + else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3326,6 +3323,14 @@ static void free_block(struct kmem_cache *cachep, void **objpp, list_add_tail(&page->lru, &n->slabs_partial); } } + + while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) { + n->free_objects -= cachep->num; + + page = list_last_entry(&n->slabs_free, struct page, lru); + list_del(&page->lru); + list_add(&page->lru, list); + } } static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) -- cgit v1.2.3 From 03d1d43a1262b347a9aa814980438fff8eb32edc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:20 -0700 Subject: mm/slab: racy access/modify the slab color Slab color isn't needed to be changed strictly. Because locking for changing slab color could cause more lock contention so this patch implements racy access/modify the slab color. This is a preparation step to implement lockless allocation path when there is no free objects in the kmem_cache. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=365/806 Kmalloc N*alloc N*free(64): Average=452/690 Kmalloc N*alloc N*free(128): Average=736/886 Kmalloc N*alloc N*free(256): Average=1167/985 Kmalloc N*alloc N*free(512): Average=2088/1125 Kmalloc N*alloc N*free(1024): Average=4115/1184 Kmalloc N*alloc N*free(2048): Average=8451/1748 Kmalloc N*alloc N*free(4096): Average=16024/2048 * After Kmalloc N*alloc N*free(32): Average=355/750 Kmalloc N*alloc N*free(64): Average=452/812 Kmalloc N*alloc N*free(128): Average=559/1070 Kmalloc N*alloc N*free(256): Average=1176/980 Kmalloc N*alloc N*free(512): Average=1939/1189 Kmalloc N*alloc N*free(1024): Average=3521/1278 Kmalloc N*alloc N*free(2048): Average=7152/1838 Kmalloc N*alloc N*free(4096): Average=13438/2013 It shows that contention is reduced for object size >= 1024 and performance increases by roughly 15%. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Jesper Dangaard Brouer Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 3f16475b7189..e181cfbf026b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2574,20 +2574,7 @@ static int cache_grow(struct kmem_cache *cachep, } local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = get_node(cachep, nodeid); - spin_lock(&n->list_lock); - - /* Get colour for the slab, and cal the next value. */ - offset = n->colour_next; - n->colour_next++; - if (n->colour_next >= cachep->colour) - n->colour_next = 0; - spin_unlock(&n->list_lock); - - offset *= cachep->colour_off; - if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); @@ -2608,6 +2595,19 @@ static int cache_grow(struct kmem_cache *cachep, if (!page) goto failed; + n = get_node(cachep, nodeid); + + /* Get colour for the slab, and cal the next value. */ + n->colour_next++; + if (n->colour_next >= cachep->colour) + n->colour_next = 0; + + offset = n->colour_next; + if (offset >= cachep->colour) + offset = 0; + + offset *= cachep->colour_off; + /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); -- cgit v1.2.3 From 511e3a05881221a7fc63e36f3d604887040fc845 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:23 -0700 Subject: mm/slab: make cache_grow() handle the page allocated on arbitrary node Currently, cache_grow() assumes that allocated page's nodeid would be same with parameter nodeid which is used for allocation request. If we discard this assumption, we can handle fallback_alloc() case gracefully. So, this patch makes cache_grow() handle the page allocated on arbitrary node and clean-up relevant code. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 60 +++++++++++++++++++++--------------------------------------- 1 file changed, 21 insertions(+), 39 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e181cfbf026b..b303c04c1565 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2556,13 +2556,14 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, struct page *page) +static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *freelist; size_t offset; gfp_t local_flags; + int page_node; struct kmem_cache_node *n; + struct page *page; /* * Be lazy and only check for valid flags here, keeping it out of the @@ -2590,12 +2591,12 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!page) - page = kmem_getpages(cachep, local_flags, nodeid); + page = kmem_getpages(cachep, local_flags, nodeid); if (!page) goto failed; - n = get_node(cachep, nodeid); + page_node = page_to_nid(page); + n = get_node(cachep, page_node); /* Get colour for the slab, and cal the next value. */ n->colour_next++; @@ -2610,7 +2611,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Get slab management. */ freelist = alloc_slabmgmt(cachep, page, offset, - local_flags & ~GFP_CONSTRAINT_MASK, nodeid); + local_flags & ~GFP_CONSTRAINT_MASK, page_node); if (OFF_SLAB(cachep) && !freelist) goto opps1; @@ -2629,13 +2630,13 @@ static int cache_grow(struct kmem_cache *cachep, STATS_INC_GROWN(cachep); n->free_objects += cachep->num; spin_unlock(&n->list_lock); - return 1; + return page_node; opps1: kmem_freepages(cachep, page); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return 0; + return -1; } #if DEBUG @@ -2916,14 +2917,14 @@ alloc_done: return obj; } - x = cache_grow(cachep, gfp_exact_node(flags), node, NULL); + x = cache_grow(cachep, gfp_exact_node(flags), node); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); node = numa_mem_id(); /* no objects in sight? abort */ - if (!x && ac->avail == 0) + if (x < 0 && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -3052,7 +3053,6 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) { struct zonelist *zonelist; - gfp_t local_flags; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); @@ -3063,8 +3063,6 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; - local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); zonelist = node_zonelist(mempolicy_slab_node(), flags); @@ -3094,33 +3092,17 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - struct page *page; + nid = cache_grow(cache, flags, numa_mem_id()); + if (nid >= 0) { + obj = ____cache_alloc_node(cache, + gfp_exact_node(flags), nid); - if (gfpflags_allow_blocking(local_flags)) - local_irq_enable(); - kmem_flagcheck(cache, flags); - page = kmem_getpages(cache, local_flags, numa_mem_id()); - if (gfpflags_allow_blocking(local_flags)) - local_irq_disable(); - if (page) { /* - * Insert into the appropriate per node queues + * Another processor may allocate the objects in + * the slab since we are not holding any locks. */ - nid = page_to_nid(page); - if (cache_grow(cache, flags, nid, page)) { - obj = ____cache_alloc_node(cache, - gfp_exact_node(flags), nid); - if (!obj) - /* - * Another processor may allocate the - * objects in the slab since we are - * not holding any locks. - */ - goto retry; - } else { - /* cache_grow already freed obj */ - obj = NULL; - } + if (!obj) + goto retry; } } @@ -3171,8 +3153,8 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, gfp_exact_node(flags), nodeid, NULL); - if (x) + x = cache_grow(cachep, gfp_exact_node(flags), nodeid); + if (x >= 0) goto retry; return fallback_alloc(cachep, flags); -- cgit v1.2.3 From 76b342bdc71badea2cbac7bf6590aa86e895c507 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:26 -0700 Subject: mm/slab: separate cache_grow() to two parts This is a preparation step to implement lockless allocation path when there is no free objects in kmem_cache. What we'd like to do here is to refill cpu cache without holding a node lock. To accomplish this purpose, refill should be done after new slab allocation but before attaching the slab to the management list. So, this patch separates cache_grow() to two parts, allocation and attaching to the list in order to add some code inbetween them in the following patch. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index b303c04c1565..8c4db214b05b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -213,6 +213,11 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); +static inline void fixup_objfreelist_debug(struct kmem_cache *cachep, + void **list); +static inline void fixup_slab_list(struct kmem_cache *cachep, + struct kmem_cache_node *n, struct page *page, + void **list); static int slab_early_init = 1; #define INDEX_NODE kmalloc_index(sizeof(struct kmem_cache_node)) @@ -1810,7 +1815,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, /* * Needed to avoid possible looping condition - * in cache_grow() + * in cache_grow_begin() */ if (OFF_SLAB(freelist_cache)) continue; @@ -2556,7 +2561,8 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * Grow (by 1) the number of slabs within a cache. This is called by * kmem_cache_alloc() when there are no active objs left in a cache. */ -static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) +static struct page *cache_grow_begin(struct kmem_cache *cachep, + gfp_t flags, int nodeid) { void *freelist; size_t offset; @@ -2622,21 +2628,40 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - check_irq_off(); - spin_lock(&n->list_lock); - /* Make slab active. */ - list_add_tail(&page->lru, &(n->slabs_free)); - STATS_INC_GROWN(cachep); - n->free_objects += cachep->num; - spin_unlock(&n->list_lock); - return page_node; + return page; + opps1: kmem_freepages(cachep, page); failed: if (gfpflags_allow_blocking(local_flags)) local_irq_disable(); - return -1; + return NULL; +} + +static void cache_grow_end(struct kmem_cache *cachep, struct page *page) +{ + struct kmem_cache_node *n; + void *list = NULL; + + check_irq_off(); + + if (!page) + return; + + INIT_LIST_HEAD(&page->lru); + n = get_node(cachep, page_to_nid(page)); + + spin_lock(&n->list_lock); + if (!page->active) + list_add_tail(&page->lru, &(n->slabs_free)); + else + fixup_slab_list(cachep, n, page, &list); + STATS_INC_GROWN(cachep); + n->free_objects += cachep->num - page->active; + spin_unlock(&n->list_lock); + + fixup_objfreelist_debug(cachep, &list); } #if DEBUG @@ -2847,6 +2872,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) struct array_cache *ac; int node; void *list = NULL; + struct page *page; check_irq_off(); node = numa_mem_id(); @@ -2874,7 +2900,6 @@ retry: } while (batchcount > 0) { - struct page *page; /* Get slab alloc is to come from. */ page = get_first_slab(n, false); if (!page) @@ -2907,8 +2932,6 @@ alloc_done: fixup_objfreelist_debug(cachep, &list); if (unlikely(!ac->avail)) { - int x; - /* Check if we can use obj in pfmemalloc slab */ if (sk_memalloc_socks()) { void *obj = cache_alloc_pfmemalloc(cachep, n, flags); @@ -2917,14 +2940,18 @@ alloc_done: return obj; } - x = cache_grow(cachep, gfp_exact_node(flags), node); + page = cache_grow_begin(cachep, gfp_exact_node(flags), node); + cache_grow_end(cachep, page); - /* cache_grow can reenable interrupts, then ac could change. */ + /* + * cache_grow_begin() can reenable interrupts, + * then ac could change. + */ ac = cpu_cache_get(cachep); node = numa_mem_id(); /* no objects in sight? abort */ - if (x < 0 && ac->avail == 0) + if (!page && ac->avail == 0) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ @@ -3057,6 +3084,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); void *obj = NULL; + struct page *page; int nid; unsigned int cpuset_mems_cookie; @@ -3092,8 +3120,10 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - nid = cache_grow(cache, flags, numa_mem_id()); - if (nid >= 0) { + page = cache_grow_begin(cache, flags, numa_mem_id()); + cache_grow_end(cache, page); + if (page) { + nid = page_to_nid(page); obj = ____cache_alloc_node(cache, gfp_exact_node(flags), nid); @@ -3121,7 +3151,6 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, struct kmem_cache_node *n; void *obj; void *list = NULL; - int x; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); @@ -3153,8 +3182,9 @@ retry: must_grow: spin_unlock(&n->list_lock); - x = cache_grow(cachep, gfp_exact_node(flags), nodeid); - if (x >= 0) + page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + cache_grow_end(cachep, page); + if (page) goto retry; return fallback_alloc(cachep, flags); -- cgit v1.2.3 From 213b46958c65c7adaaf3201102da16ce0264e9cf Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:29 -0700 Subject: mm/slab: refill cpu cache through a new slab without holding a node lock Until now, cache growing makes a free slab on node's slab list and then we can allocate free objects from it. This necessarily requires to hold a node lock which is very contended. If we refill cpu cache before attaching it to node's slab list, we can avoid holding a node lock as much as possible because this newly allocated slab is only visible to the current task. This will reduce lock contention. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=355/750 Kmalloc N*alloc N*free(64): Average=452/812 Kmalloc N*alloc N*free(128): Average=559/1070 Kmalloc N*alloc N*free(256): Average=1176/980 Kmalloc N*alloc N*free(512): Average=1939/1189 Kmalloc N*alloc N*free(1024): Average=3521/1278 Kmalloc N*alloc N*free(2048): Average=7152/1838 Kmalloc N*alloc N*free(4096): Average=13438/2013 * After Kmalloc N*alloc N*free(32): Average=248/966 Kmalloc N*alloc N*free(64): Average=261/949 Kmalloc N*alloc N*free(128): Average=314/1016 Kmalloc N*alloc N*free(256): Average=741/1061 Kmalloc N*alloc N*free(512): Average=1246/1152 Kmalloc N*alloc N*free(1024): Average=2437/1259 Kmalloc N*alloc N*free(2048): Average=4980/1800 Kmalloc N*alloc N*free(4096): Average=9000/2078 It shows that contention is reduced for all the object sizes and performance increases by 30 ~ 40%. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 68 +++++++++++++++++++++++++++++++++------------------------------ 1 file changed, 36 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 8c4db214b05b..37600e91742f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2865,6 +2865,30 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, return obj; } +/* + * Slab list should be fixed up by fixup_slab_list() for existing slab + * or cache_grow_end() for new slab + */ +static __always_inline int alloc_block(struct kmem_cache *cachep, + struct array_cache *ac, struct page *page, int batchcount) +{ + /* + * There must be at least one object available for + * allocation. + */ + BUG_ON(page->active >= cachep->num); + + while (page->active < cachep->num && batchcount--) { + STATS_INC_ALLOCED(cachep); + STATS_INC_ACTIVE(cachep); + STATS_SET_HIGH(cachep); + + ac->entry[ac->avail++] = slab_get_obj(cachep, page); + } + + return batchcount; +} + static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; @@ -2877,7 +2901,6 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) check_irq_off(); node = numa_mem_id(); -retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2907,21 +2930,7 @@ retry: check_spinlock_acquired(cachep); - /* - * The slab was either on partial or free list so - * there must be at least one object available for - * allocation. - */ - BUG_ON(page->active >= cachep->num); - - while (page->active < cachep->num && batchcount--) { - STATS_INC_ALLOCED(cachep); - STATS_INC_ACTIVE(cachep); - STATS_SET_HIGH(cachep); - - ac->entry[ac->avail++] = slab_get_obj(cachep, page); - } - + batchcount = alloc_block(cachep, ac, page, batchcount); fixup_slab_list(cachep, n, page, &list); } @@ -2941,21 +2950,18 @@ alloc_done: } page = cache_grow_begin(cachep, gfp_exact_node(flags), node); - cache_grow_end(cachep, page); /* * cache_grow_begin() can reenable interrupts, * then ac could change. */ ac = cpu_cache_get(cachep); - node = numa_mem_id(); + if (!ac->avail && page) + alloc_block(cachep, ac, page, batchcount); + cache_grow_end(cachep, page); - /* no objects in sight? abort */ - if (!page && ac->avail == 0) + if (!ac->avail) return NULL; - - if (!ac->avail) /* objects refilled by interrupt? */ - goto retry; } ac->touched = 1; @@ -3149,14 +3155,13 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, { struct page *page; struct kmem_cache_node *n; - void *obj; + void *obj = NULL; void *list = NULL; VM_BUG_ON(nodeid < 0 || nodeid >= MAX_NUMNODES); n = get_node(cachep, nodeid); BUG_ON(!n); -retry: check_irq_off(); spin_lock(&n->list_lock); page = get_first_slab(n, false); @@ -3178,19 +3183,18 @@ retry: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); - goto done; + return obj; must_grow: spin_unlock(&n->list_lock); page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); + if (page) { + /* This slab isn't counted yet so don't update free_objects */ + obj = slab_get_obj(cachep, page); + } cache_grow_end(cachep, page); - if (page) - goto retry; - return fallback_alloc(cachep, flags); - -done: - return obj; + return obj ? obj : fallback_alloc(cachep, flags); } static __always_inline void * -- cgit v1.2.3 From 801faf0db8947e01877920e848a4d338dd7a99e7 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:31 -0700 Subject: mm/slab: lockless decision to grow cache To check whether free objects exist or not precisely, we need to grab a lock. But, accuracy isn't that important because race window would be even small and if there is too much free object, cache reaper would reap it. So, this patch makes the check for free object exisistence not to hold a lock. This will reduce lock contention in heavily allocation case. Note that until now, n->shared can be freed during the processing by writing slabinfo, but, with some trick in this patch, we can access it freely within interrupt disabled period. Below is the result of concurrent allocation/free in slab allocation benchmark made by Christoph a long time ago. I make the output simpler. The number shows cycle count during alloc/free respectively so less is better. * Before Kmalloc N*alloc N*free(32): Average=248/966 Kmalloc N*alloc N*free(64): Average=261/949 Kmalloc N*alloc N*free(128): Average=314/1016 Kmalloc N*alloc N*free(256): Average=741/1061 Kmalloc N*alloc N*free(512): Average=1246/1152 Kmalloc N*alloc N*free(1024): Average=2437/1259 Kmalloc N*alloc N*free(2048): Average=4980/1800 Kmalloc N*alloc N*free(4096): Average=9000/2078 * After Kmalloc N*alloc N*free(32): Average=344/792 Kmalloc N*alloc N*free(64): Average=347/882 Kmalloc N*alloc N*free(128): Average=390/959 Kmalloc N*alloc N*free(256): Average=393/1067 Kmalloc N*alloc N*free(512): Average=683/1229 Kmalloc N*alloc N*free(1024): Average=1295/1325 Kmalloc N*alloc N*free(2048): Average=2513/1664 Kmalloc N*alloc N*free(4096): Average=4742/2172 It shows that allocation performance decreases for the object size up to 128 and it may be due to extra checks in cache_alloc_refill(). But, with considering improvement of free performance, net result looks the same. Result for other size class looks very promising, roughly, 50% performance improvement. Signed-off-by: Joonsoo Kim Cc: Jesper Dangaard Brouer Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 37600e91742f..8133ebea77a4 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -965,6 +965,15 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); + /* + * To protect lockless access to n->shared during irq disabled context. + * If n->shared isn't NULL in irq disabled context, accessing to it is + * guaranteed to be valid until irq is re-enabled, because it will be + * freed after synchronize_sched(). + */ + if (force_change) + synchronize_sched(); + fail: kfree(old_shared); kfree(new_shared); @@ -2893,7 +2902,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) { int batchcount; struct kmem_cache_node *n; - struct array_cache *ac; + struct array_cache *ac, *shared; int node; void *list = NULL; struct page *page; @@ -2914,11 +2923,16 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); + shared = READ_ONCE(n->shared); + if (!n->free_objects && (!shared || !shared->avail)) + goto direct_grow; + spin_lock(&n->list_lock); + shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ - if (n->shared && transfer_objects(ac, n->shared, batchcount)) { - n->shared->touched = 1; + if (shared && transfer_objects(ac, shared, batchcount)) { + shared->touched = 1; goto alloc_done; } @@ -2940,6 +2954,7 @@ alloc_done: spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); +direct_grow: if (unlikely(!ac->avail)) { /* Check if we can use obj in pfmemalloc slab */ if (sk_memalloc_socks()) { -- cgit v1.2.3 From 81ae6d03952c1bfb96e1a716809bd65e7cd14360 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 19 May 2016 17:10:34 -0700 Subject: mm/slub.c: replace kick_all_cpus_sync() with synchronize_sched() in kmem_cache_shrink() When we call __kmem_cache_shrink on memory cgroup removal, we need to synchronize kmem_cache->cpu_partial update with put_cpu_partial that might be running on other cpus. Currently, we achieve that by using kick_all_cpus_sync, which works as a system wide memory barrier. Though fast it is, this method has a flaw - it issues a lot of IPIs, which might hurt high performance or real-time workloads. To fix this, let's replace kick_all_cpus_sync with synchronize_sched. Although the latter one may take much longer to finish, it shouldn't be a problem in this particular case, because memory cgroups are destroyed asynchronously from a workqueue so that no user visible effects should be introduced. OTOH, it will save us from excessive IPIs when someone removes a cgroup. Anyway, even if using synchronize_sched turns out to take too long, we can always introduce a kind of __kmem_cache_shrink batching so that this method would only be called once per one cgroup destruction (not per each per memcg kmem cache as it is now). Signed-off-by: Vladimir Davydov Reported-by: Peter Zijlstra Suggested-by: Peter Zijlstra Acked-by: Peter Zijlstra (Intel) Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4dbb109eb8cd..ba81cf672f08 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3697,7 +3697,7 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) * s->cpu_partial is checked locklessly (see put_cpu_partial), * so we have to make sure the change is visible. */ - kick_all_cpus_sync(); + synchronize_sched(); } flush_all(s); -- cgit v1.2.3 From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Thu, 19 May 2016 17:10:37 -0700 Subject: mm: SLAB freelist randomization Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize the SLAB freelist. The list is randomized during initialization of a new set of pages. The order on different freelist sizes is pre-computed at boot for performance. Each kmem_cache has its own randomized freelist. Before pre-computed lists are available freelists are generated dynamically. This security feature reduces the predictability of the kernel SLAB allocator against heap overflows rendering attacks much less stable. For example this attack against SLUB (also applicable against SLAB) would be affected: https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/ Also, since v4.6 the freelist was moved at the end of the SLAB. It means a controllable heap is opened to new attacks not yet publicly discussed. A kernel heap overflow can be transformed to multiple use-after-free. This feature makes this type of attack harder too. To generate entropy, we use get_random_bytes_arch because 0 bits of entropy is available in the boot stage. In the worse case this function will fallback to the get_random_bytes sub API. We also generate a shift random number to shift pre-computed freelist for each new set of pages. The config option name is not specific to the SLAB as this approach will be extended to other allocators like SLUB. Performance results highlighted no major changes: Hackbench (running 90 10 times): Before average: 0.0698 After average: 0.0663 (-5.01%) slab_test 1 run on boot. Difference only seen on the 2048 size test being the worse case scenario covered by freelist randomization. New slab pages are constantly being created on the 10000 allocations. Variance should be mainly due to getting new pages every few allocations. Before: Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles 10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles 10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles 10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles 10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles 10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles 10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles 10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles 10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles 10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles 10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles 10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 121 cycles 10000 times kmalloc(16)/kfree -> 121 cycles 10000 times kmalloc(32)/kfree -> 121 cycles 10000 times kmalloc(64)/kfree -> 121 cycles 10000 times kmalloc(128)/kfree -> 121 cycles 10000 times kmalloc(256)/kfree -> 119 cycles 10000 times kmalloc(512)/kfree -> 119 cycles 10000 times kmalloc(1024)/kfree -> 119 cycles 10000 times kmalloc(2048)/kfree -> 119 cycles 10000 times kmalloc(4096)/kfree -> 121 cycles 10000 times kmalloc(8192)/kfree -> 119 cycles 10000 times kmalloc(16384)/kfree -> 119 cycles After: Single thread testing ===================== 1. Kmalloc: Repeatedly allocate then free test 10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles 10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles 10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles 10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles 10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles 10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles 10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles 10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles 10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles 10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles 10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles 10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles 2. Kmalloc: alloc/free test 10000 times kmalloc(8)/kfree -> 121 cycles 10000 times kmalloc(16)/kfree -> 121 cycles 10000 times kmalloc(32)/kfree -> 123 cycles 10000 times kmalloc(64)/kfree -> 142 cycles 10000 times kmalloc(128)/kfree -> 121 cycles 10000 times kmalloc(256)/kfree -> 119 cycles 10000 times kmalloc(512)/kfree -> 119 cycles 10000 times kmalloc(1024)/kfree -> 119 cycles 10000 times kmalloc(2048)/kfree -> 119 cycles 10000 times kmalloc(4096)/kfree -> 119 cycles 10000 times kmalloc(8192)/kfree -> 119 cycles 10000 times kmalloc(16384)/kfree -> 119 cycles [akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()] Signed-off-by: Thomas Garnier Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Kees Cook Cc: Greg Thelen Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 4 ++ init/Kconfig | 9 +++ mm/slab.c | 167 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 178 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 9edbbf352340..8694f7a5d92b 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -80,6 +80,10 @@ struct kmem_cache { struct kasan_cache kasan_info; #endif +#ifdef CONFIG_SLAB_FREELIST_RANDOM + void *random_seq; +#endif + struct kmem_cache_node *node[MAX_NUMNODES]; }; diff --git a/init/Kconfig b/init/Kconfig index 0dfd09d54c65..79a91a2c0444 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1742,6 +1742,15 @@ config SLOB endchoice +config SLAB_FREELIST_RANDOM + default n + depends on SLAB + bool "SLAB freelist randomization" + help + Randomizes the freelist order used on creating new SLABs. This + security feature reduces the predictability of the kernel slab + allocator against heap overflows. + config SLUB_CPU_PARTIAL default y depends on SLUB && SMP diff --git a/mm/slab.c b/mm/slab.c index 8133ebea77a4..1ee26a0d358f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1243,6 +1243,61 @@ static void __init set_up_node(struct kmem_cache *cachep, int index) } } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +static void freelist_randomize(struct rnd_state *state, freelist_idx_t *list, + size_t count) +{ + size_t i; + unsigned int rand; + + for (i = 0; i < count; i++) + list[i] = i; + + /* Fisher-Yates shuffle */ + for (i = count - 1; i > 0; i--) { + rand = prandom_u32_state(state); + rand %= (i + 1); + swap(list[i], list[rand]); + } +} + +/* Create a random sequence per cache */ +static int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + unsigned int seed, count = cachep->num; + struct rnd_state state; + + if (count < 2) + return 0; + + /* If it fails, we will just use the global lists */ + cachep->random_seq = kcalloc(count, sizeof(freelist_idx_t), gfp); + if (!cachep->random_seq) + return -ENOMEM; + + /* Get best entropy at this stage */ + get_random_bytes_arch(&seed, sizeof(seed)); + prandom_seed_state(&state, seed); + + freelist_randomize(&state, cachep->random_seq, count); + return 0; +} + +/* Destroy the per-cache random freelist sequence */ +static void cache_random_seq_destroy(struct kmem_cache *cachep) +{ + kfree(cachep->random_seq); + cachep->random_seq = NULL; +} +#else +static inline int cache_random_seq_create(struct kmem_cache *cachep, gfp_t gfp) +{ + return 0; +} +static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + + /* * Initialisation. Called after the page allocator have been initialised and * before smp_init(). @@ -2374,6 +2429,8 @@ void __kmem_cache_release(struct kmem_cache *cachep) int i; struct kmem_cache_node *n; + cache_random_seq_destroy(cachep); + free_percpu(cachep->cpu_cache); /* NUMA: free the node structures */ @@ -2480,15 +2537,115 @@ static void cache_init_objs_debug(struct kmem_cache *cachep, struct page *page) #endif } +#ifdef CONFIG_SLAB_FREELIST_RANDOM +/* Hold information during a freelist initialization */ +union freelist_init_state { + struct { + unsigned int pos; + freelist_idx_t *list; + unsigned int count; + unsigned int rand; + }; + struct rnd_state rnd_state; +}; + +/* + * Initialize the state based on the randomization methode available. + * return true if the pre-computed list is available, false otherwize. + */ +static bool freelist_state_initialize(union freelist_init_state *state, + struct kmem_cache *cachep, + unsigned int count) +{ + bool ret; + unsigned int rand; + + /* Use best entropy available to define a random shift */ + get_random_bytes_arch(&rand, sizeof(rand)); + + /* Use a random state if the pre-computed list is not available */ + if (!cachep->random_seq) { + prandom_seed_state(&state->rnd_state, rand); + ret = false; + } else { + state->list = cachep->random_seq; + state->count = count; + state->pos = 0; + state->rand = rand; + ret = true; + } + return ret; +} + +/* Get the next entry on the list and randomize it using a random shift */ +static freelist_idx_t next_random_slot(union freelist_init_state *state) +{ + return (state->list[state->pos++] + state->rand) % state->count; +} + +/* + * Shuffle the freelist initialization state based on pre-computed lists. + * return true if the list was successfully shuffled, false otherwise. + */ +static bool shuffle_freelist(struct kmem_cache *cachep, struct page *page) +{ + unsigned int objfreelist = 0, i, count = cachep->num; + union freelist_init_state state; + bool precomputed; + + if (count < 2) + return false; + + precomputed = freelist_state_initialize(&state, cachep, count); + + /* Take a random entry as the objfreelist */ + if (OBJFREELIST_SLAB(cachep)) { + if (!precomputed) + objfreelist = count - 1; + else + objfreelist = next_random_slot(&state); + page->freelist = index_to_obj(cachep, page, objfreelist) + + obj_offset(cachep); + count--; + } + + /* + * On early boot, generate the list dynamically. + * Later use a pre-computed list for speed. + */ + if (!precomputed) { + freelist_randomize(&state.rnd_state, page->freelist, count); + } else { + for (i = 0; i < count; i++) + set_free_obj(page, i, next_random_slot(&state)); + } + + if (OBJFREELIST_SLAB(cachep)) + set_free_obj(page, cachep->num - 1, objfreelist); + + return true; +} +#else +static inline bool shuffle_freelist(struct kmem_cache *cachep, + struct page *page) +{ + return false; +} +#endif /* CONFIG_SLAB_FREELIST_RANDOM */ + static void cache_init_objs(struct kmem_cache *cachep, struct page *page) { int i; void *objp; + bool shuffled; cache_init_objs_debug(cachep, page); - if (OBJFREELIST_SLAB(cachep)) { + /* Try to randomize the freelist if enabled */ + shuffled = shuffle_freelist(cachep, page); + + if (!shuffled && OBJFREELIST_SLAB(cachep)) { page->freelist = index_to_obj(cachep, page, cachep->num - 1) + obj_offset(cachep); } @@ -2502,7 +2659,8 @@ static void cache_init_objs(struct kmem_cache *cachep, kasan_poison_object_data(cachep, objp); } - set_free_obj(page, i, i); + if (!shuffled) + set_free_obj(page, i, i); } } @@ -3841,6 +3999,10 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) int shared = 0; int batchcount = 0; + err = cache_random_seq_create(cachep, gfp); + if (err) + goto end; + if (!is_root_cache(cachep)) { struct kmem_cache *root = memcg_root_cache(cachep); limit = root->limit; @@ -3894,6 +4056,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) batchcount = (limit + 1) / 2; skip_setup: err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp); +end: if (err) pr_err("enable_cpucache failed for %s, error %d\n", cachep->name, -err); -- cgit v1.2.3 From a3187e438bc6565d6e54a550a19073d1b453f041 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 19 May 2016 17:10:41 -0700 Subject: mm: slab: remove ZONE_DMA_FLAG Now we have IS_ENABLED helper to check if a Kconfig option is enabled or not, so ZONE_DMA_FLAG sounds no longer useful. And, the use of ZONE_DMA_FLAG in slab looks pointless according to the comment [1] from Johannes Weiner, so remove them and ORing passed in flags with the cache gfp flags has been done in kmem_getpages(). [1] https://lkml.org/lkml/2014/9/25/553 Link: http://lkml.kernel.org/r/1462381297-11009-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 5 ----- mm/slab.c | 23 +---------------------- 2 files changed, 1 insertion(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 989f8f3d77e0..d6e9042b99e0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -268,11 +268,6 @@ config ARCH_ENABLE_HUGEPAGE_MIGRATION config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -config ZONE_DMA_FLAG - int - default "0" if !ZONE_DMA - default "1" - config BOUNCE bool "Enable bounce buffers" default y diff --git a/mm/slab.c b/mm/slab.c index 1ee26a0d358f..d81565a92864 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2236,7 +2236,7 @@ done: cachep->freelist_size = cachep->num * sizeof(freelist_idx_t); cachep->flags = flags; cachep->allocflags = __GFP_COMP; - if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) + if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); @@ -2664,16 +2664,6 @@ static void cache_init_objs(struct kmem_cache *cachep, } } -static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) -{ - if (CONFIG_ZONE_DMA_FLAG) { - if (flags & GFP_DMA) - BUG_ON(!(cachep->allocflags & GFP_DMA)); - else - BUG_ON(cachep->allocflags & GFP_DMA); - } -} - static void *slab_get_obj(struct kmem_cache *cachep, struct page *page) { void *objp; @@ -2752,14 +2742,6 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, if (gfpflags_allow_blocking(local_flags)) local_irq_enable(); - /* - * The test for missing atomic flag is performed here, rather than - * the more obvious place, simply to reduce the critical path length - * in kmem_cache_alloc(). If a caller is seriously mis-behaving they - * will eventually be caught here (where it matters). - */ - kmem_flagcheck(cachep, flags); - /* * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. @@ -3145,9 +3127,6 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags) { might_sleep_if(gfpflags_allow_blocking(flags)); -#if DEBUG - kmem_flagcheck(cachep, flags); -#endif } #if DEBUG -- cgit v1.2.3 From 43efd3ea64f3cf8920e8793e6953321a466023e3 Mon Sep 17 00:00:00 2001 From: Li Peng Date: Thu, 19 May 2016 17:10:43 -0700 Subject: mm/slub.c: fix sysfs filename in comment /sys/kernel/slab/xx/defrag_ratio should be remote_node_defrag_ratio. Link: http://lkml.kernel.org/r/1463449242-5366-1-git-send-email-lip@dtdream.com Signed-off-by: Li Peng Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index ba81cf672f08..8671de2e5b12 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1735,11 +1735,11 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, * may return off node objects because partial slabs are obtained * from other nodes and filled up. * - * If /sys/kernel/slab/xx/defrag_ratio is set to 100 (which makes - * defrag_ratio = 1000) then every (well almost) allocation will - * first attempt to defrag slab caches on other nodes. This means - * scanning over all nodes to look for partial slabs which may be - * expensive if we do it every time we are trying to find a slab + * If /sys/kernel/slab/xx/remote_node_defrag_ratio is set to 100 + * (which makes defrag_ratio = 1000) then every (well almost) + * allocation will first attempt to defrag slab caches on other nodes. + * This means scanning over all nodes to look for partial slabs which + * may be expensive if we do it every time we are trying to find a slab * with available objects. */ if (!s->remote_node_defrag_ratio || -- cgit v1.2.3 From 6d061f9f6136d477932088c24ce155d7dc785746 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:46 -0700 Subject: mm/page_ref: use page_ref helper instead of direct modification of _count page_reference manipulation functions are introduced to track down reference count change of the page. Use it instead of direct modification of _count. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 2 +- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- mm/filemap.c | 2 +- net/wireless/util.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c index 06b819db51b1..0ff8e60deccb 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c @@ -23,7 +23,7 @@ static void nicvf_get_page(struct nicvf *nic) if (!nic->rb_pageref || !nic->rb_page) return; - atomic_add(nic->rb_pageref, &nic->rb_page->_count); + page_ref_add(nic->rb_page, nic->rb_pageref); nic->rb_pageref = 0; } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 8114541f327c..3aabfc0adefe 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -920,7 +920,7 @@ static inline int qede_realloc_rx_buffer(struct qede_dev *edev, * network stack to take the ownership of the page * which can be recycled multiple times by the driver. */ - atomic_inc(&curr_cons->data->_count); + page_ref_inc(curr_cons->data); qede_reuse_page(edev, rxq, curr_cons); } diff --git a/mm/filemap.c b/mm/filemap.c index 182b21825255..01690338e3d2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -213,7 +213,7 @@ void __delete_from_page_cache(struct page *page, void *shadow) * some other bad page check should catch it later. */ page_mapcount_reset(page); - atomic_sub(mapcount, &page->_count); + page_ref_sub(page, mapcount); } } diff --git a/net/wireless/util.c b/net/wireless/util.c index 219bd197039e..4e809e978b7d 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -651,7 +651,7 @@ __frame_add_frag(struct sk_buff *skb, struct page *page, struct skb_shared_info *sh = skb_shinfo(skb); int page_offset; - atomic_inc(&page->_count); + page_ref_inc(page); page_offset = ptr - page_address(page); skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size); } -- cgit v1.2.3 From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:10:49 -0700 Subject: mm: rename _count, field of the struct page, to _refcount Many developers already know that field for reference count of the struct page is _count and atomic type. They would try to handle it directly and this could break the purpose of page reference count tracepoint. To prevent direct _count modification, this patch rename it to _refcount and add warning message on the code. After that, developer who need to handle reference count will find that field should not be accessed directly. [akpm@linux-foundation.org: fix comments, per Vlastimil] [akpm@linux-foundation.org: Documentation/vm/transhuge.txt too] [sfr@canb.auug.org.au: sync ethernet driver changes] Signed-off-by: Joonsoo Kim Signed-off-by: Stephen Rothwell Cc: Vlastimil Babka Cc: Hugh Dickins Cc: Johannes Berg Cc: "David S. Miller" Cc: Sunil Goutham Cc: Chris Metcalf Cc: Manish Chopra Cc: Yuval Mintz Cc: Tariq Toukan Cc: Saeed Mahameed Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 10 +++++----- arch/tile/mm/init.c | 2 +- drivers/block/aoe/aoecmd.c | 2 +- drivers/hwtracing/intel_th/msu.c | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 20 +++++++++---------- drivers/net/ethernet/qlogic/qede/qede_main.c | 4 ++-- fs/proc/page.c | 2 +- include/linux/mm.h | 2 +- include/linux/mm_types.h | 14 ++++++++----- include/linux/page_ref.h | 26 ++++++++++++------------- include/linux/pagemap.h | 8 ++++---- kernel/kexec_core.c | 2 +- mm/huge_memory.c | 4 ++-- mm/internal.h | 2 +- mm/page_alloc.c | 4 ++-- mm/slub.c | 4 ++-- mm/vmscan.c | 4 ++-- 17 files changed, 58 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index d9cb65cf5cfd..fb0e1f2a19cc 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -394,9 +394,9 @@ hugepage natively. Once finished you can drop the page table lock. Refcounting on THP is mostly consistent with refcounting on other compound pages: - - get_page()/put_page() and GUP operate in head page's ->_count. + - get_page()/put_page() and GUP operate in head page's ->_refcount. - - ->_count in tail pages is always zero: get_page_unless_zero() never + - ->_refcount in tail pages is always zero: get_page_unless_zero() never succeed on tail pages. - map/unmap of the pages with PTE entry increment/decrement ->_mapcount @@ -426,15 +426,15 @@ requests to split pinned huge page: it expects page count to be equal to sum of mapcount of all sub-pages plus one (split_huge_page caller must have reference for head page). -split_huge_page uses migration entries to stabilize page->_count and +split_huge_page uses migration entries to stabilize page->_refcount and page->_mapcount. We safe against physical memory scanners too: the only legitimate way scanner can get reference to a page is get_page_unless_zero(). -All tail pages has zero ->_count until atomic_add(). It prevent scanner +All tail pages has zero ->_refcount until atomic_add(). It prevent scanner from geting reference to tail page up to the point. After the atomic_add() -we don't care about ->_count value. We already known how many references +we don't care about ->_refcount value. We already known how many references with should uncharge from head page. For head page get_page_unless_zero() will succeed and we don't mind. It's diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c index a0582b7f41d3..adce25462b0d 100644 --- a/arch/tile/mm/init.c +++ b/arch/tile/mm/init.c @@ -679,7 +679,7 @@ static void __init init_free_pfn_range(unsigned long start, unsigned long end) * Hacky direct set to avoid unnecessary * lock take/release for EVERY page here. */ - p->_count.counter = 0; + p->_refcount.counter = 0; p->_mapcount.counter = -1; } init_page_count(page); diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index 437b3a822f44..d597e432e195 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -861,7 +861,7 @@ rqbiocnt(struct request *r) * discussion. * * We cannot use get_page in the workaround, because it insists on a - * positive page count as a precondition. So we use _count directly. + * positive page count as a precondition. So we use _refcount directly. */ static void bio_pageinc(struct bio *bio) diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index d9d6022c5aca..d2209147dc89 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -1164,7 +1164,7 @@ static void msc_mmap_close(struct vm_area_struct *vma) if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex)) return; - /* drop page _counts */ + /* drop page _refcounts */ for (pg = 0; pg < msc->nr_pages; pg++) { struct page *page = msc_buffer_get_page(msc, pg); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index f3456798c596..bd947704b59c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -433,8 +433,8 @@ static int mlx5e_alloc_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { if (unlikely(mlx5e_alloc_and_map_page(rq, wi, i))) goto err_unmap; - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_add(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -452,8 +452,8 @@ err_unmap: while (--i >= 0) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq), - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq)); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -477,8 +477,8 @@ void mlx5e_free_rx_fragmented_mpwqe(struct mlx5e_rq *rq, for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { dma_unmap_page(rq->pdev, wi->umr.dma_info[i].addr, PAGE_SIZE, PCI_DMA_FROMDEVICE); - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->umr.dma_info[i].page->_count); + page_ref_sub(wi->umr.dma_info[i].page, + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(wi->umr.dma_info[i].page); } dma_unmap_single(rq->pdev, wi->umr.mtt_addr, mtt_sz, PCI_DMA_TODEVICE); @@ -527,8 +527,8 @@ static int mlx5e_alloc_rx_linear_mpwqe(struct mlx5e_rq *rq, */ split_page(wi->dma_info.page, MLX5_MPWRQ_WQE_PAGE_ORDER); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_add(mlx5e_mpwqe_strides_per_page(rq), - &wi->dma_info.page[i]._count); + page_ref_add(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq)); wi->skbs_frags[i] = 0; } @@ -551,8 +551,8 @@ void mlx5e_free_rx_linear_mpwqe(struct mlx5e_rq *rq, dma_unmap_page(rq->pdev, wi->dma_info.addr, rq->wqe_sz, PCI_DMA_FROMDEVICE); for (i = 0; i < MLX5_MPWRQ_PAGES_PER_WQE; i++) { - atomic_sub(mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i], - &wi->dma_info.page[i]._count); + page_ref_sub(&wi->dma_info.page[i], + mlx5e_mpwqe_strides_per_page(rq) - wi->skbs_frags[i]); put_page(&wi->dma_info.page[i]); } } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 3aabfc0adefe..73dd525fbf08 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -1036,7 +1036,7 @@ static int qede_fill_frag_skb(struct qede_dev *edev, /* Incr page ref count to reuse on allocation failure * so that it doesn't get freed while freeing SKB. */ - atomic_inc(¤t_bd->data->_count); + page_ref_inc(current_bd->data); goto out; } @@ -1487,7 +1487,7 @@ static int qede_rx_int(struct qede_fastpath *fp, int budget) * freeing SKB. */ - atomic_inc(&sw_rx_data->data->_count); + page_ref_inc(sw_rx_data->data); rxq->rx_alloc_errors++; qede_recycle_rx_bd_ring(rxq, edev, fp_cqe->bd_num); diff --git a/fs/proc/page.c b/fs/proc/page.c index 712f1b9992cc..3ecd445e830d 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -142,7 +142,7 @@ u64 stable_page_flags(struct page *page) /* - * Caveats on high order pages: page->_count will only be set + * Caveats on high order pages: page->_refcount will only be set * -1 on the head page; SLUB/SLQB do the same for PG_slab; * SLOB won't set PG_slab at all on compound pages. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 727f799757ab..1193a54ea2b3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -734,7 +734,7 @@ static inline void get_page(struct page *page) page = compound_head(page); /* * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. + * requires to already have an elevated page->_refcount. */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c2d75b4fa86c..1fda9c99ef95 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,9 +73,9 @@ struct page { unsigned long counters; #else /* - * Keep _count separate from slub cmpxchg_double data. - * As the rest of the double word is protected by - * slab_lock but _count is not. + * Keep _refcount separate from slub cmpxchg_double + * data. As the rest of the double word is protected by + * slab_lock but _refcount is not. */ unsigned counters; #endif @@ -97,7 +97,11 @@ struct page { }; int units; /* SLOB */ }; - atomic_t _count; /* Usage count, see below. */ + /* + * Usage count, *USE WRAPPER FUNCTION* + * when manual accounting. See page_ref.h + */ + atomic_t _refcount; }; unsigned int active; /* SLAB */ }; @@ -248,7 +252,7 @@ struct page_frag_cache { __u32 offset; #endif /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_count every time we allocate a fragment. + * containing page->_refcount every time we allocate a fragment. */ unsigned int pagecnt_bias; bool pfmemalloc; diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h index e596d5d9540e..8b5e0a9f2431 100644 --- a/include/linux/page_ref.h +++ b/include/linux/page_ref.h @@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v) static inline int page_ref_count(struct page *page) { - return atomic_read(&page->_count); + return atomic_read(&page->_refcount); } static inline int page_count(struct page *page) { - return atomic_read(&compound_head(page)->_count); + return atomic_read(&compound_head(page)->_refcount); } static inline void set_page_count(struct page *page, int v) { - atomic_set(&page->_count, v); + atomic_set(&page->_refcount, v); if (page_ref_tracepoint_active(__tracepoint_page_ref_set)) __page_ref_set(page, v); } @@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page) static inline void page_ref_add(struct page *page, int nr) { - atomic_add(nr, &page->_count); + atomic_add(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, nr); } static inline void page_ref_sub(struct page *page, int nr) { - atomic_sub(nr, &page->_count); + atomic_sub(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -nr); } static inline void page_ref_inc(struct page *page) { - atomic_inc(&page->_count); + atomic_inc(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, 1); } static inline void page_ref_dec(struct page *page) { - atomic_dec(&page->_count); + atomic_dec(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod)) __page_ref_mod(page, -1); } static inline int page_ref_sub_and_test(struct page *page, int nr) { - int ret = atomic_sub_and_test(nr, &page->_count); + int ret = atomic_sub_and_test(nr, &page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -nr, ret); @@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr) static inline int page_ref_dec_and_test(struct page *page) { - int ret = atomic_dec_and_test(&page->_count); + int ret = atomic_dec_and_test(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test)) __page_ref_mod_and_test(page, -1, ret); @@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page) static inline int page_ref_dec_return(struct page *page) { - int ret = atomic_dec_return(&page->_count); + int ret = atomic_dec_return(&page->_refcount); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return)) __page_ref_mod_and_return(page, -1, ret); @@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page) static inline int page_ref_add_unless(struct page *page, int nr, int u) { - int ret = atomic_add_unless(&page->_count, nr, u); + int ret = atomic_add_unless(&page->_refcount, nr, u); if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless)) __page_ref_mod_unless(page, nr, ret); @@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u) static inline int page_ref_freeze(struct page *page, int count) { - int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count); + int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count); if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze)) __page_ref_freeze(page, count, ret); @@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count) VM_BUG_ON_PAGE(page_count(page) != 0, page); VM_BUG_ON(count == 0); - atomic_set(&page->_count, count); + atomic_set(&page->_refcount, count); if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze)) __page_ref_unfreeze(page, count); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7e1ab155c67c..fe1513ffb7bf 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. - * If the page is free (_count == 0), then _count is untouched, and 0 - * is returned. Otherwise, _count is incremented by 1 and 1 is returned. + * If the page is free (_refcount == 0), then _refcount is untouched, and 0 + * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): - * this allows allocators to use a synchronize_rcu() to stabilize _count. + * this allows allocators to use a synchronize_rcu() to stabilize _refcount. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher @@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold); * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * - * Remove-side that cares about stability of _count (eg. reclaim) has the + * Remove-side that cares about stability of _refcount (eg. reclaim) has the * following (with tree_lock held for write): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 1391d3ee3b86..1c03dfb4abfd 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1410,7 +1410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_STRUCT_SIZE(list_head); VMCOREINFO_SIZE(nodemask_t); VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, _refcount); VMCOREINFO_OFFSET(page, mapping); VMCOREINFO_OFFSET(page, lru); VMCOREINFO_OFFSET(page, _mapcount); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b49ee126d4d1..f8ac8f582fd8 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3113,7 +3113,7 @@ static void __split_huge_page_tail(struct page *head, int tail, VM_BUG_ON_PAGE(page_ref_count(page_tail) != 0, page_tail); /* - * tail_page->_count is zero and not changing from under us. But + * tail_page->_refcount is zero and not changing from under us. But * get_page_unless_zero() may be running from under us on the * tail_page. If we used atomic_set() below instead of atomic_inc(), we * would then run atomic_set() concurrently with @@ -3340,7 +3340,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mlocked) lru_add_drain(); - /* Prevent deferred_split_scan() touching ->_count */ + /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock_irqsave(&pgdata->split_queue_lock, flags); count = page_count(head); mapcount = total_mapcount(head); diff --git a/mm/internal.h b/mm/internal.h index b79abb6721cf..098a89e3b97c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -58,7 +58,7 @@ static inline unsigned long ra_submit(struct file_ra_state *ra, } /* - * Turn a non-refcounted page (->_count == 0) into refcounted with + * Turn a non-refcounted page (->_refcount == 0) into refcounted with * a count of one. */ static inline void set_page_refcounted(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c1069efcc4d7..4ce57f938b7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -794,7 +794,7 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mapping != NULL)) bad_reason = "non-NULL mapping"; if (unlikely(page_ref_count(page) != 0)) - bad_reason = "nonzero _count"; + bad_reason = "nonzero _refcount"; if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) { bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; bad_flags = PAGE_FLAGS_CHECK_AT_FREE; @@ -6864,7 +6864,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * We can't use page_count without pin a page * because another CPU can free compound page. * This check already skips compound tails of THP - * because their page->_count is zero at all time. + * because their page->_refcount is zero at all time. */ if (!page_ref_count(page)) { if (PageBuddy(page)) diff --git a/mm/slub.c b/mm/slub.c index 8671de2e5b12..cf1faa4d3992 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -329,8 +329,8 @@ static inline void set_page_slub_counters(struct page *page, unsigned long count tmp.counters = counters_new; /* * page->counters can cover frozen/inuse/objects as well - * as page->_count. If we assign to ->counters directly - * we run the risk of losing updates to page->_count, so + * as page->_refcount. If we assign to ->counters directly + * we run the risk of losing updates to page->_refcount, so * be careful and only assign to the fields we need. */ page->frozen = tmp.frozen; diff --git a/mm/vmscan.c b/mm/vmscan.c index 142cb61f4822..d3a02ac3eed7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -633,7 +633,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_count. + * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. @@ -1720,7 +1720,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * It is safe to rely on PG_active against the non-LRU pages in here because * nobody will play with that bit on a non-LRU page. * - * The downside is that we have to touch page->_count against each page. + * The downside is that we have to touch page->_refcount against each page. * But we had to alter page->flags anyway. */ -- cgit v1.2.3 From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:10:58 -0700 Subject: include/linux/nodemask.h: create next_node_in() helper Lots of code does node = next_node(node, XXX); if (node == MAX_NUMNODES) node = first_node(XXX); so create next_node_in() to do this and use it in various places. [mhocko@suse.com: use next_node_in() helper] Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Michal Hocko Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Cc: Hui Zhu Cc: Wang Xiaoqiang Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/kernel/setup.c | 4 +--- arch/x86/mm/numa.c | 4 +--- include/linux/nodemask.h | 11 ++++++++++- kernel/cpuset.c | 8 +------- lib/Makefile | 2 +- lib/nodemask.c | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 4 +--- mm/memcontrol.c | 4 +--- mm/mempolicy.c | 24 ++---------------------- mm/page_isolation.c | 9 +++------ mm/slab.c | 13 +++---------- 11 files changed, 54 insertions(+), 59 deletions(-) create mode 100644 lib/nodemask.c (limited to 'mm') diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c index a992238e9b58..153020abd2f5 100644 --- a/arch/tile/kernel/setup.c +++ b/arch/tile/kernel/setup.c @@ -962,9 +962,7 @@ static void __init setup_numa_mapping(void) cpumask_set_cpu(best_cpu, &node_2_cpu_mask[node]); cpu_2_node[best_cpu] = node; cpumask_clear_cpu(best_cpu, &unbound_cpus); - node = next_node(node, default_nodes); - if (node == MAX_NUMNODES) - node = first_node(default_nodes); + node = next_node_in(node, default_nodes); } /* Print out node assignments and set defaults for disabled cpus */ diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index f70c1ff46125..9c086c57105c 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -617,9 +617,7 @@ static void __init numa_init_array(void) if (early_cpu_to_node(i) != NUMA_NO_NODE) continue; numa_set_node(i, rr); - rr = next_node(rr, node_online_map); - if (rr == MAX_NUMNODES) - rr = first_node(node_online_map); + rr = next_node_in(rr, node_online_map); } } diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 6e85889cf9ab..f746e44d4046 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -43,8 +43,10 @@ * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES + * int next_node_in(node, mask) Next node past 'node', or wrap to first, + * or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or - * MAX_NUMNODES. + * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set @@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp) return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } +/* + * Find the next present node in src, starting after node n, wrapping around to + * the first node in src if needed. Returns MAX_NUMNODES if src is empty. + */ +#define next_node_in(n, src) __next_node_in((n), &(src)) +int __next_node_in(int node, const nodemask_t *srcp); + static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1902956baba1..611cc69af8f0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2591,13 +2591,7 @@ int __cpuset_node_allowed(int node, gfp_t gfp_mask) static int cpuset_spread_node(int *rotor) { - int node; - - node = next_node(*rotor, current->mems_allowed); - if (node == MAX_NUMNODES) - node = first_node(current->mems_allowed); - *rotor = node; - return node; + return *rotor = next_node_in(*rotor, current->mems_allowed); } int cpuset_mem_spread_node(void) diff --git a/lib/Makefile b/lib/Makefile index 931396ada5eb..42b69185f963 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -25,7 +25,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ sha1.o md5.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ - earlycpio.o seq_buf.o nmi_backtrace.o + earlycpio.o seq_buf.o nmi_backtrace.o nodemask.o obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o lib-$(CONFIG_MMU) += ioremap.o diff --git a/lib/nodemask.c b/lib/nodemask.c new file mode 100644 index 000000000000..e42a5bf44d33 --- /dev/null +++ b/lib/nodemask.c @@ -0,0 +1,30 @@ +#include +#include +#include + +int __next_node_in(int node, const nodemask_t *srcp) +{ + int ret = __next_node(node, srcp); + + if (ret == MAX_NUMNODES) + ret = __first_node(srcp); + return ret; +} +EXPORT_SYMBOL(__next_node_in); + +#ifdef CONFIG_NUMA +/* + * Return the bit number of a random bit set in the nodemask. + * (returns NUMA_NO_NODE if nodemask is empty) + */ +int node_random(const nodemask_t *maskp) +{ + int w, bit = NUMA_NO_NODE; + + w = nodes_weight(*maskp); + if (w) + bit = bitmap_ord_to_pos(maskp->bits, + get_random_int() % w, MAX_NUMNODES); + return bit; +} +#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08b396f..5856093f9062 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -937,9 +937,7 @@ err: */ static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); + nid = next_node_in(nid, *nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fe787f5c41bd..6740c4c2b550 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1389,9 +1389,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) mem_cgroup_may_update_nodemask(memcg); node = memcg->last_scanned_node; - node = next_node(node, memcg->scan_nodes); - if (node == MAX_NUMNODES) - node = first_node(memcg->scan_nodes); + node = next_node_in(node, memcg->scan_nodes); /* * We call this when we hit limit, not when pages are added to LRU. * No LRU may hold pages because all pages are UNEVICTABLE or diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 36cc01bc950a..8d369cee0cd6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -97,7 +97,6 @@ #include #include -#include #include "internal.h" @@ -347,9 +346,7 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, BUG(); if (!node_isset(current->il_next, tmp)) { - current->il_next = next_node(current->il_next, tmp); - if (current->il_next >= MAX_NUMNODES) - current->il_next = first_node(tmp); + current->il_next = next_node_in(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) current->il_next = numa_node_id(); } @@ -1709,9 +1706,7 @@ static unsigned interleave_nodes(struct mempolicy *policy) struct task_struct *me = current; nid = me->il_next; - next = next_node(nid, policy->v.nodes); - if (next >= MAX_NUMNODES) - next = first_node(policy->v.nodes); + next = next_node_in(nid, policy->v.nodes); if (next < MAX_NUMNODES) me->il_next = next; return nid; @@ -1805,21 +1800,6 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } -/* - * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) - */ -int node_random(const nodemask_t *maskp) -{ - int w, bit = NUMA_NO_NODE; - - w = nodes_weight(*maskp); - if (w) - bit = bitmap_ord_to_pos(maskp->bits, - get_random_int() % w, MAX_NUMNODES); - return bit; -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c4f568206544..67bedd18429c 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -288,13 +288,10 @@ struct page *alloc_migrate_target(struct page *page, unsigned long private, * accordance with memory policy of the user process if possible. For * now as a simple work-around, we use the next node for destination. */ - if (PageHuge(page)) { - int node = next_online_node(page_to_nid(page)); - if (node == MAX_NUMNODES) - node = first_online_node; + if (PageHuge(page)) return alloc_huge_page_node(page_hstate(compound_head(page)), - node); - } + next_node_in(page_to_nid(page), + node_online_map)); if (PageHighMem(page)) gfp_mask |= __GFP_HIGHMEM; diff --git a/mm/slab.c b/mm/slab.c index d81565a92864..c11bf5007952 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -522,22 +522,15 @@ static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { - int node; - - node = next_node(cpu_to_mem(cpu), node_online_map); - if (node == MAX_NUMNODES) - node = first_node(node_online_map); - - per_cpu(slab_reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = next_node_in(cpu_to_mem(cpu), + node_online_map); } static void next_reap_node(void) { int node = __this_cpu_read(slab_reap_node); - node = next_node(node, node_online_map); - if (unlikely(node >= MAX_NUMNODES)) - node = first_node(node_online_map); + node = next_node_in(node, node_online_map); __this_cpu_write(slab_reap_node, node); } -- cgit v1.2.3 From 09a95e29cb30a3930db22d340ddd072a82b6b0db Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 19 May 2016 17:11:01 -0700 Subject: mm/hugetlb: optimize minimum size (min_size) accounting It was observed that minimum size accounting associated with the hugetlbfs min_size mount option may not perform optimally and as expected. As huge pages/reservations are released from the filesystem and given back to the global pools, they are reserved for subsequent filesystem use as long as the subpool reserved count is less than subpool minimum size. It does not take into account used pages within the filesystem. The filesystem size limits are not exceeded and this is technically not a bug. However, better behavior would be to wait for the number of used pages/reservations associated with the filesystem to drop below the minimum size before taking reservations to satisfy minimum size. An optimization is also made to the hugepage_subpool_get_pages() routine which is called when pages/reservations are allocated. This does not change behavior, but simply avoids the accounting if all reservations have already been taken (subpool reserved count == 0). Signed-off-by: Mike Kravetz Acked-by: Naoya Horiguchi Cc: Hillf Danton Cc: David Rientjes Cc: Dave Hansen Cc: "Kirill A. Shutemov" Cc: Paul Gortmaker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5856093f9062..fb37ef810655 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -144,7 +144,8 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, } } - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->rsv_hpages) { if (delta > spool->rsv_hpages) { /* * Asking for more reserves than those already taken on @@ -182,7 +183,8 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, if (spool->max_hpages != -1) /* maximum size accounting */ spool->used_hpages -= delta; - if (spool->min_hpages != -1) { /* minimum size accounting */ + /* minimum size accounting */ + if (spool->min_hpages != -1 && spool->used_hpages < spool->min_hpages) { if (spool->rsv_hpages + delta <= spool->min_hpages) ret = 0; else -- cgit v1.2.3 From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001 From: Vaishali Thakkar Date: Thu, 19 May 2016 17:11:04 -0700 Subject: mm/hugetlb: introduce hugetlb_bad_size() When any unsupported hugepage size is specified, 'hugepagesz=' and 'hugepages=' should be ignored during command line parsing until any supported hugepage size is found. But currently incorrect number of hugepages are allocated when unsupported size is specified as it fails to ignore the 'hugepages=' command. Test case: Note that this is specific to x86 architecture. Boot the kernel with command line option 'hugepagesz=256M hugepages=X'. After boot, dmesg output shows that X number of hugepages of the size 2M is pre-allocated instead of 0. So, to handle such command line options, introduce new routine hugetlb_bad_size. The routine hugetlb_bad_size sets the global variable parsed_valid_hugepagesz. We are using parsed_valid_hugepagesz to save the state when unsupported hugepagesize is found so that we can ignore the 'hugepages=' parameters after that and then reset the variable when supported hugepage size is found. The routine hugetlb_bad_size can be called while setting 'hugepagesz=' parameter in an architecture specific code. Signed-off-by: Vaishali Thakkar Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Acked-by: Michal Hocko Cc: Hillf Danton Cc: Yaowei Bai Cc: Dominik Dingel Cc: Kirill A. Shutemov Cc: Paul Gortmaker Cc: Dave Hansen Cc: Benjamin Herrenschmidt Cc: James Hogan Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 1 + mm/hugetlb.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 7d953c2542a8..e44c57876e89 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); +void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); struct hstate *size_to_hstate(unsigned long size); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fb37ef810655..0adb74d0a4e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -51,6 +51,7 @@ __initdata LIST_HEAD(huge_boot_pages); static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; +static bool __initdata parsed_valid_hugepagesz = true; /* * Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages, @@ -2659,6 +2660,11 @@ static int __init hugetlb_init(void) subsys_initcall(hugetlb_init); /* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_bad_size(void) +{ + parsed_valid_hugepagesz = false; +} + void __init hugetlb_add_hstate(unsigned int order) { struct hstate *h; @@ -2691,11 +2697,17 @@ static int __init hugetlb_nrpages_setup(char *s) unsigned long *mhp; static unsigned long *last_mhp; + if (!parsed_valid_hugepagesz) { + pr_warn("hugepages = %s preceded by " + "an unsupported hugepagesz, ignoring\n", s); + parsed_valid_hugepagesz = true; + return 1; + } /* * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!hugetlb_max_hstate) + else if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; -- cgit v1.2.3 From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001 From: Yaowei Bai Date: Thu, 19 May 2016 17:11:26 -0700 Subject: mm/memory_hotplug: is_mem_section_removable() can return bool Make is_mem_section_removable() return bool to improve readability due to this particular function only using either one or zero as its return value. Signed-off-by: Yaowei Bai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 6 +++--- mm/memory_hotplug.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index adbef586e696..20d8a5d4d133 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {} #ifdef CONFIG_MEMORY_HOTREMOVE -extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); +extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern void remove_memory(int nid, u64 start, u64 size); #else -static inline int is_mem_section_removable(unsigned long pfn, +static inline bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages) { - return 0; + return false; } static inline void try_offline_node(int nid) {} diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index aa34431c3f31..b21d8895ea41 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1410,7 +1410,7 @@ static struct page *next_active_pageblock(struct page *page) } /* Checks if this range of memory is likely to be hot-removable. */ -int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) +bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; @@ -1418,12 +1418,12 @@ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { if (!is_pageblock_removable_nolock(page)) - return 0; + return false; cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ - return 1; + return true; } /* -- cgit v1.2.3 From fda3d69be9fe7a24ad32b840cb2ed7c30b6ba1c9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:11:34 -0700 Subject: mm/memcontrol.c:mem_cgroup_select_victim_node(): clarify comment > The comment seems to have not much to do with the code? I guess the comment tries to say that the code path is triggered when we charge the page which happens _before_ it is added to the LRU list and so last_scanned_node might contain the stale data. Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6740c4c2b550..011dac8ab5d7 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1391,10 +1391,9 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) node = next_node_in(node, memcg->scan_nodes); /* - * We call this when we hit limit, not when pages are added to LRU. - * No LRU may hold pages because all pages are UNEVICTABLE or - * memcg is too small and all pages are not on LRU. In that case, - * we use curret node. + * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages + * last time it really checked all the LRUs due to rate limiting. + * Fallback to the current node in that case for simplicity. */ if (unlikely(node == MAX_NUMNODES)) node = numa_node_id(); -- cgit v1.2.3 From 949698a31af3b3808b0ff0cca26f36e68953bd1f Mon Sep 17 00:00:00 2001 From: Li Zhang Date: Thu, 19 May 2016 17:11:37 -0700 Subject: mm/page_alloc: Remove useless parameter of __free_pages_boot_core __free_pages_boot_core has parameter pfn which is not used at all. Remove it. Signed-off-by: Li Zhang Reviewed-by: Pan Xinhui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4ce57f938b7f..34f688bc5c7a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1076,8 +1076,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -static void __init __free_pages_boot_core(struct page *page, - unsigned long pfn, unsigned int order) +static void __init __free_pages_boot_core(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; struct page *p = page; @@ -1154,7 +1153,7 @@ void __init __free_pages_bootmem(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; - return __free_pages_boot_core(page, pfn, order); + return __free_pages_boot_core(page, order); } /* @@ -1239,12 +1238,12 @@ static void __init deferred_free_range(struct page *page, if (nr_pages == MAX_ORDER_NR_PAGES && (pfn & (MAX_ORDER_NR_PAGES-1)) == 0) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); - __free_pages_boot_core(page, pfn, MAX_ORDER-1); + __free_pages_boot_core(page, MAX_ORDER-1); return; } - for (i = 0; i < nr_pages; i++, page++, pfn++) - __free_pages_boot_core(page, pfn, 0); + for (i = 0; i < nr_pages; i++, page++) + __free_pages_boot_core(page, 0); } /* Completion tracking for deferred_init_memmap() threads */ -- cgit v1.2.3 From 54f18d35263334ebcc6bf409fee3c0c8c22e5588 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:11:40 -0700 Subject: mm/hugetlb.c: use first_memory_node Instead of open-coding it. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0adb74d0a4e1..0f580ea7f41d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2684,8 +2684,8 @@ void __init hugetlb_add_hstate(unsigned int order) for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); INIT_LIST_HEAD(&h->hugepage_activelist); - h->next_nid_to_alloc = first_node(node_states[N_MEMORY]); - h->next_nid_to_free = first_node(node_states[N_MEMORY]); + h->next_nid_to_alloc = first_memory_node; + h->next_nid_to_free = first_memory_node; snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); -- cgit v1.2.3 From fee83b3aba4b7ddb0cb1497a04ddebcaa43f236e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:11:43 -0700 Subject: mm/mempolicy.c:offset_il_node() document and clarify This code was pretty obscure and was relying upon obscure side-effects of next_node(-1, ...) and was relying upon NUMA_NO_NODE being equal to -1. Clean that all up and document the function's intent. Acked-by: Vlastimil Babka Cc: Xishi Qiu Cc: Joonsoo Kim Cc: David Rientjes Cc: Naoya Horiguchi Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8d369cee0cd6..7f80ebcd6552 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1758,23 +1758,25 @@ unsigned int mempolicy_slab_node(void) } } -/* Do static interleaving for a VMA with known offset. */ +/* + * Do static interleaving for a VMA with known offset @n. Returns the n'th + * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the + * number of present nodes. + */ static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) + struct vm_area_struct *vma, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; - int c; - int nid = NUMA_NO_NODE; + int i; + int nid; if (!nnodes) return numa_node_id(); - target = (unsigned int)off % nnodes; - c = 0; - do { + target = (unsigned int)n % nnodes; + nid = first_node(pol->v.nodes); + for (i = 0; i < target; i++) nid = next_node(nid, pol->v.nodes); - c++; - } while (c <= target); return nid; } -- cgit v1.2.3 From e4c5800a3991f0c6a766983535dfc10d51802cf6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 19 May 2016 17:11:46 -0700 Subject: mm/rmap: replace BUG_ON(anon_vma->degree) with VM_WARN_ON This check effectively catches anon vma hierarchy inconsistence and some vma corruptions. It was effective for catching corner cases in anon vma reusing logic. For now this code seems stable so check could be hidden under CONFIG_DEBUG_VM and replaced with WARN because it's not so fatal. Signed-off-by: Konstantin Khlebnikov Suggested-by: Vasily Averin Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 307b555024ef..4cebe8a7c2cb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -409,7 +409,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { struct anon_vma *anon_vma = avc->anon_vma; - BUG_ON(anon_vma->degree); + VM_WARN_ON(anon_vma->degree); put_anon_vma(anon_vma); list_del(&avc->same_vma); -- cgit v1.2.3 From 06b6640a3902d6d50c1bb4fb1f29a46b207dbf08 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:11:48 -0700 Subject: mm, compaction: wrap calculating first and last pfn of pageblock Compaction code has accumulated numerous instances of manual calculations of the first (inclusive) and last (exclusive) pfn of a pageblock (or a smaller block of given order), given a pfn within the pageblock. Wrap these calculations by introducing pageblock_start_pfn(pfn) and pageblock_end_pfn(pfn) macros. [vbabka@suse.cz: fix crash in get_pfnblock_flags_mask() from isolate_freepages():] Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Cc: Minchan Kim Cc: Michal Hocko Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 8fa254043801..017a1a1963cb 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,11 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define CREATE_TRACE_POINTS #include +#define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) +#define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) +#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) +#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) + static unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; @@ -161,7 +166,7 @@ static void reset_cached_positions(struct zone *zone) zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = - round_down(zone_end_pfn(zone) - 1, pageblock_nr_pages); + pageblock_start_pfn(zone_end_pfn(zone) - 1); } /* @@ -519,10 +524,10 @@ isolate_freepages_range(struct compact_control *cc, LIST_HEAD(freelist); pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn += isolated, block_start_pfn = block_end_pfn, @@ -538,8 +543,8 @@ isolate_freepages_range(struct compact_control *cc, * scanning range to right one. */ if (pfn >= block_end_pfn) { - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_start_pfn = pageblock_start_pfn(pfn); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); } @@ -834,10 +839,10 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, /* Scan block by block. First and last block may be incomplete */ pfn = start_pfn; - block_start_pfn = pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(pfn); if (block_start_pfn < cc->zone->zone_start_pfn) block_start_pfn = cc->zone->zone_start_pfn; - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); for (; pfn < end_pfn; pfn = block_end_pfn, block_start_pfn = block_end_pfn, @@ -924,10 +929,10 @@ static void isolate_freepages(struct compact_control *cc) * is using. */ isolate_start_pfn = cc->free_pfn; - block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = pageblock_start_pfn(cc->free_pfn); block_end_pfn = min(block_start_pfn + pageblock_nr_pages, zone_end_pfn(zone)); - low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); + low_pfn = pageblock_end_pfn(cc->migrate_pfn); /* * Isolate free pages until enough are available to migrate the @@ -1081,12 +1086,12 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * initialized by compact_zone() */ low_pfn = cc->migrate_pfn; - block_start_pfn = cc->migrate_pfn & ~(pageblock_nr_pages - 1); + block_start_pfn = pageblock_start_pfn(low_pfn); if (block_start_pfn < zone->zone_start_pfn) block_start_pfn = zone->zone_start_pfn; /* Only scan within a pageblock boundary */ - block_end_pfn = ALIGN(low_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(low_pfn); /* * Iterate over whole pageblocks until we find the first suitable. @@ -1343,7 +1348,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) { - cc->free_pfn = round_down(end_pfn - 1, pageblock_nr_pages); + cc->free_pfn = pageblock_start_pfn(end_pfn - 1); zone->compact_cached_free_pfn = cc->free_pfn; } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) { @@ -1411,7 +1416,7 @@ check_drain: if (cc->order > 0 && cc->last_migrated_pfn) { int cpu; unsigned long current_block_start = - cc->migrate_pfn & ~((1UL << cc->order) - 1); + block_start_pfn(cc->migrate_pfn, cc->order); if (cc->last_migrated_pfn < current_block_start) { cpu = get_cpu(); @@ -1436,7 +1441,7 @@ out: cc->nr_freepages = 0; VM_BUG_ON(free_pfn == 0); /* The cached pfn is always the first in a pageblock */ - free_pfn &= ~(pageblock_nr_pages-1); + free_pfn = pageblock_start_pfn(free_pfn); /* * Only go back, not forward. The cached pfn might have been * already reset to zone end in compact_finished() -- cgit v1.2.3 From a34753d275576896b06af9baa6f54bee258368c2 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:11:51 -0700 Subject: mm, compaction: reduce spurious pcplist drains Compaction drains the local pcplists each time migration scanner moves away from a cc->order aligned block where it isolated pages for migration, so that the pages freed by migrations can merge into higher orders. The detection is currently coarser than it could be. The cc->last_migrated_pfn variable should track the lowest pfn that was isolated for migration. But it is set to the pfn where isolate_migratepages_block() starts scanning, which is typically the first pfn of the pageblock. There, the scanner might fail to isolate several order-aligned blocks, and then isolate COMPACT_CLUSTER_MAX in another block. This would cause the pcplists drain to be performed, although the scanner didn't yet finish the block where it isolated from. This patch thus makes cc->last_migrated_pfn handling more accurate by setting it to the pfn of an actually isolated page in isolate_migratepages_block(). Although practical effects of this patch are likely low, it arguably makes the intent of the code more obvious. Also the next patch will make async direct compaction skip blocks more aggressively, and draining pcplists due to skipped blocks is wasteful. Signed-off-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 017a1a1963cb..329973a1ae45 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -787,6 +787,15 @@ isolate_success: cc->nr_migratepages++; nr_isolated++; + /* + * Record where we could have freed pages by migration and not + * yet flushed them to buddy allocator. + * - this is the lowest page that was isolated and likely be + * then freed by migration. + */ + if (!cc->last_migrated_pfn) + cc->last_migrated_pfn = low_pfn; + /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; @@ -1075,7 +1084,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long block_start_pfn; unsigned long block_end_pfn; unsigned long low_pfn; - unsigned long isolate_start_pfn; struct page *page; const isolate_mode_t isolate_mode = (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) | @@ -1130,7 +1138,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; /* Perform the isolation */ - isolate_start_pfn = low_pfn; low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); @@ -1139,15 +1146,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return ISOLATE_ABORT; } - /* - * Record where we could have freed pages by migration and not - * yet flushed them to buddy allocator. - * - this is the lowest page that could have been isolated and - * then freed by migration. - */ - if (cc->nr_migratepages && !cc->last_migrated_pfn) - cc->last_migrated_pfn = isolate_start_pfn; - /* * Either we isolated something and proceed with migration. Or * we failed and compact_zone should decide if we should -- cgit v1.2.3 From fdd048e12c9a46d058f69822cb15641adae181e1 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:11:55 -0700 Subject: mm, compaction: skip blocks where isolation fails in async direct compaction The goal of direct compaction is to quickly make a high-order page available for the pending allocation. Within an aligned block of pages of desired order, a single allocated page that cannot be isolated for migration means that the block cannot fully merge to a buddy page that would satisfy the allocation request. Therefore we can reduce the allocation stall by skipping the rest of the block immediately on isolation failure. For async compaction, this also means a higher chance of succeeding until it detects contention. We however shouldn't completely sacrifice the second objective of compaction, which is to reduce overal long-term memory fragmentation. As a compromise, perform the eager skipping only in direct async compaction, while sync compaction (including kcompactd) remains thorough. Testing was done using stress-highalloc from mmtests, configured for order-4 GFP_KERNEL allocations: 4.6-rc1 4.6-rc1 before after Success 1 Min 24.00 ( 0.00%) 27.00 (-12.50%) Success 1 Mean 30.20 ( 0.00%) 31.60 ( -4.64%) Success 1 Max 37.00 ( 0.00%) 35.00 ( 5.41%) Success 2 Min 42.00 ( 0.00%) 32.00 ( 23.81%) Success 2 Mean 44.00 ( 0.00%) 44.80 ( -1.82%) Success 2 Max 48.00 ( 0.00%) 52.00 ( -8.33%) Success 3 Min 91.00 ( 0.00%) 92.00 ( -1.10%) Success 3 Mean 92.20 ( 0.00%) 92.80 ( -0.65%) Success 3 Max 94.00 ( 0.00%) 93.00 ( 1.06%) We can see that success rates are unaffected by the skipping. 4.6-rc1 4.6-rc1 before after User 2587.42 2566.53 System 482.89 471.20 Elapsed 1395.68 1382.00 Times are not so useful metric for this benchmark as main portion is the interfering kernel builds, but results do hint at reduced system times. 4.6-rc1 4.6-rc1 before after Direct pages scanned 163614 159608 Kswapd pages scanned 2070139 2078790 Kswapd pages reclaimed 2061707 2069757 Direct pages reclaimed 163354 159505 Reduced direct reclaim was unintended, but could be explained by more successful first attempt at (async) direct compaction, which is attempted before the first reclaim attempt in __alloc_pages_slowpath(). Compaction stalls 33052 39853 Compaction success 12121 19773 Compaction failures 20931 20079 Compaction is indeed more successful, and thus less likely to get deferred, so there are also more direct compaction stalls. Page migrate success 3781876 3326819 Page migrate failure 45817 41774 Compaction pages isolated 7868232 6941457 Compaction migrate scanned 168160492 127269354 Compaction migrate prescanned 0 0 Compaction free scanned 2522142582 2326342620 Compaction free direct alloc 0 0 Compaction free dir. all. miss 0 0 Compaction cost 5252 4476 The patch reduces migration scanned pages by 25% thanks to the eager skipping. [hughd@google.com: prevent nr_isolated_* from going negative] Signed-off-by: Vlastimil Babka Signed-off-by: Hugh Dickins Cc: Joonsoo Kim Cc: Mel Gorman Cc: Rik van Riel Cc: David Rientjes Cc: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 329973a1ae45..7487067b4613 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -638,12 +638,13 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, { struct zone *zone = cc->zone; unsigned long nr_scanned = 0, nr_isolated = 0; - struct list_head *migratelist = &cc->migratepages; struct lruvec *lruvec; unsigned long flags = 0; bool locked = false; struct page *page = NULL, *valid_page = NULL; unsigned long start_pfn = low_pfn; + bool skip_on_failure = false; + unsigned long next_skip_pfn = 0; /* * Ensure that there are not too many pages isolated from the LRU @@ -664,10 +665,37 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (compact_should_abort(cc)) return 0; + if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) { + skip_on_failure = true; + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { bool is_lru; + if (skip_on_failure && low_pfn >= next_skip_pfn) { + /* + * We have isolated all migration candidates in the + * previous order-aligned block, and did not skip it due + * to failure. We should migrate the pages now and + * hopefully succeed compaction. + */ + if (nr_isolated) + break; + + /* + * We failed to isolate in the previous order-aligned + * block. Set the new boundary to the end of the + * current block. Note we can't simply increase + * next_skip_pfn by 1 << order, as low_pfn might have + * been incremented by a higher number due to skipping + * a compound or a high-order buddy page in the + * previous loop iteration. + */ + next_skip_pfn = block_end_pfn(low_pfn, cc->order); + } + /* * Periodically drop the lock (if held) regardless of its * contention, to give chance to IRQs. Abort async compaction @@ -679,7 +707,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, break; if (!pfn_valid_within(low_pfn)) - continue; + goto isolate_fail; nr_scanned++; page = pfn_to_page(low_pfn); @@ -734,11 +762,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, if (likely(comp_order < MAX_ORDER)) low_pfn += (1UL << comp_order) - 1; - continue; + goto isolate_fail; } if (!is_lru) - continue; + goto isolate_fail; /* * Migration will fail if an anonymous page is pinned in memory, @@ -747,7 +775,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (!page_mapping(page) && page_count(page) > page_mapcount(page)) - continue; + goto isolate_fail; /* If we already hold the lock, we can skip some rechecking */ if (!locked) { @@ -758,7 +786,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Recheck PageLRU and PageCompound under lock */ if (!PageLRU(page)) - continue; + goto isolate_fail; /* * Page become compound since the non-locked check, @@ -767,7 +795,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, */ if (unlikely(PageCompound(page))) { low_pfn += (1UL << compound_order(page)) - 1; - continue; + goto isolate_fail; } } @@ -775,7 +803,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Try isolate the page */ if (__isolate_lru_page(page, isolate_mode) != 0) - continue; + goto isolate_fail; VM_BUG_ON_PAGE(PageCompound(page), page); @@ -783,7 +811,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, del_page_from_lru_list(page, lruvec, page_lru(page)); isolate_success: - list_add(&page->lru, migratelist); + list_add(&page->lru, &cc->migratepages); cc->nr_migratepages++; nr_isolated++; @@ -801,6 +829,37 @@ isolate_success: ++low_pfn; break; } + + continue; +isolate_fail: + if (!skip_on_failure) + continue; + + /* + * We have isolated some pages, but then failed. Release them + * instead of migrating, as we cannot form the cc->order buddy + * page anyway. + */ + if (nr_isolated) { + if (locked) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + locked = false; + } + acct_isolated(zone, cc); + putback_movable_pages(&cc->migratepages); + cc->nr_migratepages = 0; + cc->last_migrated_pfn = 0; + nr_isolated = 0; + } + + if (low_pfn < next_skip_pfn) { + low_pfn = next_skip_pfn - 1; + /* + * The check near the loop beginning would have updated + * next_skip_pfn too, but this is a bit simpler. + */ + next_skip_pfn += 1UL << cc->order; + } } /* @@ -1401,6 +1460,18 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ret = COMPACT_CONTENDED; goto out; } + /* + * We failed to migrate at least one page in the current + * order-aligned block, so skip the rest of it. + */ + if (cc->direct_compaction && + (cc->mode == MIGRATE_ASYNC)) { + cc->migrate_pfn = block_end_pfn( + cc->migrate_pfn - 1, cc->order); + /* Draining pcplists is useless in this case */ + cc->last_migrated_pfn = 0; + + } } check_drain: -- cgit v1.2.3 From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 19 May 2016 17:12:00 -0700 Subject: mm: uninline page_mapped() It's huge. Uninlining it saves 206 bytes per callsite. Shaves 4924 bytes from the x86_64 allmodconfig vmlinux. [akpm@linux-foundation.org: coding-style fixes] Cc: Steve Capper Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 21 +-------------------- mm/util.c | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5b375133c695..9c2852cabf01 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page) return page->index; } -/* - * Return true if this page is mapped into pagetables. - * For compound page it returns true if any subpage of compound page is mapped. - */ -static inline bool page_mapped(struct page *page) -{ - int i; - if (likely(!PageCompound(page))) - return atomic_read(&page->_mapcount) >= 0; - page = compound_head(page); - if (atomic_read(compound_mapcount_ptr(page)) >= 0) - return true; - if (PageHuge(page)) - return false; - for (i = 0; i < hpage_nr_pages(page); i++) { - if (atomic_read(&page[i]._mapcount) >= 0) - return true; - } - return false; -} +bool page_mapped(struct page *page); /* * Return true only if the page has been allocated with diff --git a/mm/util.c b/mm/util.c index 6cc81e7b8705..8a1b3a1fb595 100644 --- a/mm/util.c +++ b/mm/util.c @@ -346,6 +346,29 @@ void *page_rmapping(struct page *page) return __page_rmapping(page); } +/* + * Return true if this page is mapped into pagetables. + * For compound page it returns true if any subpage of compound page is mapped. + */ +bool page_mapped(struct page *page) +{ + int i; + + if (likely(!PageCompound(page))) + return atomic_read(&page->_mapcount) >= 0; + page = compound_head(page); + if (atomic_read(compound_mapcount_ptr(page)) >= 0) + return true; + if (PageHuge(page)) + return false; + for (i = 0; i < hpage_nr_pages(page); i++) { + if (atomic_read(&page[i]._mapcount) >= 0) + return true; + } + return false; +} +EXPORT_SYMBOL(page_mapped); + struct anon_vma *page_anon_vma(struct page *page) { unsigned long mapping; -- cgit v1.2.3 From f44b2dda8bc29de36ccdc1e04092de7d0b2d5868 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:03 -0700 Subject: mm/hugetlb: add same zone check in pfn_range_valid_gigantic() This patchset deals with some problematic sites that iterate pfn ranges. There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to take care of this overlapping when iterating pfn range. I audit many iterating sites that uses pfn_valid(), pfn_valid_within(), zone_start_pfn and etc. and others looks safe to me. This is a preparation step for a new CMA implementation, ZONE_CMA (https://lkml.org/lkml/2015/2/12/95), because it would be easily overlapped with other zones. But, zone overlap check is also needed for the general case so I send it separately. This patch (of 5): alloc_gigantic_page() uses alloc_contig_range() and this requires that the requested range is in a single zone. To satisfy this requirement, add this check to pfn_range_valid_gigantic(). Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0f580ea7f41d..949d80609a32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1031,8 +1031,8 @@ static int __alloc_gigantic_page(unsigned long start_pfn, return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); } -static bool pfn_range_valid_gigantic(unsigned long start_pfn, - unsigned long nr_pages) +static bool pfn_range_valid_gigantic(struct zone *z, + unsigned long start_pfn, unsigned long nr_pages) { unsigned long i, end_pfn = start_pfn + nr_pages; struct page *page; @@ -1043,6 +1043,9 @@ static bool pfn_range_valid_gigantic(unsigned long start_pfn, page = pfn_to_page(i); + if (page_zone(page) != z) + return false; + if (PageReserved(page)) return false; @@ -1075,7 +1078,7 @@ static struct page *alloc_gigantic_page(int nid, unsigned int order) pfn = ALIGN(z->zone_start_pfn, nr_pages); while (zone_spans_last_pfn(z, pfn, nr_pages)) { - if (pfn_range_valid_gigantic(pfn, nr_pages)) { + if (pfn_range_valid_gigantic(z, pfn, nr_pages)) { /* * We release the zone lock here because * alloc_contig_range() will also lock the zone -- cgit v1.2.3 From b9eb63191a6ad70127ebdd83650b810cdc1d1117 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:06 -0700 Subject: mm/memory_hotplug: add comment to some functions related to memory hotplug __offline_isolated_pages() and test_pages_isolated() are used by memory hotplug. These functions require that range is in a single zone but there is no code to do this because memory hotplug checks it before calling these functions. To avoid confusing future user of these functions, this patch adds comments to them. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- mm/page_isolation.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 34f688bc5c7a..19fe7e9c39a6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7176,7 +7176,8 @@ void zone_pcp_reset(struct zone *zone) #ifdef CONFIG_MEMORY_HOTREMOVE /* - * All pages in the range must be isolated before calling this. + * All pages in the range must be in a single zone and isolated + * before calling this. */ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 67bedd18429c..612122bf6a42 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -246,6 +246,7 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, return pfn; } +/* Caller should ensure that requested range is in a single zone */ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, bool skip_hwpoisoned_pages) { -- cgit v1.2.3 From a91c43c7313a995a8908f8f6b911a85d00fdbffd Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:10 -0700 Subject: mm/vmstat: add zone range overlapping check There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to care this overlapping when iterating pfn range. There are two places in vmstat.c that iterates pfn range and they don't consider this overlapping. Add it. Without this patch, above system could over count pageblock number on a zone. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 5e4300482897..0a726e398cfd 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1010,6 +1010,9 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m, if (!memmap_valid_within(pfn, page, zone)) continue; + if (page_zone(page) != zone) + continue; + mtype = get_pageblock_migratetype(page); if (mtype < MIGRATE_TYPES) @@ -1076,6 +1079,10 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (PageBuddy(page)) { pfn += (1UL << page_order(page)) - 1; continue; -- cgit v1.2.3 From 9d43f5aec9506d98ad492a783aa8a18226c5d474 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:13 -0700 Subject: mm/page_owner: add zone range overlapping check There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to care this overlapping when iterating pfn range. There are one place in page_owner.c that iterates pfn range and it doesn't consider this overlapping. Add it. Without this patch, above system could over count early allocated page number before page_owner is activated. Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page_owner.c b/mm/page_owner.c index ac3d8d129974..438768c092ac 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -301,6 +301,9 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + /* * We are safe to check buddy flag and order, because * this is init stage and only single thread runs. -- cgit v1.2.3 From ba6b0979e346fd91d3b7ef6956d7155877308f0f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:16 -0700 Subject: power: add zone range overlapping check There is a system thats node's pfns are overlapped as follows: -----pfn--------> N0 N1 N2 N0 N1 N2 Therefore, we need to care this overlapping when iterating pfn range. mark_free_pages() iterates requested zone's pfn range and unset all range's bitmap first. And then it marks freepages in a zone to the bitmap. If there is an overlapping zone, above unset could clear previous marked bit and reference to this bitmap in the future will cause the problem. To prevent it, this patch adds a zone check in mark_free_pages(). Signed-off-by: Joonsoo Kim Acked-by: Vlastimil Babka Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: "Aneesh Kumar K.V" Cc: "Rafael J. Wysocki" Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 19fe7e9c39a6..e132705d19fd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2156,6 +2156,10 @@ void mark_free_pages(struct zone *zone) for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { page = pfn_to_page(pfn); + + if (page_zone(page) != zone) + continue; + if (!swsusp_page_is_forbidden(page)) swsusp_unset_page_free(page); } -- cgit v1.2.3 From 09b4ab3c433733127f8644290e94548a4cb8122f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:20 -0700 Subject: mm/writeback: correct dirty page calculation for highmem ZONE_MOVABLE could be treated as highmem so we need to consider it for accurate calculation of dirty pages. And, in following patches, ZONE_CMA will be introduced and it can be treated as highmem, too. So, instead of manually adding stat of ZONE_MOVABLE, looping all zones and check whether the zone is highmem or not and add stat of the zone which can be treated as highmem. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bc5149d5ec38..3b88795ab46e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -296,11 +296,15 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) #ifdef CONFIG_HIGHMEM int node; unsigned long x = 0; + int i; for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *z = &NODE_DATA(node)->node_zones[i]; - x += zone_dirtyable_memory(z); + if (is_highmem(z)) + x += zone_dirtyable_memory(z); + } } /* * Unreclaimable memory (kernel memory or anonymous memory -- cgit v1.2.3 From fc2bd799c7c79c84a59da6f9221370bc6f38c503 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:23 -0700 Subject: mm/page_alloc: correct highmem memory statistics ZONE_MOVABLE could be treated as highmem so we need to consider it for accurate statistics. And, in following patches, ZONE_CMA will be introduced and it can be treated as highmem, too. So, instead of manually adding stat of ZONE_MOVABLE, looping all zones and check whether the zone is highmem or not and add stat of the zone which can be treated as highmem. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e132705d19fd..da6d339f1936 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3793,6 +3793,8 @@ void si_meminfo_node(struct sysinfo *val, int nid) { int zone_type; /* needs to be signed */ unsigned long managed_pages = 0; + unsigned long managed_highpages = 0; + unsigned long free_highpages = 0; pg_data_t *pgdat = NODE_DATA(nid); for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) @@ -3801,12 +3803,19 @@ void si_meminfo_node(struct sysinfo *val, int nid) val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM - val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; - val->freehigh = zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { + struct zone *zone = &pgdat->node_zones[zone_type]; + + if (is_highmem(zone)) { + managed_highpages += zone->managed_pages; + free_highpages += zone_page_state(zone, NR_FREE_PAGES); + } + } + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #else - val->totalhigh = 0; - val->freehigh = 0; + val->totalhigh = managed_highpages; + val->freehigh = free_highpages; #endif val->mem_unit = PAGE_SIZE; } -- cgit v1.2.3 From 33499bfe507c844f2c6f55ae8cec17705d0eda95 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:26 -0700 Subject: mm/highmem: make nr_free_highpages() handles all highmem zones by itself nr_free_highpages() manually adds statistics per each highmem zone and returns a total value for them. Whenever we add a new highmem zone, we need to consider this function and it's really troublesome. Make it handle all highmem zones by itself. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 123bcd3ed4f2..50b4ca6787f0 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -112,16 +112,12 @@ EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); unsigned int nr_free_highpages (void) { - pg_data_t *pgdat; + struct zone *zone; unsigned int pages = 0; - for_each_online_pgdat(pgdat) { - pages += zone_page_state(&pgdat->node_zones[ZONE_HIGHMEM], - NR_FREE_PAGES); - if (zone_movable_is_highmem()) - pages += zone_page_state( - &pgdat->node_zones[ZONE_MOVABLE], - NR_FREE_PAGES); + for_each_populated_zone(zone) { + if (is_highmem(zone)) + pages += zone_page_state(zone, NR_FREE_PAGES); } return pages; -- cgit v1.2.3 From e87d59f7a2002aa2d4582d4ea16da91dd3c72752 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 19 May 2016 17:12:29 -0700 Subject: mm/vmstat: make node_page_state() handles all zones by itself node_page_state() manually adds statistics per each zone and returns total value for all zones. Whenever we add a new zone, we need to consider this function and it's really troublesome. Make it handle all zones by itself. Signed-off-by: Joonsoo Kim Reviewed-by: Aneesh Kumar K.V Cc: Rik van Riel Cc: Johannes Weiner Cc: Mel Gorman Cc: Laura Abbott Cc: Minchan Kim Cc: Marek Szyprowski Cc: Michal Nazarewicz Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 0a726e398cfd..a7de9adacbd9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -600,19 +600,13 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) unsigned long node_page_state(int node, enum zone_stat_item item) { struct zone *zones = NODE_DATA(node)->node_zones; + int i; + unsigned long count = 0; - return -#ifdef CONFIG_ZONE_DMA - zone_page_state(&zones[ZONE_DMA], item) + -#endif -#ifdef CONFIG_ZONE_DMA32 - zone_page_state(&zones[ZONE_DMA32], item) + -#endif -#ifdef CONFIG_HIGHMEM - zone_page_state(&zones[ZONE_HIGHMEM], item) + -#endif - zone_page_state(&zones[ZONE_NORMAL], item) + - zone_page_state(&zones[ZONE_MOVABLE], item); + for (i = 0; i < MAX_NR_ZONES; i++) + count += zone_page_state(zones + i, item); + + return count; } #endif -- cgit v1.2.3 From 1269019e69a6798db15edea8921f83215ef954d6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 19 May 2016 17:12:32 -0700 Subject: mm/mmap: kill hook arch_rebalance_pgtables() Nobody uses it. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bd2e1a533bc1..fba246b8f1a5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -55,10 +55,6 @@ #define arch_mmap_check(addr, len, flags) (0) #endif -#ifndef arch_rebalance_pgtables -#define arch_rebalance_pgtables(addr, len) (addr) -#endif - #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS const int mmap_rnd_bits_min = CONFIG_ARCH_MMAP_RND_BITS_MIN; const int mmap_rnd_bits_max = CONFIG_ARCH_MMAP_RND_BITS_MAX; @@ -1911,7 +1907,6 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, if (offset_in_page(addr)) return -EINVAL; - addr = arch_rebalance_pgtables(addr, len); error = security_mmap_addr(addr); return error ? error : addr; } -- cgit v1.2.3 From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:35 -0700 Subject: mm: update_lru_size warn and reset bad lru_size Though debug kernels have a VM_BUG_ON to help protect from misaccounting lru_size, non-debug kernels are liable to wrap it around: and then the vast unsigned long size draws page reclaim into a loop of repeatedly doing nothing on an empty list, without even a cond_resched(). That soft lockup looks confusingly like an over-busy reclaim scenario, with lots of contention on the lru_lock in shrink_inactive_list(): yet has a totally different origin. Help differentiate with a custom warning in mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the size to avoid the lockup. But the particular bug which suggested this change was mine alone, and since fixed. Make it a WARN_ONCE: the first occurrence is the most informative, a flurry may follow, yet even when rate-limited little more is learnt. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_inline.h | 2 +- mm/memcontrol.c | 24 ++++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 712e8c37a200..d8cea81ab1ac 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { int nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); list_del(&page->lru); + mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 011dac8ab5d7..6a0199706f00 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1023,22 +1023,38 @@ out: * @lru: index of lru list the page is sitting on * @nr_pages: positive when adding or negative when removing * - * This function must be called when a page is added to or removed from an - * lru list. + * This function must be called under lru_lock, just before a page is added + * to or just after a page is removed from an lru list (that ordering being + * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int nr_pages) { struct mem_cgroup_per_zone *mz; unsigned long *lru_size; + long size; + bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); lru_size = mz->lru_size + lru; - *lru_size += nr_pages; - VM_BUG_ON((long)(*lru_size) < 0); + empty = list_empty(lruvec->lists + lru); + + if (nr_pages < 0) + *lru_size += nr_pages; + + size = *lru_size; + if (WARN_ONCE(size < 0 || empty != !size, + "%s(%p, %d, %d): lru_size %ld but %sempty\n", + __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + VM_BUG_ON(1); + *lru_size = 0; + } + + if (nr_pages > 0) + *lru_size += nr_pages; } bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg) -- cgit v1.2.3 From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:38 -0700 Subject: mm: update_lru_size do the __mod_zone_page_state Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy reclaim was removed) that lru_size can be updated by -nr_taken once per call to isolate_lru_pages(), instead of page by page. Update it inside isolate_lru_pages(), or at its two callsites? I chose to update it at the callsites, rearranging and grouping the updates by nr_taken and nr_scanned together in both. With one exception, mem_cgroup_update_lru_size(,lru,) is then used where __mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding some more calls in a future commit. Make the code a little smaller and simpler by incorporating stat update in lru_size update. The exception was move_active_pages_to_lru(), which aggregated the pgmoved stat update separately from the individual lru_size updates; but I still think this a simplification worth making. However, the __mod_zone_page_state is not peculiar to mem_cgroups: so better use the name update_lru_size, calls mem_cgroup_update_lru_size when CONFIG_MEMCG. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ include/linux/mm_inline.h | 24 ++++++++++++++++++------ mm/memcontrol.c | 2 ++ mm/vmscan.c | 23 ++++++++++------------- 4 files changed, 30 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1191d79aa495..94da96738df3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return 0; } -static inline void -mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int increment) -{ -} - static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index d8cea81ab1ac..5bd29ba4f174 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page) return !PageSwapBacked(page); } +static __always_inline void __update_lru_size(struct lruvec *lruvec, + enum lru_list lru, int nr_pages) +{ + __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); +} + +static __always_inline void update_lru_size(struct lruvec *lruvec, + enum lru_list lru, int nr_pages) +{ +#ifdef CONFIG_MEMCG + mem_cgroup_update_lru_size(lruvec, lru, nr_pages); +#else + __update_lru_size(lruvec, lru, nr_pages); +#endif +} + static __always_inline void add_page_to_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - int nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, hpage_nr_pages(page)); list_add(&page->lru, &lruvec->lists[lru]); - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages); } static __always_inline void del_page_from_lru_list(struct page *page, struct lruvec *lruvec, enum lru_list lru) { - int nr_pages = hpage_nr_pages(page); list_del(&page->lru); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); - __mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages); + update_lru_size(lruvec, lru, -hpage_nr_pages(page)); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a0199706f00..1b40dcad2b90 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1035,6 +1035,8 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, long size; bool empty; + __update_lru_size(lruvec, lru, nr_pages); + if (mem_cgroup_disabled()) return; diff --git a/mm/vmscan.c b/mm/vmscan.c index d3a02ac3eed7..dcfdfc1a0942 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1374,7 +1374,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); scan++) { struct page *page; - int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); @@ -1383,10 +1382,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode)) { case 0: - nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, -nr_pages); + nr_taken += hpage_nr_pages(page); list_move(&page->lru, dst); - nr_taken += nr_pages; break; case -EBUSY: @@ -1602,8 +1599,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, &nr_scanned, sc, isolate_mode, lru); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); + update_lru_size(lruvec, lru, -nr_taken); __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); @@ -1624,8 +1622,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&zone->lru_lock); - reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) { if (current_is_kswapd()) __count_zone_vm_events(PGSTEAL_KSWAPD, zone, @@ -1742,7 +1738,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, SetPageLRU(page); nr_pages = hpage_nr_pages(page); - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + update_lru_size(lruvec, lru, nr_pages); list_move(&page->lru, &lruvec->lists[lru]); pgmoved += nr_pages; @@ -1760,7 +1756,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, list_add(&page->lru, pages_to_free); } } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) __count_vm_events(PGDEACTIVATE, pgmoved); } @@ -1794,14 +1790,15 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); - if (global_reclaim(sc)) - __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); + update_lru_size(lruvec, lru, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; + if (global_reclaim(sc)) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); __count_zone_vm_events(PGREFILL, zone, nr_scanned); - __mod_zone_page_state(zone, NR_LRU_BASE + lru, -nr_taken); - __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); + spin_unlock_irq(&zone->lru_lock); while (!list_empty(&l_hold)) { -- cgit v1.2.3 From fa9949da59a15017a02c86b087c7499d7b5702be Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:41 -0700 Subject: mm: use __SetPageSwapBacked and dont ClearPageSwapBacked v3.16 commit 07a427884348 ("mm: shmem: avoid atomic operation during shmem_getpage_gfp") rightly replaced one instance of SetPageSwapBacked by __SetPageSwapBacked, pointing out that the newly allocated page is not yet visible to other users (except speculative get_page_unless_zero- ers, who may not update page flags before their further checks). That was part of a series in which Mel was focused on tmpfs profiles: but almost all SetPageSwapBacked uses can be so optimized, with the same justification. Remove ClearPageSwapBacked from __read_swap_cache_async() error path: it's not an error to free a page with PG_swapbacked set. Follow a convention of __SetPageLocked, __SetPageSwapBacked instead of doing it differently in different places; but that's for tidiness - if the ordering actually mattered, we should not be using the __variants. There's probably scope for further __SetPageFlags in other places, but SwapBacked is the one I'm interested in at the moment. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Reviewed-by: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 6 +++--- mm/rmap.c | 2 +- mm/shmem.c | 4 ++-- mm/swap_state.c | 3 +-- 4 files changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index f9dfb18a4eba..53ab6398e7a2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -332,7 +332,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); return MIGRATEPAGE_SUCCESS; } @@ -378,7 +378,7 @@ int migrate_page_move_mapping(struct address_space *mapping, newpage->index = page->index; newpage->mapping = page->mapping; if (PageSwapBacked(page)) - SetPageSwapBacked(newpage); + __SetPageSwapBacked(newpage); get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { @@ -1791,7 +1791,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/rmap.c b/mm/rmap.c index 4cebe8a7c2cb..8a839935b18c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1249,7 +1249,7 @@ void page_add_new_anon_rmap(struct page *page, int nr = compound ? hpage_nr_pages(page) : 1; VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); - SetPageSwapBacked(page); + __SetPageSwapBacked(page); if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); /* increment count (starts at -1) */ diff --git a/mm/shmem.c b/mm/shmem.c index e684a9140228..9e609d58df73 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1085,8 +1085,8 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, flush_dcache_page(newpage); __SetPageLocked(newpage); + __SetPageSwapBacked(newpage); SetPageUptodate(newpage); - SetPageSwapBacked(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1276,8 +1276,8 @@ repeat: goto decused; } - __SetPageSwapBacked(page); __SetPageLocked(page); + __SetPageSwapBacked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); diff --git a/mm/swap_state.c b/mm/swap_state.c index 366ce3518703..0d457e7db8d6 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -358,7 +358,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __SetPageLocked(new_page); - SetPageSwapBacked(new_page); + __SetPageSwapBacked(new_page); err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { radix_tree_preload_end(); @@ -370,7 +370,6 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return new_page; } radix_tree_preload_end(); - ClearPageSwapBacked(new_page); __ClearPageLocked(new_page); /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely -- cgit v1.2.3 From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:44 -0700 Subject: tmpfs: preliminary minor tidyups Make a few cleanups in mm/shmem.c, before going on to complicate it. shmem_alloc_page() will become more complicated: we can't afford to to have that complication duplicated between a CONFIG_NUMA version and a !CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a single shmem_swapin() and a single shmem_alloc_page(). Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA configurations, but eliminating it is a larger cleanup: I have an alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and bug-prone, and changed yet again since my last version. Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to shmem_alloc_page(): that SwapBacked flag will be useful in future, to help to distinguish different cases appropriately. And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of little use (IIRC it dates back to when shmem_getpage() returned the page unlocked): kill it and do the necessary in shmem_file_read_iter(). But an arm64 build then complained that info may be uninitialized (where shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and advancing to an "sgp <= SGP_CACHE" test jogged it back to reality. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 6 +++++ mm/shmem.c | 69 ++++++++++++++++++----------------------------- 2 files changed, 32 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 6978a99e571f..4429d255c8ab 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p) { } +static inline struct mempolicy * +mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) +{ + return NULL; +} + #define vma_policy(vma) NULL static inline int diff --git a/mm/shmem.c b/mm/shmem.c index 9e609d58df73..6d2de2c1bf11 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -101,7 +101,6 @@ struct shmem_falloc { enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_CACHE, /* don't exceed i_size, may allocate page */ - SGP_DIRTY, /* like SGP_CACHE, but set new page dirty */ SGP_WRITE, /* may exceed i_size, may allocate !Uptodate page */ SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; @@ -169,7 +168,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as - * pages are allocated, in order to allow huge sparse files. + * pages are allocated, in order to allow large sparse files. * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ @@ -947,8 +946,7 @@ redirty: return 0; } -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -972,7 +970,18 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) } return mpol; } -#endif /* CONFIG_TMPFS */ +#else /* !CONFIG_NUMA || !CONFIG_TMPFS */ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) +{ +} +static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) +{ + return NULL; +} +#endif /* CONFIG_NUMA && CONFIG_TMPFS */ +#ifndef CONFIG_NUMA +#define vm_policy vm_private_data +#endif static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) @@ -1008,39 +1017,17 @@ static struct page *shmem_alloc_page(gfp_t gfp, pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_pages_vma(gfp, 0, &pvma, 0, numa_node_id(), false); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + } /* Drop reference taken by mpol_shared_policy_lookup() */ mpol_cond_put(pvma.vm_policy); return page; } -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -{ -} -#endif /* CONFIG_TMPFS */ - -static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return swapin_readahead(swap, gfp, NULL, 0); -} - -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return alloc_page(gfp); -} -#endif /* CONFIG_NUMA */ - -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) -static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) -{ - return NULL; -} -#endif /* * When a page is moved from swapcache to shmem filecache (either by the @@ -1084,8 +1071,6 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, copy_highpage(newpage, oldpage); flush_dcache_page(newpage); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); SetPageUptodate(newpage); set_page_private(newpage, swap_index); SetPageSwapCache(newpage); @@ -1155,7 +1140,7 @@ repeat: page = NULL; } - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { error = -EINVAL; goto unlock; @@ -1275,9 +1260,6 @@ repeat: error = -ENOMEM; goto decused; } - - __SetPageLocked(page); - __SetPageSwapBacked(page); if (sgp == SGP_WRITE) __SetPageReferenced(page); @@ -1321,12 +1303,10 @@ clear: flush_dcache_page(page); SetPageUptodate(page); } - if (sgp == SGP_DIRTY) - set_page_dirty(page); } /* Perhaps the file has been truncated since we checked */ - if (sgp != SGP_WRITE && sgp != SGP_FALLOC && + if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { ClearPageDirty(page); @@ -1633,7 +1613,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * and even mark them dirty, so it cannot exceed the max_blocks limit. */ if (!iter_is_iovec(to)) - sgp = SGP_DIRTY; + sgp = SGP_CACHE; index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; @@ -1659,8 +1639,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = 0; break; } - if (page) + if (page) { + if (sgp == SGP_CACHE) + set_page_dirty(page); unlock_page(page); + } /* * We must evaluate after, since reads (unlike writes) -- cgit v1.2.3 From 9e18eb29356b7dfd55183bd42cf73919d1590835 Mon Sep 17 00:00:00 2001 From: Andres Lagar-Cavilla Date: Thu, 19 May 2016 17:12:47 -0700 Subject: tmpfs: mem_cgroup charge fault to vm_mm not current mm Although shmem_fault() has been careful to count a major fault to vm_mm, shmem_getpage_gfp() has been careless in charging a remote access fault to current->mm owner's memcg instead of to vma->vm_mm owner's memcg: that is inconsistent with all the mem_cgroup charging on remote access faults in mm/memory.c. Fix it by passing fault_mm along with fault_type to shmem_get_page_gfp(); but in that case, now knowing the right mm, it's better for it to handle the PGMAJFAULT updates itself. And let's keep this clutter out of most callers' way: change the common shmem_getpage() wrapper to hide fault_mm and fault_type as well as gfp. Signed-off-by: Andres Lagar-Cavilla Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 61 ++++++++++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 6d2de2c1bf11..e418a995427d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -121,13 +121,14 @@ static bool shmem_should_replace_page(struct page *page, gfp_t gfp); static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index); static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct mm_struct *fault_mm, int *fault_type); static inline int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, int *fault_type) + struct page **pagep, enum sgp_type sgp) { return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), fault_type); + mapping_gfp_mask(inode->i_mapping), NULL, NULL); } static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) @@ -527,7 +528,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (partial_start) { struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ, NULL); + shmem_getpage(inode, start - 1, &page, SGP_READ); if (page) { unsigned int top = PAGE_SIZE; if (start > end) { @@ -542,7 +543,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } if (partial_end) { struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ, NULL); + shmem_getpage(inode, end, &page, SGP_READ); if (page) { zero_user_segment(page, 0, partial_end); set_page_dirty(page); @@ -1115,14 +1116,19 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap - * entry since a page cannot live in both the swap and page cache + * entry since a page cannot live in both the swap and page cache. + * + * fault_mm and fault_type are only supplied by shmem_fault: + * otherwise they are NULL. */ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) + struct page **pagep, enum sgp_type sgp, gfp_t gfp, + struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mm_struct *charge_mm; struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; @@ -1168,14 +1174,19 @@ repeat: */ info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); + charge_mm = fault_mm ? : current->mm; if (swap.val) { /* Look it up and read it in.. */ page = lookup_swap_cache(swap); if (!page) { - /* here we actually do the io */ - if (fault_type) + /* Or update major stats only when swapin succeeds?? */ + if (fault_type) { *fault_type |= VM_FAULT_MAJOR; + count_vm_event(PGMAJFAULT); + mem_cgroup_count_vm_event(fault_mm, PGMAJFAULT); + } + /* Here we actually start the io */ page = shmem_swapin(swap, gfp, info, index); if (!page) { error = -ENOMEM; @@ -1202,7 +1213,7 @@ repeat: goto failed; } - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, @@ -1263,7 +1274,7 @@ repeat: if (sgp == SGP_WRITE) __SetPageReferenced(page); - error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg, + error = mem_cgroup_try_charge(page, charge_mm, gfp, &memcg, false); if (error) goto decused; @@ -1352,6 +1363,7 @@ unlock: static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); + gfp_t gfp = mapping_gfp_mask(inode->i_mapping); int error; int ret = VM_FAULT_LOCKED; @@ -1413,14 +1425,10 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); + error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + gfp, vma->vm_mm, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); - - if (ret & VM_FAULT_MAJOR) { - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - } return ret; } @@ -1567,7 +1575,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + return shmem_getpage(inode, index, pagep, SGP_WRITE); } static int @@ -1633,7 +1641,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, sgp, NULL); + error = shmem_getpage(inode, index, &page, sgp); if (error) { if (error == -EINVAL) error = 0; @@ -1749,7 +1757,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, error = 0; while (spd.nr_pages < nr_pages) { - error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -1771,8 +1779,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, page = spd.pages[page_nr]; if (!PageUptodate(page) || page->mapping != mapping) { - error = shmem_getpage(inode, index, &page, - SGP_CACHE, NULL); + error = shmem_getpage(inode, index, &page, SGP_CACHE); if (error) break; unlock_page(page); @@ -2215,8 +2222,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC, - NULL); + error = shmem_getpage(inode, index, &page, SGP_FALLOC); if (error) { /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, @@ -2534,7 +2540,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); + error = shmem_getpage(inode, 0, &page, SGP_WRITE); if (error) { iput(inode); return error; @@ -2575,7 +2581,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ, NULL); + error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); unlock_page(page); @@ -3479,7 +3485,8 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, int error; BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + gfp, NULL, NULL); if (error) page = ERR_PTR(error); else -- cgit v1.2.3 From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:50 -0700 Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 14 +++++++++++ include/linux/vmstat.h | 4 +++ kernel/sysctl.c | 7 ++++++ mm/vmstat.c | 60 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 85 insertions(+) (limited to 'mm') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 34a5fece3121..720355cbdf45 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm: - panic_on_oom - percpu_pagelist_fraction - stat_interval +- stat_refresh - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -755,6 +756,19 @@ is 1 second. ============================================================== +stat_refresh + +Any read or write (by root only) flushes all the per-cpu vm statistics +into their global totals, for more accurate reports when testing +e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo + +As a side-effect, it also checks for negative totals (elsewhere reported +as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. +(At time of writing, a few stats are known sometimes to be found negative, +with no ill effects: errors and warnings on these stats are suppressed.) + +============================================================== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 73fae8c4a5fb..02fce415b3d9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -193,6 +193,10 @@ void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); +struct ctl_table; +int vmstat_refresh(struct ctl_table *, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..2effd84d83e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "stat_refresh", + .data = NULL, + .maxlen = 0, + .mode = 0600, + .proc_handler = vmstat_refresh, + }, #endif #ifdef CONFIG_MMU { diff --git a/mm/vmstat.c b/mm/vmstat.c index a7de9adacbd9..c831be32a1a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; static cpumask_var_t cpu_stat_off; +#ifdef CONFIG_PROC_FS +static void refresh_vm_stats(struct work_struct *work) +{ + refresh_cpu_vm_stats(true); +} + +int vmstat_refresh(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + long val; + int err; + int i; + + /* + * The regular update, every sysctl_stat_interval, may come later + * than expected: leaving a significant amount in per_cpu buckets. + * This is particularly misleading when checking a quantity of HUGE + * pages, immediately after running a test. /proc/sys/vm/stat_refresh, + * which can equally be echo'ed to or cat'ted from (by root), + * can be used to update the stats just before reading them. + * + * Oh, and since global_page_state() etc. are so careful to hide + * transiently negative values, report an error here if any of + * the stats is negative, so we know to go looking for imbalance. + */ + err = schedule_on_each_cpu(refresh_vm_stats); + if (err) + return err; + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { + val = atomic_long_read(&vm_stat[i]); + if (val < 0) { + switch (i) { + case NR_ALLOC_BATCH: + case NR_PAGES_SCANNED: + /* + * These are often seen to go negative in + * recent kernels, but not to go permanently + * negative. Whilst it would be nicer not to + * have exceptions, rooting them out would be + * another task, of rather low priority. + */ + break; + default: + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; + break; + } + } + } + if (err) + return err; + if (write) + *ppos += *lenp; + else + *lenp = 0; + return 0; +} +#endif /* CONFIG_PROC_FS */ + static void vmstat_update(struct work_struct *w) { if (refresh_cpu_vm_stats(true)) { -- cgit v1.2.3 From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:54 -0700 Subject: huge mm: move_huge_pmd does not need new_vma Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as it was in the old vma, alignment and size permitting. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 4 +--- mm/huge_memory.c | 7 ++----- mm/mremap.c | 5 ++--- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d7b9e5346fba..419fb9e03447 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); -extern bool move_huge_pmd(struct vm_area_struct *vma, - struct vm_area_struct *new_vma, - unsigned long old_addr, +extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8ac8f582fd8..66675eed67be 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1698,20 +1698,17 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } -bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, - unsigned long old_addr, +bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; pmd_t pmd; - struct mm_struct *mm = vma->vm_mm; if ((old_addr & ~HPAGE_PMD_MASK) || (new_addr & ~HPAGE_PMD_MASK) || - old_end - old_addr < HPAGE_PMD_SIZE || - (new_vma->vm_flags & VM_NOHUGEPAGE)) + old_end - old_addr < HPAGE_PMD_SIZE) return false; /* diff --git a/mm/mremap.c b/mm/mremap.c index 3fa0a467df66..7d98fe1adc12 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -198,9 +198,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma, /* See comment in move_ptes() */ if (need_rmap_locks) anon_vma_lock_write(vma->anon_vma); - moved = move_huge_pmd(vma, new_vma, old_addr, - new_addr, old_end, - old_pmd, new_pmd); + moved = move_huge_pmd(vma, old_addr, new_addr, + old_end, old_pmd, new_pmd); if (need_rmap_locks) anon_vma_unlock_write(vma->anon_vma); if (moved) { -- cgit v1.2.3 From 1d069b7dd56728a0eb6acb138dce0d37600dee00 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:12:57 -0700 Subject: huge pagecache: extend mremap pmd rmap lockout to files Whatever huge pagecache implementation we go with, file rmap locking must be added to anon rmap locking, when mremap's move_page_tables() finds a pmd_trans_huge pmd entry: a simple change, let's do it now. Factor out take_rmap_locks() and drop_rmap_locks() to handle the locking for make move_ptes() and move_page_tables(), and delete the VM_BUG_ON_VMA which rejected vm_file and required anon_vma. Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Andres Lagar-Cavilla Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mremap.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mremap.c b/mm/mremap.c index 7d98fe1adc12..9dc499977924 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -70,6 +70,22 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return pmd; } +static void take_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); +} + +static void drop_rmap_locks(struct vm_area_struct *vma) +{ + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + if (vma->vm_file) + i_mmap_unlock_write(vma->vm_file->f_mapping); +} + static pte_t move_soft_dirty_pte(pte_t pte) { /* @@ -90,8 +106,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct vm_area_struct *new_vma, pmd_t *new_pmd, unsigned long new_addr, bool need_rmap_locks) { - struct address_space *mapping = NULL; - struct anon_vma *anon_vma = NULL; struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; @@ -114,16 +128,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ - if (need_rmap_locks) { - if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; - i_mmap_lock_write(mapping); - } - if (vma->anon_vma) { - anon_vma = vma->anon_vma; - anon_vma_lock_write(anon_vma); - } - } + if (need_rmap_locks) + take_rmap_locks(vma); /* * We don't have to worry about the ordering of src and dst @@ -151,10 +157,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); - if (anon_vma) - anon_vma_unlock_write(anon_vma); - if (mapping) - i_mmap_unlock_write(mapping); + if (need_rmap_locks) + drop_rmap_locks(vma); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -193,15 +197,13 @@ unsigned long move_page_tables(struct vm_area_struct *vma, if (pmd_trans_huge(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; - VM_BUG_ON_VMA(vma->vm_file || !vma->anon_vma, - vma); /* See comment in move_ptes() */ if (need_rmap_locks) - anon_vma_lock_write(vma->anon_vma); + take_rmap_locks(vma); moved = move_huge_pmd(vma, old_addr, new_addr, old_end, old_pmd, new_pmd); if (need_rmap_locks) - anon_vma_unlock_write(vma->anon_vma); + drop_rmap_locks(vma); if (moved) { need_flush = true; continue; -- cgit v1.2.3 From 8604d9e534a3e662600e288bcfd1a5acd2763d28 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 19 May 2016 17:13:03 -0700 Subject: memory_hotplug: introduce CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE This patchset continues the work I started with commit 31bc3858ea3e ("memory-hotplug: add automatic onlining policy for the newly added memory"). Initially I was going to stop there and bring the policy setting logic to userspace. I met two issues on this way: 1) It is possible to have memory hotplugged at boot (e.g. with QEMU). These blocks stay offlined if we turn the onlining policy on by userspace. 2) My attempt to bring this policy setting to systemd failed, systemd maintainers suggest to change the default in kernel or ... to use tmpfiles.d to alter the policy (which looks like a hack to me): https://github.com/systemd/systemd/pull/2938 Here I suggest to add a config option to set the default value for the policy and a kernel command line parameter to make the override. This patch (of 2): Introduce config option to set the default value for memory hotplug onlining policy (/sys/devices/system/memory/auto_online_blocks). The reason one would want to turn this option on are to have early onlining for hotpluggable memory available at boot and to not require any userspace actions to make memory hotplug work. [akpm@linux-foundation.org: tweak Kconfig text] Signed-off-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Dan Williams Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: David Vrabel Cc: David Rientjes Cc: Igor Mammedov Cc: Lennart Poettering Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/memory-hotplug.txt | 9 +++++---- mm/Kconfig | 16 ++++++++++++++++ mm/memory_hotplug.c | 4 ++++ 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 443f4b44ad97..0d7cb955aa01 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -261,10 +261,11 @@ it according to the policy which can be read from "auto_online_blocks" file: % cat /sys/devices/system/memory/auto_online_blocks -The default is "offline" which means the newly added memory is not in a -ready-to-use state and you have to "online" the newly added memory blocks -manually. Automatic onlining can be requested by writing "online" to -"auto_online_blocks" file: +The default depends on the CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config +option. If it is disabled the default is "offline" which means the newly added +memory is not in a ready-to-use state and you have to "online" the newly added +memory blocks manually. Automatic onlining can be requested by writing "online" +to "auto_online_blocks" file: % echo online > /sys/devices/system/memory/auto_online_blocks diff --git a/mm/Kconfig b/mm/Kconfig index d6e9042b99e0..b0432b71137d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -192,6 +192,22 @@ config MEMORY_HOTPLUG_SPARSE def_bool y depends on SPARSEMEM && MEMORY_HOTPLUG +config MEMORY_HOTPLUG_DEFAULT_ONLINE + bool "Online the newly added memory blocks by default" + default n + depends on MEMORY_HOTPLUG + help + This option sets the default policy setting for memory hotplug + onlining policy (/sys/devices/system/memory/auto_online_blocks) which + determines what happens to newly added memory regions. Policy setting + can always be changed at runtime. + See Documentation/memory-hotplug.txt for more information. + + Say Y here if you want all hot-plugged memory blocks to appear in + 'online' state by default. + Say N here if you want the default policy to keep all hot-plugged + memory blocks in 'offline' state. + config MEMORY_HOTREMOVE bool "Allow for memory hot remove" select MEMORY_ISOLATION diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b21d8895ea41..fcafbfcff044 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -78,7 +78,11 @@ static struct { #define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) #define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) +#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE bool memhp_auto_online; +#else +bool memhp_auto_online = true; +#endif EXPORT_SYMBOL_GPL(memhp_auto_online); void get_online_mems(void) -- cgit v1.2.3 From 86dd995d63241039e0ad9123f9b424013c611510 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 19 May 2016 17:13:06 -0700 Subject: memory_hotplug: introduce memhp_default_state= command line parameter CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE specifies the default value for the memory hotplug onlining policy. Add a command line parameter to make it possible to override the default. It may come handy for debug and testing purposes. Signed-off-by: Vitaly Kuznetsov Cc: Jonathan Corbet Cc: Dan Williams Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: David Vrabel Cc: David Rientjes Cc: Igor Mammedov Cc: Lennart Poettering Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 8 ++++++++ mm/memory_hotplug.c | 11 +++++++++++ 2 files changed, 19 insertions(+) (limited to 'mm') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 794403166a8f..2edb27bdc680 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2161,6 +2161,14 @@ bytes respectively. Such letter suffixes can also be entirely omitted. [KNL,SH] Allow user to override the default size for per-device physically contiguous DMA buffers. + memhp_default_state=online/offline + [KNL] Set the initial state for the memory hotplug + onlining policy. If not specified, the default value is + set according to the + CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE kernel config + option. + See Documentation/memory-hotplug.txt. + memmap=exactmap [KNL,X86] Enable setting of an exact E820 memory map, as specified by the user. Such memmap=exactmap lines can be constructed based on diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fcafbfcff044..caf2a14c37ad 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -85,6 +85,17 @@ bool memhp_auto_online = true; #endif EXPORT_SYMBOL_GPL(memhp_auto_online); +static int __init setup_memhp_default_state(char *str) +{ + if (!strcmp(str, "online")) + memhp_auto_online = true; + else if (!strcmp(str, "offline")) + memhp_auto_online = false; + + return 1; +} +__setup("memhp_default_state=", setup_memhp_default_state); + void get_online_mems(void) { might_sleep(); -- cgit v1.2.3 From 3da88fb3bacfaa33ff9d13730d17110bb2d9604d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:09 -0700 Subject: mm, oom: move GFP_NOFS check to out_of_memory __alloc_pages_may_oom is the central place to decide when the out_of_memory should be invoked. This is a good approach for most checks there because they are page allocator specific and the allocation fails right after for all of them. The notable exception is GFP_NOFS context which is faking did_some_progress and keep the page allocator looping even though there couldn't have been any progress from the OOM killer. This patch doesn't change this behavior because we are not ready to allow those allocation requests to fail yet (and maybe we will face the reality that we will never manage to safely fail these request). Instead __GFP_FS check is moved down to out_of_memory and prevent from OOM victim selection there. There are two reasons for that - OOM notifiers might release some memory even from this context as none of the registered notifier seems to be FS related - this might help a dying thread to get an access to memory reserves and move on which will make the behavior more consistent with the case when the task gets killed from a different context. Keep a comment in __alloc_pages_may_oom to make sure we do not forget how GFP_NOFS is special and that we really want to do something about it. Note to the current oom_notifier users: The observable difference for you is that oom notifiers cannot depend on any fs locks because we could deadlock. Not that this would be allowed today because that would just lockup machine in most of the cases and ruling out the OOM killer along the way. Another difference is that callbacks might be invoked sooner now because GFP_NOFS is a weaker reclaim context and so there could be reclaimable memory which is just not reachable now. That would require GFP_NOFS only loads which are really rare and more importantly the observable result would be dropping of reconstructible object and potential performance drop which is not such a big deal when we are struggling to fulfill other important allocation requests. Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 9 +++++++++ mm/page_alloc.c | 24 ++++++++++-------------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 86349586eacb..32d8210b8773 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -876,6 +876,15 @@ bool out_of_memory(struct oom_control *oc) return true; } + /* + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least + * ___GFP_DIRECT_RECLAIM to get here. + */ + if (oc->gfp_mask && !(oc->gfp_mask & (__GFP_FS|__GFP_NOFAIL))) + return true; + /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da6d339f1936..6d1c8b06b458 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2875,22 +2875,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, /* The OOM killer does not needlessly kill tasks for lowmem */ if (ac->high_zoneidx < ZONE_NORMAL) goto out; - /* The OOM killer does not compensate for IO-less reclaim */ - if (!(gfp_mask & __GFP_FS)) { - /* - * XXX: Page reclaim didn't yield anything, - * and the OOM killer can't be invoked, but - * keep looping as per tradition. - * - * But do not keep looping if oom_killer_disable() - * was already called, for the system is trying to - * enter a quiescent state during suspend. - */ - *did_some_progress = !oom_killer_disabled; - goto out; - } if (pm_suspended_storage()) goto out; + /* + * XXX: GFP_NOFS allocations should rather fail than rely on + * other request to make a forward progress. + * We are in an unfortunate situation where out_of_memory cannot + * do much for this context but let's try it to at least get + * access to memory reserved if the current task is killed (see + * out_of_memory). Once filesystems are ready to handle allocation + * failures more gracefully we should just bail out here. + */ + /* The OOM killer may not free memory on a specific node */ if (gfp_mask & __GFP_THISNODE) goto out; -- cgit v1.2.3 From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:12 -0700 Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path If either the current task is already killed or PF_EXITING or a selected task is PF_EXITING then the oom killer is suppressed and so is the oom reaper. This patch adds try_oom_reaper which checks the given task and queues it for the oom reaper if that is safe to be done meaning that the task doesn't share the mm with an alive process. This might help to release the memory pressure while the task tries to exit. [akpm@linux-foundation.org: fix nommu build] Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 8 +++++ mm/memcontrol.c | 1 + mm/oom_kill.c | 86 ++++++++++++++++++++++++++++++++++++++++++----------- 3 files changed, 77 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/include/linux/oom.h b/include/linux/oom.h index 628a43242a34..83b9c39bd8b7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p) extern void mark_oom_victim(struct task_struct *tsk); +#ifdef CONFIG_MMU +extern void try_oom_reaper(struct task_struct *tsk); +#else +static inline void try_oom_reaper(struct task_struct *tsk) +{ +} +#endif + extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b40dcad2b90..d71d387868e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1275,6 +1275,7 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (fatal_signal_pending(current) || task_will_free_mem(current)) { mark_oom_victim(current); + try_oom_reaper(current); goto unlock; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 32d8210b8773..850b6ff66bdf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -412,6 +412,25 @@ bool oom_killer_disabled __read_mostly; #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * task->mm can be NULL if the task is the exited group leader. So to + * determine whether the task is using a particular mm, we examine all the + * task's threads: if one of those is using this mm then this task was also + * using it. + */ +static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) +{ + struct task_struct *t; + + for_each_thread(p, t) { + struct mm_struct *t_mm = READ_ONCE(t->mm); + if (t_mm) + return t_mm == mm; + } + return false; +} + + #ifdef CONFIG_MMU /* * OOM Reaper kernel thread which tries to reap the memory used by the OOM @@ -563,6 +582,53 @@ static void wake_oom_reaper(struct task_struct *tsk) wake_up(&oom_reaper_wait); } +/* Check if we can reap the given task. This has to be called with stable + * tsk->mm + */ +void try_oom_reaper(struct task_struct *tsk) +{ + struct mm_struct *mm = tsk->mm; + struct task_struct *p; + + if (!mm) + return; + + /* + * There might be other threads/processes which are either not + * dying or even not killable. + */ + if (atomic_read(&mm->mm_users) > 1) { + rcu_read_lock(); + for_each_process(p) { + bool exiting; + + if (!process_shares_mm(p, mm)) + continue; + if (same_thread_group(p, tsk)) + continue; + if (fatal_signal_pending(p)) + continue; + + /* + * If the task is exiting make sure the whole thread group + * is exiting and cannot acces mm anymore. + */ + spin_lock_irq(&p->sighand->siglock); + exiting = signal_group_exit(p->signal); + spin_unlock_irq(&p->sighand->siglock); + if (exiting) + continue; + + /* Give up */ + rcu_read_unlock(); + return; + } + rcu_read_unlock(); + } + + wake_oom_reaper(tsk); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -652,24 +718,6 @@ void oom_killer_enable(void) oom_killer_disabled = false; } -/* - * task->mm can be NULL if the task is the exited group leader. So to - * determine whether the task is using a particular mm, we examine all the - * task's threads: if one of those is using this mm then this task was also - * using it. - */ -static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) -{ - struct task_struct *t; - - for_each_thread(p, t) { - struct mm_struct *t_mm = READ_ONCE(t->mm); - if (t_mm) - return t_mm == mm; - } - return false; -} - /* * Must be called while holding a reference to p, which will be released upon * returning. @@ -694,6 +742,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, task_lock(p); if (p->mm && task_will_free_mem(p)) { mark_oom_victim(p); + try_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -873,6 +922,7 @@ bool out_of_memory(struct oom_control *oc) if (current->mm && (fatal_signal_pending(current) || task_will_free_mem(current))) { mark_oom_victim(current); + try_oom_reaper(current); return true; } -- cgit v1.2.3 From 449d777d7ad6d7f9ac5ed8f618fa13e6ff36c32f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 19 May 2016 17:13:15 -0700 Subject: mm, oom_reaper: clear TIF_MEMDIE for all tasks queued for oom_reaper Right now the oom reaper will clear TIF_MEMDIE only for tasks which were successfully reaped. This is the safest option because we know that such an oom victim would only block forward progress of the oom killer without a good reason because it is highly unlikely it would release much more memory. Basically most of its memory has been already torn down. We can relax this assumption to catch more corner cases though. The first obvious one is when the oom victim clears its mm and gets stuck later on. oom_reaper would back of on find_lock_task_mm returning NULL. We can safely try to clear TIF_MEMDIE in this case because such a task would be ignored by the oom killer anyway. The flag would be cleared by that time already most of the time anyway. The less obvious one is when the oom reaper fails due to mmap_sem contention. Even if we clear TIF_MEMDIE for this task then it is not very likely that we would select another task too easily because we haven't reaped the last victim and so it would be still the #1 candidate. There is a rare race condition possible when the current victim terminates before the next select_bad_process but considering that oom_reap_task had retried several times before giving up then this sounds like a borderline thing. After this patch we should have a guarantee that the OOM killer will not be block for unbounded amount of time for most cases. Signed-off-by: Michal Hocko Cc: Raushaniya Maksudova Cc: Michael S. Tsirkin Cc: Paul E. McKenney Cc: David Rientjes Cc: Tetsuo Handa Cc: Daniel Vetter Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 850b6ff66bdf..415f7eb913fa 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -510,14 +510,10 @@ static bool __oom_reap_task(struct task_struct *tsk) up_read(&mm->mmap_sem); /* - * Clear TIF_MEMDIE because the task shouldn't be sitting on a - * reasonably reclaimable memory anymore. OOM killer can continue - * by selecting other victim if unmapping hasn't led to any - * improvements. This also means that selecting this task doesn't - * make any sense. + * This task can be safely ignored because we cannot do much more + * to release its memory. */ tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; - exit_oom_victim(tsk); out: mmput(mm); return ret; @@ -538,6 +534,15 @@ static void oom_reap_task(struct task_struct *tsk) debug_show_all_locks(); } + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore or it is not a good candidate + * for the oom victim right now because it cannot release its memory + * itself nor by the oom reaper. + */ + tsk->oom_reaper_list = NULL; + exit_oom_victim(tsk); + /* Drop a reference taken by wake_oom_reaper */ put_task_struct(tsk); } -- cgit v1.2.3 From d61f8590397480981f0d3ee7a2b38b5ea990db52 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:18 -0700 Subject: mm, page_alloc: only check PageCompound for high-order pages Another year, another round of page allocator optimisations focusing this time on the alloc and free fast paths. This should be of help to workloads that are allocator-intensive from kernel space where the cost of zeroing is not nceessraily incurred. The series is motivated by the observation that page alloc microbenchmarks on multiple machines regressed between 3.12.44 and 4.4. Second, there is discussions before LSF/MM considering the possibility of adding another page allocator which is potentially hazardous but a patch series improving performance is better than whining. After the series is applied, there are still hazards. In the free paths, the debugging checking and page zone/pageblock lookups dominate but there was not an obvious solution to that. In the alloc path, the major contributers are dealing with zonelists, new page preperation, the fair zone allocation and numerous statistic updates. The fair zone allocator is removed by the per-node LRU series if that gets merged so it's nor a major concern at the moment. On normal userspace benchmarks, there is little impact as the zeroing cost is significant but it's visible aim9 4.6.0-rc3 4.6.0-rc3 vanilla deferalloc-v3 Min page_test 828693.33 ( 0.00%) 887060.00 ( 7.04%) Min brk_test 4847266.67 ( 0.00%) 4966266.67 ( 2.45%) Min exec_test 1271.00 ( 0.00%) 1275.67 ( 0.37%) Min fork_test 12371.75 ( 0.00%) 12380.00 ( 0.07%) The overall impact on a page allocator microbenchmark for a range of orders and number of pages allocated in a batch is 4.6.0-rc3 4.6.0-rc3 vanilla deferalloc-v3r7 Min alloc-odr0-1 428.00 ( 0.00%) 316.00 ( 26.17%) Min alloc-odr0-2 314.00 ( 0.00%) 231.00 ( 26.43%) Min alloc-odr0-4 256.00 ( 0.00%) 192.00 ( 25.00%) Min alloc-odr0-8 222.00 ( 0.00%) 166.00 ( 25.23%) Min alloc-odr0-16 207.00 ( 0.00%) 154.00 ( 25.60%) Min alloc-odr0-32 197.00 ( 0.00%) 148.00 ( 24.87%) Min alloc-odr0-64 193.00 ( 0.00%) 144.00 ( 25.39%) Min alloc-odr0-128 191.00 ( 0.00%) 143.00 ( 25.13%) Min alloc-odr0-256 203.00 ( 0.00%) 153.00 ( 24.63%) Min alloc-odr0-512 212.00 ( 0.00%) 165.00 ( 22.17%) Min alloc-odr0-1024 221.00 ( 0.00%) 172.00 ( 22.17%) Min alloc-odr0-2048 225.00 ( 0.00%) 179.00 ( 20.44%) Min alloc-odr0-4096 232.00 ( 0.00%) 185.00 ( 20.26%) Min alloc-odr0-8192 235.00 ( 0.00%) 187.00 ( 20.43%) Min alloc-odr0-16384 236.00 ( 0.00%) 188.00 ( 20.34%) Min alloc-odr1-1 519.00 ( 0.00%) 450.00 ( 13.29%) Min alloc-odr1-2 391.00 ( 0.00%) 336.00 ( 14.07%) Min alloc-odr1-4 313.00 ( 0.00%) 268.00 ( 14.38%) Min alloc-odr1-8 277.00 ( 0.00%) 235.00 ( 15.16%) Min alloc-odr1-16 256.00 ( 0.00%) 218.00 ( 14.84%) Min alloc-odr1-32 252.00 ( 0.00%) 212.00 ( 15.87%) Min alloc-odr1-64 244.00 ( 0.00%) 206.00 ( 15.57%) Min alloc-odr1-128 244.00 ( 0.00%) 207.00 ( 15.16%) Min alloc-odr1-256 243.00 ( 0.00%) 207.00 ( 14.81%) Min alloc-odr1-512 245.00 ( 0.00%) 209.00 ( 14.69%) Min alloc-odr1-1024 248.00 ( 0.00%) 214.00 ( 13.71%) Min alloc-odr1-2048 253.00 ( 0.00%) 220.00 ( 13.04%) Min alloc-odr1-4096 258.00 ( 0.00%) 224.00 ( 13.18%) Min alloc-odr1-8192 261.00 ( 0.00%) 229.00 ( 12.26%) Min alloc-odr2-1 560.00 ( 0.00%) 753.00 (-34.46%) Min alloc-odr2-2 424.00 ( 0.00%) 351.00 ( 17.22%) Min alloc-odr2-4 339.00 ( 0.00%) 393.00 (-15.93%) Min alloc-odr2-8 298.00 ( 0.00%) 246.00 ( 17.45%) Min alloc-odr2-16 276.00 ( 0.00%) 227.00 ( 17.75%) Min alloc-odr2-32 271.00 ( 0.00%) 221.00 ( 18.45%) Min alloc-odr2-64 264.00 ( 0.00%) 217.00 ( 17.80%) Min alloc-odr2-128 264.00 ( 0.00%) 217.00 ( 17.80%) Min alloc-odr2-256 264.00 ( 0.00%) 218.00 ( 17.42%) Min alloc-odr2-512 269.00 ( 0.00%) 223.00 ( 17.10%) Min alloc-odr2-1024 279.00 ( 0.00%) 230.00 ( 17.56%) Min alloc-odr2-2048 283.00 ( 0.00%) 235.00 ( 16.96%) Min alloc-odr2-4096 285.00 ( 0.00%) 239.00 ( 16.14%) Min alloc-odr3-1 629.00 ( 0.00%) 505.00 ( 19.71%) Min alloc-odr3-2 472.00 ( 0.00%) 374.00 ( 20.76%) Min alloc-odr3-4 383.00 ( 0.00%) 301.00 ( 21.41%) Min alloc-odr3-8 341.00 ( 0.00%) 266.00 ( 21.99%) Min alloc-odr3-16 316.00 ( 0.00%) 248.00 ( 21.52%) Min alloc-odr3-32 308.00 ( 0.00%) 241.00 ( 21.75%) Min alloc-odr3-64 305.00 ( 0.00%) 241.00 ( 20.98%) Min alloc-odr3-128 308.00 ( 0.00%) 244.00 ( 20.78%) Min alloc-odr3-256 317.00 ( 0.00%) 249.00 ( 21.45%) Min alloc-odr3-512 327.00 ( 0.00%) 256.00 ( 21.71%) Min alloc-odr3-1024 331.00 ( 0.00%) 261.00 ( 21.15%) Min alloc-odr3-2048 333.00 ( 0.00%) 266.00 ( 20.12%) Min alloc-odr4-1 767.00 ( 0.00%) 572.00 ( 25.42%) Min alloc-odr4-2 578.00 ( 0.00%) 429.00 ( 25.78%) Min alloc-odr4-4 474.00 ( 0.00%) 346.00 ( 27.00%) Min alloc-odr4-8 422.00 ( 0.00%) 310.00 ( 26.54%) Min alloc-odr4-16 399.00 ( 0.00%) 295.00 ( 26.07%) Min alloc-odr4-32 392.00 ( 0.00%) 293.00 ( 25.26%) Min alloc-odr4-64 394.00 ( 0.00%) 293.00 ( 25.63%) Min alloc-odr4-128 405.00 ( 0.00%) 305.00 ( 24.69%) Min alloc-odr4-256 417.00 ( 0.00%) 319.00 ( 23.50%) Min alloc-odr4-512 425.00 ( 0.00%) 326.00 ( 23.29%) Min alloc-odr4-1024 426.00 ( 0.00%) 329.00 ( 22.77%) Min free-odr0-1 216.00 ( 0.00%) 178.00 ( 17.59%) Min free-odr0-2 152.00 ( 0.00%) 125.00 ( 17.76%) Min free-odr0-4 120.00 ( 0.00%) 99.00 ( 17.50%) Min free-odr0-8 106.00 ( 0.00%) 85.00 ( 19.81%) Min free-odr0-16 97.00 ( 0.00%) 80.00 ( 17.53%) Min free-odr0-32 92.00 ( 0.00%) 76.00 ( 17.39%) Min free-odr0-64 89.00 ( 0.00%) 74.00 ( 16.85%) Min free-odr0-128 89.00 ( 0.00%) 73.00 ( 17.98%) Min free-odr0-256 107.00 ( 0.00%) 90.00 ( 15.89%) Min free-odr0-512 117.00 ( 0.00%) 108.00 ( 7.69%) Min free-odr0-1024 125.00 ( 0.00%) 118.00 ( 5.60%) Min free-odr0-2048 132.00 ( 0.00%) 125.00 ( 5.30%) Min free-odr0-4096 135.00 ( 0.00%) 130.00 ( 3.70%) Min free-odr0-8192 137.00 ( 0.00%) 130.00 ( 5.11%) Min free-odr0-16384 137.00 ( 0.00%) 131.00 ( 4.38%) Min free-odr1-1 318.00 ( 0.00%) 289.00 ( 9.12%) Min free-odr1-2 228.00 ( 0.00%) 207.00 ( 9.21%) Min free-odr1-4 182.00 ( 0.00%) 165.00 ( 9.34%) Min free-odr1-8 163.00 ( 0.00%) 146.00 ( 10.43%) Min free-odr1-16 151.00 ( 0.00%) 135.00 ( 10.60%) Min free-odr1-32 146.00 ( 0.00%) 129.00 ( 11.64%) Min free-odr1-64 145.00 ( 0.00%) 130.00 ( 10.34%) Min free-odr1-128 148.00 ( 0.00%) 134.00 ( 9.46%) Min free-odr1-256 148.00 ( 0.00%) 137.00 ( 7.43%) Min free-odr1-512 151.00 ( 0.00%) 140.00 ( 7.28%) Min free-odr1-1024 154.00 ( 0.00%) 143.00 ( 7.14%) Min free-odr1-2048 156.00 ( 0.00%) 144.00 ( 7.69%) Min free-odr1-4096 156.00 ( 0.00%) 142.00 ( 8.97%) Min free-odr1-8192 156.00 ( 0.00%) 140.00 ( 10.26%) Min free-odr2-1 361.00 ( 0.00%) 457.00 (-26.59%) Min free-odr2-2 258.00 ( 0.00%) 224.00 ( 13.18%) Min free-odr2-4 208.00 ( 0.00%) 223.00 ( -7.21%) Min free-odr2-8 185.00 ( 0.00%) 160.00 ( 13.51%) Min free-odr2-16 173.00 ( 0.00%) 149.00 ( 13.87%) Min free-odr2-32 166.00 ( 0.00%) 145.00 ( 12.65%) Min free-odr2-64 166.00 ( 0.00%) 146.00 ( 12.05%) Min free-odr2-128 169.00 ( 0.00%) 148.00 ( 12.43%) Min free-odr2-256 170.00 ( 0.00%) 152.00 ( 10.59%) Min free-odr2-512 177.00 ( 0.00%) 156.00 ( 11.86%) Min free-odr2-1024 182.00 ( 0.00%) 162.00 ( 10.99%) Min free-odr2-2048 181.00 ( 0.00%) 160.00 ( 11.60%) Min free-odr2-4096 180.00 ( 0.00%) 159.00 ( 11.67%) Min free-odr3-1 431.00 ( 0.00%) 367.00 ( 14.85%) Min free-odr3-2 306.00 ( 0.00%) 259.00 ( 15.36%) Min free-odr3-4 249.00 ( 0.00%) 208.00 ( 16.47%) Min free-odr3-8 224.00 ( 0.00%) 186.00 ( 16.96%) Min free-odr3-16 208.00 ( 0.00%) 176.00 ( 15.38%) Min free-odr3-32 206.00 ( 0.00%) 174.00 ( 15.53%) Min free-odr3-64 210.00 ( 0.00%) 178.00 ( 15.24%) Min free-odr3-128 215.00 ( 0.00%) 182.00 ( 15.35%) Min free-odr3-256 224.00 ( 0.00%) 189.00 ( 15.62%) Min free-odr3-512 232.00 ( 0.00%) 195.00 ( 15.95%) Min free-odr3-1024 230.00 ( 0.00%) 195.00 ( 15.22%) Min free-odr3-2048 229.00 ( 0.00%) 193.00 ( 15.72%) Min free-odr4-1 561.00 ( 0.00%) 439.00 ( 21.75%) Min free-odr4-2 418.00 ( 0.00%) 318.00 ( 23.92%) Min free-odr4-4 339.00 ( 0.00%) 269.00 ( 20.65%) Min free-odr4-8 299.00 ( 0.00%) 239.00 ( 20.07%) Min free-odr4-16 289.00 ( 0.00%) 234.00 ( 19.03%) Min free-odr4-32 291.00 ( 0.00%) 235.00 ( 19.24%) Min free-odr4-64 298.00 ( 0.00%) 238.00 ( 20.13%) Min free-odr4-128 308.00 ( 0.00%) 251.00 ( 18.51%) Min free-odr4-256 321.00 ( 0.00%) 267.00 ( 16.82%) Min free-odr4-512 327.00 ( 0.00%) 269.00 ( 17.74%) Min free-odr4-1024 326.00 ( 0.00%) 271.00 ( 16.87%) Min total-odr0-1 644.00 ( 0.00%) 494.00 ( 23.29%) Min total-odr0-2 466.00 ( 0.00%) 356.00 ( 23.61%) Min total-odr0-4 376.00 ( 0.00%) 291.00 ( 22.61%) Min total-odr0-8 328.00 ( 0.00%) 251.00 ( 23.48%) Min total-odr0-16 304.00 ( 0.00%) 234.00 ( 23.03%) Min total-odr0-32 289.00 ( 0.00%) 224.00 ( 22.49%) Min total-odr0-64 282.00 ( 0.00%) 218.00 ( 22.70%) Min total-odr0-128 280.00 ( 0.00%) 216.00 ( 22.86%) Min total-odr0-256 310.00 ( 0.00%) 243.00 ( 21.61%) Min total-odr0-512 329.00 ( 0.00%) 273.00 ( 17.02%) Min total-odr0-1024 346.00 ( 0.00%) 290.00 ( 16.18%) Min total-odr0-2048 357.00 ( 0.00%) 304.00 ( 14.85%) Min total-odr0-4096 367.00 ( 0.00%) 315.00 ( 14.17%) Min total-odr0-8192 372.00 ( 0.00%) 317.00 ( 14.78%) Min total-odr0-16384 373.00 ( 0.00%) 319.00 ( 14.48%) Min total-odr1-1 838.00 ( 0.00%) 739.00 ( 11.81%) Min total-odr1-2 619.00 ( 0.00%) 543.00 ( 12.28%) Min total-odr1-4 495.00 ( 0.00%) 433.00 ( 12.53%) Min total-odr1-8 440.00 ( 0.00%) 382.00 ( 13.18%) Min total-odr1-16 407.00 ( 0.00%) 353.00 ( 13.27%) Min total-odr1-32 398.00 ( 0.00%) 341.00 ( 14.32%) Min total-odr1-64 389.00 ( 0.00%) 336.00 ( 13.62%) Min total-odr1-128 392.00 ( 0.00%) 341.00 ( 13.01%) Min total-odr1-256 391.00 ( 0.00%) 344.00 ( 12.02%) Min total-odr1-512 396.00 ( 0.00%) 349.00 ( 11.87%) Min total-odr1-1024 402.00 ( 0.00%) 357.00 ( 11.19%) Min total-odr1-2048 409.00 ( 0.00%) 364.00 ( 11.00%) Min total-odr1-4096 414.00 ( 0.00%) 366.00 ( 11.59%) Min total-odr1-8192 417.00 ( 0.00%) 369.00 ( 11.51%) Min total-odr2-1 921.00 ( 0.00%) 1210.00 (-31.38%) Min total-odr2-2 682.00 ( 0.00%) 576.00 ( 15.54%) Min total-odr2-4 547.00 ( 0.00%) 616.00 (-12.61%) Min total-odr2-8 483.00 ( 0.00%) 406.00 ( 15.94%) Min total-odr2-16 449.00 ( 0.00%) 376.00 ( 16.26%) Min total-odr2-32 437.00 ( 0.00%) 366.00 ( 16.25%) Min total-odr2-64 431.00 ( 0.00%) 363.00 ( 15.78%) Min total-odr2-128 433.00 ( 0.00%) 365.00 ( 15.70%) Min total-odr2-256 434.00 ( 0.00%) 371.00 ( 14.52%) Min total-odr2-512 446.00 ( 0.00%) 379.00 ( 15.02%) Min total-odr2-1024 461.00 ( 0.00%) 392.00 ( 14.97%) Min total-odr2-2048 464.00 ( 0.00%) 395.00 ( 14.87%) Min total-odr2-4096 465.00 ( 0.00%) 398.00 ( 14.41%) Min total-odr3-1 1060.00 ( 0.00%) 872.00 ( 17.74%) Min total-odr3-2 778.00 ( 0.00%) 633.00 ( 18.64%) Min total-odr3-4 632.00 ( 0.00%) 510.00 ( 19.30%) Min total-odr3-8 565.00 ( 0.00%) 452.00 ( 20.00%) Min total-odr3-16 524.00 ( 0.00%) 424.00 ( 19.08%) Min total-odr3-32 514.00 ( 0.00%) 415.00 ( 19.26%) Min total-odr3-64 515.00 ( 0.00%) 419.00 ( 18.64%) Min total-odr3-128 523.00 ( 0.00%) 426.00 ( 18.55%) Min total-odr3-256 541.00 ( 0.00%) 438.00 ( 19.04%) Min total-odr3-512 559.00 ( 0.00%) 451.00 ( 19.32%) Min total-odr3-1024 561.00 ( 0.00%) 456.00 ( 18.72%) Min total-odr3-2048 562.00 ( 0.00%) 459.00 ( 18.33%) Min total-odr4-1 1328.00 ( 0.00%) 1011.00 ( 23.87%) Min total-odr4-2 997.00 ( 0.00%) 747.00 ( 25.08%) Min total-odr4-4 813.00 ( 0.00%) 615.00 ( 24.35%) Min total-odr4-8 721.00 ( 0.00%) 550.00 ( 23.72%) Min total-odr4-16 689.00 ( 0.00%) 529.00 ( 23.22%) Min total-odr4-32 683.00 ( 0.00%) 528.00 ( 22.69%) Min total-odr4-64 692.00 ( 0.00%) 531.00 ( 23.27%) Min total-odr4-128 713.00 ( 0.00%) 556.00 ( 22.02%) Min total-odr4-256 738.00 ( 0.00%) 586.00 ( 20.60%) Min total-odr4-512 753.00 ( 0.00%) 595.00 ( 20.98%) Min total-odr4-1024 752.00 ( 0.00%) 600.00 ( 20.21%) This patch (of 27): order-0 pages by definition cannot be compound so avoid the check in the fast path for those pages. [akpm@linux-foundation.org: use unlikely(order) in free_pages_prepare(), per Vlastimil] Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d1c8b06b458..087ba3e417ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1024,24 +1024,33 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) static bool free_pages_prepare(struct page *page, unsigned int order) { - bool compound = PageCompound(page); - int i, bad = 0; + int bad = 0; VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); kasan_free_pages(page, order); + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + bad += free_pages_check(page + i); + } + } if (PageAnon(page)) page->mapping = NULL; bad += free_pages_check(page); - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); - } if (bad) return false; -- cgit v1.2.3 From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:21 -0700 Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast path The PageAnon check always checks for compound_head but this is a relatively expensive check if the caller already knows the page is a head page. This patch creates a helper and uses it in the page free path which only operates on head pages. With this patch and "Only check PageCompound for high-order pages", the performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 vanilla nocompound-v1r20 Min alloc-odr0-1 425.00 ( 0.00%) 417.00 ( 1.88%) Min alloc-odr0-2 313.00 ( 0.00%) 308.00 ( 1.60%) Min alloc-odr0-4 257.00 ( 0.00%) 253.00 ( 1.56%) Min alloc-odr0-8 224.00 ( 0.00%) 221.00 ( 1.34%) Min alloc-odr0-16 208.00 ( 0.00%) 205.00 ( 1.44%) Min alloc-odr0-32 199.00 ( 0.00%) 199.00 ( 0.00%) Min alloc-odr0-64 195.00 ( 0.00%) 193.00 ( 1.03%) Min alloc-odr0-128 192.00 ( 0.00%) 191.00 ( 0.52%) Min alloc-odr0-256 204.00 ( 0.00%) 200.00 ( 1.96%) Min alloc-odr0-512 213.00 ( 0.00%) 212.00 ( 0.47%) Min alloc-odr0-1024 219.00 ( 0.00%) 219.00 ( 0.00%) Min alloc-odr0-2048 225.00 ( 0.00%) 225.00 ( 0.00%) Min alloc-odr0-4096 230.00 ( 0.00%) 231.00 ( -0.43%) Min alloc-odr0-8192 235.00 ( 0.00%) 234.00 ( 0.43%) Min alloc-odr0-16384 235.00 ( 0.00%) 234.00 ( 0.43%) Min free-odr0-1 215.00 ( 0.00%) 191.00 ( 11.16%) Min free-odr0-2 152.00 ( 0.00%) 136.00 ( 10.53%) Min free-odr0-4 119.00 ( 0.00%) 107.00 ( 10.08%) Min free-odr0-8 106.00 ( 0.00%) 96.00 ( 9.43%) Min free-odr0-16 97.00 ( 0.00%) 87.00 ( 10.31%) Min free-odr0-32 91.00 ( 0.00%) 83.00 ( 8.79%) Min free-odr0-64 89.00 ( 0.00%) 81.00 ( 8.99%) Min free-odr0-128 88.00 ( 0.00%) 80.00 ( 9.09%) Min free-odr0-256 106.00 ( 0.00%) 95.00 ( 10.38%) Min free-odr0-512 116.00 ( 0.00%) 111.00 ( 4.31%) Min free-odr0-1024 125.00 ( 0.00%) 118.00 ( 5.60%) Min free-odr0-2048 133.00 ( 0.00%) 126.00 ( 5.26%) Min free-odr0-4096 136.00 ( 0.00%) 130.00 ( 4.41%) Min free-odr0-8192 138.00 ( 0.00%) 130.00 ( 5.80%) Min free-odr0-16384 137.00 ( 0.00%) 130.00 ( 5.11%) There is a sizable boost to the free allocator performance. While there is an apparent boost on the allocation side, it's likely a co-incidence or due to the patches slightly reducing cache footprint. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 7 ++++++- mm/page_alloc.c | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 6b052aa7b5b7..a61e06e5fbce 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY) #define PAGE_MAPPING_KSM 2 #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM) +static __always_inline int PageAnonHead(struct page *page) +{ + return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; +} + static __always_inline int PageAnon(struct page *page) { page = compound_head(page); - return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; + return PageAnonHead(page); } #ifdef CONFIG_KSM diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 087ba3e417ec..7be1ce8b6be0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1048,7 +1048,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) bad += free_pages_check(page + i); } } - if (PageAnon(page)) + if (PageAnonHead(page)) page->mapping = NULL; bad += free_pages_check(page); if (bad) -- cgit v1.2.3 From b9f00e147f27d86691f7f52a3c8126d25432477c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:24 -0700 Subject: mm, page_alloc: reduce branches in zone_statistics zone_statistics has more branches than it really needs to take an unlikely GFP flag into account. Reduce the number and annotate the unlikely flag. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 nocompound-v1r10 statbranch-v1r10 Min alloc-odr0-1 417.00 ( 0.00%) 419.00 ( -0.48%) Min alloc-odr0-2 308.00 ( 0.00%) 305.00 ( 0.97%) Min alloc-odr0-4 253.00 ( 0.00%) 250.00 ( 1.19%) Min alloc-odr0-8 221.00 ( 0.00%) 219.00 ( 0.90%) Min alloc-odr0-16 205.00 ( 0.00%) 203.00 ( 0.98%) Min alloc-odr0-32 199.00 ( 0.00%) 195.00 ( 2.01%) Min alloc-odr0-64 193.00 ( 0.00%) 191.00 ( 1.04%) Min alloc-odr0-128 191.00 ( 0.00%) 189.00 ( 1.05%) Min alloc-odr0-256 200.00 ( 0.00%) 198.00 ( 1.00%) Min alloc-odr0-512 212.00 ( 0.00%) 210.00 ( 0.94%) Min alloc-odr0-1024 219.00 ( 0.00%) 216.00 ( 1.37%) Min alloc-odr0-2048 225.00 ( 0.00%) 221.00 ( 1.78%) Min alloc-odr0-4096 231.00 ( 0.00%) 227.00 ( 1.73%) Min alloc-odr0-8192 234.00 ( 0.00%) 232.00 ( 0.85%) Min alloc-odr0-16384 234.00 ( 0.00%) 232.00 ( 0.85%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index c831be32a1a3..d585de27e960 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -581,17 +581,21 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) */ void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) { - if (z->zone_pgdat == preferred_zone->zone_pgdat) { + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); } else { __inc_zone_state(z, NUMA_MISS); __inc_zone_state(preferred_zone, NUMA_FOREIGN); } - if (z->node == ((flags & __GFP_OTHER_NODE) ? - preferred_zone->node : numa_node_id())) - __inc_zone_state(z, NUMA_LOCAL); - else - __inc_zone_state(z, NUMA_OTHER); } /* -- cgit v1.2.3 From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:27 -0700 Subject: mm, page_alloc: inline zone_statistics zone_statistics has one call-site but it's a public function. Make it static and inline. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 statbranch-v1r20 statinline-v1r20 Min alloc-odr0-1 419.00 ( 0.00%) 412.00 ( 1.67%) Min alloc-odr0-2 305.00 ( 0.00%) 301.00 ( 1.31%) Min alloc-odr0-4 250.00 ( 0.00%) 247.00 ( 1.20%) Min alloc-odr0-8 219.00 ( 0.00%) 215.00 ( 1.83%) Min alloc-odr0-16 203.00 ( 0.00%) 199.00 ( 1.97%) Min alloc-odr0-32 195.00 ( 0.00%) 191.00 ( 2.05%) Min alloc-odr0-64 191.00 ( 0.00%) 187.00 ( 2.09%) Min alloc-odr0-128 189.00 ( 0.00%) 185.00 ( 2.12%) Min alloc-odr0-256 198.00 ( 0.00%) 193.00 ( 2.53%) Min alloc-odr0-512 210.00 ( 0.00%) 207.00 ( 1.43%) Min alloc-odr0-1024 216.00 ( 0.00%) 213.00 ( 1.39%) Min alloc-odr0-2048 221.00 ( 0.00%) 220.00 ( 0.45%) Min alloc-odr0-4096 227.00 ( 0.00%) 226.00 ( 0.44%) Min alloc-odr0-8192 232.00 ( 0.00%) 229.00 ( 1.29%) Min alloc-odr0-16384 232.00 ( 0.00%) 229.00 ( 1.29%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 2 -- mm/page_alloc.c | 31 +++++++++++++++++++++++++++++++ mm/vmstat.c | 29 ----------------------------- 3 files changed, 31 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 02fce415b3d9..d2da8e053210 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, #ifdef CONFIG_NUMA extern unsigned long node_page_state(int node, enum zone_stat_item item); -extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #else #define node_page_state(node, item) global_page_state(item) -#define zone_statistics(_zl, _z, gfp) do { } while (0) #endif /* CONFIG_NUMA */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7be1ce8b6be0..36384baa74e1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2354,6 +2354,37 @@ int split_free_page(struct page *page) return nr_pages; } +/* + * Update NUMA hit/miss statistics + * + * Must be called with interrupts disabled. + * + * When __GFP_OTHER_NODE is set assume the node of the preferred + * zone is the local node. This is useful for daemons who allocate + * memory on behalf of other processes. + */ +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, + gfp_t flags) +{ +#ifdef CONFIG_NUMA + int local_nid = numa_node_id(); + enum zone_stat_item local_stat = NUMA_LOCAL; + + if (unlikely(flags & __GFP_OTHER_NODE)) { + local_stat = NUMA_OTHER; + local_nid = preferred_zone->node; + } + + if (z->node == local_nid) { + __inc_zone_state(z, NUMA_HIT); + __inc_zone_state(z, local_stat); + } else { + __inc_zone_state(z, NUMA_MISS); + __inc_zone_state(preferred_zone, NUMA_FOREIGN); + } +#endif +} + /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index d585de27e960..f1a73bfb77b5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -569,35 +569,6 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) #endif #ifdef CONFIG_NUMA -/* - * zonelist = the list of zones passed to the allocator - * z = the zone from which the allocation occurred. - * - * Must be called with interrupts disabled. - * - * When __GFP_OTHER_NODE is set assume the node of the preferred - * zone is the local node. This is useful for daemons who allocate - * memory on behalf of other processes. - */ -void zone_statistics(struct zone *preferred_zone, struct zone *z, gfp_t flags) -{ - int local_nid = numa_node_id(); - enum zone_stat_item local_stat = NUMA_LOCAL; - - if (unlikely(flags & __GFP_OTHER_NODE)) { - local_stat = NUMA_OTHER; - local_nid = preferred_zone->node; - } - - if (z->node == local_nid) { - __inc_zone_state(z, NUMA_HIT); - __inc_zone_state(z, local_stat); - } else { - __inc_zone_state(z, NUMA_MISS); - __inc_zone_state(preferred_zone, NUMA_FOREIGN); - } -} - /* * Determine the per node value of a stat item. */ -- cgit v1.2.3 From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:30 -0700 Subject: mm, page_alloc: inline the fast path of the zonelist iterator The page allocator iterates through a zonelist for zones that match the addressing limitations and nodemask of the caller but many allocations will not be restricted. Despite this, there is always functional call overhead which builds up. This patch inlines the optimistic basic case and only calls the iterator function for the complex case. A hindrance was the fact that cpuset_current_mems_allowed is used in the fastpath as the allowed nodemask even though all nodes are allowed on most systems. The patch handles this by only considering cpuset_current_mems_allowed if a cpuset exists. As well as being faster in the fast-path, this removes some junk in the slowpath. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 statinline-v1r20 optiter-v1r20 Min alloc-odr0-1 412.00 ( 0.00%) 382.00 ( 7.28%) Min alloc-odr0-2 301.00 ( 0.00%) 282.00 ( 6.31%) Min alloc-odr0-4 247.00 ( 0.00%) 233.00 ( 5.67%) Min alloc-odr0-8 215.00 ( 0.00%) 203.00 ( 5.58%) Min alloc-odr0-16 199.00 ( 0.00%) 188.00 ( 5.53%) Min alloc-odr0-32 191.00 ( 0.00%) 182.00 ( 4.71%) Min alloc-odr0-64 187.00 ( 0.00%) 177.00 ( 5.35%) Min alloc-odr0-128 185.00 ( 0.00%) 175.00 ( 5.41%) Min alloc-odr0-256 193.00 ( 0.00%) 184.00 ( 4.66%) Min alloc-odr0-512 207.00 ( 0.00%) 197.00 ( 4.83%) Min alloc-odr0-1024 213.00 ( 0.00%) 203.00 ( 4.69%) Min alloc-odr0-2048 220.00 ( 0.00%) 209.00 ( 5.00%) Min alloc-odr0-4096 226.00 ( 0.00%) 214.00 ( 5.31%) Min alloc-odr0-8192 229.00 ( 0.00%) 218.00 ( 4.80%) Min alloc-odr0-16384 229.00 ( 0.00%) 219.00 ( 4.37%) perf indicated that next_zones_zonelist disappeared in the profile and __next_zones_zonelist did not appear. This is expected as the micro-benchmark would hit the inlined fast-path every time. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 13 +++++++++++-- mm/mmzone.c | 2 +- mm/page_alloc.c | 26 +++++++++----------------- 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 150c6049f961..cfcd7723edb6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) #endif /* CONFIG_NUMA */ } +struct zoneref *__next_zones_zonelist(struct zoneref *z, + enum zone_type highest_zoneidx, + nodemask_t *nodes); + /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z - The cursor used as a starting point for the search @@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) * being examined. It should be advanced by one before calling * next_zones_zonelist again. */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, - nodemask_t *nodes); + nodemask_t *nodes) +{ + if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) + return z; + return __next_zones_zonelist(z, highest_zoneidx, nodes); +} /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist diff --git a/mm/mmzone.c b/mm/mmzone.c index 52687fb4de6f..5652be858e5e 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -52,7 +52,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) } /* Returns the next zone at or below highest_zoneidx in a zonelist */ -struct zoneref *next_zones_zonelist(struct zoneref *z, +struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 36384baa74e1..789e5f065e8d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3192,17 +3192,6 @@ retry: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); - /* - * Find the true preferred zone if the allocation is unconstrained by - * cpusets. - */ - if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) { - struct zoneref *preferred_zoneref; - preferred_zoneref = first_zones_zonelist(ac->zonelist, - ac->high_zoneidx, NULL, &ac->preferred_zone); - ac->classzone_idx = zonelist_zone_idx(preferred_zoneref); - } - /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -3358,14 +3347,21 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), + .zonelist = zonelist, .nodemask = nodemask, .migratetype = gfpflags_to_migratetype(gfp_mask), }; + if (cpusets_enabled()) { + alloc_flags |= ALLOC_CPUSET; + if (!ac.nodemask) + ac.nodemask = &cpuset_current_mems_allowed; + } + gfp_mask &= gfp_allowed_mask; lockdep_trace_alloc(gfp_mask); @@ -3389,16 +3385,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); - /* We set it here, as __alloc_pages_slowpath might have changed it */ - ac.zonelist = zonelist; - /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask ? : &cpuset_current_mems_allowed, - &ac.preferred_zone); + ac.nodemask, &ac.preferred_zone); if (!ac.preferred_zone) goto out; ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); -- cgit v1.2.3 From 754078eb45df8069f389f3371002e7e24962e1a2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:33 -0700 Subject: mm, page_alloc: use __dec_zone_state for order-0 page allocation __dec_zone_state is cheaper to use for removing an order-0 page as it has fewer conditions to check. The performance difference on a page allocator microbenchmark is; 4.6.0-rc2 4.6.0-rc2 optiter-v1r20 decstat-v1r20 Min alloc-odr0-1 382.00 ( 0.00%) 381.00 ( 0.26%) Min alloc-odr0-2 282.00 ( 0.00%) 275.00 ( 2.48%) Min alloc-odr0-4 233.00 ( 0.00%) 229.00 ( 1.72%) Min alloc-odr0-8 203.00 ( 0.00%) 199.00 ( 1.97%) Min alloc-odr0-16 188.00 ( 0.00%) 186.00 ( 1.06%) Min alloc-odr0-32 182.00 ( 0.00%) 179.00 ( 1.65%) Min alloc-odr0-64 177.00 ( 0.00%) 174.00 ( 1.69%) Min alloc-odr0-128 175.00 ( 0.00%) 172.00 ( 1.71%) Min alloc-odr0-256 184.00 ( 0.00%) 181.00 ( 1.63%) Min alloc-odr0-512 197.00 ( 0.00%) 193.00 ( 2.03%) Min alloc-odr0-1024 203.00 ( 0.00%) 201.00 ( 0.99%) Min alloc-odr0-2048 209.00 ( 0.00%) 206.00 ( 1.44%) Min alloc-odr0-4096 214.00 ( 0.00%) 212.00 ( 0.93%) Min alloc-odr0-8192 218.00 ( 0.00%) 215.00 ( 1.38%) Min alloc-odr0-16384 219.00 ( 0.00%) 216.00 ( 1.37%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 789e5f065e8d..8f6b6eab074f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2417,6 +2417,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, else page = list_first_entry(list, struct page, lru); + __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); pcp->count--; } else { @@ -2438,11 +2439,11 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); __mod_zone_freepage_state(zone, -(1 << order), get_pcppage_migratetype(page)); } - __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); if (atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH]) <= 0 && !test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) set_bit(ZONE_FAIR_DEPLETED, &zone->flags); -- cgit v1.2.3 From f75fb889d18d362e336f8d3fba158a8636d0a063 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:36 -0700 Subject: mm, page_alloc: avoid unnecessary zone lookups during pageblock operations Pageblocks have an associated bitmap to store migrate types and whether the pageblock should be skipped during compaction. The bitmap may be associated with a memory section or a zone but the zone is looked up unconditionally. The compiler should optimise this away automatically so this is a cosmetic patch only in many cases. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f6b6eab074f..7f328cfb137d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6767,23 +6767,23 @@ void *__init alloc_large_system_hash(const char *tablename, } /* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct zone *zone, +static inline unsigned long *get_pageblock_bitmap(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM return __pfn_to_section(pfn)->pageblock_flags; #else - return zone->pageblock_flags; + return page_zone(page)->pageblock_flags; #endif /* CONFIG_SPARSEMEM */ } -static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) { #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #else - pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; #endif /* CONFIG_SPARSEMEM */ } @@ -6801,14 +6801,12 @@ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { - struct zone *zone; unsigned long *bitmap; unsigned long bitidx, word_bitidx; unsigned long word; - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); @@ -6830,20 +6828,18 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long end_bitidx, unsigned long mask) { - struct zone *zone; unsigned long *bitmap; unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - zone = page_zone(page); - bitmap = get_pageblock_bitmap(zone, pfn); - bitidx = pfn_to_bitidx(zone, pfn); + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); word_bitidx = bitidx / BITS_PER_LONG; bitidx &= (BITS_PER_LONG-1); - VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); bitidx += end_bitidx; mask <<= (BITS_PER_LONG - bitidx - 1); -- cgit v1.2.3 From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:38 -0700 Subject: mm, page_alloc: convert alloc_flags to unsigned alloc_flags is a bitmask of flags but it is signed which does not necessarily generate the best code depending on the compiler. Even without an impact, it makes more sense that this be unsigned. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 6 +++--- include/linux/mmzone.h | 3 ++- mm/compaction.c | 12 +++++++----- mm/internal.h | 2 +- mm/page_alloc.c | 26 ++++++++++++++------------ 5 files changed, 27 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index d7c8de583a23..242b660f64e6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended); + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx); + unsigned int alloc_flags, int classzone_idx); extern void defer_compaction(struct zone *zone, int order); extern bool compaction_deferred(struct zone *zone, int order); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cfcd7723edb6..327f0fa1e1ce 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags); + unsigned long mark, int classzone_idx, + unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx); enum memmap_context { diff --git a/mm/compaction.c b/mm/compaction.c index 7487067b4613..8f339ca25621 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1313,7 +1313,8 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_CONTINUE - If compaction should run now */ static unsigned long __compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { int fragindex; unsigned long watermark; @@ -1358,7 +1359,8 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, } unsigned long compaction_suitable(struct zone *zone, int order, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, + int classzone_idx) { unsigned long ret; @@ -1530,7 +1532,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, - int alloc_flags, int classzone_idx) + unsigned int alloc_flags, int classzone_idx) { unsigned long ret; struct compact_control cc = { @@ -1571,8 +1573,8 @@ int sysctl_extfrag_threshold = 500; * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended) + unsigned int alloc_flags, const struct alloc_context *ac, + enum migrate_mode mode, int *contended) { int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; diff --git a/mm/internal.h b/mm/internal.h index 098a89e3b97c..114593aab55c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -175,7 +175,7 @@ struct compact_control { bool direct_compaction; /* False from kcompactd or /proc/... */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const int alloc_flags; /* alloc flags of a direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ const int classzone_idx; /* zone index of a direct compactor */ struct zone *zone; int contended; /* Signal need_sched() or lock diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7f328cfb137d..094587a4ed81 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1525,7 +1525,7 @@ static inline bool free_pages_prezeroed(bool poisoned) } static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - int alloc_flags) + unsigned int alloc_flags) { int i; bool poisoned = true; @@ -2391,7 +2391,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, static inline struct page *buffered_rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int alloc_flags, int migratetype) + gfp_t gfp_flags, unsigned int alloc_flags, + int migratetype) { unsigned long flags; struct page *page; @@ -2545,12 +2546,13 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * to check in the allocation paths if no pages are free. */ static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, int alloc_flags, + unsigned long mark, int classzone_idx, + unsigned int alloc_flags, long free_pages) { long min = mark; int o; - const int alloc_harder = (alloc_flags & ALLOC_HARDER); + const bool alloc_harder = (alloc_flags & ALLOC_HARDER); /* free_pages may go negative - that's OK */ free_pages -= (1 << order) - 1; @@ -2613,7 +2615,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, } bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, - int classzone_idx, int alloc_flags) + int classzone_idx, unsigned int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); @@ -2957,7 +2959,7 @@ out: /* Try memory compaction for high-order allocations before reclaim */ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3013,7 +3015,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { @@ -3053,7 +3055,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, - int alloc_flags, const struct alloc_context *ac, + unsigned int alloc_flags, const struct alloc_context *ac, unsigned long *did_some_progress) { struct page *page = NULL; @@ -3092,10 +3094,10 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); } -static inline int +static inline unsigned int gfp_to_alloc_flags(gfp_t gfp_mask) { - int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; + unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); @@ -3156,7 +3158,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; struct page *page = NULL; - int alloc_flags; + unsigned int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; @@ -3348,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zoneref *preferred_zoneref; struct page *page = NULL; unsigned int cpuset_mems_cookie; - int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; + unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), -- cgit v1.2.3 From fa379b9586c7507c607d031dadf3681ed29614a9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:41 -0700 Subject: mm, page_alloc: convert nr_fair_skipped to bool The number of zones skipped to a zone expiring its fair zone allocation quota is irrelevant. Convert to bool. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 094587a4ed81..41d20bf66ac8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2680,7 +2680,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, struct zoneref *z; struct page *page = NULL; struct zone *zone; - int nr_fair_skipped = 0; + bool fair_skipped; bool zonelist_rescan; zonelist_scan: @@ -2708,7 +2708,7 @@ zonelist_scan: if (!zone_local(ac->preferred_zone, zone)) break; if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { - nr_fair_skipped++; + fair_skipped = true; continue; } } @@ -2801,7 +2801,7 @@ try_this_zone: */ if (alloc_flags & ALLOC_FAIR) { alloc_flags &= ~ALLOC_FAIR; - if (nr_fair_skipped) { + if (fair_skipped) { zonelist_rescan = true; reset_alloc_batches(ac->preferred_zone); } -- cgit v1.2.3 From 4dfa6cd8fdb1682586d463bc34888980fe98eb46 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:44 -0700 Subject: mm, page_alloc: remove unnecessary local variable in get_page_from_freelist zonelist here is a copy of a struct field that is used once. Ditch it. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 41d20bf66ac8..1d705803fc66 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2676,7 +2676,6 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zonelist *zonelist = ac->zonelist; struct zoneref *z; struct page *page = NULL; struct zone *zone; @@ -2690,7 +2689,7 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { unsigned long mark; -- cgit v1.2.3 From be06af002f6d50de10fd602ce3a6aa5d28e88d38 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:47 -0700 Subject: mm, page_alloc: remove unnecessary initialisation in get_page_from_freelist Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1d705803fc66..1096ac8f5ed1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2677,7 +2677,6 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { struct zoneref *z; - struct page *page = NULL; struct zone *zone; bool fair_skipped; bool zonelist_rescan; @@ -2691,6 +2690,7 @@ zonelist_scan: */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { + struct page *page; unsigned long mark; if (cpusets_enabled() && -- cgit v1.2.3 From 5bb1b169757875a72e05bfcbb76e22602cb1a760 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:50 -0700 Subject: mm, page_alloc: remove unnecessary initialisation from __alloc_pages_nodemask() page is guaranteed to be set before it is read with or without the initialisation. [akpm@linux-foundation.org: fix warning] Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1096ac8f5ed1..f9ca6cc553c7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3347,7 +3347,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { struct zoneref *preferred_zoneref; - struct page *page = NULL; + struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ @@ -3393,8 +3393,11 @@ retry_cpuset: /* The preferred zone is used for statistics later */ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask, &ac.preferred_zone); - if (!ac.preferred_zone) + if (!ac.preferred_zone) { + page = NULL; goto out; + } + ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ -- cgit v1.2.3 From 83d4ca8148fd9092715fd8ef75b30bbfd67fd2a9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:56 -0700 Subject: mm, page_alloc: move __GFP_HARDWALL modifications out of the fastpath __GFP_HARDWALL only has meaning in the context of cpusets but the fast path always applies the flag on the first attempt. Move the manipulations into the cpuset paths where they will be masked by a static branch in the common case. With the other micro-optimisations in this series combined, the impact on a page allocator microbenchmark is 4.6.0-rc2 4.6.0-rc2 decstat-v1r20 micro-v1r20 Min alloc-odr0-1 381.00 ( 0.00%) 377.00 ( 1.05%) Min alloc-odr0-2 275.00 ( 0.00%) 273.00 ( 0.73%) Min alloc-odr0-4 229.00 ( 0.00%) 226.00 ( 1.31%) Min alloc-odr0-8 199.00 ( 0.00%) 196.00 ( 1.51%) Min alloc-odr0-16 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-32 179.00 ( 0.00%) 175.00 ( 2.23%) Min alloc-odr0-64 174.00 ( 0.00%) 172.00 ( 1.15%) Min alloc-odr0-128 172.00 ( 0.00%) 170.00 ( 1.16%) Min alloc-odr0-256 181.00 ( 0.00%) 183.00 ( -1.10%) Min alloc-odr0-512 193.00 ( 0.00%) 191.00 ( 1.04%) Min alloc-odr0-1024 201.00 ( 0.00%) 199.00 ( 1.00%) Min alloc-odr0-2048 206.00 ( 0.00%) 204.00 ( 0.97%) Min alloc-odr0-4096 212.00 ( 0.00%) 210.00 ( 0.94%) Min alloc-odr0-8192 215.00 ( 0.00%) 213.00 ( 0.93%) Min alloc-odr0-16384 216.00 ( 0.00%) 214.00 ( 0.93%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9ca6cc553c7..48afc1a42bbd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3350,7 +3350,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; - gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */ + gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */ struct alloc_context ac = { .high_zoneidx = gfp_zone(gfp_mask), .zonelist = zonelist, @@ -3359,6 +3359,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, }; if (cpusets_enabled()) { + alloc_mask |= __GFP_HARDWALL; alloc_flags |= ALLOC_CPUSET; if (!ac.nodemask) ac.nodemask = &cpuset_current_mems_allowed; @@ -3401,7 +3402,6 @@ retry_cpuset: ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ - alloc_mask = gfp_mask|__GFP_HARDWALL; page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (unlikely(!page)) { /* @@ -3427,8 +3427,10 @@ out: * the mask is being updated. If a page allocation is about to fail, * check if the cpuset changed during allocation and if so, retry. */ - if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) + if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) { + alloc_mask = gfp_mask; goto retry_cpuset; + } return page; } -- cgit v1.2.3 From 3777999dd47ec00ec34a151b1d93c0a2b721e822 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:13:58 -0700 Subject: mm, page_alloc: check once if a zone has isolated pageblocks When bulk freeing pages from the per-cpu lists the zone is checked for isolated pageblocks on every release. This patch checks it once per drain. [mgorman@techsingularity.net: fix locking radce, per Vlastimil] Signed-off-by: Mel Gorman Signed-off-by: Vlastimil Babka Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48afc1a42bbd..a3b7eb86f912 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -831,8 +831,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, int batch_free = 0; int to_free = count; unsigned long nr_scanned; + bool isolated_pageblocks; spin_lock(&zone->lock); + isolated_pageblocks = has_isolate_pageblock(zone); nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); @@ -870,7 +872,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ - if (unlikely(has_isolate_pageblock(zone))) + if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); __free_one_page(page, page_to_pfn(page), zone, 0, mt); -- cgit v1.2.3 From 4fcb0971175f6037590d7b7772fe6619261d2165 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:01 -0700 Subject: mm, page_alloc: shorten the page allocator fast path The page allocator fast path checks page multiple times unnecessarily. This patch avoids all the slowpath checks if the first allocation attempt succeeds. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a3b7eb86f912..8380011d77db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3398,31 +3398,26 @@ retry_cpuset: ac.nodemask, &ac.preferred_zone); if (!ac.preferred_zone) { page = NULL; - goto out; + goto no_zone; } ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); - if (unlikely(!page)) { - /* - * Runtime PM, block IO and its error handling path - * can deadlock because I/O on the device might not - * complete. - */ - alloc_mask = memalloc_noio_flags(gfp_mask); - ac.spread_dirty_pages = false; - - page = __alloc_pages_slowpath(alloc_mask, order, &ac); - } + if (likely(page)) + goto out; - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); + /* + * Runtime PM, block IO and its error handling path can deadlock + * because I/O on the device might not complete. + */ + alloc_mask = memalloc_noio_flags(gfp_mask); + ac.spread_dirty_pages = false; - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + page = __alloc_pages_slowpath(alloc_mask, order, &ac); -out: +no_zone: /* * When updating a task's mems_allowed, it is possible to race with * parallel threads in such a way that an allocation can fail while @@ -3434,6 +3429,12 @@ out: goto retry_cpuset; } +out: + if (kmemcheck_enabled && page) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); + + trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); + return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); -- cgit v1.2.3 From 305347550becd08fdb576df32fc0767842ed71a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:04 -0700 Subject: mm, page_alloc: reduce cost of fair zone allocation policy retry The fair zone allocation policy is not without cost but it can be reduced slightly. This patch removes an unnecessary local variable, checks the likely conditions of the fair zone policy first, uses a bool instead of a flags check and falls through when a remote node is encountered instead of doing a full restart. The benefit is marginal but it's there 4.6.0-rc2 4.6.0-rc2 decstat-v1r20 optfair-v1r20 Min alloc-odr0-1 377.00 ( 0.00%) 380.00 ( -0.80%) Min alloc-odr0-2 273.00 ( 0.00%) 273.00 ( 0.00%) Min alloc-odr0-4 226.00 ( 0.00%) 227.00 ( -0.44%) Min alloc-odr0-8 196.00 ( 0.00%) 196.00 ( 0.00%) Min alloc-odr0-16 183.00 ( 0.00%) 183.00 ( 0.00%) Min alloc-odr0-32 175.00 ( 0.00%) 173.00 ( 1.14%) Min alloc-odr0-64 172.00 ( 0.00%) 169.00 ( 1.74%) Min alloc-odr0-128 170.00 ( 0.00%) 169.00 ( 0.59%) Min alloc-odr0-256 183.00 ( 0.00%) 180.00 ( 1.64%) Min alloc-odr0-512 191.00 ( 0.00%) 190.00 ( 0.52%) Min alloc-odr0-1024 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-2048 204.00 ( 0.00%) 204.00 ( 0.00%) Min alloc-odr0-4096 210.00 ( 0.00%) 209.00 ( 0.48%) Min alloc-odr0-8192 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-16384 214.00 ( 0.00%) 214.00 ( 0.00%) The benefit is marginal at best but one of the most important benefits, avoiding a second search when falling back to another node is not triggered by this particular test so the benefit for some corner cases is understated. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8380011d77db..a4124936303b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2680,12 +2680,10 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, { struct zoneref *z; struct zone *zone; - bool fair_skipped; - bool zonelist_rescan; + bool fair_skipped = false; + bool apply_fair = (alloc_flags & ALLOC_FAIR); zonelist_scan: - zonelist_rescan = false; - /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. @@ -2705,13 +2703,16 @@ zonelist_scan: * page was allocated in should have no effect on the * time the page has in memory before being reclaimed. */ - if (alloc_flags & ALLOC_FAIR) { - if (!zone_local(ac->preferred_zone, zone)) - break; + if (apply_fair) { if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) { fair_skipped = true; continue; } + if (!zone_local(ac->preferred_zone, zone)) { + if (fair_skipped) + goto reset_fair; + apply_fair = false; + } } /* * When allocating a page cache page for writing, we @@ -2800,18 +2801,13 @@ try_this_zone: * include remote zones now, before entering the slowpath and waking * kswapd: prefer spilling to a remote zone over swapping locally. */ - if (alloc_flags & ALLOC_FAIR) { - alloc_flags &= ~ALLOC_FAIR; - if (fair_skipped) { - zonelist_rescan = true; - reset_alloc_batches(ac->preferred_zone); - } - if (nr_online_nodes > 1) - zonelist_rescan = true; - } - - if (zonelist_rescan) + if (fair_skipped) { +reset_fair: + apply_fair = false; + fair_skipped = false; + reset_alloc_batches(ac->preferred_zone); goto zonelist_scan; + } return NULL; } -- cgit v1.2.3 From 48ee5f3696f62496481a8b6d852bcad9b3ebbe37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:07 -0700 Subject: mm, page_alloc: shortcut watermark checks for order-0 pages Watermarks have to be checked on every allocation including the number of pages being allocated and whether reserves can be accessed. The reserves only matter if memory is limited and the free_pages adjustment only applies to high-order pages. This patch adds a shortcut for order-0 pages that avoids numerous calculations if there is plenty of free memory yielding the following performance difference in a page allocator microbenchmark; 4.6.0-rc2 4.6.0-rc2 optfair-v1r20 fastmark-v1r20 Min alloc-odr0-1 380.00 ( 0.00%) 364.00 ( 4.21%) Min alloc-odr0-2 273.00 ( 0.00%) 262.00 ( 4.03%) Min alloc-odr0-4 227.00 ( 0.00%) 214.00 ( 5.73%) Min alloc-odr0-8 196.00 ( 0.00%) 186.00 ( 5.10%) Min alloc-odr0-16 183.00 ( 0.00%) 173.00 ( 5.46%) Min alloc-odr0-32 173.00 ( 0.00%) 165.00 ( 4.62%) Min alloc-odr0-64 169.00 ( 0.00%) 161.00 ( 4.73%) Min alloc-odr0-128 169.00 ( 0.00%) 159.00 ( 5.92%) Min alloc-odr0-256 180.00 ( 0.00%) 168.00 ( 6.67%) Min alloc-odr0-512 190.00 ( 0.00%) 180.00 ( 5.26%) Min alloc-odr0-1024 198.00 ( 0.00%) 190.00 ( 4.04%) Min alloc-odr0-2048 204.00 ( 0.00%) 196.00 ( 3.92%) Min alloc-odr0-4096 209.00 ( 0.00%) 202.00 ( 3.35%) Min alloc-odr0-8192 213.00 ( 0.00%) 206.00 ( 3.29%) Min alloc-odr0-16384 214.00 ( 0.00%) 206.00 ( 3.74%) Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a4124936303b..732875b1bdfb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2623,6 +2623,32 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, zone_page_state(z, NR_FREE_PAGES)); } +static inline bool zone_watermark_fast(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, unsigned int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + long cma_pages = 0; + +#ifdef CONFIG_CMA + /* If allocation can't use CMA areas don't use free CMA pages */ + if (!(alloc_flags & ALLOC_CMA)) + cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES); +#endif + + /* + * Fast check for order-0 only. If this fails then the reserves + * need to be calculated. There is a corner case where the check + * passes but only the high-order atomic reserve are free. If + * the caller is !atomic then it'll uselessly search the free + * list. That corner case is then slower but it is harmless. + */ + if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx]) + return true; + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); +} + bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx) { @@ -2744,7 +2770,7 @@ zonelist_scan: continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, + if (!zone_watermark_fast(zone, order, mark, ac->classzone_idx, alloc_flags)) { int ret; -- cgit v1.2.3 From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:10 -0700 Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice The allocator fast path looks up the first usable zone in a zonelist and then get_page_from_freelist does the same job in the zonelist iterator. This patch preserves the necessary information. 4.6.0-rc2 4.6.0-rc2 fastmark-v1r20 initonce-v1r20 Min alloc-odr0-1 364.00 ( 0.00%) 359.00 ( 1.37%) Min alloc-odr0-2 262.00 ( 0.00%) 260.00 ( 0.76%) Min alloc-odr0-4 214.00 ( 0.00%) 214.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 186.00 ( 0.00%) Min alloc-odr0-16 173.00 ( 0.00%) 173.00 ( 0.00%) Min alloc-odr0-32 165.00 ( 0.00%) 165.00 ( 0.00%) Min alloc-odr0-64 161.00 ( 0.00%) 162.00 ( -0.62%) Min alloc-odr0-128 159.00 ( 0.00%) 161.00 ( -1.26%) Min alloc-odr0-256 168.00 ( 0.00%) 170.00 ( -1.19%) Min alloc-odr0-512 180.00 ( 0.00%) 181.00 ( -0.56%) Min alloc-odr0-1024 190.00 ( 0.00%) 190.00 ( 0.00%) Min alloc-odr0-2048 196.00 ( 0.00%) 196.00 ( 0.00%) Min alloc-odr0-4096 202.00 ( 0.00%) 202.00 ( 0.00%) Min alloc-odr0-8192 206.00 ( 0.00%) 205.00 ( 0.49%) Min alloc-odr0-16384 206.00 ( 0.00%) 205.00 ( 0.49%) The benefit is negligible and the results are within the noise but each cycle counts. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 10 +++++----- include/linux/mmzone.h | 18 +++++++++++------- mm/internal.h | 2 +- mm/mempolicy.c | 19 ++++++++++--------- mm/page_alloc.c | 34 ++++++++++++++++------------------ 5 files changed, 43 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/fs/buffer.c b/fs/buffer.c index af0d9a82a8ed..754813a6962b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -255,17 +255,17 @@ out: */ static void free_more_memory(void) { - struct zone *zone; + struct zoneref *z; int nid; wakeup_flusher_threads(1024, WB_REASON_FREE_MORE_MEM); yield(); for_each_online_node(nid) { - (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), - gfp_zone(GFP_NOFS), NULL, - &zone); - if (zone) + + z = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL); + if (z->zone) try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, GFP_NOFS, NULL); } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 327f0fa1e1ce..4b28d2f8125e 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, - nodemask_t *nodes, - struct zone **zone) + nodemask_t *nodes) { - struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs, + return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); - *zone = zonelist_zone(z); - return z; } /** @@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ - for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ + for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ - zone = zonelist_zone(z)) \ + zone = zonelist_zone(z)) + +#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ + for (zone = z->zone; \ + zone; \ + z = next_zones_zonelist(++z, highidx, nodemask), \ + zone = zonelist_zone(z)) + /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index diff --git a/mm/internal.h b/mm/internal.h index 114593aab55c..d1ddd71c1bbf 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -102,7 +102,7 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; - struct zone *preferred_zone; + struct zoneref *preferred_zoneref; int classzone_idx; int migratetype; enum zone_type high_zoneidx; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7f80ebcd6552..297d6854f849 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1739,18 +1739,18 @@ unsigned int mempolicy_slab_node(void) return interleave_nodes(policy); case MPOL_BIND: { + struct zoneref *z; + /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; - struct zone *zone; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[0]; - (void)first_zones_zonelist(zonelist, highest_zoneidx, - &policy->v.nodes, - &zone); - return zone ? zone->node : node; + z = first_zones_zonelist(zonelist, highest_zoneidx, + &policy->v.nodes); + return z->zone ? z->zone->node : node; } default: @@ -2266,7 +2266,7 @@ static void sp_free(struct sp_node *n) int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; - struct zone *zone; + struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); @@ -2298,6 +2298,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long break; case MPOL_BIND: + /* * allows binding to multiple nodes. * use current page if in policy nodemask, @@ -2306,11 +2307,11 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long */ if (node_isset(curnid, pol->v.nodes)) goto out; - (void)first_zones_zonelist( + z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), - &pol->v.nodes, &zone); - polnid = zone->node; + &pol->v.nodes); + polnid = z->zone->node; break; default: diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 732875b1bdfb..dba8cfd0b2d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2704,7 +2704,7 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zoneref *z; + struct zoneref *z = ac->preferred_zoneref; struct zone *zone; bool fair_skipped = false; bool apply_fair = (alloc_flags & ALLOC_FAIR); @@ -2714,7 +2714,7 @@ zonelist_scan: * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ - for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { struct page *page; unsigned long mark; @@ -2734,7 +2734,7 @@ zonelist_scan: fair_skipped = true; continue; } - if (!zone_local(ac->preferred_zone, zone)) { + if (!zone_local(ac->preferred_zoneref->zone, zone)) { if (fair_skipped) goto reset_fair; apply_fair = false; @@ -2780,7 +2780,7 @@ zonelist_scan: goto try_this_zone; if (zone_reclaim_mode == 0 || - !zone_allows_reclaim(ac->preferred_zone, zone)) + !zone_allows_reclaim(ac->preferred_zoneref->zone, zone)) continue; ret = zone_reclaim(zone, gfp_mask, order); @@ -2802,7 +2802,7 @@ zonelist_scan: } try_this_zone: - page = buffered_rmqueue(ac->preferred_zone, zone, order, + page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { if (prep_new_page(page, order, gfp_mask, alloc_flags)) @@ -2831,7 +2831,7 @@ try_this_zone: reset_fair: apply_fair = false; fair_skipped = false; - reset_alloc_batches(ac->preferred_zone); + reset_alloc_batches(ac->preferred_zoneref->zone); goto zonelist_scan; } @@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone)); + wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref)); } static inline unsigned int @@ -3332,7 +3332,7 @@ retry: if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50); + wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); goto retry; } @@ -3370,7 +3370,6 @@ struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, nodemask_t *nodemask) { - struct zoneref *preferred_zoneref; struct page *page; unsigned int cpuset_mems_cookie; unsigned int alloc_flags = ALLOC_WMARK_LOW|ALLOC_FAIR; @@ -3416,14 +3415,14 @@ retry_cpuset: ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); /* The preferred zone is used for statistics later */ - preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, - ac.nodemask, &ac.preferred_zone); - if (!ac.preferred_zone) { + ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, + ac.high_zoneidx, ac.nodemask); + if (!ac.preferred_zoneref) { page = NULL; goto no_zone; } - ac.classzone_idx = zonelist_zone_idx(preferred_zoneref); + ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); @@ -4462,13 +4461,12 @@ static void build_zonelists(pg_data_t *pgdat) */ int local_memory_node(int node) { - struct zone *zone; + struct zoneref *z; - (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), - NULL, - &zone); - return zone->node; + NULL); + return z->zone->node; } #endif -- cgit v1.2.3 From 93ea9964d14ad583492ffb9ab7543f015876aaf2 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:13 -0700 Subject: mm, page_alloc: remove field from alloc_context The classzone_idx can be inferred from preferred_zoneref so remove the unnecessary field and save stack space. Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++-- mm/internal.h | 3 ++- mm/page_alloc.c | 8 +++----- 3 files changed, 7 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 8f339ca25621..eda3c2244f30 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1602,7 +1602,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, - ac->classzone_idx); + ac_classzone_idx(ac)); rc = max(status, rc); /* * It takes at least one zone that wasn't lock contended @@ -1612,7 +1612,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { /* * We think the allocation will succeed in this zone, * but it is not certain, hence the false. The caller diff --git a/mm/internal.h b/mm/internal.h index d1ddd71c1bbf..3ac544f1963f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -103,12 +103,13 @@ struct alloc_context { struct zonelist *zonelist; nodemask_t *nodemask; struct zoneref *preferred_zoneref; - int classzone_idx; int migratetype; enum zone_type high_zoneidx; bool spread_dirty_pages; }; +#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref) + /* * Locate the struct page for both the matching buddy in our * pair (buddy1) and the combined O(n+1) page they form (page). diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dba8cfd0b2d6..9da66e792e17 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2771,7 +2771,7 @@ zonelist_scan: mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_fast(zone, order, mark, - ac->classzone_idx, alloc_flags)) { + ac_classzone_idx(ac), alloc_flags)) { int ret; /* Checked here to keep the fast path fast */ @@ -2794,7 +2794,7 @@ zonelist_scan: default: /* did we reclaim enough */ if (zone_watermark_ok(zone, order, mark, - ac->classzone_idx, alloc_flags)) + ac_classzone_idx(ac), alloc_flags)) goto try_this_zone; continue; @@ -3114,7 +3114,7 @@ static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) - wakeup_kswapd(zone, order, zonelist_zone_idx(ac->preferred_zoneref)); + wakeup_kswapd(zone, order, ac_classzone_idx(ac)); } static inline unsigned int @@ -3422,8 +3422,6 @@ retry_cpuset: goto no_zone; } - ac.classzone_idx = zonelist_zone_idx(ac.preferred_zoneref); - /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) -- cgit v1.2.3 From 7bfec6f47bb0ffd207c7e813e819235e6c1c0f34 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:15 -0700 Subject: mm, page_alloc: check multiple page fields with a single branch Every page allocated or freed is checked for sanity to avoid corruptions that are difficult to detect later. A bad page could be due to a number of fields. Instead of using multiple branches, this patch combines multiple fields into a single branch. A detailed check is only necessary if that check fails. 4.6.0-rc2 4.6.0-rc2 initonce-v1r20 multcheck-v1r20 Min alloc-odr0-1 359.00 ( 0.00%) 348.00 ( 3.06%) Min alloc-odr0-2 260.00 ( 0.00%) 254.00 ( 2.31%) Min alloc-odr0-4 214.00 ( 0.00%) 213.00 ( 0.47%) Min alloc-odr0-8 186.00 ( 0.00%) 186.00 ( 0.00%) Min alloc-odr0-16 173.00 ( 0.00%) 173.00 ( 0.00%) Min alloc-odr0-32 165.00 ( 0.00%) 166.00 ( -0.61%) Min alloc-odr0-64 162.00 ( 0.00%) 162.00 ( 0.00%) Min alloc-odr0-128 161.00 ( 0.00%) 160.00 ( 0.62%) Min alloc-odr0-256 170.00 ( 0.00%) 169.00 ( 0.59%) Min alloc-odr0-512 181.00 ( 0.00%) 180.00 ( 0.55%) Min alloc-odr0-1024 190.00 ( 0.00%) 188.00 ( 1.05%) Min alloc-odr0-2048 196.00 ( 0.00%) 194.00 ( 1.02%) Min alloc-odr0-4096 202.00 ( 0.00%) 199.00 ( 1.49%) Min alloc-odr0-8192 205.00 ( 0.00%) 202.00 ( 1.46%) Min alloc-odr0-16384 205.00 ( 0.00%) 203.00 ( 0.98%) Again, the benefit is marginal but avoiding excessive branches is important. Ideally the paths would not have to check these conditions at all but regrettably abandoning the tests would make use-after-free bugs much harder to detect. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9da66e792e17..76a394812776 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -784,10 +784,42 @@ out: zone->free_area[order].nr_free++; } +/* + * A bad page could be due to a number of fields. Instead of multiple branches, + * try and check multiple fields with one check. The caller must do a detailed + * check if necessary. + */ +static inline bool page_expected_state(struct page *page, + unsigned long check_flags) +{ + if (unlikely(atomic_read(&page->_mapcount) != -1)) + return false; + + if (unlikely((unsigned long)page->mapping | + page_ref_count(page) | +#ifdef CONFIG_MEMCG + (unsigned long)page->mem_cgroup | +#endif + (page->flags & check_flags))) + return false; + + return true; +} + static inline int free_pages_check(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + const char *bad_reason; + unsigned long bad_flags; + + if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)) { + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; + } + + /* Something has gone sideways, find it */ + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; @@ -803,14 +835,8 @@ static inline int free_pages_check(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - page_cpupid_reset_last(page); - if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; + bad_page(page, bad_reason, bad_flags); + return 1; } /* @@ -1492,9 +1518,14 @@ static inline void expand(struct zone *zone, struct page *page, */ static inline int check_new_page(struct page *page) { - const char *bad_reason = NULL; - unsigned long bad_flags = 0; + const char *bad_reason; + unsigned long bad_flags; + + if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)) + return 0; + bad_reason = NULL; + bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) -- cgit v1.2.3 From bb552ac6c6b4f24e7a7b491286f87b63f9478d42 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:18 -0700 Subject: mm, page_alloc: un-inline the bad part of free_pages_check From: Vlastimil Babka !DEBUG_VM size and bloat-o-meter: add/remove: 1/0 grow/shrink: 0/2 up/down: 124/-370 (-246) function old new delta free_pages_check_bad - 124 +124 free_pcppages_bulk 1288 1171 -117 __free_pages_ok 948 695 -253 DEBUG_VM: add/remove: 1/0 grow/shrink: 0/1 up/down: 124/-214 (-90) function old new delta free_pages_check_bad - 124 +124 free_pages_prepare 1112 898 -214 [akpm@linux-foundation.org: fix whitespace] Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76a394812776..d51543de1813 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -806,18 +806,11 @@ static inline bool page_expected_state(struct page *page, return true; } -static inline int free_pages_check(struct page *page) +static void free_pages_check_bad(struct page *page) { const char *bad_reason; unsigned long bad_flags; - if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)) { - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - return 0; - } - - /* Something has gone sideways, find it */ bad_reason = NULL; bad_flags = 0; @@ -836,6 +829,18 @@ static inline int free_pages_check(struct page *page) bad_reason = "page still charged to cgroup"; #endif bad_page(page, bad_reason, bad_flags); +} + +static inline int free_pages_check(struct page *page) +{ + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) { + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; + } + + /* Something has gone sideways, find it */ + free_pages_check_bad(page); return 1; } -- cgit v1.2.3 From da838d4fcba675cbf864f225d76f970e91220ee6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:21 -0700 Subject: mm, page_alloc: pull out side effects from free_pages_check Check without side-effects should be easier to maintain. It also removes the duplicated cpupid and flags reset done in !DEBUG_VM variant of both free_pcp_prepare() and then bulkfree_pcp_prepare(). Finally, it enables the next patch. It shouldn't result in new branches, thanks to inlining of the check. !DEBUG_VM bloat-o-meter: add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-27 (-27) function old new delta __free_pages_ok 748 739 -9 free_pcppages_bulk 1403 1385 -18 DEBUG_VM: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-28 (-28) function old new delta free_pages_prepare 806 778 -28 This is also slightly faster because cpupid information is not set on tail pages so we can avoid resets there. Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d51543de1813..fea50b0cb405 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -833,11 +833,8 @@ static void free_pages_check_bad(struct page *page) static inline int free_pages_check(struct page *page) { - if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) { - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) return 0; - } /* Something has gone sideways, find it */ free_pages_check_bad(page); @@ -1078,7 +1075,11 @@ static bool free_pages_prepare(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - bad += free_pages_check(page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; } } if (PageAnonHead(page)) @@ -1087,6 +1088,8 @@ static bool free_pages_prepare(struct page *page, unsigned int order) if (bad) return false; + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); if (!PageHighMem(page)) { -- cgit v1.2.3 From e5b31ac2ca2cd0cf6bf2fcbb708ed01466c89aaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:24 -0700 Subject: mm, page_alloc: remove unnecessary variable from free_pcppages_bulk The original count is never reused so it can be removed. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fea50b0cb405..822ce86fc883 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -857,7 +857,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - int to_free = count; unsigned long nr_scanned; bool isolated_pageblocks; @@ -867,7 +866,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (nr_scanned) __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); - while (to_free) { + while (count) { struct page *page; struct list_head *list; @@ -887,7 +886,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* This is the only non-empty list. Free them all. */ if (batch_free == MIGRATE_PCPTYPES) - batch_free = to_free; + batch_free = count; do { int mt; /* migratetype of the to-be-freed page */ @@ -905,7 +904,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - } while (--to_free && --batch_free && !list_empty(list)); + } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); } -- cgit v1.2.3 From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:27 -0700 Subject: mm, page_alloc: inline pageblock lookup in page free fast paths The function call overhead of get_pfnblock_flags_mask() is measurable in the page free paths. This patch uses an inlined version that is faster. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 7 -- mm/page_alloc.c | 188 ++++++++++++++++++++++++++----------------------- mm/page_owner.c | 2 +- mm/vmstat.c | 2 +- 4 files changed, 102 insertions(+), 97 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4b28d2f8125e..c60db2096fd8 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled; get_pfnblock_flags_mask(page, page_to_pfn(page), \ PB_migrate_end, MIGRATETYPE_MASK) -static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) -{ - BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); - return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, - MIGRATETYPE_MASK); -} - struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 822ce86fc883..bdf7a13311b5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -352,6 +352,106 @@ static inline bool update_defer_init(pg_data_t *pgdat, } #endif +/* Return a pointer to the bitmap storing bits affecting a block of pages */ +static inline unsigned long *get_pageblock_bitmap(struct page *page, + unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + return __pfn_to_section(pfn)->pageblock_flags; +#else + return page_zone(page)->pageblock_flags; +#endif /* CONFIG_SPARSEMEM */ +} + +static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) +{ +#ifdef CONFIG_SPARSEMEM + pfn &= (PAGES_PER_SECTION-1); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#else + pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; +#endif /* CONFIG_SPARSEMEM */ +} + +/** + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest to retrieve + * @mask: mask of bits that the caller is interested in + * + * Return: pageblock_bits flags + */ +static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long word; + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; +} + +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask); +} + +static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) +{ + return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK); +} + +/** + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * @page: The page within the block of interest + * @flags: The flags to set + * @pfn: The target page frame number + * @end_bitidx: The last bit of interest + * @mask: mask of bits that the caller is interested in + */ +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, + unsigned long end_bitidx, + unsigned long mask) +{ + unsigned long *bitmap; + unsigned long bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); + + bitmap = get_pageblock_bitmap(page, pfn); + bitidx = pfn_to_bitidx(page, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + + VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); + + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = READ_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } +} void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -6831,94 +6931,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -/* Return a pointer to the bitmap storing bits affecting a block of pages */ -static inline unsigned long *get_pageblock_bitmap(struct page *page, - unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - return __pfn_to_section(pfn)->pageblock_flags; -#else - return page_zone(page)->pageblock_flags; -#endif /* CONFIG_SPARSEMEM */ -} - -static inline int pfn_to_bitidx(struct page *page, unsigned long pfn) -{ -#ifdef CONFIG_SPARSEMEM - pfn &= (PAGES_PER_SECTION-1); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); - return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; -#endif /* CONFIG_SPARSEMEM */ -} - -/** - * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest to retrieve - * @mask: mask of bits that the caller is interested in - * - * Return: pageblock_bits flags - */ -unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long word; - - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - word = bitmap[word_bitidx]; - bitidx += end_bitidx; - return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; -} - -/** - * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages - * @page: The page within the block of interest - * @flags: The flags to set - * @pfn: The target page frame number - * @end_bitidx: The last bit of interest - * @mask: mask of bits that the caller is interested in - */ -void set_pfnblock_flags_mask(struct page *page, unsigned long flags, - unsigned long pfn, - unsigned long end_bitidx, - unsigned long mask) -{ - unsigned long *bitmap; - unsigned long bitidx, word_bitidx; - unsigned long old_word, word; - - BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); - - bitmap = get_pageblock_bitmap(page, pfn); - bitidx = pfn_to_bitidx(page, pfn); - word_bitidx = bitidx / BITS_PER_LONG; - bitidx &= (BITS_PER_LONG-1); - - VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); - - bitidx += end_bitidx; - mask <<= (BITS_PER_LONG - bitidx - 1); - flags <<= (BITS_PER_LONG - bitidx - 1); - - word = READ_ONCE(bitmap[word_bitidx]); - for (;;) { - old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); - if (word == old_word) - break; - word = old_word; - } -} - /* * This function checks whether pageblock includes unmovable pages or not. * If @count is not zero, it is okay to include less @count unmovable pages diff --git a/mm/page_owner.c b/mm/page_owner.c index 438768c092ac..792b56da13d8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -143,7 +143,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, goto err; /* Print information relevant to grouping pages by mobility */ - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); page_mt = gfpflags_to_migratetype(page_ext->gfp_mask); ret += snprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", diff --git a/mm/vmstat.c b/mm/vmstat.c index f1a73bfb77b5..5b72a8ad2813 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1041,7 +1041,7 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, block_end_pfn = min(block_end_pfn, end_pfn); page = pfn_to_page(pfn); - pageblock_mt = get_pfnblock_migratetype(page, pfn); + pageblock_mt = get_pageblock_migratetype(page); for (; pfn < block_end_pfn; pfn++) { if (!pfn_valid_within(pfn)) -- cgit v1.2.3 From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:30 -0700 Subject: cpuset: use static key better and convert to new API An important function for cpusets is cpuset_node_allowed(), which optimizes on the fact if there's a single root CPU set, it must be trivially allowed. But the check "nr_cpusets() <= 1" doesn't use the cpusets_enabled_key static key the right way where static keys eliminate branching overhead with jump labels. This patch converts it so that static key is used properly. It's also switched to the new static key API and the checking functions are converted to return bool instead of int. We also provide a new variant __cpuset_zone_allowed() which expects that the static key check was already done and they key was enabled. This is needed for get_page_from_freelist() where we want to also avoid the relatively slower check when ALLOC_CPUSET is not set in alloc_flags. The impact on the page allocator microbenchmark is less than expected but the cleanup in itself is worthwhile. 4.6.0-rc2 4.6.0-rc2 multcheck-v1r20 cpuset-v1r20 Min alloc-odr0-1 348.00 ( 0.00%) 348.00 ( 0.00%) Min alloc-odr0-2 254.00 ( 0.00%) 254.00 ( 0.00%) Min alloc-odr0-4 213.00 ( 0.00%) 213.00 ( 0.00%) Min alloc-odr0-8 186.00 ( 0.00%) 183.00 ( 1.61%) Min alloc-odr0-16 173.00 ( 0.00%) 171.00 ( 1.16%) Min alloc-odr0-32 166.00 ( 0.00%) 163.00 ( 1.81%) Min alloc-odr0-64 162.00 ( 0.00%) 159.00 ( 1.85%) Min alloc-odr0-128 160.00 ( 0.00%) 157.00 ( 1.88%) Min alloc-odr0-256 169.00 ( 0.00%) 166.00 ( 1.78%) Min alloc-odr0-512 180.00 ( 0.00%) 180.00 ( 0.00%) Min alloc-odr0-1024 188.00 ( 0.00%) 187.00 ( 0.53%) Min alloc-odr0-2048 194.00 ( 0.00%) 193.00 ( 0.52%) Min alloc-odr0-4096 199.00 ( 0.00%) 198.00 ( 0.50%) Min alloc-odr0-8192 202.00 ( 0.00%) 201.00 ( 0.50%) Min alloc-odr0-16384 203.00 ( 0.00%) 202.00 ( 0.49%) Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Acked-by: Zefan Li Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++-------------- kernel/cpuset.c | 14 +++++++------- mm/page_alloc.c | 2 +- 3 files changed, 36 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 85a868ccb493..bfc204e70338 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -16,26 +16,26 @@ #ifdef CONFIG_CPUSETS -extern struct static_key cpusets_enabled_key; +extern struct static_key_false cpusets_enabled_key; static inline bool cpusets_enabled(void) { - return static_key_false(&cpusets_enabled_key); + return static_branch_unlikely(&cpusets_enabled_key); } static inline int nr_cpusets(void) { /* jump label reference count + the top-level cpuset */ - return static_key_count(&cpusets_enabled_key) + 1; + return static_key_count(&cpusets_enabled_key.key) + 1; } static inline void cpuset_inc(void) { - static_key_slow_inc(&cpusets_enabled_key); + static_branch_inc(&cpusets_enabled_key); } static inline void cpuset_dec(void) { - static_key_slow_dec(&cpusets_enabled_key); + static_branch_dec(&cpusets_enabled_key); } extern int cpuset_init(void); @@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); -extern int __cpuset_node_allowed(int node, gfp_t gfp_mask); +extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask); -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask); + if (cpusets_enabled()) + return __cpuset_node_allowed(node, gfp_mask); + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return cpuset_node_allowed(zone_to_nid(z), gfp_mask); + return __cpuset_node_allowed(zone_to_nid(z), gfp_mask); +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + if (cpusets_enabled()) + return __cpuset_zone_allowed(z, gfp_mask); + return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, @@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) return 1; } -static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) +static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask) { - return 1; + return true; } -static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { - return 1; + return true; +} + +static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) +{ + return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 611cc69af8f0..73e93e53884d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,7 +61,7 @@ #include #include -struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* See "Frequency meter" comments, below. */ @@ -2528,27 +2528,27 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) * GFP_KERNEL - any node in enclosing hardwalled cpuset ok * GFP_USER - only nodes in current tasks mems allowed ok. */ -int __cpuset_node_allowed(int node, gfp_t gfp_mask) +bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) - return 1; + return true; if (node_isset(node, current->mems_allowed)) - return 1; + return true; /* * Allow tasks that have access to memory reserves because they have * been OOM killed to get memory anywhere. */ if (unlikely(test_thread_flag(TIF_MEMDIE))) - return 1; + return true; if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ - return 0; + return false; if (current->flags & PF_EXITING) /* Let dying task have memory */ - return 1; + return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ spin_lock_irqsave(&callback_lock, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdf7a13311b5..39c441bb8d61 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2859,7 +2859,7 @@ zonelist_scan: if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && - !cpuset_zone_allowed(zone, gfp_mask)) + !__cpuset_zone_allowed(zone, gfp_mask)) continue; /* * Distribute pages in proportion to the individual -- cgit v1.2.3 From 4db7548ccbd9ec8e666f35df4a530f55904dec39 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:32 -0700 Subject: mm, page_alloc: defer debugging checks of freed pages until a PCP drain Every page free checks a number of page fields for validity. This catches premature frees and corruptions but it is also expensive. This patch weakens the debugging check by checking PCP pages at the time they are drained from the PCP list. This will trigger the bug but the site that freed the corrupt page will be lost. To get the full context, a kernel rebuild with DEBUG_VM is necessary. [akpm@linux-foundation.org: fix build] Signed-off-by: Mel Gorman Cc: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 152 +++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 101 insertions(+), 51 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39c441bb8d61..759d3f60bea0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -941,6 +941,103 @@ static inline int free_pages_check(struct page *page) return 1; } +static int free_tail_pages_check(struct page *head_page, struct page *page) +{ + int ret = 1; + + /* + * We rely page->lru.next never has bit 0 set, unless the page + * is PageTail(). Let's make sure that's true even for poisoned ->lru. + */ + BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); + + if (!IS_ENABLED(CONFIG_DEBUG_VM)) { + ret = 0; + goto out; + } + switch (page - head_page) { + case 1: + /* the first tail page: ->mapping is compound_mapcount() */ + if (unlikely(compound_mapcount(page))) { + bad_page(page, "nonzero compound_mapcount", 0); + goto out; + } + break; + case 2: + /* + * the second tail page: ->mapping is + * page_deferred_list().next -- ignore value. + */ + break; + default: + if (page->mapping != TAIL_MAPPING) { + bad_page(page, "corrupted mapping in tail page", 0); + goto out; + } + break; + } + if (unlikely(!PageTail(page))) { + bad_page(page, "PageTail not set", 0); + goto out; + } + if (unlikely(compound_head(page) != head_page)) { + bad_page(page, "compound_head not consistent", 0); + goto out; + } + ret = 0; +out: + page->mapping = NULL; + clear_compound_head(page); + return ret; +} + +static bool free_pages_prepare(struct page *page, unsigned int order); + +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + VM_BUG_ON_PAGE(PageTail(page), page); + + trace_mm_page_free(page, 0); + kmemcheck_free_shadow(page, 0); + kasan_free_pages(page, 0); + + if (PageAnonHead(page)) + page->mapping = NULL; + + reset_page_owner(page, 0); + + if (!PageHighMem(page)) { + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE); + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE); + } + arch_free_page(page, 0); + kernel_poison_pages(page, 0, 0); + kernel_map_pages(page, 0, 0); + + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return true; +} + +static bool bulkfree_pcp_prepare(struct page *page) +{ + return free_pages_check(page); +} +#endif /* CONFIG_DEBUG_VM */ + /* * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. @@ -1002,6 +1099,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page); + if (bulkfree_pcp_prepare(page)) + continue; + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); @@ -1028,56 +1128,6 @@ static void free_one_page(struct zone *zone, spin_unlock(&zone->lock); } -static int free_tail_pages_check(struct page *head_page, struct page *page) -{ - int ret = 1; - - /* - * We rely page->lru.next never has bit 0 set, unless the page - * is PageTail(). Let's make sure that's true even for poisoned ->lru. - */ - BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); - - if (!IS_ENABLED(CONFIG_DEBUG_VM)) { - ret = 0; - goto out; - } - switch (page - head_page) { - case 1: - /* the first tail page: ->mapping is compound_mapcount() */ - if (unlikely(compound_mapcount(page))) { - bad_page(page, "nonzero compound_mapcount", 0); - goto out; - } - break; - case 2: - /* - * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. - */ - break; - default: - if (page->mapping != TAIL_MAPPING) { - bad_page(page, "corrupted mapping in tail page", 0); - goto out; - } - break; - } - if (unlikely(!PageTail(page))) { - bad_page(page, "PageTail not set", 0); - goto out; - } - if (unlikely(compound_head(page) != head_page)) { - bad_page(page, "compound_head not consistent", 0); - goto out; - } - ret = 0; -out: - page->mapping = NULL; - clear_compound_head(page); - return ret; -} - static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { @@ -2339,7 +2389,7 @@ void free_hot_cold_page(struct page *page, bool cold) unsigned long pfn = page_to_pfn(page); int migratetype; - if (!free_pages_prepare(page, 0)) + if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); -- cgit v1.2.3 From 479f854a207ce2b97545a0a83856778b541063d0 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:35 -0700 Subject: mm, page_alloc: defer debugging checks of pages allocated from the PCP Every page allocated checks a number of page fields for validity. This catches corruption bugs of pages that are already freed but it is expensive. This patch weakens the debugging check by checking PCP pages only when the PCP lists are being refilled. All compound pages are checked. This potentially avoids debugging checks entirely if the PCP lists are never emptied and refilled so some corruption issues may be missed. Full checking requires DEBUG_VM. With the two deferred debugging patches applied, the impact to a page allocator microbenchmark is 4.6.0-rc3 4.6.0-rc3 inline-v3r6 deferalloc-v3r7 Min alloc-odr0-1 344.00 ( 0.00%) 317.00 ( 7.85%) Min alloc-odr0-2 248.00 ( 0.00%) 231.00 ( 6.85%) Min alloc-odr0-4 209.00 ( 0.00%) 192.00 ( 8.13%) Min alloc-odr0-8 181.00 ( 0.00%) 166.00 ( 8.29%) Min alloc-odr0-16 168.00 ( 0.00%) 154.00 ( 8.33%) Min alloc-odr0-32 161.00 ( 0.00%) 148.00 ( 8.07%) Min alloc-odr0-64 158.00 ( 0.00%) 145.00 ( 8.23%) Min alloc-odr0-128 156.00 ( 0.00%) 143.00 ( 8.33%) Min alloc-odr0-256 168.00 ( 0.00%) 154.00 ( 8.33%) Min alloc-odr0-512 178.00 ( 0.00%) 167.00 ( 6.18%) Min alloc-odr0-1024 186.00 ( 0.00%) 174.00 ( 6.45%) Min alloc-odr0-2048 192.00 ( 0.00%) 180.00 ( 6.25%) Min alloc-odr0-4096 198.00 ( 0.00%) 184.00 ( 7.07%) Min alloc-odr0-8192 200.00 ( 0.00%) 188.00 ( 6.00%) Min alloc-odr0-16384 201.00 ( 0.00%) 188.00 ( 6.47%) Min free-odr0-1 189.00 ( 0.00%) 180.00 ( 4.76%) Min free-odr0-2 132.00 ( 0.00%) 126.00 ( 4.55%) Min free-odr0-4 104.00 ( 0.00%) 99.00 ( 4.81%) Min free-odr0-8 90.00 ( 0.00%) 85.00 ( 5.56%) Min free-odr0-16 84.00 ( 0.00%) 80.00 ( 4.76%) Min free-odr0-32 80.00 ( 0.00%) 76.00 ( 5.00%) Min free-odr0-64 78.00 ( 0.00%) 74.00 ( 5.13%) Min free-odr0-128 77.00 ( 0.00%) 73.00 ( 5.19%) Min free-odr0-256 94.00 ( 0.00%) 91.00 ( 3.19%) Min free-odr0-512 108.00 ( 0.00%) 112.00 ( -3.70%) Min free-odr0-1024 115.00 ( 0.00%) 118.00 ( -2.61%) Min free-odr0-2048 120.00 ( 0.00%) 125.00 ( -4.17%) Min free-odr0-4096 123.00 ( 0.00%) 129.00 ( -4.88%) Min free-odr0-8192 126.00 ( 0.00%) 130.00 ( -3.17%) Min free-odr0-16384 126.00 ( 0.00%) 131.00 ( -3.97%) Note that the free paths for large numbers of pages is impacted as the debugging cost gets shifted into that path when the page data is no longer necessarily cache-hot. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 92 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 64 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 759d3f60bea0..193ed34a2780 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1714,7 +1714,41 @@ static inline bool free_pages_prezeroed(bool poisoned) page_poisoning_enabled() && poisoned; } -static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, +#ifdef CONFIG_DEBUG_VM +static bool check_pcp_refill(struct page *page) +{ + return false; +} + +static bool check_new_pcp(struct page *page) +{ + return check_new_page(page); +} +#else +static bool check_pcp_refill(struct page *page) +{ + return check_new_page(page); +} +static bool check_new_pcp(struct page *page) +{ + return false; +} +#endif /* CONFIG_DEBUG_VM */ + +static bool check_new_pages(struct page *page, unsigned int order) +{ + int i; + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + + if (unlikely(check_new_page(p))) + return true; + } + + return false; +} + +static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, unsigned int alloc_flags) { int i; @@ -1722,8 +1756,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, for (i = 0; i < (1 << order); i++) { struct page *p = page + i; - if (unlikely(check_new_page(p))) - return 1; if (poisoned) poisoned &= page_is_poisoned(p); } @@ -1755,8 +1787,6 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, set_page_pfmemalloc(page); else clear_page_pfmemalloc(page); - - return 0; } /* @@ -2178,6 +2208,9 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, if (unlikely(page == NULL)) break; + if (unlikely(check_pcp_refill(page))) + continue; + /* * Split buddy pages returned by expand() are received here * in physical page order. The page is added to the callers and @@ -2593,20 +2626,22 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, struct list_head *list; local_irq_save(flags); - pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype, cold); - if (unlikely(list_empty(list))) - goto failed; - } + do { + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) + goto failed; + } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); + if (cold) + page = list_last_entry(list, struct page, lru); + else + page = list_first_entry(list, struct page, lru); + } while (page && check_new_pcp(page)); __dec_zone_state(zone, NR_ALLOC_BATCH); list_del(&page->lru); @@ -2619,14 +2654,16 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); spin_lock_irqsave(&zone->lock, flags); - page = NULL; - if (alloc_flags & ALLOC_HARDER) { - page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); - if (page) - trace_mm_page_alloc_zone_locked(page, order, migratetype); - } - if (!page) - page = __rmqueue(zone, order, migratetype); + do { + page = NULL; + if (alloc_flags & ALLOC_HARDER) { + page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); + } + if (!page) + page = __rmqueue(zone, order, migratetype); + } while (page && check_new_pages(page, order)); spin_unlock(&zone->lock); if (!page) goto failed; @@ -2993,8 +3030,7 @@ try_this_zone: page = buffered_rmqueue(ac->preferred_zoneref->zone, zone, order, gfp_mask, alloc_flags, ac->migratetype); if (page) { - if (prep_new_page(page, order, gfp_mask, alloc_flags)) - goto try_this_zone; + prep_new_page(page, order, gfp_mask, alloc_flags); /* * If this is a high-order atomic allocation then check -- cgit v1.2.3 From e2769dbdc51f1baa1908ecf6c84d50f19577e1db Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:38 -0700 Subject: mm, page_alloc: don't duplicate code in free_pcp_prepare The new free_pcp_prepare() function shares a lot of code with free_pages_prepare(), which makes this a maintenance risk when some future patch modifies only one of them. We should be able to achieve the same effect (skipping free_pages_check() from !DEBUG_VM configs) by adding a parameter to free_pages_prepare() and making it inline, so the checks (and the order != 0 parts) are eliminated from the call from free_pcp_prepare(). !DEBUG_VM: bloat-o-meter reports no difference, as my gcc was already inlining free_pages_prepare() and the elimination seems to work as expected DEBUG_VM bloat-o-meter: add/remove: 0/1 grow/shrink: 2/0 up/down: 1035/-778 (257) function old new delta __free_pages_ok 297 1060 +763 free_hot_cold_page 480 752 +272 free_pages_prepare 778 - -778 Here inlining didn't occur before, and added some code, but it's ok for a debug option. [akpm@linux-foundation.org: fix build] Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 133 +++++++++++++++++++++++--------------------------------- 1 file changed, 55 insertions(+), 78 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 193ed34a2780..7d8f642c498d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -991,47 +991,77 @@ out: return ret; } -static bool free_pages_prepare(struct page *page, unsigned int order); - -#ifdef CONFIG_DEBUG_VM -static inline bool free_pcp_prepare(struct page *page) +static __always_inline bool free_pages_prepare(struct page *page, + unsigned int order, bool check_free) { - return free_pages_prepare(page, 0); -} + int bad = 0; -static inline bool bulkfree_pcp_prepare(struct page *page) -{ - return false; -} -#else -static bool free_pcp_prepare(struct page *page) -{ VM_BUG_ON_PAGE(PageTail(page), page); - trace_mm_page_free(page, 0); - kmemcheck_free_shadow(page, 0); - kasan_free_pages(page, 0); + trace_mm_page_free(page, order); + kmemcheck_free_shadow(page, order); + kasan_free_pages(page, order); + + /* + * Check tail pages before head page information is cleared to + * avoid checking PageCompound for order-0 pages. + */ + if (unlikely(order)) { + bool compound = PageCompound(page); + int i; + + VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); + for (i = 1; i < (1 << order); i++) { + if (compound) + bad += free_tail_pages_check(page, page + i); + if (unlikely(free_pages_check(page + i))) { + bad++; + continue; + } + (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + } + } if (PageAnonHead(page)) page->mapping = NULL; + if (check_free) + bad += free_pages_check(page); + if (bad) + return false; - reset_page_owner(page, 0); + page_cpupid_reset_last(page); + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + reset_page_owner(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), - PAGE_SIZE); + PAGE_SIZE << order); debug_check_no_obj_freed(page_address(page), - PAGE_SIZE); + PAGE_SIZE << order); } - arch_free_page(page, 0); - kernel_poison_pages(page, 0, 0); - kernel_map_pages(page, 0, 0); + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return true; } +#ifdef CONFIG_DEBUG_VM +static inline bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, true); +} + +static inline bool bulkfree_pcp_prepare(struct page *page) +{ + return false; +} +#else +static bool free_pcp_prepare(struct page *page) +{ + return free_pages_prepare(page, 0, false); +} + static bool bulkfree_pcp_prepare(struct page *page) { return free_pages_check(page); @@ -1201,66 +1231,13 @@ void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) } } -static bool free_pages_prepare(struct page *page, unsigned int order) -{ - int bad = 0; - - VM_BUG_ON_PAGE(PageTail(page), page); - - trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); - - /* - * Check tail pages before head page information is cleared to - * avoid checking PageCompound for order-0 pages. - */ - if (unlikely(order)) { - bool compound = PageCompound(page); - int i; - - VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); - - for (i = 1; i < (1 << order); i++) { - if (compound) - bad += free_tail_pages_check(page, page + i); - if (unlikely(free_pages_check(page + i))) { - bad++; - continue; - } - (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - } - } - if (PageAnonHead(page)) - page->mapping = NULL; - bad += free_pages_check(page); - if (bad) - return false; - - page_cpupid_reset_last(page); - page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; - reset_page_owner(page, order); - - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), - PAGE_SIZE << order); - debug_check_no_obj_freed(page_address(page), - PAGE_SIZE << order); - } - arch_free_page(page, order); - kernel_poison_pages(page, 1 << order, 0); - kernel_map_pages(page, 1 << order, 0); - - return true; -} - static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); - if (!free_pages_prepare(page, order)) + if (!free_pages_prepare(page, order, true)) return; migratetype = get_pfnblock_migratetype(page, pfn); -- cgit v1.2.3 From 4e6118016eb7986109ad61b00186579f384f956a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 19 May 2016 17:14:41 -0700 Subject: mm, page_alloc: uninline the bad page part of check_new_page() Bad pages should be rare so the code handling them doesn't need to be inline for performance reasons. Put it to separate function which returns void. This also assumes that the initial page_expected_state() result will match the result of the thorough check, i.e. the page doesn't become "good" in the meanwhile. This matches the same expectations already in place in free_pages_check(). !DEBUG_VM bloat-o-meter: add/remove: 1/0 grow/shrink: 0/1 up/down: 134/-274 (-140) function old new delta check_new_page_bad - 134 +134 get_page_from_freelist 3468 3194 -274 Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7d8f642c498d..ecf663358b0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1647,19 +1647,11 @@ static inline void expand(struct zone *zone, struct page *page, } } -/* - * This page is about to be returned from the page allocator - */ -static inline int check_new_page(struct page *page) +static void check_new_page_bad(struct page *page) { - const char *bad_reason; - unsigned long bad_flags; + const char *bad_reason = NULL; + unsigned long bad_flags = 0; - if (page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)) - return 0; - - bad_reason = NULL; - bad_flags = 0; if (unlikely(atomic_read(&page->_mapcount) != -1)) bad_reason = "nonzero mapcount"; if (unlikely(page->mapping != NULL)) @@ -1678,11 +1670,20 @@ static inline int check_new_page(struct page *page) if (unlikely(page->mem_cgroup)) bad_reason = "page still charged to cgroup"; #endif - if (unlikely(bad_reason)) { - bad_page(page, bad_reason, bad_flags); - return 1; - } - return 0; + bad_page(page, bad_reason, bad_flags); +} + +/* + * This page is about to be returned from the page allocator + */ +static inline int check_new_page(struct page *page) +{ + if (likely(page_expected_state(page, + PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) + return 0; + + check_new_page_bad(page); + return 1; } static inline bool free_pages_prezeroed(bool poisoned) -- cgit v1.2.3 From 4741526b83c5d3a3d661d1896f9e7414c5730bcb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 19 May 2016 17:14:44 -0700 Subject: mm, page_alloc: restore the original nodemask if the fast path allocation failed The page allocator fast path uses either the requested nodemask or cpuset_current_mems_allowed if cpusets are enabled. If the allocation context allows watermarks to be ignored then it can also ignore memory policies. However, on entering the allocator slowpath the nodemask may still be cpuset_current_mems_allowed and the policies are enforced. This patch resets the nodemask appropriately before entering the slowpath. Link: http://lkml.kernel.org/r/20160504143628.GU2858@techsingularity.net Signed-off-by: Vlastimil Babka Signed-off-by: Mel Gorman Cc: Jesper Dangaard Brouer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ecf663358b0d..5c469c1dfb8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3636,6 +3636,12 @@ retry_cpuset: alloc_mask = memalloc_noio_flags(gfp_mask); ac.spread_dirty_pages = false; + /* + * Restore the original nodemask if it was potentially replaced with + * &cpuset_current_mems_allowed to optimize the fast-path attempt. + */ + if (cpusets_enabled()) + ac.nodemask = nodemask; page = __alloc_pages_slowpath(alloc_mask, order, &ac); no_zone: -- cgit v1.2.3 From f0281a00fe80f0e689dd51e68c3aed5f6ef1bf58 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 20 May 2016 16:56:25 -0700 Subject: mm: workingset: only do workingset activations on reads This is a follow-up to http://www.spinics.net/lists/linux-mm/msg101739.html where Andres reported his database workingset being pushed out by the minimum size enforcement of the inactive file list - currently 50% of cache - as well as repeatedly written file pages that are never actually read. Two changes fell out of the discussions. The first change observes that pages that are only ever written don't benefit from caching beyond what the writeback cache does for partial page writes, and so we shouldn't promote them to the active file list where they compete with pages whose cached data is actually accessed repeatedly. This change comes in two patches - one for in-cache write accesses and one for refaults triggered by writes, neither of which should promote a cache page. Second, with the refault detection we don't need to set 50% of the cache aside for used-once cache anymore since we can detect frequently used pages even when they are evicted between accesses. We can allow the active list to be bigger and thus protect a bigger workingset that isn't challenged by streamers. Depending on the access patterns, this can increase major faults during workingset transitions for better performance during stable phases. This patch (of 3): When rewriting a page, the data in that page is replaced with new data. This means that evicting something else from the active file list, in order to cache data that will be replaced by something else, is likely to be a waste of memory. It is better to save the active list for frequently read pages, because reads actually use the data that is in the page. This patch ignores partial writes, because it is unclear whether the complexity of identifying those is worth any potential performance gain obtained from better caching pages that see repeated partial writes at large enough intervals to not get caught by the use-twice promotion code used for the inactive file list. Signed-off-by: Rik van Riel Signed-off-by: Johannes Weiner Reported-by: Andres Freund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 01690338e3d2..beba6bd6b511 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -713,8 +713,12 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, * The page might have been evicted from cache only * recently, in which case it should be activated like * any other repeatedly accessed page. + * The exception is pages getting rewritten; evicting other + * data from the working set, only to cache data that will + * get overwritten with something else, is a waste of memory. */ - if (shadow && workingset_refault(shadow)) { + if (!(gfp_mask & __GFP_WRITE) && + shadow && workingset_refault(shadow)) { SetPageActive(page); workingset_activation(page); } else -- cgit v1.2.3 From bbddabe2e436aa7869b3ac5248df5c14ddde0cbf Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 20 May 2016 16:56:28 -0700 Subject: mm: filemap: only do access activations on reads Andres observed that his database workload is struggling with the transaction journal creating pressure on frequently read pages. Access patterns like transaction journals frequently write the same pages over and over, but in the majority of cases those pages are never read back. There are no caching benefits to be had for those pages, so activating them and having them put pressure on pages that do benefit from caching is a bad choice. Leave page activations to read accesses and don't promote pages based on writes alone. It could be said that partially written pages do contain cache-worthy data, because even if *userspace* does not access the unwritten part, the kernel still has to read it from the filesystem for correctness. However, a counter argument is that these pages enjoy at least *some* protection over other inactive file pages through the writeback cache, in the sense that dirty pages are written back with a delay and cache reclaim leaves them alone until they have been written back to disk. Should that turn out to be insufficient and we see increased read IO from partial writes under memory pressure, we can always go back and update grab_cache_page_write_begin() to take (pos, len) so that it can tell partial writes from pages that don't need partial reads. But for now, keep it simple. Signed-off-by: Johannes Weiner Reported-by: Andres Freund Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index beba6bd6b511..8f4859989f1b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2578,7 +2578,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { struct page *page; - int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; + int fgp_flags = FGP_LOCK|FGP_WRITE|FGP_CREAT; if (flags & AOP_FLAG_NOFS) fgp_flags |= FGP_NOFS; -- cgit v1.2.3 From 59dc76b0d4dfdd7dc46a1010e4afb44f60f3e97f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 20 May 2016 16:56:31 -0700 Subject: mm: vmscan: reduce size of inactive file list The inactive file list should still be large enough to contain readahead windows and freshly written file data, but it no longer is the only source for detecting multiple accesses to file pages. The workingset refault measurement code causes recently evicted file pages that get accessed again after a shorter interval to be promoted directly to the active list. With that mechanism in place, we can afford to (on a larger system) dedicate more memory to the active file list, so we can actually cache more of the frequently used file pages in memory, and not have them pushed out by streaming writes, once-used streaming file reads, etc. This can help things like database workloads, where only half the page cache can currently be used to cache the database working set. This patch automatically increases that fraction on larger systems, using the same ratio that has already been used for anonymous memory. [hannes@cmpxchg.org: cgroup-awareness] Signed-off-by: Rik van Riel Signed-off-by: Johannes Weiner Reported-by: Andres Freund Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 25 ----------- mm/page_alloc.c | 44 ------------------- mm/vmscan.c | 104 ++++++++++++++++++--------------------------- 3 files changed, 42 insertions(+), 131 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 94da96738df3..a805474df4ab 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -415,25 +415,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) return mz->lru_size[lru]; } -static inline bool mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ - unsigned long inactive_ratio; - unsigned long inactive; - unsigned long active; - unsigned long gb; - - inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); - active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); - - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); - else - inactive_ratio = 1; - - return inactive * inactive_ratio < active; -} - void mem_cgroup_handle_over_high(void); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, @@ -646,12 +627,6 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) return true; } -static inline bool -mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) -{ - return true; -} - static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5c469c1dfb8b..edbdf56b3c9b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6670,49 +6670,6 @@ void setup_per_zone_wmarks(void) mutex_unlock(&zonelists_mutex); } -/* - * The inactive anon list should be small enough that the VM never has to - * do too much work, but large enough that each inactive page has a chance - * to be referenced again before it is swapped out. - * - * The inactive_anon ratio is the target ratio of ACTIVE_ANON to - * INACTIVE_ANON pages on this zone's LRU, maintained by the - * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of - * the anonymous pages are kept on the inactive list. - * - * total target max - * memory ratio inactive anon - * ------------------------------------- - * 10MB 1 5MB - * 100MB 1 50MB - * 1GB 3 250MB - * 10GB 10 0.9GB - * 100GB 31 3GB - * 1TB 101 10GB - * 10TB 320 32GB - */ -static void __meminit calculate_zone_inactive_ratio(struct zone *zone) -{ - unsigned int gb, ratio; - - /* Zone size in gigabytes */ - gb = zone->managed_pages >> (30 - PAGE_SHIFT); - if (gb) - ratio = int_sqrt(10 * gb); - else - ratio = 1; - - zone->inactive_ratio = ratio; -} - -static void __meminit setup_per_zone_inactive_ratio(void) -{ - struct zone *zone; - - for_each_zone(zone) - calculate_zone_inactive_ratio(zone); -} - /* * Initialise min_free_kbytes. * @@ -6758,7 +6715,6 @@ int __meminit init_per_zone_wmark_min(void) setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); - setup_per_zone_inactive_ratio(); return 0; } core_initcall(init_per_zone_wmark_min) diff --git a/mm/vmscan.c b/mm/vmscan.c index dcfdfc1a0942..38d6d06c955f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1862,83 +1862,63 @@ static void shrink_active_list(unsigned long nr_to_scan, free_hot_cold_page_list(&l_hold, true); } -#ifdef CONFIG_SWAP -static bool inactive_anon_is_low_global(struct zone *zone) -{ - unsigned long active, inactive; - - active = zone_page_state(zone, NR_ACTIVE_ANON); - inactive = zone_page_state(zone, NR_INACTIVE_ANON); - - return inactive * zone->inactive_ratio < active; -} - -/** - * inactive_anon_is_low - check if anonymous pages need to be deactivated - * @lruvec: LRU vector to check +/* + * The inactive anon list should be small enough that the VM never has + * to do too much work. * - * Returns true if the zone does not have enough inactive anon pages, - * meaning some active anon pages need to be deactivated. - */ -static bool inactive_anon_is_low(struct lruvec *lruvec) -{ - /* - * If we don't have swap space, anonymous page deactivation - * is pointless. - */ - if (!total_swap_pages) - return false; - - if (!mem_cgroup_disabled()) - return mem_cgroup_inactive_anon_is_low(lruvec); - - return inactive_anon_is_low_global(lruvec_zone(lruvec)); -} -#else -static inline bool inactive_anon_is_low(struct lruvec *lruvec) -{ - return false; -} -#endif - -/** - * inactive_file_is_low - check if file pages need to be deactivated - * @lruvec: LRU vector to check + * The inactive file list should be small enough to leave most memory + * to the established workingset on the scan-resistant active list, + * but large enough to avoid thrashing the aggregate readahead window. * - * When the system is doing streaming IO, memory pressure here - * ensures that active file pages get deactivated, until more - * than half of the file pages are on the inactive list. + * Both inactive lists should also be large enough that each inactive + * page has a chance to be referenced again before it is reclaimed. * - * Once we get to that situation, protect the system's working - * set from being evicted by disabling active file page aging. + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * on this LRU, maintained by the pageout code. A zone->inactive_ratio + * of 3 means 3:1 or 25% of the pages are kept on the inactive list. * - * This uses a different ratio than the anonymous pages, because - * the page cache uses a use-once replacement algorithm. + * total target max + * memory ratio inactive + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB */ -static bool inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_list_is_low(struct lruvec *lruvec, bool file) { + unsigned long inactive_ratio; unsigned long inactive; unsigned long active; + unsigned long gb; - inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_FILE); - active = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!file && !total_swap_pages) + return false; - return active > inactive; -} + inactive = lruvec_lru_size(lruvec, file * LRU_FILE); + active = lruvec_lru_size(lruvec, file * LRU_FILE + LRU_ACTIVE); -static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) -{ - if (is_file_lru(lru)) - return inactive_file_is_low(lruvec); + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); else - return inactive_anon_is_low(lruvec); + inactive_ratio = 1; + + return inactive * inactive_ratio < active; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, lru)) + if (inactive_list_is_low(lruvec, is_file_lru(lru))) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2059,7 +2039,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_file_is_low(lruvec) && + if (!inactive_list_is_low(lruvec, true) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2301,7 +2281,7 @@ static void shrink_zone_memcg(struct zone *zone, struct mem_cgroup *memcg, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(lruvec)) + if (inactive_list_is_low(lruvec, false)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -2962,7 +2942,7 @@ static void age_active_anon(struct zone *zone, struct scan_control *sc) do { struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); - if (inactive_anon_is_low(lruvec)) + if (inactive_list_is_low(lruvec, false)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); -- cgit v1.2.3 From b6459cc154e804f0de0d61fa023c4946b742cc96 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:34 -0700 Subject: vmscan: consider classzone_idx in compaction_ready Motivation: As pointed out by Linus [2][3] relying on zone_reclaimable as a way to communicate the reclaim progress is rater dubious. I tend to agree, not only it is really obscure, it is not hard to imagine cases where a single page freed in the loop keeps all the reclaimers looping without getting any progress because their gfp_mask wouldn't allow to get that page anyway (e.g. single GFP_ATOMIC alloc and free loop). This is rather rare so it doesn't happen in the practice but the current logic which we have is rather obscure and hard to follow a also non-deterministic. This is an attempt to make the OOM detection more deterministic and easier to follow because each reclaimer basically tracks its own progress which is implemented at the page allocator layer rather spread out between the allocator and the reclaim. The more on the implementation is described in the first patch. I have tested several different scenarios but it should be clear that testing OOM killer is quite hard to be representative. There is usually a tiny gap between almost OOM and full blown OOM which is often time sensitive. Anyway, I have tested the following 2 scenarios and I would appreciate if there are more to test. Testing environment: a virtual machine with 2G of RAM and 2CPUs without any swap to make the OOM more deterministic. 1) 2 writers (each doing dd with 4M blocks to an xfs partition with 1G file size, removes the files and starts over again) running in parallel for 10s to build up a lot of dirty pages when 100 parallel mem_eaters (anon private populated mmap which waits until it gets signal) with 80M each. This causes an OOM flood of course and I have compared both patched and unpatched kernels. The test is considered finished after there are no OOM conditions detected. This should tell us whether there are any excessive kills or some of them premature (e.g. due to dirty pages): I have performed two runs this time each after a fresh boot. * base kernel $ grep "Out of memory:" base-oom-run1.log | wc -l 78 $ grep "Out of memory:" base-oom-run2.log | wc -l 78 $ grep "Kill process" base-oom-run1.log | tail -n1 [ 91.391203] Out of memory: Kill process 3061 (mem_eater) score 39 or sacrifice child $ grep "Kill process" base-oom-run2.log | tail -n1 [ 82.141919] Out of memory: Kill process 3086 (mem_eater) score 39 or sacrifice child $ grep "DMA32 free:" base-oom-run1.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk min: 5376.00 max: 6776.00 avg: 5530.75 std: 166.50 nr: 61 $ grep "DMA32 free:" base-oom-run2.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk min: 5416.00 max: 5608.00 avg: 5514.15 std: 42.94 nr: 52 $ grep "DMA32.*all_unreclaimable? no" base-oom-run1.log | wc -l 1 $ grep "DMA32.*all_unreclaimable? no" base-oom-run2.log | wc -l 3 * patched kernel $ grep "Out of memory:" patched-oom-run1.log | wc -l 78 miso@tiehlicka /mnt/share/devel/miso/kvm $ grep "Out of memory:" patched-oom-run2.log | wc -l 77 e grep "Kill process" patched-oom-run1.log | tail -n1 [ 497.317732] Out of memory: Kill process 3108 (mem_eater) score 39 or sacrifice child $ grep "Kill process" patched-oom-run2.log | tail -n1 [ 316.169920] Out of memory: Kill process 3093 (mem_eater) score 39 or sacrifice child $ grep "DMA32 free:" patched-oom-run1.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk min: 5420.00 max: 5808.00 avg: 5513.90 std: 60.45 nr: 78 $ grep "DMA32 free:" patched-oom-run2.log | sed 's@.*free:\([0-9]*\)kB.*@\1@' | calc_min_max.awk min: 5380.00 max: 6384.00 avg: 5520.94 std: 136.84 nr: 77 e grep "DMA32.*all_unreclaimable? no" patched-oom-run1.log | wc -l 2 $ grep "DMA32.*all_unreclaimable? no" patched-oom-run2.log | wc -l 3 The patched kernel run noticeably longer while invoking OOM killer same number of times. This means that the original implementation is much more aggressive and triggers the OOM killer sooner. free pages stats show that neither kernels went OOM too early most of the time, though. I guess the difference is in the backoff when retries without any progress do sleep for a while if there is memory under writeback or dirty which is highly likely considering the parallel IO. Both kernels have seen races where zone wasn't marked unreclaimable and we still hit the OOM killer. This is most likely a race where a task managed to exit between the last allocation attempt and the oom killer invocation. 2) 2 writers again with 10s of run and then 10 mem_eaters to consume as much memory as possible without triggering the OOM killer. This required a lot of tuning but I've considered 3 consecutive runs in three different boots without OOM as a success. * base kernel size=$(awk '/MemFree/{printf "%dK", ($2/10)-(16*1024)}' /proc/meminfo) * patched kernel size=$(awk '/MemFree/{printf "%dK", ($2/10)-(12*1024)}' /proc/meminfo) That means 40M more memory was usable without triggering OOM killer. The base kernel sometimes managed to handle the same as patched but it wasn't consistent and failed in at least on of the 3 runs. This seems like a minor improvement. I was testing also GPF_REPEAT costly requests (hughetlb) with fragmented memory and under memory pressure. The results are in patch 11 where the logic is implemented. In short I can see huge improvement there. I am certainly interested in other usecases as well as well as any feedback. Especially those which require higher order requests. This patch (of 14): While playing with the oom detection rework [1] I have noticed that my heavy order-9 (hugetlb) load close to OOM ended up in an endless loop where the reclaim hasn't made any progress but did_some_progress didn't reflect that and compaction_suitable was backing off because no zone is above low wmark + 1 << order. It turned out that this is in fact an old standing bug in compaction_ready which ignores the requested_highidx and did the watermark check for 0 classzone_idx. This succeeds for zone DMA most of the time as the zone is mostly unused because of lowmem protection. As a result costly high order allocatios always report a successfull progress even when there was none. This wasn't a problem so far because these allocations usually fail quite early or retry only few times with __GFP_REPEAT but this will change after later patch in this series so make sure to not lie about the progress and propagate requested_highidx down to compaction_ready and use it for both the watermak check and compaction_suitable to fix this issue. [1] http://lkml.kernel.org/r/1459855533-4600-1-git-send-email-mhocko@kernel.org [2] https://lkml.org/lkml/2015/10/12/808 [3] https://lkml.org/lkml/2015/10/13/597 Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: Johannes Weiner Cc: Mel Gorman Cc: David Rientjes Cc: Tetsuo Handa Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 38d6d06c955f..a386454c015a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2459,7 +2459,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc, * Returns true if compaction should go ahead for a high-order request, or * the high-order allocation would succeed without compaction. */ -static inline bool compaction_ready(struct zone *zone, int order) +static inline bool compaction_ready(struct zone *zone, int order, int classzone_idx) { unsigned long balance_gap, watermark; bool watermark_ok; @@ -2473,7 +2473,7 @@ static inline bool compaction_ready(struct zone *zone, int order) balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, classzone_idx); /* * If compaction is deferred, reclaim up to a point where @@ -2486,7 +2486,7 @@ static inline bool compaction_ready(struct zone *zone, int order) * If compaction is not ready to start and allocation is not likely * to succeed without it, then keep reclaiming. */ - if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) + if (compaction_suitable(zone, order, 0, classzone_idx) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2566,7 +2566,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (IS_ENABLED(CONFIG_COMPACTION) && sc->order > PAGE_ALLOC_COSTLY_ORDER && zonelist_zone_idx(z) <= requested_highidx && - compaction_ready(zone, sc->order)) { + compaction_ready(zone, sc->order, requested_highidx)) { sc->compaction_ready = true; continue; } -- cgit v1.2.3 From ea7ab982b6bdb7ce218fd3a7850bb2e2b414fdd0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:38 -0700 Subject: mm, compaction: change COMPACT_ constants into enum Compaction code is doing weird dances between COMPACT_FOO -> int -> unsigned long But there doesn't seem to be any reason for that. All functions which return/use one of those constants are not expecting any other value so it really makes sense to define an enum for them and make it clear that no other values are expected. This is a pure cleanup and shouldn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 45 +++++++++++++++++++++++++++------------------ mm/compaction.c | 27 ++++++++++++++------------- mm/page_alloc.c | 2 +- 3 files changed, 42 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 242b660f64e6..706cbf00e919 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -2,21 +2,29 @@ #define _LINUX_COMPACTION_H /* Return values for compact_zone() and try_to_compact_pages() */ -/* compaction didn't start as it was deferred due to past failures */ -#define COMPACT_DEFERRED 0 -/* compaction didn't start as it was not possible or direct reclaim was more suitable */ -#define COMPACT_SKIPPED 1 -/* compaction should continue to another pageblock */ -#define COMPACT_CONTINUE 2 -/* direct compaction partially compacted a zone and there are suitable pages */ -#define COMPACT_PARTIAL 3 -/* The full zone was compacted */ -#define COMPACT_COMPLETE 4 -/* For more detailed tracepoint output */ -#define COMPACT_NO_SUITABLE_PAGE 5 -#define COMPACT_NOT_SUITABLE_ZONE 6 -#define COMPACT_CONTENDED 7 /* When adding new states, please adjust include/trace/events/compaction.h */ +enum compact_result { + /* compaction didn't start as it was deferred due to past failures */ + COMPACT_DEFERRED, + /* + * compaction didn't start as it was not possible or direct reclaim + * was more suitable + */ + COMPACT_SKIPPED, + /* compaction should continue to another pageblock */ + COMPACT_CONTINUE, + /* + * direct compaction partially compacted a zone and there are suitable + * pages + */ + COMPACT_PARTIAL, + /* The full zone was compacted */ + COMPACT_COMPLETE, + /* For more detailed tracepoint output */ + COMPACT_NO_SUITABLE_PAGE, + COMPACT_NOT_SUITABLE_ZONE, + COMPACT_CONTENDED, +}; /* Used to signal whether compaction detected need_sched() or lock contention */ /* No contention detected */ @@ -38,12 +46,13 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int sysctl_compact_unevictable_allowed; extern int fragmentation_index(struct zone *zone, unsigned int order); -extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, +extern enum compact_result try_to_compact_pages(gfp_t gfp_mask, + unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); -extern unsigned long compaction_suitable(struct zone *zone, int order, +extern enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx); extern void defer_compaction(struct zone *zone, int order); @@ -57,7 +66,7 @@ extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); #else -static inline unsigned long try_to_compact_pages(gfp_t gfp_mask, +static inline enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended) @@ -73,7 +82,7 @@ static inline void reset_isolation_suitable(pg_data_t *pgdat) { } -static inline unsigned long compaction_suitable(struct zone *zone, int order, +static inline enum compact_result compaction_suitable(struct zone *zone, int order, int alloc_flags, int classzone_idx) { return COMPACT_SKIPPED; diff --git a/mm/compaction.c b/mm/compaction.c index eda3c2244f30..e721d252c5d2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1229,7 +1229,7 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static int __compact_finished(struct zone *zone, struct compact_control *cc, +static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, const int migratetype) { unsigned int order; @@ -1292,8 +1292,9 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc, return COMPACT_NO_SUITABLE_PAGE; } -static int compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result compact_finished(struct zone *zone, + struct compact_control *cc, + const int migratetype) { int ret; @@ -1312,7 +1313,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc, * COMPACT_PARTIAL - If the allocation would succeed without compaction * COMPACT_CONTINUE - If compaction should run now */ -static unsigned long __compaction_suitable(struct zone *zone, int order, +static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx) { @@ -1358,11 +1359,11 @@ static unsigned long __compaction_suitable(struct zone *zone, int order, return COMPACT_CONTINUE; } -unsigned long compaction_suitable(struct zone *zone, int order, +enum compact_result compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, int classzone_idx) { - unsigned long ret; + enum compact_result ret; ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); trace_mm_compaction_suitable(zone, order, ret); @@ -1372,9 +1373,9 @@ unsigned long compaction_suitable(struct zone *zone, int order, return ret; } -static int compact_zone(struct zone *zone, struct compact_control *cc) +static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) { - int ret; + enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); @@ -1530,11 +1531,11 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, int order, +static enum compact_result compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, enum migrate_mode mode, int *contended, unsigned int alloc_flags, int classzone_idx) { - unsigned long ret; + enum compact_result ret; struct compact_control cc = { .nr_freepages = 0, .nr_migratepages = 0, @@ -1572,7 +1573,7 @@ int sysctl_extfrag_threshold = 500; * * This is the main entry point for direct page compaction. */ -unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, +enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, int *contended) { @@ -1580,7 +1581,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - int rc = COMPACT_DEFERRED; + enum compact_result rc = COMPACT_DEFERRED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1594,7 +1595,7 @@ unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { - int status; + enum compact_result status; int zone_contended; if (compaction_deferred(zone, order)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index edbdf56b3c9b..ed62c4b90598 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3188,7 +3188,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum migrate_mode mode, int *contended_compaction, bool *deferred_compaction) { - unsigned long compact_result; + enum compact_result compact_result; struct page *page; if (!order) -- cgit v1.2.3 From c46649deae3f00aa8ba8716f0ddb8eef2dc9532f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:41 -0700 Subject: mm, compaction: cover all compaction mode in compact_zone The compiler is complaining after "mm, compaction: change COMPACT_ constants into enum" mm/compaction.c: In function `compact_zone': mm/compaction.c:1350:2: warning: enumeration value `COMPACT_DEFERRED' not handled in switch [-Wswitch] switch (ret) { ^ mm/compaction.c:1350:2: warning: enumeration value `COMPACT_COMPLETE' not handled in switch [-Wswitch] mm/compaction.c:1350:2: warning: enumeration value `COMPACT_NO_SUITABLE_PAGE' not handled in switch [-Wswitch] mm/compaction.c:1350:2: warning: enumeration value `COMPACT_NOT_SUITABLE_ZONE' not handled in switch [-Wswitch] mm/compaction.c:1350:2: warning: enumeration value `COMPACT_CONTENDED' not handled in switch [-Wswitch] compaction_suitable is allowed to return only COMPACT_PARTIAL, COMPACT_SKIPPED and COMPACT_CONTINUE so other cases are simply impossible. Put a VM_BUG_ON to catch an impossible return value. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e721d252c5d2..455ecd87f48d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1383,15 +1383,12 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); - switch (ret) { - case COMPACT_PARTIAL: - case COMPACT_SKIPPED: - /* Compaction is likely to fail */ + /* Compaction is likely to fail */ + if (ret == COMPACT_PARTIAL || ret == COMPACT_SKIPPED) return ret; - case COMPACT_CONTINUE: - /* Fall through to compaction */ - ; - } + + /* huh, compaction_suitable is returning something unexpected */ + VM_BUG_ON(ret != COMPACT_CONTINUE); /* * Clear pageblock skip if there were failures recently and compaction -- cgit v1.2.3 From 1d4746d395975e0ff5103e20ab169d1a95b4ef9e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:44 -0700 Subject: mm, compaction: distinguish COMPACT_DEFERRED from COMPACT_SKIPPED try_to_compact_pages() can currently return COMPACT_SKIPPED even when the compaction is defered for some zone just because zone DMA is skipped in 99% of cases due to watermark checks. This makes COMPACT_DEFERRED basically unusable for the page allocator as a feedback mechanism. Make sure we distinguish those two states properly and switch their ordering in the enum. This would mean that the COMPACT_SKIPPED will be returned only when all eligible zones are skipped. As a result COMPACT_DEFERRED handling for THP in __alloc_pages_slowpath will be more precise and we would bail out rather than reclaim. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 7 +++++-- include/trace/events/compaction.h | 2 +- mm/compaction.c | 8 +++++--- 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 706cbf00e919..11f228712ed5 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -4,13 +4,16 @@ /* Return values for compact_zone() and try_to_compact_pages() */ /* When adding new states, please adjust include/trace/events/compaction.h */ enum compact_result { - /* compaction didn't start as it was deferred due to past failures */ - COMPACT_DEFERRED, /* * compaction didn't start as it was not possible or direct reclaim * was more suitable */ COMPACT_SKIPPED, + /* compaction didn't start as it was deferred due to past failures */ + COMPACT_DEFERRED, + /* compaction not active last round */ + COMPACT_INACTIVE = COMPACT_DEFERRED, + /* compaction should continue to another pageblock */ COMPACT_CONTINUE, /* diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index e215bf68f521..6ba16c86d7db 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -10,8 +10,8 @@ #include #define COMPACTION_STATUS \ - EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_SKIPPED, "skipped") \ + EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_CONTINUE, "continue") \ EM( COMPACT_PARTIAL, "partial") \ EM( COMPACT_COMPLETE, "complete") \ diff --git a/mm/compaction.c b/mm/compaction.c index 455ecd87f48d..b2b94474dd28 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1578,7 +1578,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, int may_perform_io = gfp_mask & __GFP_IO; struct zoneref *z; struct zone *zone; - enum compact_result rc = COMPACT_DEFERRED; + enum compact_result rc = COMPACT_SKIPPED; int all_zones_contended = COMPACT_CONTENDED_LOCK; /* init for &= op */ *contended = COMPACT_CONTENDED_NONE; @@ -1595,8 +1595,10 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, enum compact_result status; int zone_contended; - if (compaction_deferred(zone, order)) + if (compaction_deferred(zone, order)) { + rc = max_t(enum compact_result, COMPACT_DEFERRED, rc); continue; + } status = compact_zone_order(zone, order, gfp_mask, mode, &zone_contended, alloc_flags, @@ -1667,7 +1669,7 @@ break_loop: * If at least one zone wasn't deferred or skipped, we report if all * zones that were tried were lock contended. */ - if (rc > COMPACT_SKIPPED && all_zones_contended) + if (rc > COMPACT_INACTIVE && all_zones_contended) *contended = COMPACT_CONTENDED_LOCK; return rc; -- cgit v1.2.3 From c8f7de0bfae36e8532e5e25a39d15407f02aca78 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:47 -0700 Subject: mm, compaction: distinguish between full and partial COMPACT_COMPLETE COMPACT_COMPLETE now means that compaction and free scanner met. This is not very useful information if somebody just wants to use this feedback and make any decisions based on that. The current caller might be a poor guy who just happened to scan tiny portion of the zone and that could be the reason no suitable pages were compacted. Make sure we distinguish the full and partial zone walks. Consumers should treat COMPACT_PARTIAL_SKIPPED as a potential success and be optimistic in retrying. The existing users of COMPACT_COMPLETE are conservatively changed to use COMPACT_PARTIAL_SKIPPED as well but some of them should be probably reconsidered and only defer the compaction only for COMPACT_COMPLETE with the new semantic. This patch shouldn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 10 +++++++++- include/trace/events/compaction.h | 1 + mm/compaction.c | 14 +++++++++++--- mm/internal.h | 1 + 4 files changed, 22 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 11f228712ed5..9b37f9d3f7a8 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,7 +21,15 @@ enum compact_result { * pages */ COMPACT_PARTIAL, - /* The full zone was compacted */ + /* + * direct compaction has scanned part of the zone but wasn't successfull + * to compact suitable pages. + */ + COMPACT_PARTIAL_SKIPPED, + /* + * The full zone was compacted scanned but wasn't successfull to compact + * suitable pages. + */ COMPACT_COMPLETE, /* For more detailed tracepoint output */ COMPACT_NO_SUITABLE_PAGE, diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 6ba16c86d7db..36e2d6fb1360 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -14,6 +14,7 @@ EM( COMPACT_DEFERRED, "deferred") \ EM( COMPACT_CONTINUE, "continue") \ EM( COMPACT_PARTIAL, "partial") \ + EM( COMPACT_PARTIAL_SKIPPED, "partial_skipped") \ EM( COMPACT_COMPLETE, "complete") \ EM( COMPACT_NO_SUITABLE_PAGE, "no_suitable_page") \ EM( COMPACT_NOT_SUITABLE_ZONE, "not_suitable_zone") \ diff --git a/mm/compaction.c b/mm/compaction.c index b2b94474dd28..4af1577adb5c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1252,7 +1252,10 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (cc->direct_compaction) zone->compact_blockskip_flush = true; - return COMPACT_COMPLETE; + if (cc->whole_zone) + return COMPACT_COMPLETE; + else + return COMPACT_PARTIAL_SKIPPED; } if (is_via_compact_memory(cc->order)) @@ -1413,6 +1416,10 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } + + if (cc->migrate_pfn == start_pfn) + cc->whole_zone = true; + cc->last_migrated_pfn = 0; trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, @@ -1634,7 +1641,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order, goto break_loop; } - if (mode != MIGRATE_ASYNC && status == COMPACT_COMPLETE) { + if (mode != MIGRATE_ASYNC && (status == COMPACT_COMPLETE || + status == COMPACT_PARTIAL_SKIPPED)) { /* * We think that allocation won't succeed in this zone * so we defer compaction there. If it ends up @@ -1881,7 +1889,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.classzone_idx, 0)) { success = true; compaction_defer_reset(zone, cc.order, false); - } else if (status == COMPACT_COMPLETE) { + } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) { /* * We use sync migration mode here, so we defer like * sync direct compaction does. diff --git a/mm/internal.h b/mm/internal.h index 3ac544f1963f..f6f3353b0868 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -174,6 +174,7 @@ struct compact_control { enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool direct_compaction; /* False from kcompactd or /proc/... */ + bool whole_zone; /* Whole zone has been scanned */ int order; /* order a direct compactor needs */ const gfp_t gfp_mask; /* gfp mask of a direct compactor */ const unsigned int alloc_flags; /* alloc flags of a direct compactor */ -- cgit v1.2.3 From c5d01d0d18e2ab7a21f0371b00e4d1a06f79cdf5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:56:53 -0700 Subject: mm, compaction: simplify __alloc_pages_direct_compact feedback interface __alloc_pages_direct_compact communicates potential back off by two variables: - deferred_compaction tells that the compaction returned COMPACT_DEFERRED - contended_compaction is set when there is a contention on zone->lock resp. zone->lru_lock locks __alloc_pages_slowpath then backs of for THP allocation requests to prevent from long stalls. This is rather messy and it would be much cleaner to return a single compact result value and hide all the nasty details into __alloc_pages_direct_compact. This patch shouldn't introduce any functional changes. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 67 ++++++++++++++++++++++++++------------------------------- 1 file changed, 31 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ed62c4b90598..8bcc10616fab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3185,29 +3185,21 @@ out: static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended_compaction, - bool *deferred_compaction) + enum migrate_mode mode, enum compact_result *compact_result) { - enum compact_result compact_result; struct page *page; + int contended_compaction; if (!order) return NULL; current->flags |= PF_MEMALLOC; - compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, - mode, contended_compaction); + *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, + mode, &contended_compaction); current->flags &= ~PF_MEMALLOC; - switch (compact_result) { - case COMPACT_DEFERRED: - *deferred_compaction = true; - /* fall-through */ - case COMPACT_SKIPPED: + if (*compact_result <= COMPACT_INACTIVE) return NULL; - default: - break; - } /* * At least in one zone compaction wasn't deferred or skipped, so let's @@ -3233,6 +3225,24 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, */ count_vm_event(COMPACTFAIL); + /* + * In all zones where compaction was attempted (and not + * deferred or skipped), lock contention has been detected. + * For THP allocation we do not want to disrupt the others + * so we fallback to base pages instead. + */ + if (contended_compaction == COMPACT_CONTENDED_LOCK) + *compact_result = COMPACT_CONTENDED; + + /* + * If compaction was aborted due to need_resched(), we do not + * want to further increase allocation latency, unless it is + * khugepaged trying to collapse. + */ + if (contended_compaction == COMPACT_CONTENDED_SCHED + && !(current->flags & PF_KTHREAD)) + *compact_result = COMPACT_CONTENDED; + cond_resched(); return NULL; @@ -3241,8 +3251,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, - enum migrate_mode mode, int *contended_compaction, - bool *deferred_compaction) + enum migrate_mode mode, enum compact_result *compact_result) { return NULL; } @@ -3387,8 +3396,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; - bool deferred_compaction = false; - int contended_compaction = COMPACT_CONTENDED_NONE; + enum compact_result compact_result; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3475,8 +3483,7 @@ retry: */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); + &compact_result); if (page) goto got_pg; @@ -3489,25 +3496,14 @@ retry: * to heavily disrupt the system, so we fail the allocation * instead of entering direct reclaim. */ - if (deferred_compaction) - goto nopage; - - /* - * In all zones where compaction was attempted (and not - * deferred or skipped), lock contention has been detected. - * For THP allocation we do not want to disrupt the others - * so we fallback to base pages instead. - */ - if (contended_compaction == COMPACT_CONTENDED_LOCK) + if (compact_result == COMPACT_DEFERRED) goto nopage; /* - * If compaction was aborted due to need_resched(), we do not - * want to further increase allocation latency, unless it is - * khugepaged trying to collapse. + * Compaction is contended so rather back off than cause + * excessive stalls. */ - if (contended_compaction == COMPACT_CONTENDED_SCHED - && !(current->flags & PF_KTHREAD)) + if(compact_result == COMPACT_CONTENDED) goto nopage; } @@ -3555,8 +3551,7 @@ noretry: */ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, - &contended_compaction, - &deferred_compaction); + &compact_result); if (page) goto got_pg; nopage: -- cgit v1.2.3 From 0a0337e0d1d134465778a16f5cbea95086e8e9e0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:00 -0700 Subject: mm, oom: rework oom detection __alloc_pages_slowpath has traditionally relied on the direct reclaim and did_some_progress as an indicator that it makes sense to retry allocation rather than declaring OOM. shrink_zones had to rely on zone_reclaimable if shrink_zone didn't make any progress to prevent from a premature OOM killer invocation - the LRU might be full of dirty or writeback pages and direct reclaim cannot clean those up. zone_reclaimable allows to rescan the reclaimable lists several times and restart if a page is freed. This is really subtle behavior and it might lead to a livelock when a single freed page keeps allocator looping but the current task will not be able to allocate that single page. OOM killer would be more appropriate than looping without any progress for unbounded amount of time. This patch changes OOM detection logic and pulls it out from shrink_zone which is too low to be appropriate for any high level decisions such as OOM which is per zonelist property. It is __alloc_pages_slowpath which knows how many attempts have been done and what was the progress so far therefore it is more appropriate to implement this logic. The new heuristic is implemented in should_reclaim_retry helper called from __alloc_pages_slowpath. It tries to be more deterministic and easier to follow. It builds on an assumption that retrying makes sense only if the currently reclaimable memory + free pages would allow the current allocation request to succeed (as per __zone_watermark_ok) at least for one zone in the usable zonelist. This alone wouldn't be sufficient, though, because the writeback might get stuck and reclaimable pages might be pinned for a really long time or even depend on the current allocation context. Therefore there is a backoff mechanism implemented which reduces the reclaim target after each reclaim round without any progress. This means that we should eventually converge to only NR_FREE_PAGES as the target and fail on the wmark check and proceed to OOM. The backoff is simple and linear with 1/16 of the reclaimable pages for each round without any progress. We are optimistic and reset counter for successful reclaim rounds. Costly high order pages mostly preserve their semantic and those without __GFP_REPEAT fail right away while those which have the flag set will back off after the amount of reclaimable pages reaches equivalent of the requested order. The only difference is that if there was no progress during the reclaim we rely on zone watermark check. This is more logical thing to do than previous 1< Acked-by: Hillf Danton Cc: Vladimir Davydov Cc: Johannes Weiner Cc: David Rientjes Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/page_alloc.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++----- mm/vmscan.c | 25 +++---------- 3 files changed, 97 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index ad220359f1b0..0af2bb2028fd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -316,6 +316,7 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, struct vm_area_struct *vma); /* linux/mm/vmscan.c */ +extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8bcc10616fab..fa39efc3a692 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3386,6 +3386,77 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask) return (gfp_mask & (GFP_TRANSHUGE | __GFP_KSWAPD_RECLAIM)) == GFP_TRANSHUGE; } +/* + * Maximum number of reclaim retries without any progress before OOM killer + * is consider as the only way to move forward. + */ +#define MAX_RECLAIM_RETRIES 16 + +/* + * Checks whether it makes sense to retry the reclaim to make a forward progress + * for the given allocation request. + * The reclaim feedback represented by did_some_progress (any progress during + * the last reclaim round), pages_reclaimed (cumulative number of reclaimed + * pages) and no_progress_loops (number of reclaim rounds without any progress + * in a row) is considered as well as the reclaimable pages on the applicable + * zone list (with a backoff mechanism which is a function of no_progress_loops). + * + * Returns true if a retry is viable or false to enter the oom path. + */ +static inline bool +should_reclaim_retry(gfp_t gfp_mask, unsigned order, + struct alloc_context *ac, int alloc_flags, + bool did_some_progress, unsigned long pages_reclaimed, + int no_progress_loops) +{ + struct zone *zone; + struct zoneref *z; + + /* + * Make sure we converge to OOM if we cannot make any progress + * several times in the row. + */ + if (no_progress_loops > MAX_RECLAIM_RETRIES) + return false; + + if (order > PAGE_ALLOC_COSTLY_ORDER) { + if (pages_reclaimed >= (1<zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + + available = zone_reclaimable_pages(zone); + available -= DIV_ROUND_UP(no_progress_loops * available, + MAX_RECLAIM_RETRIES); + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + + /* + * Would the allocation succeed if we reclaimed the whole + * available? + */ + if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), + ac->high_zoneidx, alloc_flags, available)) { + /* Wait for some write requests to complete then retry */ + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50); + return true; + } + } + + return false; +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) @@ -3397,6 +3468,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; enum compact_result compact_result; + int no_progress_loops = 0; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -3525,23 +3597,35 @@ retry: if (gfp_mask & __GFP_NORETRY) goto noretry; - /* Keep reclaiming pages as long as there is reasonable progress */ - pages_reclaimed += did_some_progress; - if ((did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) || - ((gfp_mask & __GFP_REPEAT) && pages_reclaimed < (1 << order))) { - /* Wait for some write requests to complete then retry */ - wait_iff_congested(ac->preferred_zoneref->zone, BLK_RW_ASYNC, HZ/50); - goto retry; + /* + * Do not retry costly high order allocations unless they are + * __GFP_REPEAT + */ + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + goto noretry; + + if (did_some_progress) { + no_progress_loops = 0; + pages_reclaimed += did_some_progress; + } else { + no_progress_loops++; } + if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, + did_some_progress > 0, pages_reclaimed, + no_progress_loops)) + goto retry; + /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) goto got_pg; /* Retry as long as the OOM killer is making progress */ - if (did_some_progress) + if (did_some_progress) { + no_progress_loops = 0; goto retry; + } noretry: /* diff --git a/mm/vmscan.c b/mm/vmscan.c index a386454c015a..c4a2f4512fca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -191,7 +191,7 @@ static bool sane_reclaim(struct scan_control *sc) } #endif -static unsigned long zone_reclaimable_pages(struct zone *zone) +unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; @@ -2507,10 +2507,8 @@ static inline bool compaction_ready(struct zone *zone, int order, int classzone_ * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. - * - * Returns true if a zone was reclaimable. */ -static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) +static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; @@ -2518,7 +2516,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_scanned; gfp_t orig_mask; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); - bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2583,17 +2580,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; - if (nr_soft_reclaimed) - reclaimable = true; /* need some check for avoid more shrink_zone() */ } - if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) - reclaimable = true; - - if (global_reclaim(sc) && - !reclaimable && zone_reclaimable(zone)) - reclaimable = true; + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); } /* @@ -2601,8 +2591,6 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) * promoted it to __GFP_HIGHMEM. */ sc->gfp_mask = orig_mask; - - return reclaimable; } /* @@ -2627,7 +2615,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int initial_priority = sc->priority; unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool zones_reclaimable; retry: delayacct_freepages_start(); @@ -2638,7 +2625,7 @@ retry: vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - zones_reclaimable = shrink_zones(zonelist, sc); + shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) @@ -2685,10 +2672,6 @@ retry: goto retry; } - /* Any of the zones still reclaimable? Don't OOM. */ - if (zones_reclaimable) - return 1; - return 0; } -- cgit v1.2.3 From ede37713737834d98ec72ed299a305d53e909f73 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:03 -0700 Subject: mm: throttle on IO only when there are too many dirty and writeback pages wait_iff_congested has been used to throttle allocator before it retried another round of direct reclaim to allow the writeback to make some progress and prevent reclaim from looping over dirty/writeback pages without making any progress. We used to do congestion_wait before commit 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone") but that led to undesirable stalls and sleeping for the full timeout even when the BDI wasn't congested. Hence wait_iff_congested was used instead. But it seems that even wait_iff_congested doesn't work as expected. We might have a small file LRU list with all pages dirty/writeback and yet the bdi is not congested so this is just a cond_resched in the end and can end up triggering pre mature OOM. This patch replaces the unconditional wait_iff_congested by congestion_wait which is executed only if we _know_ that the last round of direct reclaim didn't make any progress and dirty+writeback pages are more than a half of the reclaimable pages on the zone which might be usable for our target allocation. This shouldn't reintroduce stalls fixed by 0e093d99763e because congestion_wait is called only when we are getting hopeless when sleeping is a better choice than OOM with many pages under IO. We have to preserve logic introduced by commit 373ccbe59270 ("mm, vmstat: allow WQ concurrency to discover memory reclaim doesn't make any progress") into the __alloc_pages_slowpath now that wait_iff_congested is not used anymore. As the only remaining user of wait_iff_congested is shrink_inactive_list we can remove the WQ specific short sleep from wait_iff_congested because the sleep is needed to be done only once in the allocation retry cycle. [mhocko@suse.com: high_zoneidx->ac_classzone_idx to evaluate memory reserves properly] Link: http://lkml.kernel.org/r/1463051677-29418-2-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 20 +++----------------- mm/page_alloc.c | 41 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 40 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 0c6317b7db38..ed173b8ae8f2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -957,9 +957,8 @@ EXPORT_SYMBOL(congestion_wait); * jiffies for either a BDI to exit congestion of the given @sync queue * or a write to complete. * - * In the absence of zone congestion, a short sleep or a cond_resched is - * performed to yield the processor and to allow other subsystems to make - * a forward progress. + * In the absence of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function @@ -979,20 +978,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) */ if (atomic_read(&nr_wb_congested[sync]) == 0 || !test_bit(ZONE_CONGESTED, &zone->flags)) { - - /* - * Memory allocation/reclaim might be called from a WQ - * context and the current implementation of the WQ - * concurrency control doesn't recognize that a particular - * WQ is congested if the worker thread is looping without - * ever sleeping. Therefore we have to do a short sleep - * here rather than calling cond_resched(). - */ - if (current->flags & PF_WQ_WORKER) - schedule_timeout_uninterruptible(1); - else - cond_resched(); - + cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); if (ret < 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fa39efc3a692..f51c302126a1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3436,8 +3436,9 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { unsigned long available; + unsigned long reclaimable; - available = zone_reclaimable_pages(zone); + available = reclaimable = zone_reclaimable_pages(zone); available -= DIV_ROUND_UP(no_progress_loops * available, MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); @@ -3447,9 +3448,41 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * available? */ if (__zone_watermark_ok(zone, order, min_wmark_pages(zone), - ac->high_zoneidx, alloc_flags, available)) { - /* Wait for some write requests to complete then retry */ - wait_iff_congested(zone, BLK_RW_ASYNC, HZ/50); + ac_classzone_idx(ac), alloc_flags, available)) { + /* + * If we didn't make any progress and have a lot of + * dirty + writeback pages then we should wait for + * an IO to complete to slow down the reclaim and + * prevent from pre mature OOM + */ + if (!did_some_progress) { + unsigned long writeback; + unsigned long dirty; + + writeback = zone_page_state_snapshot(zone, + NR_WRITEBACK); + dirty = zone_page_state_snapshot(zone, NR_FILE_DIRTY); + + if (2*(writeback + dirty) > reclaimable) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + return true; + } + } + + /* + * Memory allocation/reclaim might be called from a WQ + * context and the current implementation of the WQ + * concurrency control doesn't recognize that + * a particular WQ is congested if the worker thread is + * looping without ever sleeping. Therefore we have to + * do a short sleep here rather than calling + * cond_resched(). + */ + if (current->flags & PF_WQ_WORKER) + schedule_timeout_uninterruptible(1); + else + cond_resched(); + return true; } } -- cgit v1.2.3 From 33c2d21438daea807947923377995c73ee8ed3fc Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:06 -0700 Subject: mm, oom: protect !costly allocations some more should_reclaim_retry will give up retries for higher order allocations if none of the eligible zones has any requested or higher order pages available even if we pass the watermak check for order-0. This is done because there is no guarantee that the reclaimable and currently free pages will form the required order. This can, however, lead to situations where the high-order request (e.g. order-2 required for the stack allocation during fork) will trigger OOM too early - e.g. after the first reclaim/compaction round. Such a system would have to be highly fragmented and there is no guarantee further reclaim/compaction attempts would help but at least make sure that the compaction was active before we go OOM and keep retrying even if should_reclaim_retry tells us to oom if - the last compaction round backed off or - we haven't completed at least MAX_COMPACT_RETRIES active compaction rounds. The first rule ensures that the very last attempt for compaction was not ignored while the second guarantees that the compaction has done some work. Multiple retries might be needed to prevent occasional pigggy backing of other contexts to steal the compacted pages before the current context manages to retry to allocate them. compaction_failed() is taken as a final word from the compaction that the retry doesn't make much sense. We have to be careful though because the first compaction round is MIGRATE_ASYNC which is rather weak as it ignores pages under writeback and gives up too easily in other situations. We therefore have to make sure that MIGRATE_SYNC_LIGHT mode has been used before we give up. With this logic in place we do not have to increase the migration mode unconditionally and rather do it only if the compaction failed for the weaker mode. A nice side effect is that the stronger migration mode is used only when really needed so this has a potential of smaller latencies in some cases. Please note that the compaction doesn't tell us much about how successful it was when returning compaction_made_progress so we just have to blindly trust that another retry is worthwhile and cap the number to something reasonable to guarantee a convergence. If the given number of successful retries is not sufficient for a reasonable workloads we should focus on the collected compaction tracepoints data and try to address the issue in the compaction code. If this is not feasible we can increase the retries limit. [mhocko@suse.com: fix warning] Link: http://lkml.kernel.org/r/20160512061636.GA4200@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 78 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f51c302126a1..38ad6dd7cba0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3180,6 +3180,13 @@ out: return page; } + +/* + * Maximum number of compaction retries wit a progress before OOM + * killer is consider as the only way to move forward. + */ +#define MAX_COMPACT_RETRIES 16 + #ifdef CONFIG_COMPACTION /* Try memory compaction for high-order allocations before reclaim */ static struct page * @@ -3247,14 +3254,60 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; } + +static inline bool +should_compact_retry(unsigned int order, enum compact_result compact_result, + enum migrate_mode *migrate_mode, + int compaction_retries) +{ + if (!order) + return false; + + /* + * compaction considers all the zone as desperately out of memory + * so it doesn't really make much sense to retry except when the + * failure could be caused by weak migration mode. + */ + if (compaction_failed(compact_result)) { + if (*migrate_mode == MIGRATE_ASYNC) { + *migrate_mode = MIGRATE_SYNC_LIGHT; + return true; + } + return false; + } + + /* + * !costly allocations are really important and we have to make sure + * the compaction wasn't deferred or didn't bail out early due to locks + * contention before we go OOM. Still cap the reclaim retry loops with + * progress to prevent from looping forever and potential trashing. + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER) { + if (compaction_withdrawn(compact_result)) + return true; + if (compaction_retries <= MAX_COMPACT_RETRIES) + return true; + } + + return false; +} #else static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, unsigned int alloc_flags, const struct alloc_context *ac, enum migrate_mode mode, enum compact_result *compact_result) { + *compact_result = COMPACT_SKIPPED; return NULL; } + +static inline bool +should_compact_retry(unsigned int order, enum compact_result compact_result, + enum migrate_mode *migrate_mode, + int compaction_retries) +{ + return false; +} #endif /* CONFIG_COMPACTION */ /* Perform direct synchronous page reclaim */ @@ -3501,6 +3554,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; enum migrate_mode migration_mode = MIGRATE_ASYNC; enum compact_result compact_result; + int compaction_retries = 0; int no_progress_loops = 0; /* @@ -3612,13 +3666,8 @@ retry: goto nopage; } - /* - * It can become very expensive to allocate transparent hugepages at - * fault, so use asynchronous memory compaction for THP unless it is - * khugepaged trying to collapse. - */ - if (!is_thp_gfp_mask(gfp_mask) || (current->flags & PF_KTHREAD)) - migration_mode = MIGRATE_SYNC_LIGHT; + if (order && compaction_made_progress(compact_result)) + compaction_retries++; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac, @@ -3649,6 +3698,17 @@ retry: no_progress_loops)) goto retry; + /* + * It doesn't make any sense to retry for the compaction if the order-0 + * reclaim is not able to make any progress because the current + * implementation of the compaction depends on the sufficient amount + * of free memory (see __compaction_suitable) + */ + if (did_some_progress > 0 && + should_compact_retry(order, compact_result, + &migration_mode, compaction_retries)) + goto retry; + /* Reclaim has failed us, start killing things */ page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) @@ -3662,10 +3722,18 @@ retry: noretry: /* - * High-order allocations do not necessarily loop after - * direct reclaim and reclaim/compaction depends on compaction - * being called after reclaim so call directly if necessary + * High-order allocations do not necessarily loop after direct reclaim + * and reclaim/compaction depends on compaction being called after + * reclaim so call directly if necessary. + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. All other requests should tolerate + * at least light sync migration. */ + if (is_thp_gfp_mask(gfp_mask) && !(current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_ASYNC; + else + migration_mode = MIGRATE_SYNC_LIGHT; page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, migration_mode, &compact_result); -- cgit v1.2.3 From 7854ea6c28c6076050e24773eeb78e2925bd7411 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:09 -0700 Subject: mm: consider compaction feedback also for costly allocation PAGE_ALLOC_COSTLY_ORDER retry logic is mostly handled inside should_reclaim_retry currently where we decide to not retry after at least order worth of pages were reclaimed or the watermark check for at least one zone would succeed after reclaiming all pages if the reclaim hasn't made any progress. Compaction feedback is mostly ignored and we just try to make sure that the compaction did at least something before giving up. The first condition was added by a41f24ea9fd6 ("page allocator: smarter retry of costly-order allocations) and it assumed that lumpy reclaim could have created a page of the sufficient order. Lumpy reclaim, has been removed quite some time ago so the assumption doesn't hold anymore. Remove the check for the number of reclaimed pages and rely on the compaction feedback solely. should_reclaim_retry now only makes sure that we keep retrying reclaim for high order pages only if they are hidden by watermaks so order-0 reclaim makes really sense. should_compact_retry now keeps retrying even for the costly allocations. The number of retries is reduced wrt. !costly requests because they are less important and harder to grant and so their pressure shouldn't cause contention for other requests or cause an over reclaim. We also do not reset no_progress_loops for costly request to make sure we do not keep reclaiming too agressively. This has been tested by running a process which fragments memory: - compact memory - mmap large portion of the memory (1920M on 2GRAM machine with 2G of swapspace) - MADV_DONTNEED single page in PAGE_SIZE*((1UL< /proc/sys/vm/overcommit_memory;echo 1 > /proc/sys/vm/compact_memory; ./fragment-mem-and-run /root/alloc_hugepages.sh 1920M 250M Node 0, zone DMA 31 28 31 10 2 0 2 1 2 3 1 Node 0, zone DMA32 437 319 171 50 28 25 20 16 16 14 437 * This is the /proc/buddyinfo after the compaction Done fragmenting. size=2013265920 freed=262144000 Node 0, zone DMA 165 48 3 1 2 0 2 2 2 2 0 Node 0, zone DMA32 35109 14575 185 51 41 12 6 0 0 0 0 * /proc/buddyinfo after memory got fragmented Executing "/root/alloc_hugepages.sh" Eating some pagecache 508623+0 records in 508623+0 records out 2083319808 bytes (2.1 GB) copied, 11.7292 s, 178 MB/s Node 0, zone DMA 3 5 3 1 2 0 2 2 2 2 0 Node 0, zone DMA32 111 344 153 20 24 10 3 0 0 0 0 * /proc/buddyinfo after page cache got eaten Trying to allocate 129 129 * 129 hugepages requested and all of them granted. Node 0, zone DMA 3 5 3 1 2 0 2 2 2 2 0 Node 0, zone DMA32 127 97 30 99 11 6 2 1 4 0 0 * /proc/buddyinfo after hugetlb allocation. 10 runs will behave as follows: Trying to allocate 130 130 -- Trying to allocate 129 129 -- Trying to allocate 128 128 -- Trying to allocate 129 129 -- Trying to allocate 128 128 -- Trying to allocate 129 129 -- Trying to allocate 132 132 -- Trying to allocate 129 129 -- Trying to allocate 128 128 -- Trying to allocate 129 129 So basically 100% success for all 10 attempts. Without the patch numbers looked much worse: Trying to allocate 128 12 -- Trying to allocate 129 14 -- Trying to allocate 129 7 -- Trying to allocate 129 16 -- Trying to allocate 129 30 -- Trying to allocate 129 38 -- Trying to allocate 129 19 -- Trying to allocate 129 37 -- Trying to allocate 129 28 -- Trying to allocate 129 37 Just for completness the base kernel without oom detection rework looks as follows: Trying to allocate 127 30 -- Trying to allocate 129 12 -- Trying to allocate 129 52 -- Trying to allocate 128 32 -- Trying to allocate 129 12 -- Trying to allocate 129 10 -- Trying to allocate 129 32 -- Trying to allocate 128 14 -- Trying to allocate 128 16 -- Trying to allocate 129 8 As we can see the success rate is much more volatile and smaller without this patch. So the patch not only makes the retry logic for costly requests more sensible the success rate is even higher. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Hillf Danton Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 63 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 32 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 38ad6dd7cba0..dea406a62e3d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3260,6 +3260,8 @@ should_compact_retry(unsigned int order, enum compact_result compact_result, enum migrate_mode *migrate_mode, int compaction_retries) { + int max_retries = MAX_COMPACT_RETRIES; + if (!order) return false; @@ -3277,17 +3279,24 @@ should_compact_retry(unsigned int order, enum compact_result compact_result, } /* - * !costly allocations are really important and we have to make sure - * the compaction wasn't deferred or didn't bail out early due to locks - * contention before we go OOM. Still cap the reclaim retry loops with - * progress to prevent from looping forever and potential trashing. + * make sure the compaction wasn't deferred or didn't bail out early + * due to locks contention before we declare that we should give up. */ - if (order <= PAGE_ALLOC_COSTLY_ORDER) { - if (compaction_withdrawn(compact_result)) - return true; - if (compaction_retries <= MAX_COMPACT_RETRIES) - return true; - } + if (compaction_withdrawn(compact_result)) + return true; + + /* + * !costly requests are much more important than __GFP_REPEAT + * costly ones because they are de facto nofail and invoke OOM + * killer to move on while costly can fail and users are ready + * to cope with that. 1/4 retries is rather arbitrary but we + * would need much more detailed feedback from compaction to + * make a better decision. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + max_retries /= 4; + if (compaction_retries <= max_retries) + return true; return false; } @@ -3449,18 +3458,17 @@ static inline bool is_thp_gfp_mask(gfp_t gfp_mask) * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round), pages_reclaimed (cumulative number of reclaimed - * pages) and no_progress_loops (number of reclaim rounds without any progress - * in a row) is considered as well as the reclaimable pages on the applicable - * zone list (with a backoff mechanism which is a function of no_progress_loops). + * the last reclaim round) and no_progress_loops (number of reclaim rounds without + * any progress in a row) is considered as well as the reclaimable pages on the + * applicable zone list (with a backoff mechanism which is a function of + * no_progress_loops). * * Returns true if a retry is viable or false to enter the oom path. */ static inline bool should_reclaim_retry(gfp_t gfp_mask, unsigned order, struct alloc_context *ac, int alloc_flags, - bool did_some_progress, unsigned long pages_reclaimed, - int no_progress_loops) + bool did_some_progress, int no_progress_loops) { struct zone *zone; struct zoneref *z; @@ -3472,14 +3480,6 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, if (no_progress_loops > MAX_RECLAIM_RETRIES) return false; - if (order > PAGE_ALLOC_COSTLY_ORDER) { - if (pages_reclaimed >= (1< PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) goto noretry; - if (did_some_progress) { + /* + * Costly allocations might have made a progress but this doesn't mean + * their order will become available due to high fragmentation so + * always increment the no progress counter for them + */ + if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER) no_progress_loops = 0; - pages_reclaimed += did_some_progress; - } else { + else no_progress_loops++; - } if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, - did_some_progress > 0, pages_reclaimed, - no_progress_loops)) + did_some_progress > 0, no_progress_loops)) goto retry; /* -- cgit v1.2.3 From 86a294a81f93d6f36d00ec3ff779d36d218f852d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:12 -0700 Subject: mm, oom, compaction: prevent from should_compact_retry looping for ever for costly orders "mm: consider compaction feedback also for costly allocation" has removed the upper bound for the reclaim/compaction retries based on the number of reclaimed pages for costly orders. While this is desirable the patch did miss a mis interaction between reclaim, compaction and the retry logic. The direct reclaim tries to get zones over min watermark while compaction backs off and returns COMPACT_SKIPPED when all zones are below low watermark + 1< Acked-by: Hillf Danton Acked-by: Vlastimil Babka Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Mel Gorman Cc: Tetsuo Handa Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 4 ++++ include/linux/mmzone.h | 3 +++ mm/compaction.c | 42 +++++++++++++++++++++++++++++++++++++++--- mm/page_alloc.c | 23 +++++++++++++---------- 4 files changed, 59 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 8d8c916fe67a..a58c852a268f 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -142,6 +142,10 @@ static inline bool compaction_withdrawn(enum compact_result result) return false; } + +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags); + extern int kcompactd_run(int nid); extern void kcompactd_stop(int nid); extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c60db2096fd8..8dd0333b01dc 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -739,6 +739,9 @@ static inline bool is_dev_zone(const struct zone *zone) extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, unsigned int alloc_flags, + long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, unsigned int alloc_flags); diff --git a/mm/compaction.c b/mm/compaction.c index 4af1577adb5c..d8a20fcf8678 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1318,7 +1318,8 @@ static enum compact_result compact_finished(struct zone *zone, */ static enum compact_result __compaction_suitable(struct zone *zone, int order, unsigned int alloc_flags, - int classzone_idx) + int classzone_idx, + unsigned long wmark_target) { int fragindex; unsigned long watermark; @@ -1341,7 +1342,8 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, * allocated and for a short time, the footprint is higher */ watermark += (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, classzone_idx, alloc_flags)) + if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx, + alloc_flags, wmark_target)) return COMPACT_SKIPPED; /* @@ -1368,7 +1370,8 @@ enum compact_result compaction_suitable(struct zone *zone, int order, { enum compact_result ret; - ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx); + ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx, + zone_page_state(zone, NR_FREE_PAGES)); trace_mm_compaction_suitable(zone, order, ret); if (ret == COMPACT_NOT_SUITABLE_ZONE) ret = COMPACT_SKIPPED; @@ -1376,6 +1379,39 @@ enum compact_result compaction_suitable(struct zone *zone, int order, return ret; } +bool compaction_zonelist_suitable(struct alloc_context *ac, int order, + int alloc_flags) +{ + struct zone *zone; + struct zoneref *z; + + /* + * Make sure at least one zone would pass __compaction_suitable if we continue + * retrying the reclaim. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + unsigned long available; + enum compact_result compact_result; + + /* + * Do not consider all the reclaimable memory because we do not + * want to trash just for a single high order allocation which + * is even not guaranteed to appear even if __compaction_suitable + * is happy about the watermark check. + */ + available = zone_reclaimable_pages(zone) / order; + available += zone_page_state_snapshot(zone, NR_FREE_PAGES); + compact_result = __compaction_suitable(zone, order, alloc_flags, + ac_classzone_idx(ac), available); + if (compact_result != COMPACT_SKIPPED && + compact_result != COMPACT_NOT_SUITABLE_ZONE) + return true; + } + + return false; +} + static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc) { enum compact_result ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dea406a62e3d..089f760ce64a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2750,10 +2750,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * one free page of a suitable size. Checking now avoids taking the zone lock * to check in the allocation paths if no pages are free. */ -static bool __zone_watermark_ok(struct zone *z, unsigned int order, - unsigned long mark, int classzone_idx, - unsigned int alloc_flags, - long free_pages) +bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, + int classzone_idx, unsigned int alloc_flags, + long free_pages) { long min = mark; int o; @@ -3256,8 +3255,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(unsigned int order, enum compact_result compact_result, - enum migrate_mode *migrate_mode, +should_compact_retry(struct alloc_context *ac, int order, int alloc_flags, + enum compact_result compact_result, enum migrate_mode *migrate_mode, int compaction_retries) { int max_retries = MAX_COMPACT_RETRIES; @@ -3281,9 +3280,11 @@ should_compact_retry(unsigned int order, enum compact_result compact_result, /* * make sure the compaction wasn't deferred or didn't bail out early * due to locks contention before we declare that we should give up. + * But do not retry if the given zonelist is not suitable for + * compaction. */ if (compaction_withdrawn(compact_result)) - return true; + return compaction_zonelist_suitable(ac, order, alloc_flags); /* * !costly requests are much more important than __GFP_REPEAT @@ -3311,7 +3312,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, } static inline bool -should_compact_retry(unsigned int order, enum compact_result compact_result, +should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags, + enum compact_result compact_result, enum migrate_mode *migrate_mode, int compaction_retries) { @@ -3706,8 +3708,9 @@ retry: * of free memory (see __compaction_suitable) */ if (did_some_progress > 0 && - should_compact_retry(order, compact_result, - &migration_mode, compaction_retries)) + should_compact_retry(ac, order, alloc_flags, + compact_result, &migration_mode, + compaction_retries)) goto retry; /* Reclaim has failed us, start killing things */ -- cgit v1.2.3 From 31e49bfda18464b9b0cfe42044d5a4be4a0ca0f3 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:15 -0700 Subject: mm, oom: protect !costly allocations some more for !CONFIG_COMPACTION Joonsoo has reported that he is able to trigger OOM for !costly high order requests (heavy fork() workload close the OOM) with the new oom detection rework. This is because we rely only on should_reclaim_retry when the compaction is disabled and it only checks watermarks for the requested order and so we might trigger OOM when there is a lot of free memory. It is not very clear what are the usual workloads when the compaction is disabled. Relying on high order allocations heavily without any mechanism to create those orders except for unbound amount of reclaim is certainly not a good idea. To prevent from potential regressions let's help this configuration some. We have to sacrifice the determinsm though because there simply is none here possible. should_compact_retry implementation for !CONFIG_COMPACTION, which was empty so far, will do watermark check for order-0 on all eligible zones. This will cause retrying until either the reclaim cannot make any further progress or all the zones are depleted even for order-0 pages. This means that the number of retries is basically unbounded for !costly orders but that was the case before the rework as well so this shouldn't regress. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1463051677-29418-3-git-send-email-mhocko@kernel.org Reported-by: Joonsoo Kim Signed-off-by: Michal Hocko Acked-by: Hillf Danton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 089f760ce64a..7e49bc3705b6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3317,6 +3317,24 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla enum migrate_mode *migrate_mode, int compaction_retries) { + struct zone *zone; + struct zoneref *z; + + if (!order || order > PAGE_ALLOC_COSTLY_ORDER) + return false; + + /* + * There are setups with compaction disabled which would prefer to loop + * inside the allocator rather than hit the oom killer prematurely. + * Let's give them a good hope and keep retrying while the order-0 + * watermarks are OK. + */ + for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, + ac->nodemask) { + if (zone_watermark_ok(zone, 0, min_wmark_pages(zone), + ac_classzone_idx(ac), alloc_flags)) + return true; + } return false; } #endif /* CONFIG_COMPACTION */ -- cgit v1.2.3 From bb8a4b7fd1266ef888b3a80aa5f266062b224ef4 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:18 -0700 Subject: mm, oom_reaper: hide oom reaped tasks from OOM killer more carefully Commit 36324a990cf5 ("oom: clear TIF_MEMDIE after oom_reaper managed to unmap the address space") not only clears TIF_MEMDIE for oom reaped task but also set OOM_SCORE_ADJ_MIN for the target task to hide it from the oom killer. This works in simple cases but it is not sufficient for (unlikely) cases where the mm is shared between independent processes (as they do not share signal struct). If the mm had only small amount of memory which could be reaped then another task sharing the mm could be selected and that wouldn't help to move out from the oom situation. Introduce MMF_OOM_REAPED mm flag which is checked in oom_badness (same as OOM_SCORE_ADJ_MIN) and task is skipped if the flag is set. Set the flag after __oom_reap_task is done with a task. This will force the select_bad_process() to ignore all already oom reaped tasks as well as no such task is sacrificed for its parent. Signed-off-by: Michal Hocko Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/oom_kill.c | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index 31bd0d97d178..40eabf176ce2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -521,6 +521,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_REAPED 21 /* mm has been already reaped */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 415f7eb913fa..c0376efa79ec 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -174,8 +174,13 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; + /* + * Do not even consider tasks which are explicitly marked oom + * unkillable or have been already oom reaped. + */ adj = (long)p->signal->oom_score_adj; - if (adj == OOM_SCORE_ADJ_MIN) { + if (adj == OOM_SCORE_ADJ_MIN || + test_bit(MMF_OOM_REAPED, &p->mm->flags)) { task_unlock(p); return 0; } @@ -513,7 +518,7 @@ static bool __oom_reap_task(struct task_struct *tsk) * This task can be safely ignored because we cannot do much more * to release its memory. */ - tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + set_bit(MMF_OOM_REAPED, &mm->flags); out: mmput(mm); return ret; -- cgit v1.2.3 From ec8d7c14ea14922fe21945b458a75e39f11dd832 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: mm, oom_reaper: do not mmput synchronously from the oom reaper context Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 5 +++++ kernel/fork.c | 50 +++++++++++++++++++++++++++++++++--------------- mm/oom_kill.c | 8 ++++++-- 4 files changed, 48 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1fda9c99ef95..d553855503e6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -513,6 +514,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif + struct work_struct async_put_work; }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index 40eabf176ce2..479e3cade7e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2730,6 +2730,11 @@ static inline void mmdrop(struct mm_struct * mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +/* same as above but performs the slow path from the async kontext. Can + * be called from the atomic context as well + */ +extern void mmput_async(struct mm_struct *); + /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* diff --git a/kernel/fork.c b/kernel/fork.c index 3e8451527cbe..8fbed7194af1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -699,6 +699,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -706,24 +726,24 @@ void mmput(struct mm_struct *mm) { might_sleep(); + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); +} +EXPORT_SYMBOL_GPL(mmput); + +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); } } -EXPORT_SYMBOL_GPL(mmput); /** * set_mm_exe_file - change a reference to the mm's executable file diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c0376efa79ec..c0e37dd1422f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -446,7 +446,6 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); - static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; @@ -520,7 +519,12 @@ static bool __oom_reap_task(struct task_struct *tsk) */ set_bit(MMF_OOM_REAPED, &mm->flags); out: - mmput(mm); + /* + * Drop our reference but make sure the mmput slow path is called from a + * different context because we shouldn't risk we get stuck there and + * put the oom_reaper out of the way. + */ + mmput_async(mm); return ret; } -- cgit v1.2.3 From f44666b04605d1c7fd94ab90b7ccf633e7eff228 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 20 May 2016 16:57:27 -0700 Subject: mm,oom: speed up select_bad_process() loop Since commit 3a5dda7a17cf ("oom: prevent unnecessary oom kills or kernel panics"), select_bad_process() is using for_each_process_thread(). Since oom_unkillable_task() scans all threads in the caller's thread group and oom_task_origin() scans signal_struct of the caller's thread group, we don't need to call oom_unkillable_task() and oom_task_origin() on each thread. Also, since !mm test will be done later at oom_badness(), we don't need to do !mm test on each thread. Therefore, we only need to do TIF_MEMDIE test on each thread. Although the original code was correct it was quite inefficient because each thread group was scanned num_threads times which can be a lot especially with processes with many threads. Even though the OOM is extremely cold path it is always good to be as effective as possible when we are inside rcu_read_lock() - aka unpreemptible context. If we track number of TIF_MEMDIE threads inside signal_struct, we don't need to do TIF_MEMDIE test on each thread. This will allow select_bad_process() to use for_each_process(). This patch adds a counter to signal_struct for tracking how many TIF_MEMDIE threads are in a given thread group, and check it at oom_scan_process_thread() so that select_bad_process() can use for_each_process() rather than for_each_process_thread(). [mhocko@suse.com: do not blow the signal_struct size] Link: http://lkml.kernel.org/r/20160520075035.GF19172@dhcp22.suse.cz Link: http://lkml.kernel.org/r/201605182230.IDC73435.MVSOHLFOQFOJtF@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Cc: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + mm/oom_kill.c | 17 ++++++----------- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/include/linux/sched.h b/include/linux/sched.h index 479e3cade7e9..01fe1bb68754 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -669,6 +669,7 @@ struct signal_struct { atomic_t sigcnt; atomic_t live; int nr_threads; + atomic_t oom_victims; /* # of TIF_MEDIE threads in this thread group */ struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c0e37dd1422f..5bb2f7698ad7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -283,12 +283,8 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, * This task already has access to memory reserves and is being killed. * Don't allow any other task to have access to the reserves. */ - if (test_tsk_thread_flag(task, TIF_MEMDIE)) { - if (!is_sysrq_oom(oc)) - return OOM_SCAN_ABORT; - } - if (!task->mm) - return OOM_SCAN_CONTINUE; + if (!is_sysrq_oom(oc) && atomic_read(&task->signal->oom_victims)) + return OOM_SCAN_ABORT; /* * If task is allocating a lot of memory and has been marked to be @@ -307,12 +303,12 @@ enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, static struct task_struct *select_bad_process(struct oom_control *oc, unsigned int *ppoints, unsigned long totalpages) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; unsigned long chosen_points = 0; rcu_read_lock(); - for_each_process_thread(g, p) { + for_each_process(p) { unsigned int points; switch (oom_scan_process_thread(oc, p, totalpages)) { @@ -331,9 +327,6 @@ static struct task_struct *select_bad_process(struct oom_control *oc, points = oom_badness(p, NULL, oc->nodemask, totalpages); if (!points || points < chosen_points) continue; - /* Prefer thread group leaders for display purposes */ - if (points == chosen_points && thread_group_leader(chosen)) - continue; chosen = p; chosen_points = points; @@ -673,6 +666,7 @@ void mark_oom_victim(struct task_struct *tsk) /* OOM killer might race with memcg OOM */ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_inc(&tsk->signal->oom_victims); /* * Make sure that the task is woken up from uninterruptible sleep * if it is frozen because OOM killer wouldn't be able to free @@ -690,6 +684,7 @@ void exit_oom_victim(struct task_struct *tsk) { if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) return; + atomic_dec(&tsk->signal->oom_victims); if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); -- cgit v1.2.3 From 340a43bed674a70308f196f2a61ec0b01f8a14d9 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 20 May 2016 16:57:30 -0700 Subject: mm: thp: simplify the implementation of mk_huge_pmd() The implementation of mk_huge_pmd looks verbose, it could be just simplified to one line code. Signed-off-by: Yang Shi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 66675eed67be..86eafd91c982 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -764,10 +764,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline pmd_t mk_huge_pmd(struct page *page, pgprot_t prot) { - pmd_t entry; - entry = mk_pmd(page, prot); - entry = pmd_mkhuge(entry); - return entry; + return pmd_mkhuge(mk_pmd(page, prot)); } static inline struct list_head *page_deferred_list(struct page *page) -- cgit v1.2.3 From 495367c051fb200a42636bdc63be78ca1713a85a Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Fri, 20 May 2016 16:57:32 -0700 Subject: mm/memory-failure.c: replace "MCE" with "Memory failure" HWPoison was specific to some particular x86 platforms. And it is often seen as high level machine check handler. And therefore, 'MCE' is used for the format prefix of printk(). However, 'PowerNV' has also used HWPoison for handling memory errors[1], so 'MCE' is no longer suitable to memory_failure.c. Additionally, 'MCE' and 'Memory failure' have different context. The former belongs to exception context and the latter belongs to process context. Furthermore, HWPoison can also be used for off-lining those sub-health pages that do not trigger any machine check exception. This patch aims to replace 'MCE' with a more appropriate prefix. [1] commit 75eb3d9b60c2 ("powerpc/powernv: Get FSP memory errors and plumb into memory poison infrastructure.") Signed-off-by: Chen Yucong Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 72 +++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ca5acee53b7a..2fcca6b0e005 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -184,8 +184,8 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, struct siginfo si; int ret; - pr_err("MCE %#lx: Killing %s:%d due to hardware memory corruption\n", - pfn, t->comm, t->pid); + pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n", + pfn, t->comm, t->pid); si.si_signo = SIGBUS; si.si_errno = 0; si.si_addr = (void *)addr; @@ -208,7 +208,7 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ } if (ret < 0) - pr_info("MCE: Error sending signal to %s:%d: %d\n", + pr_info("Memory failure: Error sending signal to %s:%d: %d\n", t->comm, t->pid, ret); return ret; } @@ -289,7 +289,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, } else { tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); if (!tk) { - pr_err("MCE: Out of memory while machine check handling\n"); + pr_err("Memory failure: Out of memory while machine check handling\n"); return; } } @@ -303,7 +303,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_info("MCE: Unable to find user space address %lx in %s\n", + pr_info("Memory failure: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -334,7 +334,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, * signal and then access the memory. Just kill it. */ if (fail || tk->addr_valid == 0) { - pr_err("MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", pfn, tk->tsk->comm, tk->tsk->pid); force_sig(SIGKILL, tk->tsk); } @@ -347,7 +347,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, */ else if (kill_proc(tk->tsk, tk->addr, trapno, pfn, page, flags) < 0) - pr_err("MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pr_err("Memory failure: %#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } put_task_struct(tk->tsk); @@ -559,7 +559,7 @@ static int me_kernel(struct page *p, unsigned long pfn) */ static int me_unknown(struct page *p, unsigned long pfn) { - pr_err("MCE %#lx: Unknown page state\n", pfn); + pr_err("Memory failure: %#lx: Unknown page state\n", pfn); return MF_FAILED; } @@ -604,11 +604,12 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (mapping->a_ops->error_remove_page) { err = mapping->a_ops->error_remove_page(mapping, p); if (err != 0) { - pr_info("MCE %#lx: Failed to punch page: %d\n", + pr_info("Memory failure: %#lx: Failed to punch page: %d\n", pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_info("MCE %#lx: failed to release buffers\n", pfn); + pr_info("Memory failure: %#lx: failed to release buffers\n", + pfn); } else { ret = MF_RECOVERED; } @@ -620,7 +621,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) if (invalidate_inode_page(p)) ret = MF_RECOVERED; else - pr_info("MCE %#lx: Failed to invalidate\n", pfn); + pr_info("Memory failure: %#lx: Failed to invalidate\n", + pfn); } return ret; } @@ -833,7 +835,7 @@ static void action_result(unsigned long pfn, enum mf_action_page_type type, { trace_memory_failure_event(pfn, type, result); - pr_err("MCE %#lx: recovery action for %s: %s\n", + pr_err("Memory failure: %#lx: recovery action for %s: %s\n", pfn, action_page_types[type], action_name[result]); } @@ -849,7 +851,7 @@ static int page_action(struct page_state *ps, struct page *p, if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { - pr_err("MCE %#lx: %s still referenced by %d users\n", + pr_err("Memory failure: %#lx: %s still referenced by %d users\n", pfn, action_page_types[ps->type], count); result = MF_FAILED; } @@ -882,7 +884,7 @@ int get_hwpoison_page(struct page *page) * tries to touch the "partially handled" page. */ if (!PageAnon(head)) { - pr_err("MCE: %#lx: non anonymous thp\n", + pr_err("Memory failure: %#lx: non anonymous thp\n", page_to_pfn(page)); return 0; } @@ -892,7 +894,8 @@ int get_hwpoison_page(struct page *page) if (head == compound_head(page)) return 1; - pr_info("MCE: %#lx cannot catch tail\n", page_to_pfn(page)); + pr_info("Memory failure: %#lx cannot catch tail\n", + page_to_pfn(page)); put_page(head); } @@ -931,12 +934,13 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, return SWAP_SUCCESS; if (PageKsm(p)) { - pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); + pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); return SWAP_FAIL; } if (PageSwapCache(p)) { - pr_err("MCE %#lx: keeping poisoned page in swap cache\n", pfn); + pr_err("Memory failure: %#lx: keeping poisoned page in swap cache\n", + pfn); ttu |= TTU_IGNORE_HWPOISON; } @@ -954,7 +958,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } else { kill = 0; ttu |= TTU_IGNORE_HWPOISON; - pr_info("MCE %#lx: corrupted page was clean: dropped without side effects\n", + pr_info("Memory failure: %#lx: corrupted page was clean: dropped without side effects\n", pfn); } } @@ -972,7 +976,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) - pr_err("MCE %#lx: failed to unmap page (mapcount=%d)\n", + pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); /* @@ -1040,14 +1044,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) panic("Memory failure from trap %d on page %lx", trapno, pfn); if (!pfn_valid(pfn)) { - pr_err("MCE %#lx: memory outside kernel control\n", pfn); + pr_err("Memory failure: %#lx: memory outside kernel control\n", + pfn); return -ENXIO; } p = pfn_to_page(pfn); orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { - pr_err("MCE %#lx: already hardware poisoned\n", pfn); + pr_err("Memory failure: %#lx: already hardware poisoned\n", + pfn); return 0; } @@ -1112,9 +1118,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { unlock_page(hpage); if (!PageAnon(hpage)) - pr_err("MCE: %#lx: non anonymous thp\n", pfn); + pr_err("Memory failure: %#lx: non anonymous thp\n", + pfn); else - pr_err("MCE: %#lx: thp split failed\n", pfn); + pr_err("Memory failure: %#lx: thp split failed\n", + pfn); if (TestClearPageHWPoison(p)) num_poisoned_pages_sub(nr_pages); put_hwpoison_page(p); @@ -1178,7 +1186,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * unpoison always clear PG_hwpoison inside page lock */ if (!PageHWPoison(p)) { - pr_err("MCE %#lx: just unpoisoned\n", pfn); + pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); num_poisoned_pages_sub(nr_pages); unlock_page(hpage); put_hwpoison_page(hpage); @@ -1395,25 +1403,25 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", + unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); return 0; } if (page_count(page) > 1) { - unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", + unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); return 0; } if (page_mapped(page)) { - unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", + unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); return 0; } if (page_mapping(page)) { - unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); return 0; } @@ -1424,7 +1432,7 @@ int unpoison_memory(unsigned long pfn) * In such case, we yield to memory_failure() and make unpoison fail. */ if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", + unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", pfn, &unpoison_rs); return 0; } @@ -1439,13 +1447,13 @@ int unpoison_memory(unsigned long pfn) * to the end. */ if (PageHuge(page)) { - unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", + unpoison_pr_info("Unpoison: Memory failure is now running on free hugepage %#lx\n", pfn, &unpoison_rs); return 0; } if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); - unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", + unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", pfn, &unpoison_rs); return 0; } @@ -1458,7 +1466,7 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", pfn, &unpoison_rs); num_poisoned_pages_sub(nr_pages); freeit = 1; -- cgit v1.2.3 From f705ac4b39f30a6a5f8411a42114758f4d4655bc Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 20 May 2016 16:57:35 -0700 Subject: mm/memblock.c: move memblock_{add,reserve}_region into memblock_{add,reserve} memblock_add_region() and memblock_reserve_region() do nothing specific before the call of memblock_add_range(), only print debug output. We can do the same in memblock_add() and memblock_reserve() since both memblock_add_region() and memblock_reserve_region() are not used by anybody outside of memblock.c and memblock_{add,reserve}() have the same set of flags and nids. Since memblock_add_region() and memblock_reserve_region() will be inlined, there will not be functional changes, but will improve code readability a little. Signed-off-by: Alexander Kuleshov Acked-by: Ard Biesheuvel Cc: Mel Gorman Cc: Pekka Enberg Cc: Tony Luck Cc: Tang Chen Cc: David Gibson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 28 ++++++---------------------- 1 file changed, 6 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index b570dddb4cb9..3b93daa46fc5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -606,22 +606,14 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size, return memblock_add_range(&memblock.memory, base, size, nid, 0); } -static int __init_memblock memblock_add_region(phys_addr_t base, - phys_addr_t size, - int nid, - unsigned long flags) +int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) { memblock_dbg("memblock_add: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - flags, (void *)_RET_IP_); - - return memblock_add_range(&memblock.memory, base, size, nid, flags); -} + 0UL, (void *)_RET_IP_); -int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) -{ - return memblock_add_region(base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0); } /** @@ -732,22 +724,14 @@ int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) return memblock_remove_range(&memblock.reserved, base, size); } -static int __init_memblock memblock_reserve_region(phys_addr_t base, - phys_addr_t size, - int nid, - unsigned long flags) +int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) { memblock_dbg("memblock_reserve: [%#016llx-%#016llx] flags %#02lx %pF\n", (unsigned long long)base, (unsigned long long)base + size - 1, - flags, (void *)_RET_IP_); - - return memblock_add_range(&memblock.reserved, base, size, nid, flags); -} + 0UL, (void *)_RET_IP_); -int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) -{ - return memblock_reserve_region(base, size, MAX_NUMNODES, 0); + return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } /** -- cgit v1.2.3 From 80c4bd7a5e4368b680e0aeb57050a1b06eb573d8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 20 May 2016 16:57:38 -0700 Subject: mm/vmalloc: keep a separate lazy-free list When mixing lots of vmallocs and set_memory_*() (which calls vm_unmap_aliases()) I encountered situations where the performance degraded severely due to the walking of the entire vmap_area list each invocation. One simple improvement is to add the lazily freed vmap_area to a separate lockless free list, such that we then avoid having to walk the full list on each purge. Signed-off-by: Chris Wilson Reviewed-by: Roman Pen Cc: Joonas Lahtinen Cc: Tvrtko Ursulin Cc: Daniel Vetter Cc: David Rientjes Cc: Joonsoo Kim Cc: Roman Pen Cc: Mel Gorman Cc: Toshi Kani Cc: Shawn Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 ++- mm/vmalloc.c | 39 +++++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index d1f1d338af20..957adb741b6f 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -4,6 +4,7 @@ #include #include #include +#include #include /* pgprot_t */ #include @@ -44,7 +45,7 @@ struct vmap_area { unsigned long flags; struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ - struct list_head purge_list; /* "lazy purge" list */ + struct llist_node purge_list; /* "lazy purge" list */ struct vm_struct *vm; struct rcu_head rcu_head; }; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ae7d20b447ff..6e3291882739 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -274,13 +274,12 @@ EXPORT_SYMBOL(vmalloc_to_pfn); /*** Global kva allocator ***/ -#define VM_LAZY_FREE 0x01 -#define VM_LAZY_FREEING 0x02 #define VM_VM_AREA 0x04 static DEFINE_SPINLOCK(vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); +static LLIST_HEAD(vmap_purge_list); static struct rb_root vmap_area_root = RB_ROOT; /* The vmap cache globals are protected by vmap_area_lock */ @@ -601,7 +600,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { static DEFINE_SPINLOCK(purge_lock); - LIST_HEAD(valist); + struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; int nr = 0; @@ -620,20 +619,14 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (sync) purge_fragmented_blocks_allcpus(); - rcu_read_lock(); - list_for_each_entry_rcu(va, &vmap_area_list, list) { - if (va->flags & VM_LAZY_FREE) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - list_add_tail(&va->purge_list, &valist); - va->flags |= VM_LAZY_FREEING; - va->flags &= ~VM_LAZY_FREE; - } + valist = llist_del_all(&vmap_purge_list); + llist_for_each_entry(va, valist, purge_list) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; } - rcu_read_unlock(); if (nr) atomic_sub(nr, &vmap_lazy_nr); @@ -643,7 +636,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (nr) { spin_lock(&vmap_area_lock); - list_for_each_entry_safe(va, n_va, &valist, purge_list) + llist_for_each_entry_safe(va, n_va, valist, purge_list) __free_vmap_area(va); spin_unlock(&vmap_area_lock); } @@ -678,9 +671,15 @@ static void purge_vmap_area_lazy(void) */ static void free_vmap_area_noflush(struct vmap_area *va) { - va->flags |= VM_LAZY_FREE; - atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); - if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + int nr_lazy; + + nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT, + &vmap_lazy_nr); + + /* After this point, we may free va at any time */ + llist_add(&va->purge_list, &vmap_purge_list); + + if (unlikely(nr_lazy > lazy_max_pages())) try_purge_vmap_area_lazy(); } -- cgit v1.2.3 From d5957d2fc232a689543bdbed1a5ff8002f0e9843 Mon Sep 17 00:00:00 2001 From: Yongji Xie Date: Fri, 20 May 2016 16:57:41 -0700 Subject: mm: fix incorrect pfn passed to untrack_pfn() in remap_pfn_range() We use generic hooks in remap_pfn_range() to help archs to track pfnmap regions. The code is something like: int remap_pfn_range() { ... track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); ... pfn -= addr >> PAGE_SHIFT; ... untrack_pfn(vma, pfn, PAGE_ALIGN(size)); ... } Here we can easily find the pfn is changed but not recovered before untrack_pfn() is called. That's incorrect. There are no known runtime effects - this is from inspection. Signed-off-by: Yongji Xie Cc: Kirill A. Shutemov Cc: Jerome Marchand Cc: Ingo Molnar Cc: Vlastimil Babka Cc: Dave Hansen Cc: Dan Williams Cc: Matthew Wilcox Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Andy Lutomirski Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 07493e34ab7e..007c72ad03f6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1744,6 +1744,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; + unsigned long remap_pfn = pfn; int err; /* @@ -1770,7 +1771,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_pgoff = pfn; } - err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size)); + err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; @@ -1789,7 +1790,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, } while (pgd++, addr = next, addr != end); if (err) - untrack_pfn(vma, pfn, PAGE_ALIGN(size)); + untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); return err; } -- cgit v1.2.3 From f4fcd55841fc9e46daac553b39361572453c2b88 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Fri, 20 May 2016 16:57:45 -0700 Subject: mm: enable RLIMIT_DATA by default with workaround for valgrind Since commit 84638335900f ("mm: rework virtual memory accounting") RLIMIT_DATA limits both brk() and private mmap() but this's disabled by default because of incompatibility with older versions of valgrind. Valgrind always set limit to zero and fails if RLIMIT_DATA is enabled. Fortunately it changes only rlim_cur and keeps rlim_max for reverting limit back when needed. This patch checks current usage also against rlim_max if rlim_cur is zero. This is safe because task anyway can increase rlim_cur up to rlim_max. Size of brk is still checked against rlim_cur, so this part is completely compatible - zero rlim_cur forbids brk() but allows private mmap(). Link: http://lkml.kernel.org/r/56A28613.5070104@de.ibm.com Signed-off-by: Konstantin Khlebnikov Acked-by: Linus Torvalds Cc: Cyrill Gorcunov Cc: Christian Borntraeger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index fba246b8f1a5..b9274a0c82c9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -66,7 +66,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif -static bool ignore_rlimit_data = true; +static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); static void unmap_region(struct mm_struct *mm, @@ -2886,13 +2886,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages) if (is_data_mapping(flags) && mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) { - if (ignore_rlimit_data) - pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Will be forbidden soon.\n", + /* Workaround for Valgrind */ + if (rlimit(RLIMIT_DATA) == 0 && + mm->data_vm + npages <= rlimit_max(RLIMIT_DATA) >> PAGE_SHIFT) + return true; + if (!ignore_rlimit_data) { + pr_warn_once("%s (%d): VmData %lu exceed data ulimit %lu. Update limits or use boot option ignore_rlimit_data.\n", current->comm, current->pid, (mm->data_vm + npages) << PAGE_SHIFT, rlimit(RLIMIT_DATA)); - else return false; + } } return true; -- cgit v1.2.3 From 297880f4af4e492ed5084be9397d65a18ade56ee Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 20 May 2016 16:57:50 -0700 Subject: mm, hugetlb_cgroup: round limit_in_bytes down to hugepage size The page_counter rounds limits down to page size values. This makes sense, except in the case of hugetlb_cgroup where it's not possible to charge partial hugepages. If the hugetlb_cgroup margin is less than the hugepage size being charged, it will fail as expected. Round the hugetlb_cgroup limit down to hugepage size, since it is the effective limit of the cgroup. For consistency, round down PAGE_COUNTER_MAX as well when a hugetlb_cgroup is created: this prevents error reports when a user cannot restore the value to the kernel default. Signed-off-by: David Rientjes Cc: Michal Hocko Cc: Nikolay Borisov Cc: Johannes Weiner Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb_cgroup.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d8fb10de0f14..eec1150125b9 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -67,26 +67,42 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) return false; } +static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, + struct hugetlb_cgroup *parent_h_cgroup) +{ + int idx; + + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) { + struct page_counter *counter = &h_cgroup->hugepage[idx]; + struct page_counter *parent = NULL; + unsigned long limit; + int ret; + + if (parent_h_cgroup) + parent = &parent_h_cgroup->hugepage[idx]; + page_counter_init(counter, parent); + + limit = round_down(PAGE_COUNTER_MAX, + 1 << huge_page_order(&hstates[idx])); + ret = page_counter_limit(counter, limit); + VM_BUG_ON(ret); + } +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; - int idx; h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); - if (parent_h_cgroup) { - for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - page_counter_init(&h_cgroup->hugepage[idx], - &parent_h_cgroup->hugepage[idx]); - } else { + if (!parent_h_cgroup) root_h_cgroup = h_cgroup; - for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) - page_counter_init(&h_cgroup->hugepage[idx], NULL); - } + + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; } @@ -285,6 +301,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, return ret; idx = MEMFILE_IDX(of_cft(of)->private); + nr_pages = round_down(nr_pages, 1 << huge_page_order(&hstates[idx])); switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: -- cgit v1.2.3 From a4a921aa5c7a1ab621e71fd1a4289e76fe230cbd Mon Sep 17 00:00:00 2001 From: Ming Li Date: Fri, 20 May 2016 16:57:56 -0700 Subject: mm/swap.c: put activate_page_pvecs and other pagevecs together Put the activate_page_pvecs definition next to those of the other pagevecs, for clarity. Signed-off-by: Ming Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 03aacbcb013f..95916142fc46 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -47,6 +47,9 @@ static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); +#endif /* * This path almost never happens for VM activity - pages are normally @@ -274,8 +277,6 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, } #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); - static void activate_page_drain(int cpu) { struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); -- cgit v1.2.3 From 2a138dc7e50bfdc90f9db9b52584ac5564952425 Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 20 May 2016 16:58:13 -0700 Subject: mm: use existing helper to convert "on"/"off" to boolean It's more convenient to use existing function helper to convert string "on/off" to boolean. Link: http://lkml.kernel.org/r/1461908824-16129-1-git-send-email-mnghuan@gmail.com Signed-off-by: Minfei Huang Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +-------- mm/page_poison.c | 8 +------- 2 files changed, 2 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e49bc3705b6..3f4b69aaa23a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -613,14 +613,7 @@ static int __init early_debug_pagealloc(char *buf) { if (!buf) return -EINVAL; - - if (strcmp(buf, "on") == 0) - _debug_pagealloc_enabled = true; - - if (strcmp(buf, "off") == 0) - _debug_pagealloc_enabled = false; - - return 0; + return kstrtobool(buf, &_debug_pagealloc_enabled); } early_param("debug_pagealloc", early_debug_pagealloc); diff --git a/mm/page_poison.c b/mm/page_poison.c index 479e7ea2bea6..1eae5fad2446 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -13,13 +13,7 @@ static int early_page_poison_param(char *buf) { if (!buf) return -EINVAL; - - if (strcmp(buf, "on") == 0) - want_page_poisoning = true; - else if (strcmp(buf, "off") == 0) - want_page_poisoning = false; - - return 0; + return strtobool(buf, &want_page_poisoning); } early_param("page_poison", early_page_poison_param); -- cgit v1.2.3 From 51038171b7ecf0017fbe3e06bd246eaa26f4d2e7 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Fri, 20 May 2016 16:58:18 -0700 Subject: memcg: fix stale mem_cgroup_force_empty() comment Commit f61c42a7d911 ("memcg: remove tasks/children test from mem_cgroup_force_empty()") removed memory reparenting from the function. Fix the function's comment. Link: http://lkml.kernel.org/r/1462569810-54496-1-git-send-email-gthelen@google.com Signed-off-by: Greg Thelen Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d71d387868e6..b3f16ab4b431 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2652,8 +2652,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg) } /* - * Reclaims as many pages from the given memcg as possible and moves - * the rest to the parent. + * Reclaims as many pages from the given memcg as possible. * * Caller is responsible for holding css reference for memcg. */ -- cgit v1.2.3 From 7b8da4c7f0777489f8690115b5fd7704ac0abb8f Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Fri, 20 May 2016 16:58:21 -0700 Subject: vmstat: get rid of the ugly cpu_stat_off variable The cpu_stat_off variable is unecessary since we can check if a workqueue request is pending otherwise. Removal of cpu_stat_off makes it pretty easy for the vmstat shepherd to ensure that the proper things happen. Removing the state also removes all races related to it. Should a workqueue not be scheduled as needed for vmstat_update then the shepherd will notice and schedule it as needed. Should a workqueue be unecessarily scheduled then the vmstat updater will disable it. [akpm@linux-foundation.org: fix indentation, per Michal] Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1605061306460.17934@east.gentwo.org Signed-off-by: Christoph Lameter Cc: Tejun Heo Acked-by: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 51 ++++++++++----------------------------------------- 1 file changed, 10 insertions(+), 41 deletions(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 5b72a8ad2813..77e42ef388c2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1352,7 +1352,6 @@ static const struct file_operations proc_vmstat_file_operations = { static struct workqueue_struct *vmstat_wq; static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; -static cpumask_var_t cpu_stat_off; #ifdef CONFIG_PROC_FS static void refresh_vm_stats(struct work_struct *work) @@ -1421,24 +1420,10 @@ static void vmstat_update(struct work_struct *w) * Counters were updated so we expect more updates * to occur in the future. Keep on running the * update worker thread. - * If we were marked on cpu_stat_off clear the flag - * so that vmstat_shepherd doesn't schedule us again. */ - if (!cpumask_test_and_clear_cpu(smp_processor_id(), - cpu_stat_off)) { - queue_delayed_work_on(smp_processor_id(), vmstat_wq, + queue_delayed_work_on(smp_processor_id(), vmstat_wq, this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); - } - } else { - /* - * We did not update any counters so the app may be in - * a mode where it does not cause counter updates. - * We may be uselessly running vmstat_update. - * Defer the checking for differentials to the - * shepherd thread on a different processor. - */ - cpumask_set_cpu(smp_processor_id(), cpu_stat_off); } } @@ -1470,16 +1455,17 @@ static bool need_update(int cpu) return false; } +/* + * Switch off vmstat processing and then fold all the remaining differentials + * until the diffs stay at zero. The function is used by NOHZ and can only be + * invoked when tick processing is not active. + */ void quiet_vmstat(void) { if (system_state != SYSTEM_RUNNING) return; - /* - * If we are already in hands of the shepherd then there - * is nothing for us to do here. - */ - if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off)) + if (!delayed_work_pending(this_cpu_ptr(&vmstat_work))) return; if (!need_update(smp_processor_id())) @@ -1494,7 +1480,6 @@ void quiet_vmstat(void) refresh_cpu_vm_stats(false); } - /* * Shepherd worker thread that checks the * differentials of processors that have their worker @@ -1511,20 +1496,11 @@ static void vmstat_shepherd(struct work_struct *w) get_online_cpus(); /* Check processors whose vmstat worker threads have been disabled */ - for_each_cpu(cpu, cpu_stat_off) { + for_each_online_cpu(cpu) { struct delayed_work *dw = &per_cpu(vmstat_work, cpu); - if (need_update(cpu)) { - if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) - queue_delayed_work_on(cpu, vmstat_wq, dw, 0); - } else { - /* - * Cancel the work if quiet_vmstat has put this - * cpu on cpu_stat_off because the work item might - * be still scheduled - */ - cancel_delayed_work(dw); - } + if (!delayed_work_pending(dw) && need_update(cpu)) + queue_delayed_work_on(cpu, vmstat_wq, dw, 0); } put_online_cpus(); @@ -1540,10 +1516,6 @@ static void __init start_shepherd_timer(void) INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), vmstat_update); - if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) - BUG(); - cpumask_copy(cpu_stat_off, cpu_online_mask); - vmstat_wq = alloc_workqueue("vmstat", WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); @@ -1578,16 +1550,13 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); node_set_state(cpu_to_node(cpu), N_CPU); - cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - cpumask_clear_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DEAD: case CPU_DEAD_FROZEN: -- cgit v1.2.3 From d5ee7c3bcca6fe2b4f7a1fdee253250059c110d2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 20 May 2016 16:58:27 -0700 Subject: mm: thp: split_huge_pmd_address() comment improvement Comment is partly wrong, this improves it by including the case of split_huge_pmd_address() called by try_to_unmap_one if TTU_SPLIT_HUGE_PMD is set. Link: http://lkml.kernel.org/r/1462547040-1737-4-git-send-email-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Acked-by: Kirill A. Shutemov Cc: Alex Williamson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86eafd91c982..1764184c4774 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3023,8 +3023,10 @@ void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, return; /* - * Caller holds the mmap_sem write mode, so a huge pmd cannot - * materialize from under us. + * Caller holds the mmap_sem write mode or the anon_vma lock, + * so a huge pmd cannot materialize from under us (khugepaged + * holds both the mmap_sem write mode and the anon_vma lock + * write mode). */ __split_huge_pmd(vma, pmd, address, freeze); } -- cgit v1.2.3 From 9a001fc19cccdeb9be4c3b89ad089d92df303c44 Mon Sep 17 00:00:00 2001 From: Vitaly Wool Date: Fri, 20 May 2016 16:58:30 -0700 Subject: z3fold: the 3-fold allocator for compressed pages This patch introduces z3fold, a special purpose allocator for storing compressed pages. It is designed to store up to three compressed pages per physical page. It is a ZBUD derivative which allows for higher compression ratio keeping the simplicity and determinism of its predecessor. This patch comes as a follow-up to the discussions at the Embedded Linux Conference in San-Diego related to the talk [1]. The outcome of these discussions was that it would be good to have a compressed page allocator as stable and deterministic as zbud with with higher compression ratio. To keep the determinism and simplicity, z3fold, just like zbud, always stores an integral number of compressed pages per page, but it can store up to 3 pages unlike zbud which can store at most 2. Therefore the compression ratio goes to around 2.6x while zbud's one is around 1.7x. The patch is based on the latest linux.git tree. This version has been updated after testing on various simulators (e.g. ARM Versatile Express, MIPS Malta, x86_64/Haswell) and basing on comments from Dan Streetman [3]. [1] https://openiotelc2016.sched.org/event/6DAC/swapping-and-embedded-compression-relieves-the-pressure-vitaly-wool-softprise-consulting-ou [2] https://lkml.org/lkml/2016/4/21/799 [3] https://lkml.org/lkml/2016/5/4/852 Link: http://lkml.kernel.org/r/20160509151753.ec3f9fda3c9898d31ff52a32@gmail.com Signed-off-by: Vitaly Wool Cc: Seth Jennings Cc: Dan Streetman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/z3fold.txt | 26 ++ mm/Kconfig | 12 +- mm/Makefile | 1 + mm/z3fold.c | 792 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 830 insertions(+), 1 deletion(-) create mode 100644 Documentation/vm/z3fold.txt create mode 100644 mm/z3fold.c (limited to 'mm') diff --git a/Documentation/vm/z3fold.txt b/Documentation/vm/z3fold.txt new file mode 100644 index 000000000000..38e4dac810b6 --- /dev/null +++ b/Documentation/vm/z3fold.txt @@ -0,0 +1,26 @@ +z3fold +------ + +z3fold is a special purpose allocator for storing compressed pages. +It is designed to store up to three compressed pages per physical page. +It is a zbud derivative which allows for higher compression +ratio keeping the simplicity and determinism of its predecessor. + +The main differences between z3fold and zbud are: +* unlike zbud, z3fold allows for up to PAGE_SIZE allocations +* z3fold can hold up to 3 compressed pages in its page +* z3fold doesn't export any API itself and is thus intended to be used + via the zpool API. + +To keep the determinism and simplicity, z3fold, just like zbud, always +stores an integral number of compressed pages per page, but it can store +up to 3 pages unlike zbud which can store at most 2. Therefore the +compression ratio goes to around 2.7x while zbud's one is around 1.7x. + +Unlike zbud (but like zsmalloc for that matter) z3fold_alloc() does not +return a dereferenceable pointer. Instead, it returns an unsigned long +handle which encodes actual location of the allocated object. + +Keeping effective compression ratio close to zsmalloc's, z3fold doesn't +depend on MMU enabled and provides more predictable reclaim behavior +which makes it a better fit for small and response-critical systems. diff --git a/mm/Kconfig b/mm/Kconfig index b0432b71137d..1a6a28ebcb8b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -567,7 +567,7 @@ config ZPOOL zsmalloc. config ZBUD - tristate "Low density storage for compressed pages" + tristate "Low (Up to 2x) density storage for compressed pages" default n help A special purpose allocator for storing compressed pages. @@ -576,6 +576,16 @@ config ZBUD deterministic reclaim properties that make it preferable to a higher density approach when reclaim will be used. +config Z3FOLD + tristate "Up to 3x density storage for compressed pages" + depends on ZPOOL + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to three compressed pages per physical + page. It is a ZBUD derivative so the simplicity and determinism are + still there. + config ZSMALLOC tristate "Memory allocator for compressed pages" depends on MMU diff --git a/mm/Makefile b/mm/Makefile index deb467edca2d..78c6f7dedb83 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o +obj-$(CONFIG_Z3FOLD) += z3fold.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o diff --git a/mm/z3fold.c b/mm/z3fold.c new file mode 100644 index 000000000000..34917d55d311 --- /dev/null +++ b/mm/z3fold.c @@ -0,0 +1,792 @@ +/* + * z3fold.c + * + * Author: Vitaly Wool + * Copyright (C) 2016, Sony Mobile Communications Inc. + * + * This implementation is based on zbud written by Seth Jennings. + * + * z3fold is an special purpose allocator for storing compressed pages. It + * can store up to three compressed pages per page which improves the + * compression ratio of zbud while retaining its main concepts (e. g. always + * storing an integral number of objects per page) and simplicity. + * It still has simple and deterministic reclaim properties that make it + * preferable to a higher density approach (with no requirement on integral + * number of object per page) when reclaim is used. + * + * As in zbud, pages are divided into "chunks". The size of the chunks is + * fixed at compile time and is determined by NCHUNKS_ORDER below. + * + * z3fold doesn't export any API and is meant to be used via zpool API. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64. As one chunk + * in allocated page is occupied by z3fold header, NCHUNKS will be calculated + * to 63 which shows the max number of free chunks in z3fold page, also there + * will be 63 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE +#define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT) + +#define BUDDY_MASK ((1 << NCHUNKS_ORDER) - 1) + +struct z3fold_pool; +struct z3fold_ops { + int (*evict)(struct z3fold_pool *pool, unsigned long handle); +}; + +/** + * struct z3fold_pool - stores metadata for each z3fold pool + * @lock: protects all pool fields and first|last_chunk fields of any + * z3fold page in the pool + * @unbuddied: array of lists tracking z3fold pages that contain 2- buddies; + * the lists each z3fold page is added to depends on the size of + * its free region. + * @buddied: list tracking the z3fold pages that contain 3 buddies; + * these z3fold pages are full + * @lru: list tracking the z3fold pages in LRU order by most recently + * added buddy. + * @pages_nr: number of z3fold pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular z3fold pool. + */ +struct z3fold_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + const struct z3fold_ops *ops; + struct zpool *zpool; + const struct zpool_ops *zpool_ops; +}; + +enum buddy { + HEADLESS = 0, + FIRST, + MIDDLE, + LAST, + BUDDIES_MAX +}; + +/* + * struct z3fold_header - z3fold page metadata occupying the first chunk of each + * z3fold page, except for HEADLESS pages + * @buddy: links the z3fold page into the relevant list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @middle_chunks: the size of the middle buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + * @first_num: the starting number (for the first handle) + */ +struct z3fold_header { + struct list_head buddy; + unsigned short first_chunks; + unsigned short middle_chunks; + unsigned short last_chunks; + unsigned short start_middle; + unsigned short first_num:NCHUNKS_ORDER; +}; + +/* + * Internal z3fold page flags + */ +enum z3fold_page_flags { + UNDER_RECLAIM = 0, + PAGE_HEADLESS, + MIDDLE_CHUNK_MAPPED, +}; + +/***************** + * Helpers +*****************/ + +/* Converts an allocation size in bytes to size in z3fold chunks */ +static int size_to_chunks(size_t size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the z3fold header of a newly allocated z3fold page */ +static struct z3fold_header *init_z3fold_page(struct page *page) +{ + struct z3fold_header *zhdr = page_address(page); + + INIT_LIST_HEAD(&page->lru); + clear_bit(UNDER_RECLAIM, &page->private); + clear_bit(PAGE_HEADLESS, &page->private); + clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + + zhdr->first_chunks = 0; + zhdr->middle_chunks = 0; + zhdr->last_chunks = 0; + zhdr->first_num = 0; + zhdr->start_middle = 0; + INIT_LIST_HEAD(&zhdr->buddy); + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_z3fold_page(struct z3fold_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a z3fold page + * Pool lock should be held as this function accesses first_num + */ +static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + handle = (unsigned long)zhdr; + if (bud != HEADLESS) + handle += (bud + zhdr->first_num) & BUDDY_MASK; + return handle; +} + +/* Returns the z3fold page where a given handle is stored */ +static struct z3fold_header *handle_to_z3fold_header(unsigned long handle) +{ + return (struct z3fold_header *)(handle & PAGE_MASK); +} + +/* Returns buddy number */ +static enum buddy handle_to_buddy(unsigned long handle) +{ + struct z3fold_header *zhdr = handle_to_z3fold_header(handle); + return (handle - zhdr->first_num) & BUDDY_MASK; +} + +/* + * Returns the number of free chunks in a z3fold page. + * NB: can't be used with HEADLESS pages. + */ +static int num_free_chunks(struct z3fold_header *zhdr) +{ + int nfree; + /* + * If there is a middle object, pick up the bigger free space + * either before or after it. Otherwise just subtract the number + * of chunks occupied by the first and the last objects. + */ + if (zhdr->middle_chunks != 0) { + int nfree_before = zhdr->first_chunks ? + 0 : zhdr->start_middle - 1; + int nfree_after = zhdr->last_chunks ? + 0 : NCHUNKS - zhdr->start_middle - zhdr->middle_chunks; + nfree = max(nfree_before, nfree_after); + } else + nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks; + return nfree; +} + +/***************** + * API Functions +*****************/ +/** + * z3fold_create_pool() - create a new z3fold pool + * @gfp: gfp flags when allocating the z3fold pool structure + * @ops: user-defined operations for the z3fold pool + * + * Return: pointer to the new z3fold pool or NULL if the metadata allocation + * failed. + */ +static struct z3fold_pool *z3fold_create_pool(gfp_t gfp, + const struct z3fold_ops *ops) +{ + struct z3fold_pool *pool; + int i; + + pool = kzalloc(sizeof(struct z3fold_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * z3fold_destroy_pool() - destroys an existing z3fold pool + * @pool: the z3fold pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +static void z3fold_destroy_pool(struct z3fold_pool *pool) +{ + kfree(pool); +} + +/* Has to be called with lock held */ +static int z3fold_compact_page(struct z3fold_header *zhdr) +{ + struct page *page = virt_to_page(zhdr); + void *beg = zhdr; + + + if (!test_bit(MIDDLE_CHUNK_MAPPED, &page->private) && + zhdr->middle_chunks != 0 && + zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + memmove(beg + ZHDR_SIZE_ALIGNED, + beg + (zhdr->start_middle << CHUNK_SHIFT), + zhdr->middle_chunks << CHUNK_SHIFT); + zhdr->first_chunks = zhdr->middle_chunks; + zhdr->middle_chunks = 0; + zhdr->start_middle = 0; + zhdr->first_num++; + return 1; + } + return 0; +} + +/** + * z3fold_alloc() - allocates a region of a given size + * @pool: z3fold pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as z3fold pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL if the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + int chunks = 0, i, freechunks; + struct z3fold_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (!size || (gfp & __GFP_HIGHMEM)) + return -EINVAL; + + if (size > PAGE_SIZE) + return -ENOSPC; + + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) + bud = HEADLESS; + else { + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied z3fold page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct z3fold_header, buddy); + page = virt_to_page(zhdr); + if (zhdr->first_chunks == 0) { + if (zhdr->middle_chunks != 0 && + chunks >= zhdr->start_middle) + bud = LAST; + else + bud = FIRST; + } else if (zhdr->last_chunks == 0) + bud = LAST; + else if (zhdr->middle_chunks == 0) + bud = MIDDLE; + else { + pr_err("No free chunks in unbuddied\n"); + WARN_ON(1); + continue; + } + list_del(&zhdr->buddy); + goto found; + } + } + bud = FIRST; + spin_unlock(&pool->lock); + } + + /* Couldn't find unbuddied z3fold page, create new one */ + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_z3fold_page(page); + + if (bud == HEADLESS) { + set_bit(PAGE_HEADLESS, &page->private); + goto headless; + } + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else if (bud == LAST) + zhdr->last_chunks = chunks; + else { + zhdr->middle_chunks = chunks; + zhdr->start_middle = zhdr->first_chunks + 1; + } + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 || + zhdr->middle_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + +headless: + /* Add/move z3fold page to beginning of LRU */ + if (!list_empty(&page->lru)) + list_del(&page->lru); + + list_add(&page->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * z3fold_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by z3fold_alloc() + * + * In the case that the z3fold page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see z3fold_reclaim_page() below). + */ +static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + int freechunks; + struct page *page; + enum buddy bud; + + spin_lock(&pool->lock); + zhdr = handle_to_z3fold_header(handle); + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) { + /* HEADLESS page stored */ + bud = HEADLESS; + } else { + bud = (handle - zhdr->first_num) & BUDDY_MASK; + + switch (bud) { + case FIRST: + zhdr->first_chunks = 0; + break; + case MIDDLE: + zhdr->middle_chunks = 0; + zhdr->start_middle = 0; + break; + case LAST: + zhdr->last_chunks = 0; + break; + default: + pr_err("%s: unknown bud %d\n", __func__, bud); + WARN_ON(1); + spin_unlock(&pool->lock); + return; + } + } + + if (test_bit(UNDER_RECLAIM, &page->private)) { + /* z3fold page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + if (bud != HEADLESS) { + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + } + + if (bud == HEADLESS || + (zhdr->first_chunks == 0 && zhdr->middle_chunks == 0 && + zhdr->last_chunks == 0)) { + /* z3fold page is empty, free */ + list_del(&page->lru); + clear_bit(PAGE_HEADLESS, &page->private); + free_z3fold_page(zhdr); + pool->pages_nr--; + } else { + z3fold_compact_page(zhdr); + /* Add to the unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +/** + * z3fold_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * z3fold reclaim is different from normal system reclaim in that it is done + * from the bottom, up. This is because only the bottom layer, z3fold, has + * information on how the allocations are organized within each z3fold page. + * This has the potential to create interesting locking situations between + * z3fold and the user, however. + * + * To avoid these, this is how z3fold_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls z3fold_reclaim_page(). + * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and + * call the user-defined eviction handler with the pool and handle as + * arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. z3fold_reclaim_page() will add the z3fold page back to the + * appropriate list and try the next z3fold page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the z3fold page are successfully evicted, then the + * z3fold page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries) +{ + int i, ret = 0, freechunks; + struct z3fold_header *zhdr; + struct page *page; + unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + page = list_last_entry(&pool->lru, struct page, lru); + list_del(&page->lru); + + /* Protect z3fold page against free */ + set_bit(UNDER_RECLAIM, &page->private); + zhdr = page_address(page); + if (!test_bit(PAGE_HEADLESS, &page->private)) { + list_del(&zhdr->buddy); + /* + * We need encode the handles before unlocking, since + * we can race with free that will set + * (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + middle_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->middle_chunks) + middle_handle = encode_handle(zhdr, MIDDLE); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + } else { + first_handle = encode_handle(zhdr, HEADLESS); + last_handle = middle_handle = 0; + } + + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (middle_handle) { + ret = pool->ops->evict(pool, middle_handle); + if (ret) + goto next; + } + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + clear_bit(UNDER_RECLAIM, &page->private); + if ((test_bit(PAGE_HEADLESS, &page->private) && ret == 0) || + (zhdr->first_chunks == 0 && zhdr->last_chunks == 0 && + zhdr->middle_chunks == 0)) { + /* + * All buddies are now free, free the z3fold page and + * return success. + */ + clear_bit(PAGE_HEADLESS, &page->private); + free_z3fold_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks != 0 && + zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) { + /* Full, add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } else if (!test_bit(PAGE_HEADLESS, &page->private)) { + z3fold_compact_page(zhdr); + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + /* add to beginning of LRU */ + list_add(&page->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * z3fold_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * Extracts the buddy number from handle and constructs the pointer to the + * correct starting chunk within the page. + * + * Returns: a pointer to the mapped allocation + */ +static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + void *addr; + enum buddy buddy; + + spin_lock(&pool->lock); + zhdr = handle_to_z3fold_header(handle); + addr = zhdr; + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) + goto out; + + buddy = handle_to_buddy(handle); + switch (buddy) { + case FIRST: + addr += ZHDR_SIZE_ALIGNED; + break; + case MIDDLE: + addr += zhdr->start_middle << CHUNK_SHIFT; + set_bit(MIDDLE_CHUNK_MAPPED, &page->private); + break; + case LAST: + addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + break; + default: + pr_err("unknown buddy id %d\n", buddy); + WARN_ON(1); + addr = NULL; + break; + } +out: + spin_unlock(&pool->lock); + return addr; +} + +/** + * z3fold_unmap() - unmaps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle) +{ + struct z3fold_header *zhdr; + struct page *page; + enum buddy buddy; + + spin_lock(&pool->lock); + zhdr = handle_to_z3fold_header(handle); + page = virt_to_page(zhdr); + + if (test_bit(PAGE_HEADLESS, &page->private)) { + spin_unlock(&pool->lock); + return; + } + + buddy = handle_to_buddy(handle); + if (buddy == MIDDLE) + clear_bit(MIDDLE_CHUNK_MAPPED, &page->private); + spin_unlock(&pool->lock); +} + +/** + * z3fold_get_pool_size() - gets the z3fold pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +static u64 z3fold_get_pool_size(struct z3fold_pool *pool) +{ + return pool->pages_nr; +} + +/***************** + * zpool + ****************/ + +static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle) +{ + if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict) + return pool->zpool_ops->evict(pool->zpool, handle); + else + return -ENOENT; +} + +static const struct z3fold_ops z3fold_zpool_ops = { + .evict = z3fold_zpool_evict +}; + +static void *z3fold_zpool_create(const char *name, gfp_t gfp, + const struct zpool_ops *zpool_ops, + struct zpool *zpool) +{ + struct z3fold_pool *pool; + + pool = z3fold_create_pool(gfp, zpool_ops ? &z3fold_zpool_ops : NULL); + if (pool) { + pool->zpool = zpool; + pool->zpool_ops = zpool_ops; + } + return pool; +} + +static void z3fold_zpool_destroy(void *pool) +{ + z3fold_destroy_pool(pool); +} + +static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return z3fold_alloc(pool, size, gfp, handle); +} +static void z3fold_zpool_free(void *pool, unsigned long handle) +{ + z3fold_free(pool, handle); +} + +static int z3fold_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = z3fold_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *z3fold_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return z3fold_map(pool, handle); +} +static void z3fold_zpool_unmap(void *pool, unsigned long handle) +{ + z3fold_unmap(pool, handle); +} + +static u64 z3fold_zpool_total_size(void *pool) +{ + return z3fold_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver z3fold_zpool_driver = { + .type = "z3fold", + .owner = THIS_MODULE, + .create = z3fold_zpool_create, + .destroy = z3fold_zpool_destroy, + .malloc = z3fold_zpool_malloc, + .free = z3fold_zpool_free, + .shrink = z3fold_zpool_shrink, + .map = z3fold_zpool_map, + .unmap = z3fold_zpool_unmap, + .total_size = z3fold_zpool_total_size, +}; + +MODULE_ALIAS("zpool-z3fold"); + +static int __init init_z3fold(void) +{ + /* Make sure the z3fold header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct z3fold_header) > ZHDR_SIZE_ALIGNED); + zpool_register_driver(&z3fold_zpool_driver); + + return 0; +} + +static void __exit exit_z3fold(void) +{ + zpool_unregister_driver(&z3fold_zpool_driver); +} + +module_init(init_z3fold); +module_exit(exit_z3fold); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vitaly Wool "); +MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages"); -- cgit v1.2.3 From cd33a76b0f2805fb0d6a05a2d53933f3817ccc9b Mon Sep 17 00:00:00 2001 From: Richard Leitner Date: Fri, 20 May 2016 16:58:33 -0700 Subject: mm/memblock.c: remove unnecessary always-true comparison Comparing an u64 variable to >= 0 returns always true and can therefore be removed. This issue was detected using the -Wtype-limits gcc flag. This patch fixes following type-limits warning: mm/memblock.c: In function `__next_reserved_mem_region': mm/memblock.c:843:11: warning: comparison of unsigned expression >= 0 is always true [-Wtype-limits] if (*idx >= 0 && *idx < type->cnt) { Link: http://lkml.kernel.org/r/20160510103625.3a7f8f32@g0hl1n.net Signed-off-by: Richard Leitner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 3b93daa46fc5..ac1248933b31 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -824,7 +824,7 @@ void __init_memblock __next_reserved_mem_region(u64 *idx, { struct memblock_type *type = &memblock.reserved; - if (*idx >= 0 && *idx < type->cnt) { + if (*idx < type->cnt) { struct memblock_region *r = &type->regions[*idx]; phys_addr_t base = r->base; phys_addr_t size = r->size; -- cgit v1.2.3 From 4b50bcc7eda4d3cc9e3f2a0aa60e590fedf728c5 Mon Sep 17 00:00:00 2001 From: Stefan Bader Date: Fri, 20 May 2016 16:58:38 -0700 Subject: mm: use phys_addr_t for reserve_bootmem_region() arguments Since commit 92923ca3aace ("mm: meminit: only set page reserved in the memblock region") the reserved bit is set on reserved memblock regions. However start and end address are passed as unsigned long. This is only 32bit on i386, so it can end up marking the wrong pages reserved for ranges at 4GB and above. This was observed on a 32bit Xen dom0 which was booted with initial memory set to a value below 4G but allowing to balloon in memory (dom0_mem=1024M for example). This would define a reserved bootmem region for the additional memory (for example on a 8GB system there was a reverved region covering the 4GB-8GB range). But since the addresses were passed on as unsigned long, this was actually marking all pages from 0 to 4GB as reserved. Fixes: 92923ca3aacef63 ("mm: meminit: only set page reserved in the memblock region") Link: http://lkml.kernel.org/r/1463491221-10573-1-git-send-email-stefan.bader@canonical.com Signed-off-by: Stefan Bader Cc: [4.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 65d18a45b8e8..fbdb9d40847f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1763,7 +1763,7 @@ extern void free_highmem_page(struct page *page); extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); -extern void reserve_bootmem_region(unsigned long start, unsigned long end); +extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3f4b69aaa23a..2dd1ba4e70cc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1205,7 +1205,7 @@ static inline void init_reserved_page(unsigned long pfn) * marks the pages PageReserved. The remaining valid pages are later * sent to the buddy page allocator. */ -void __meminit reserve_bootmem_region(unsigned long start, unsigned long end) +void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) { unsigned long start_pfn = PFN_DOWN(start); unsigned long end_pfn = PFN_UP(end); -- cgit v1.2.3 From 5c0a85fad949212b3e059692deecdeed74ae7ec7 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 20 May 2016 16:58:41 -0700 Subject: mm: make faultaround produce old ptes Currently, faultaround code produces young pte. This can screw up vmscan behaviour[1], as it makes vmscan think that these pages are hot and not push them out on first round. During sparse file access faultaround gets more pages mapped and all of them are young. Under memory pressure, this makes vmscan swap out anon pages instead, or to drop other page cache pages which otherwise stay resident. Modify faultaround to produce old ptes, so they can easily be reclaimed under memory pressure. This can to some extend defeat the purpose of faultaround on machines without hardware accessed bit as it will not help us with reducing the number of minor page faults. We may want to disable faultaround on such machines altogether, but that's subject for separate patchset. Minchan: "I tested 512M mmap sequential word read test on non-HW access bit system (i.e., ARM) and confirmed it doesn't increase minor fault any more. old: 4096 fault_around minor fault: 131291 elapsed time: 6747645 usec new: 65536 fault_around minor fault: 131291 elapsed time: 6709263 usec 0.56% benefit" [1] https://lkml.kernel.org/r/1460992636-711-1-git-send-email-vinmenon@codeaurora.org Link: http://lkml.kernel.org/r/1463488366-47723-1-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Minchan Kim Tested-by: Minchan Kim Acked-by: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: Vinayak Menon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/filemap.c | 2 +- mm/memory.c | 23 ++++++++++++++++++----- 3 files changed, 20 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index fbdb9d40847f..f223ac26b5d9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -596,7 +596,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) } void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon); + struct page *page, pte_t *pte, bool write, bool anon, bool old); #endif /* diff --git a/mm/filemap.c b/mm/filemap.c index 8f4859989f1b..b418405903bc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2191,7 +2191,7 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false); + do_set_pte(vma, addr, page, pte, false, false, true); unlock_page(page); goto next; unlock: diff --git a/mm/memory.c b/mm/memory.c index 007c72ad03f6..f29e5ab0342d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2876,7 +2876,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, * vm_ops->map_pages. */ void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon) + struct page *page, pte_t *pte, bool write, bool anon, bool old) { pte_t entry; @@ -2884,6 +2884,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (old) + entry = pte_mkold(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address, false); @@ -3021,9 +3023,20 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); - do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) goto unlock_out; + do_fault_around(vma, address, pte, pgoff, flags); + /* Check if the fault is handled by faultaround */ + if (!pte_same(*pte, orig_pte)) { + /* + * Faultaround produce old pte, but the pte we've + * handler fault for should be young. + */ + pte_t entry = pte_mkyoung(*pte); + if (ptep_set_access_flags(vma, address, pte, entry, 0)) + update_mmu_cache(vma, address, pte); + goto unlock_out; + } pte_unmap_unlock(pte, ptl); } @@ -3038,7 +3051,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, false, false); + do_set_pte(vma, address, fault_page, pte, false, false, false); unlock_page(fault_page); unlock_out: pte_unmap_unlock(pte, ptl); @@ -3090,7 +3103,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } goto uncharge_out; } - do_set_pte(vma, address, new_page, pte, true, true); + do_set_pte(vma, address, new_page, pte, true, true, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); @@ -3147,7 +3160,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, true, false); + do_set_pte(vma, address, fault_page, pte, true, false, false); pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) -- cgit v1.2.3 From d0834a6c2c5b0c76cfb806bd7dba6556d8b4edbb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 20 May 2016 16:58:44 -0700 Subject: mm: disable fault around on emulated access bit architecture fault_around aims to reduce minor faults of file-backed pages via speculative ahead pte mapping and relying on readahead logic. However, on non-HW access bit architecture the benefit is highly limited because they should emulate the young bit with minor faults for reclaim's page aging algorithm. IOW, we cannot reduce minor faults on those architectures. I did quick a test on my ARM machine. 512M file mmap sequential every word read on eSATA drive 4 times. stddev is stable. = fault_around 4096 = elapsed time(usec): 6747645 = fault_around 65536 = elapsed time(usec): 6709263 0.5% gain. Even when I tested it with eMMC there is no gain because I guess with slow storage the major fault is the dominant factor. Also, fault_around has the side effect of shrinking slab more aggressively and causes higher vmpressure, so if such speculation fails, it can evict slab more which can result in page I/O (e.g., inode cache). In the end, it would make void any benefit of fault_around. So let's make the default "disabled" on those architectures. Link: http://lkml.kernel.org/r/20160518014229.GB21538@bbox Signed-off-by: Minchan Kim Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index f29e5ab0342d..a1b93d9e4449 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2899,8 +2899,16 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } +/* + * If architecture emulates "accessed" or "young" bit without HW support, + * there is no much gain with fault_around. + */ static unsigned long fault_around_bytes __read_mostly = +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + PAGE_SIZE; +#else rounddown_pow_of_two(65536); +#endif #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) -- cgit v1.2.3 From 29b52de182acf50f85a8284ad39104d84c9bbf57 Mon Sep 17 00:00:00 2001 From: "seokhoon.yoon" Date: Fri, 20 May 2016 16:58:47 -0700 Subject: mm, kasan: fix to call kasan_free_pages() after poisoning page When CONFIG_PAGE_POISONING and CONFIG_KASAN is enabled, free_pages_prepare()'s codeflow is below. 1)kmemcheck_free_shadow() 2)kasan_free_pages() - set shadow byte of page is freed 3)kernel_poison_pages() 3.1) check access to page is valid or not using kasan ---> error occur, kasan think it is invalid access 3.2) poison page 4)kernel_map_pages() So kasan_free_pages() should be called after poisoning the page. Link: http://lkml.kernel.org/r/1463220405-7455-1-git-send-email-iamyooon@gmail.com Signed-off-by: seokhoon.yoon Cc: Andrey Ryabinin Cc: Laura Abbott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2dd1ba4e70cc..383b14b4f61d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -993,7 +993,6 @@ static __always_inline bool free_pages_prepare(struct page *page, trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); - kasan_free_pages(page, order); /* * Check tail pages before head page information is cleared to @@ -1035,6 +1034,7 @@ static __always_inline bool free_pages_prepare(struct page *page, arch_free_page(page, order); kernel_poison_pages(page, 1 << order, 0); kernel_map_pages(page, 1 << order, 0); + kasan_free_pages(page, order); return true; } -- cgit v1.2.3 From e570f56cccd215db68e50870ee74b7d9c0022109 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Fri, 20 May 2016 16:58:50 -0700 Subject: mm: check_new_page_bad() directly returns in __PG_HWPOISON case Currently we check page->flags twice for "HWPoisoned" case of check_new_page_bad(), which can cause a race with unpoisoning. This race unnecessarily taints kernel with "BUG: Bad page state". check_new_page_bad() is the only caller of bad_page() which is interested in __PG_HWPOISON, so let's move the hwpoison related code in bad_page() to it. Link: http://lkml.kernel.org/r/20160518100949.GA17299@hori1.linux.bs1.fc.nec.co.jp Signed-off-by: Naoya Horiguchi Acked-by: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 383b14b4f61d..f8f3bfc435ee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -522,12 +522,6 @@ static void bad_page(struct page *page, const char *reason, static unsigned long nr_shown; static unsigned long nr_unshown; - /* Don't complain about poisoned pages */ - if (PageHWPoison(page)) { - page_mapcount_reset(page); /* remove PageBuddy */ - return; - } - /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -1654,6 +1648,9 @@ static void check_new_page_bad(struct page *page) if (unlikely(page->flags & __PG_HWPOISON)) { bad_reason = "HWPoisoned (hardware-corrupted)"; bad_flags = __PG_HWPOISON; + /* Don't complain about hwpoisoned pages */ + page_mapcount_reset(page); /* remove PageBuddy */ + return; } if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) { bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set"; -- cgit v1.2.3 From a53eaff8c1192bb5bdfda5deb484bc8f415c5dfd Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 20 May 2016 16:58:53 -0700 Subject: MM: increase safety margin provided by PF_LESS_THROTTLE When nfsd is exporting a filesystem over NFS which is then NFS-mounted on the local machine there is a risk of deadlock. This happens when there are lots of dirty pages in the NFS filesystem and they cause NFSD to be throttled, either in throttle_vm_writeout() or in balance_dirty_pages(). To avoid this problem the PF_LESS_THROTTLE flag is set for NFSD threads and it provides a 25% increase to the limits that affect NFSD. Any process writing to an NFS filesystem will be throttled well before the number of dirty NFS pages reaches the limit imposed on NFSD, so NFSD will not deadlock on pages that it needs to write out. At least it shouldn't. All processes are allowed a small excess margin to avoid performing too many calculations: ratelimit_pages. ratelimit_pages is set so that if a thread on every CPU uses the entire margin, the total will only go 3% over the limit, and this is much less than the 25% bonus that PF_LESS_THROTTLE provides, so this margin shouldn't be a problem. But it is. The "total memory" that these 3% and 25% are calculated against are not really total memory but are "global_dirtyable_memory()" which doesn't include anonymous memory, just free memory and page-cache memory. The "ratelimit_pages" number is based on whatever the global_dirtyable_memory was on the last CPU hot-plug, which might not be what you expect, but is probably close to the total freeable memory. The throttle threshold uses the global_dirtable_memory at the moment when the throttling happens, which could be much less than at the last CPU hotplug. So if lots of anonymous memory has been allocated, thus pushing out lots of page-cache pages, then NFSD might end up being throttled due to dirty NFS pages because the "25%" bonus it gets is calculated against a rather small amount of dirtyable memory, while the "3%" margin that other processes are allowed to dirty without penalty is calculated against a much larger number. To remove this possibility of deadlock we need to make sure that the margin granted to PF_LESS_THROTTLE exceeds that rate-limit margin. Simply adding ratelimit_pages isn't enough as that should be multiplied by the number of cpus. So add "global_wb_domain.dirty_limit / 32" as that more accurately reflects the current total over-shoot margin. This ensures that the number of dirty NFS pages never gets so high that nfsd will be throttled waiting for them to be written. Link: http://lkml.kernel.org/r/87futgowwv.fsf@notabene.neil.brown.name Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3b88795ab46e..b9956fdee8f5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -411,8 +411,8 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) bg_thresh = thresh / 2; tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - bg_thresh += bg_thresh / 4; - thresh += thresh / 4; + bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; + thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; } dtc->thresh = thresh; dtc->bg_thresh = bg_thresh; -- cgit v1.2.3 From f0508977787b3bcfd54e82111ab50d6245b9f4df Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 20 May 2016 16:58:56 -0700 Subject: mm, thp: khugepaged should scan when sleep value is written If a large value is written to scan_sleep_millisecs, for example, that period must lapse before khugepaged will wake up for periodic collapsing. If this value is tuned to 1 day, for example, and then re-tuned to its default 10s, khugepaged will still wait for a day before scanning again. This patch causes khugepaged to wakeup immediately when the value is changed and then sleep until that value is rewritten or the new value lapses. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1605181453200.4786@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1764184c4774..41ef7547e822 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,6 +89,7 @@ static unsigned int khugepaged_full_scans; static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000; /* during fragmentation poll the hugepage allocator once every minute */ static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000; +static unsigned long khugepaged_sleep_expire; static struct task_struct *khugepaged_thread __read_mostly; static DEFINE_MUTEX(khugepaged_mutex); static DEFINE_SPINLOCK(khugepaged_mm_lock); @@ -467,6 +468,7 @@ static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, return -EINVAL; khugepaged_scan_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; @@ -494,6 +496,7 @@ static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, return -EINVAL; khugepaged_alloc_sleep_millisecs = msecs; + khugepaged_sleep_expire = 0; wake_up_interruptible(&khugepaged_wait); return count; @@ -2791,15 +2794,25 @@ static void khugepaged_do_scan(void) put_page(hpage); } +static bool khugepaged_should_wakeup(void) +{ + return kthread_should_stop() || + time_after_eq(jiffies, khugepaged_sleep_expire); +} + static void khugepaged_wait_work(void) { if (khugepaged_has_work()) { - if (!khugepaged_scan_sleep_millisecs) + const unsigned long scan_sleep_jiffies = + msecs_to_jiffies(khugepaged_scan_sleep_millisecs); + + if (!scan_sleep_jiffies) return; + khugepaged_sleep_expire = jiffies + scan_sleep_jiffies; wait_event_freezable_timeout(khugepaged_wait, - kthread_should_stop(), - msecs_to_jiffies(khugepaged_scan_sleep_millisecs)); + khugepaged_should_wakeup(), + scan_sleep_jiffies); return; } -- cgit v1.2.3 From 6cd9dc3e75078ef646076fa63adfb9b85ced0b66 Mon Sep 17 00:00:00 2001 From: Chen Feng Date: Fri, 20 May 2016 16:59:02 -0700 Subject: mm/compaction.c: fix zoneindex in kcompactd() While testing the kcompactd in my platform 3G MEM only DMA ZONE. I found the kcompactd never wakeup. It seems the zoneindex has already minus 1 before. So the traverse here should be <=. It fixes a regression where kswapd could previously compact, but kcompactd not. Not a crash fix though. [akpm@linux-foundation.org: fix kcompactd_do_work() as well, per Hugh] Link: http://lkml.kernel.org/r/1463659121-84124-1-git-send-email-puck.chen@hisilicon.com Fixes: accf62422b3a ("mm, kswapd: replace kswapd compaction with waking up kcompactd") Signed-off-by: Chen Feng Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Michal Hocko Cc: Kirill A. Shutemov Cc: Johannes Weiner Cc: Tejun Heo Cc: Zhuangluan Su Cc: Yiping Xu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index d8a20fcf8678..1427366ad673 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1862,7 +1862,7 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat) struct zone *zone; enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx; - for (zoneid = 0; zoneid < classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= classzone_idx; zoneid++) { zone = &pgdat->node_zones[zoneid]; if (!populated_zone(zone)) @@ -1897,7 +1897,7 @@ static void kcompactd_do_work(pg_data_t *pgdat) cc.classzone_idx); count_vm_event(KCOMPACTD_WAKE); - for (zoneid = 0; zoneid < cc.classzone_idx; zoneid++) { + for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) { int status; zone = &pgdat->node_zones[zoneid]; -- cgit v1.2.3 From dfef2ef4027b1304149a65dc33794eab65e8a3bf Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Fri, 20 May 2016 16:59:05 -0700 Subject: mm, migrate: increment fail count on ENOMEM If page migration fails due to -ENOMEM, nr_failed should still be incremented for proper statistics. This was encountered recently when all page migration vmstats showed 0, and inferred that migrate_pages() was never called, although in reality the first page migration failed because compaction_alloc() failed to find a migration target. This patch increments nr_failed so the vmstat is properly accounted on ENOMEM. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1605191510230.32658@chino.kir.corp.google.com Signed-off-by: David Rientjes Cc: Vlastimil Babka Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 53ab6398e7a2..9baf41c877ff 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1171,6 +1171,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, switch(rc) { case -ENOMEM: + nr_failed++; goto out; case -EAGAIN: retry++; -- cgit v1.2.3 From 55834c59098d0c5a97b0f3247e55832b67facdcf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 20 May 2016 16:59:11 -0700 Subject: mm: kasan: initial memory quarantine implementation Quarantine isolates freed objects in a separate queue. The objects are returned to the allocator later, which helps to detect use-after-free errors. When the object is freed, its state changes from KASAN_STATE_ALLOC to KASAN_STATE_QUARANTINE. The object is poisoned and put into quarantine instead of being returned to the allocator, therefore every subsequent access to that object triggers a KASAN error, and the error handler is able to say where the object has been allocated and deallocated. When it's time for the object to leave quarantine, its state becomes KASAN_STATE_FREE and it's returned to the allocator. From now on the allocator may reuse it for another allocation. Before that happens, it's still possible to detect a use-after free on that object (it retains the allocation/deallocation stacks). When the allocator reuses this object, the shadow is unpoisoned and old allocation/deallocation stacks are wiped. Therefore a use of this object, even an incorrect one, won't trigger ASan warning. Without the quarantine, it's not guaranteed that the objects aren't reused immediately, that's why the probability of catching a use-after-free is lower than with quarantine in place. Quarantine isolates freed objects in a separate queue. The objects are returned to the allocator later, which helps to detect use-after-free errors. Freed objects are first added to per-cpu quarantine queues. When a cache is destroyed or memory shrinking is requested, the objects are moved into the global quarantine queue. Whenever a kmalloc call allows memory reclaiming, the oldest objects are popped out of the global queue until the total size of objects in quarantine is less than 3/4 of the maximum quarantine size (which is a fraction of installed physical memory). As long as an object remains in the quarantine, KASAN is able to report accesses to it, so the chance of reporting a use-after-free is increased. Once the object leaves quarantine, the allocator may reuse it, in which case the object is unpoisoned and KASAN can't detect incorrect accesses to it. Right now quarantine support is only enabled in SLAB allocator. Unification of KASAN features in SLAB and SLUB will be done later. This patch is based on the "mm: kasan: quarantine" patch originally prepared by Dmitry Chernenkov. A number of improvements have been suggested by Andrey Ryabinin. [glider@google.com: v9] Link: http://lkml.kernel.org/r/1462987130-144092-1-git-send-email-glider@google.com Signed-off-by: Alexander Potapenko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Andrey Ryabinin Cc: Steven Rostedt Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 13 ++- mm/kasan/Makefile | 1 + mm/kasan/kasan.c | 57 ++++++++-- mm/kasan/kasan.h | 21 +++- mm/kasan/quarantine.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/report.c | 1 + mm/mempool.c | 2 +- mm/slab.c | 12 ++- mm/slab.h | 2 + mm/slab_common.c | 2 + 10 files changed, 387 insertions(+), 15 deletions(-) create mode 100644 mm/kasan/quarantine.c (limited to 'mm') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 737371b56044..611927f5870d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -50,6 +50,8 @@ void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags); +void kasan_cache_shrink(struct kmem_cache *cache); +void kasan_cache_destroy(struct kmem_cache *cache); void kasan_poison_slab(struct page *page); void kasan_unpoison_object_data(struct kmem_cache *cache, void *object); @@ -63,7 +65,8 @@ void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size, void kasan_krealloc(const void *object, size_t new_size, gfp_t flags); void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); -void kasan_slab_free(struct kmem_cache *s, void *object); +bool kasan_slab_free(struct kmem_cache *s, void *object); +void kasan_poison_slab_free(struct kmem_cache *s, void *object); struct kasan_cache { int alloc_meta_offset; @@ -88,6 +91,8 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, unsigned long *flags) {} +static inline void kasan_cache_shrink(struct kmem_cache *cache) {} +static inline void kasan_cache_destroy(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct page *page) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, @@ -105,7 +110,11 @@ static inline void kasan_krealloc(const void *object, size_t new_size, static inline void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) {} -static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} +static inline bool kasan_slab_free(struct kmem_cache *s, void *object) +{ + return false; +} +static inline void kasan_poison_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } static inline void kasan_free_shadow(const struct vm_struct *vm) {} diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 131daadf40e4..1548749a3d45 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -8,3 +8,4 @@ CFLAGS_REMOVE_kasan.o = -pg CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) obj-y := kasan.o report.o kasan_init.o +obj-$(CONFIG_SLAB) += quarantine.o diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 38f1dd79acdb..8df666bb23be 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -388,6 +388,16 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, } #endif +void kasan_cache_shrink(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + +void kasan_cache_destroy(struct kmem_cache *cache) +{ + quarantine_remove_cache(cache); +} + void kasan_poison_slab(struct page *page) { kasan_poison_shadow(page_address(page), @@ -482,7 +492,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) kasan_kmalloc(cache, object, cache->object_size, flags); } -void kasan_slab_free(struct kmem_cache *cache, void *object) +void kasan_poison_slab_free(struct kmem_cache *cache, void *object) { unsigned long size = cache->object_size; unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); @@ -491,18 +501,43 @@ void kasan_slab_free(struct kmem_cache *cache, void *object) if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) return; + kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); +} + +bool kasan_slab_free(struct kmem_cache *cache, void *object) +{ #ifdef CONFIG_SLAB - if (cache->flags & SLAB_KASAN) { - struct kasan_free_meta *free_info = - get_free_info(cache, object); + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + return false; + + if (likely(cache->flags & SLAB_KASAN)) { struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - alloc_info->state = KASAN_STATE_FREE; - set_track(&free_info->track, GFP_NOWAIT); + struct kasan_free_meta *free_info = + get_free_info(cache, object); + + switch (alloc_info->state) { + case KASAN_STATE_ALLOC: + alloc_info->state = KASAN_STATE_QUARANTINE; + quarantine_put(free_info, cache); + set_track(&free_info->track, GFP_NOWAIT); + kasan_poison_slab_free(cache, object); + return true; + case KASAN_STATE_QUARANTINE: + case KASAN_STATE_FREE: + pr_err("Double free"); + dump_stack(); + break; + default: + break; + } } + return false; +#else + kasan_poison_slab_free(cache, object); + return false; #endif - - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); } void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, @@ -511,6 +546,9 @@ void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size, unsigned long redzone_start; unsigned long redzone_end; + if (flags & __GFP_RECLAIM) + quarantine_reduce(); + if (unlikely(object == NULL)) return; @@ -541,6 +579,9 @@ void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) unsigned long redzone_start; unsigned long redzone_end; + if (flags & __GFP_RECLAIM) + quarantine_reduce(); + if (unlikely(ptr == NULL)) return; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 30a2f0ba0e09..7f7ac51d7faf 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -62,6 +62,7 @@ struct kasan_global { enum kasan_state { KASAN_STATE_INIT, KASAN_STATE_ALLOC, + KASAN_STATE_QUARANTINE, KASAN_STATE_FREE }; @@ -79,9 +80,14 @@ struct kasan_alloc_meta { u32 reserved; }; +struct qlist_node { + struct qlist_node *next; +}; struct kasan_free_meta { - /* Allocator freelist pointer, unused by KASAN. */ - void **freelist; + /* This field is used while the object is in the quarantine. + * Otherwise it might be used for the allocator freelist. + */ + struct qlist_node quarantine_link; struct kasan_track track; }; @@ -105,4 +111,15 @@ static inline bool kasan_report_enabled(void) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); +#ifdef CONFIG_SLAB +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); +void quarantine_reduce(void); +void quarantine_remove_cache(struct kmem_cache *cache); +#else +static inline void quarantine_put(struct kasan_free_meta *info, + struct kmem_cache *cache) { } +static inline void quarantine_reduce(void) { } +static inline void quarantine_remove_cache(struct kmem_cache *cache) { } +#endif + #endif diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c new file mode 100644 index 000000000000..4973505a9bdd --- /dev/null +++ b/mm/kasan/quarantine.c @@ -0,0 +1,291 @@ +/* + * KASAN quarantine. + * + * Author: Alexander Potapenko + * Copyright (C) 2016 Google, Inc. + * + * Based on code by Dmitry Chernenkov. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kasan.h" + +/* Data structure and operations for quarantine queues. */ + +/* + * Each queue is a signle-linked list, which also stores the total size of + * objects inside of it. + */ +struct qlist_head { + struct qlist_node *head; + struct qlist_node *tail; + size_t bytes; +}; + +#define QLIST_INIT { NULL, NULL, 0 } + +static bool qlist_empty(struct qlist_head *q) +{ + return !q->head; +} + +static void qlist_init(struct qlist_head *q) +{ + q->head = q->tail = NULL; + q->bytes = 0; +} + +static void qlist_put(struct qlist_head *q, struct qlist_node *qlink, + size_t size) +{ + if (unlikely(qlist_empty(q))) + q->head = qlink; + else + q->tail->next = qlink; + q->tail = qlink; + qlink->next = NULL; + q->bytes += size; +} + +static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) +{ + if (unlikely(qlist_empty(from))) + return; + + if (qlist_empty(to)) { + *to = *from; + qlist_init(from); + return; + } + + to->tail->next = from->head; + to->tail = from->tail; + to->bytes += from->bytes; + + qlist_init(from); +} + +static void qlist_move(struct qlist_head *from, struct qlist_node *last, + struct qlist_head *to, size_t size) +{ + if (unlikely(last == from->tail)) { + qlist_move_all(from, to); + return; + } + if (qlist_empty(to)) + to->head = from->head; + else + to->tail->next = from->head; + to->tail = last; + from->head = last->next; + last->next = NULL; + from->bytes -= size; + to->bytes += size; +} + + +/* + * The object quarantine consists of per-cpu queues and a global queue, + * guarded by quarantine_lock. + */ +static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); + +static struct qlist_head global_quarantine; +static DEFINE_SPINLOCK(quarantine_lock); + +/* Maximum size of the global queue. */ +static unsigned long quarantine_size; + +/* + * The fraction of physical memory the quarantine is allowed to occupy. + * Quarantine doesn't support memory shrinker with SLAB allocator, so we keep + * the ratio low to avoid OOM. + */ +#define QUARANTINE_FRACTION 32 + +#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) +#define QUARANTINE_PERCPU_SIZE (1 << 20) + +static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) +{ + return virt_to_head_page(qlink)->slab_cache; +} + +static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache) +{ + struct kasan_free_meta *free_info = + container_of(qlink, struct kasan_free_meta, + quarantine_link); + + return ((void *)free_info) - cache->kasan_info.free_meta_offset; +} + +static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache) +{ + void *object = qlink_to_object(qlink, cache); + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + unsigned long flags; + + local_irq_save(flags); + alloc_info->state = KASAN_STATE_FREE; + ___cache_free(cache, object, _THIS_IP_); + local_irq_restore(flags); +} + +static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache) +{ + struct qlist_node *qlink; + + if (unlikely(qlist_empty(q))) + return; + + qlink = q->head; + while (qlink) { + struct kmem_cache *obj_cache = + cache ? cache : qlink_to_cache(qlink); + struct qlist_node *next = qlink->next; + + qlink_free(qlink, obj_cache); + qlink = next; + } + qlist_init(q); +} + +void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head *q; + struct qlist_head temp = QLIST_INIT; + + local_irq_save(flags); + + q = this_cpu_ptr(&cpu_quarantine); + qlist_put(q, &info->quarantine_link, cache->size); + if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) + qlist_move_all(q, &temp); + + local_irq_restore(flags); + + if (unlikely(!qlist_empty(&temp))) { + spin_lock_irqsave(&quarantine_lock, flags); + qlist_move_all(&temp, &global_quarantine); + spin_unlock_irqrestore(&quarantine_lock, flags); + } +} + +void quarantine_reduce(void) +{ + size_t new_quarantine_size; + unsigned long flags; + struct qlist_head to_free = QLIST_INIT; + size_t size_to_free = 0; + struct qlist_node *last; + + if (likely(READ_ONCE(global_quarantine.bytes) <= + READ_ONCE(quarantine_size))) + return; + + spin_lock_irqsave(&quarantine_lock, flags); + + /* + * Update quarantine size in case of hotplug. Allocate a fraction of + * the installed memory to quarantine minus per-cpu queue limits. + */ + new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + QUARANTINE_FRACTION; + new_quarantine_size -= QUARANTINE_PERCPU_SIZE * num_online_cpus(); + WRITE_ONCE(quarantine_size, new_quarantine_size); + + last = global_quarantine.head; + while (last) { + struct kmem_cache *cache = qlink_to_cache(last); + + size_to_free += cache->size; + if (!last->next || size_to_free > + global_quarantine.bytes - QUARANTINE_LOW_SIZE) + break; + last = last->next; + } + qlist_move(&global_quarantine, last, &to_free, size_to_free); + + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, NULL); +} + +static void qlist_move_cache(struct qlist_head *from, + struct qlist_head *to, + struct kmem_cache *cache) +{ + struct qlist_node *prev = NULL, *curr; + + if (unlikely(qlist_empty(from))) + return; + + curr = from->head; + while (curr) { + struct qlist_node *qlink = curr; + struct kmem_cache *obj_cache = qlink_to_cache(qlink); + + if (obj_cache == cache) { + if (unlikely(from->head == qlink)) { + from->head = curr->next; + prev = curr; + } else + prev->next = curr->next; + if (unlikely(from->tail == qlink)) + from->tail = curr->next; + from->bytes -= cache->size; + qlist_put(to, qlink, cache->size); + } else { + prev = curr; + } + curr = curr->next; + } +} + +static void per_cpu_remove_cache(void *arg) +{ + struct kmem_cache *cache = arg; + struct qlist_head to_free = QLIST_INIT; + struct qlist_head *q; + + q = this_cpu_ptr(&cpu_quarantine); + qlist_move_cache(q, &to_free, cache); + qlist_free_all(&to_free, cache); +} + +void quarantine_remove_cache(struct kmem_cache *cache) +{ + unsigned long flags; + struct qlist_head to_free = QLIST_INIT; + + on_each_cpu(per_cpu_remove_cache, cache, 1); + + spin_lock_irqsave(&quarantine_lock, flags); + qlist_move_cache(&global_quarantine, &to_free, cache); + spin_unlock_irqrestore(&quarantine_lock, flags); + + qlist_free_all(&to_free, cache); +} diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 60869a5a0124..b3c122ddd454 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -151,6 +151,7 @@ static void object_err(struct kmem_cache *cache, struct page *page, print_track(&alloc_info->track); break; case KASAN_STATE_FREE: + case KASAN_STATE_QUARANTINE: pr_err("Object freed, allocated with size %u bytes\n", alloc_info->alloc_size); free_info = get_free_info(cache, object); diff --git a/mm/mempool.c b/mm/mempool.c index 9b7a14a791cc..9e075f829d0d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -105,7 +105,7 @@ static inline void poison_element(mempool_t *pool, void *element) static void kasan_poison_element(mempool_t *pool, void *element) { if (pool->alloc == mempool_alloc_slab) - kasan_slab_free(pool->pool_data, element); + kasan_poison_slab_free(pool->pool_data, element); if (pool->alloc == mempool_kmalloc) kasan_kfree(element); if (pool->alloc == mempool_alloc_pages) diff --git a/mm/slab.c b/mm/slab.c index c11bf5007952..28864c022430 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3547,9 +3547,17 @@ free_done: static inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { - struct array_cache *ac = cpu_cache_get(cachep); + /* Put the object into the quarantine, don't touch it for now. */ + if (kasan_slab_free(cachep, objp)) + return; + + ___cache_free(cachep, objp, caller); +} - kasan_slab_free(cachep, objp); +void ___cache_free(struct kmem_cache *cachep, void *objp, + unsigned long caller) +{ + struct array_cache *ac = cpu_cache_get(cachep); check_irq_off(); kmemleak_free_recursive(objp, cachep->flags); diff --git a/mm/slab.h b/mm/slab.h index 5969769fbee6..dedb1a920fb8 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -462,4 +462,6 @@ void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); + #endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index 3239bfd758e6..a65dad7fdcd1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -715,6 +715,7 @@ void kmem_cache_destroy(struct kmem_cache *s) get_online_cpus(); get_online_mems(); + kasan_cache_destroy(s); mutex_lock(&slab_mutex); s->refcount--; @@ -753,6 +754,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); + kasan_cache_shrink(cachep); ret = __kmem_cache_shrink(cachep, false); put_online_mems(); put_online_cpus(); -- cgit v1.2.3 From 4ebb31a42ffa03912447fe1aabbdb28242f909ba Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Fri, 20 May 2016 16:59:14 -0700 Subject: mm, kasan: don't call kasan_krealloc() from ksize(). Instead of calling kasan_krealloc(), which replaces the memory allocation stack ID (if stack depot is used), just unpoison the whole memory chunk. Signed-off-by: Alexander Potapenko Acked-by: Andrey Ryabinin Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Christoph Lameter Cc: Konstantin Serebryany Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slub.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 28864c022430..cc8bbc1e6bc9 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4501,7 +4501,7 @@ size_t ksize(const void *objp) /* We assume that ksize callers could use the whole allocated area, * so we need to unpoison this area. */ - kasan_krealloc(objp, size, GFP_NOWAIT); + kasan_unpoison_shadow(objp, size); return size; } diff --git a/mm/slub.c b/mm/slub.c index cf1faa4d3992..825ff4505336 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3635,8 +3635,9 @@ size_t ksize(const void *object) { size_t size = __ksize(object); /* We assume that ksize callers could use whole allocated area, - so we need unpoison this area. */ - kasan_krealloc(object, size, GFP_NOWAIT); + * so we need to unpoison this area. + */ + kasan_unpoison_shadow(object, size); return size; } EXPORT_SYMBOL(ksize); -- cgit v1.2.3 From 936bb4bbbb832f81055328b84e5afe1fc7246a8d Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 20 May 2016 16:59:20 -0700 Subject: mm/kasan: print name of mem[set,cpy,move]() caller in report When bogus memory access happens in mem[set,cpy,move]() it's usually caller's fault. So don't blame mem[set,cpy,move]() in bug report, blame the caller instead. Before: BUG: KASAN: out-of-bounds access in memset+0x23/0x40 at
After: BUG: KASAN: out-of-bounds access in at
Link: http://lkml.kernel.org/r/1462538722-1574-2-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.c | 64 ++++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 8df666bb23be..e5beb40d97b1 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -273,32 +273,36 @@ static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size) return memory_is_poisoned_n(addr, size); } - -static __always_inline void check_memory_region(unsigned long addr, - size_t size, bool write) +static __always_inline void check_memory_region_inline(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) { if (unlikely(size == 0)) return; if (unlikely((void *)addr < kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) { - kasan_report(addr, size, write, _RET_IP_); + kasan_report(addr, size, write, ret_ip); return; } if (likely(!memory_is_poisoned(addr, size))) return; - kasan_report(addr, size, write, _RET_IP_); + kasan_report(addr, size, write, ret_ip); } -void __asan_loadN(unsigned long addr, size_t size); -void __asan_storeN(unsigned long addr, size_t size); +static void check_memory_region(unsigned long addr, + size_t size, bool write, + unsigned long ret_ip) +{ + check_memory_region_inline(addr, size, write, ret_ip); +} #undef memset void *memset(void *addr, int c, size_t len) { - __asan_storeN((unsigned long)addr, len); + check_memory_region((unsigned long)addr, len, true, _RET_IP_); return __memset(addr, c, len); } @@ -306,8 +310,8 @@ void *memset(void *addr, int c, size_t len) #undef memmove void *memmove(void *dest, const void *src, size_t len) { - __asan_loadN((unsigned long)src, len); - __asan_storeN((unsigned long)dest, len); + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); return __memmove(dest, src, len); } @@ -315,8 +319,8 @@ void *memmove(void *dest, const void *src, size_t len) #undef memcpy void *memcpy(void *dest, const void *src, size_t len) { - __asan_loadN((unsigned long)src, len); - __asan_storeN((unsigned long)dest, len); + check_memory_region((unsigned long)src, len, false, _RET_IP_); + check_memory_region((unsigned long)dest, len, true, _RET_IP_); return __memcpy(dest, src, len); } @@ -690,22 +694,22 @@ void __asan_unregister_globals(struct kasan_global *globals, size_t size) } EXPORT_SYMBOL(__asan_unregister_globals); -#define DEFINE_ASAN_LOAD_STORE(size) \ - void __asan_load##size(unsigned long addr) \ - { \ - check_memory_region(addr, size, false); \ - } \ - EXPORT_SYMBOL(__asan_load##size); \ - __alias(__asan_load##size) \ - void __asan_load##size##_noabort(unsigned long); \ - EXPORT_SYMBOL(__asan_load##size##_noabort); \ - void __asan_store##size(unsigned long addr) \ - { \ - check_memory_region(addr, size, true); \ - } \ - EXPORT_SYMBOL(__asan_store##size); \ - __alias(__asan_store##size) \ - void __asan_store##size##_noabort(unsigned long); \ +#define DEFINE_ASAN_LOAD_STORE(size) \ + void __asan_load##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, false, _RET_IP_);\ + } \ + EXPORT_SYMBOL(__asan_load##size); \ + __alias(__asan_load##size) \ + void __asan_load##size##_noabort(unsigned long); \ + EXPORT_SYMBOL(__asan_load##size##_noabort); \ + void __asan_store##size(unsigned long addr) \ + { \ + check_memory_region_inline(addr, size, true, _RET_IP_); \ + } \ + EXPORT_SYMBOL(__asan_store##size); \ + __alias(__asan_store##size) \ + void __asan_store##size##_noabort(unsigned long); \ EXPORT_SYMBOL(__asan_store##size##_noabort) DEFINE_ASAN_LOAD_STORE(1); @@ -716,7 +720,7 @@ DEFINE_ASAN_LOAD_STORE(16); void __asan_loadN(unsigned long addr, size_t size) { - check_memory_region(addr, size, false); + check_memory_region(addr, size, false, _RET_IP_); } EXPORT_SYMBOL(__asan_loadN); @@ -726,7 +730,7 @@ EXPORT_SYMBOL(__asan_loadN_noabort); void __asan_storeN(unsigned long addr, size_t size) { - check_memory_region(addr, size, true); + check_memory_region(addr, size, true, _RET_IP_); } EXPORT_SYMBOL(__asan_storeN); -- cgit v1.2.3 From 64f8ebaf115bcddc4aaa902f981c57ba6506bc42 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 20 May 2016 16:59:28 -0700 Subject: mm/kasan: add API to check memory regions Memory access coded in an assembly won't be seen by KASAN as a compiler can instrument only C code. Add kasan_check_[read,write]() API which is going to be used to check a certain memory range. Link: http://lkml.kernel.org/r/1462538722-1574-3-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Alexander Potapenko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- include/linux/kasan-checks.h | 12 ++++++++++++ mm/kasan/kasan.c | 12 ++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 include/linux/kasan-checks.h (limited to 'mm') diff --git a/MAINTAINERS b/MAINTAINERS index 374ffa2d81b7..8b92445561b6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6242,7 +6242,7 @@ S: Maintained F: arch/*/include/asm/kasan.h F: arch/*/mm/kasan_init* F: Documentation/kasan.txt -F: include/linux/kasan.h +F: include/linux/kasan*.h F: lib/test_kasan.c F: mm/kasan/ F: scripts/Makefile.kasan diff --git a/include/linux/kasan-checks.h b/include/linux/kasan-checks.h new file mode 100644 index 000000000000..b7f8aced7870 --- /dev/null +++ b/include/linux/kasan-checks.h @@ -0,0 +1,12 @@ +#ifndef _LINUX_KASAN_CHECKS_H +#define _LINUX_KASAN_CHECKS_H + +#ifdef CONFIG_KASAN +void kasan_check_read(const void *p, unsigned int size); +void kasan_check_write(const void *p, unsigned int size); +#else +static inline void kasan_check_read(const void *p, unsigned int size) { } +static inline void kasan_check_write(const void *p, unsigned int size) { } +#endif + +#endif diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index e5beb40d97b1..18b6a2b8d183 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -299,6 +299,18 @@ static void check_memory_region(unsigned long addr, check_memory_region_inline(addr, size, write, ret_ip); } +void kasan_check_read(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, false, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_read); + +void kasan_check_write(const void *p, unsigned int size) +{ + check_memory_region((unsigned long)p, size, true, _RET_IP_); +} +EXPORT_SYMBOL(kasan_check_write); + #undef memset void *memset(void *addr, int c, size_t len) { -- cgit v1.2.3 From a42094676f076534bf4998625456fe0bb99c1f1e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 20 May 2016 16:59:36 -0700 Subject: zsmalloc: use first_page rather than page Clean up function parameter "struct page". Many functions of zsmalloc expect that page paramter is "first_page" so use "first_page" rather than "page" for code readability. Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 62 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe47fbba995a..c3e55a4bcc78 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -413,26 +413,28 @@ static int is_last_page(struct page *page) return PagePrivate2(page); } -static void get_zspage_mapping(struct page *page, unsigned int *class_idx, +static void get_zspage_mapping(struct page *first_page, + unsigned int *class_idx, enum fullness_group *fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - m = (unsigned long)page->mapping; + m = (unsigned long)first_page->mapping; *fullness = m & FULLNESS_MASK; *class_idx = (m >> FULLNESS_BITS) & CLASS_IDX_MASK; } -static void set_zspage_mapping(struct page *page, unsigned int class_idx, +static void set_zspage_mapping(struct page *first_page, + unsigned int class_idx, enum fullness_group fullness) { unsigned long m; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | (fullness & FULLNESS_MASK); - page->mapping = (struct address_space *)m; + first_page->mapping = (struct address_space *)m; } /* @@ -625,14 +627,14 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) * the pool (not yet implemented). This function returns fullness * status of the given page. */ -static enum fullness_group get_fullness_group(struct page *page) +static enum fullness_group get_fullness_group(struct page *first_page) { int inuse, max_objects; enum fullness_group fg; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - inuse = page->inuse; - max_objects = page->objects; + inuse = first_page->inuse; + max_objects = first_page->objects; if (inuse == 0) fg = ZS_EMPTY; @@ -652,12 +654,12 @@ static enum fullness_group get_fullness_group(struct page *page) * have. This functions inserts the given zspage into the freelist * identified by . */ -static void insert_zspage(struct page *page, struct size_class *class, +static void insert_zspage(struct page *first_page, struct size_class *class, enum fullness_group fullness) { struct page **head; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; @@ -667,7 +669,7 @@ static void insert_zspage(struct page *page, struct size_class *class, head = &class->fullness_list[fullness]; if (!*head) { - *head = page; + *head = first_page; return; } @@ -675,21 +677,21 @@ static void insert_zspage(struct page *page, struct size_class *class, * We want to see more ZS_FULL pages and less almost * empty/full. Put pages with higher ->inuse first. */ - list_add_tail(&page->lru, &(*head)->lru); - if (page->inuse >= (*head)->inuse) - *head = page; + list_add_tail(&first_page->lru, &(*head)->lru); + if (first_page->inuse >= (*head)->inuse) + *head = first_page; } /* * This function removes the given zspage from the freelist identified * by . */ -static void remove_zspage(struct page *page, struct size_class *class, +static void remove_zspage(struct page *first_page, struct size_class *class, enum fullness_group fullness) { struct page **head; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; @@ -698,11 +700,11 @@ static void remove_zspage(struct page *page, struct size_class *class, BUG_ON(!*head); if (list_empty(&(*head)->lru)) *head = NULL; - else if (*head == page) + else if (*head == first_page) *head = (struct page *)list_entry((*head)->lru.next, struct page, lru); - list_del_init(&page->lru); + list_del_init(&first_page->lru); zs_stat_dec(class, fullness == ZS_ALMOST_EMPTY ? CLASS_ALMOST_EMPTY : CLASS_ALMOST_FULL, 1); } @@ -717,21 +719,21 @@ static void remove_zspage(struct page *page, struct size_class *class, * fullness group. */ static enum fullness_group fix_fullness_group(struct size_class *class, - struct page *page) + struct page *first_page) { int class_idx; enum fullness_group currfg, newfg; - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - get_zspage_mapping(page, &class_idx, &currfg); - newfg = get_fullness_group(page); + get_zspage_mapping(first_page, &class_idx, &currfg); + newfg = get_fullness_group(first_page); if (newfg == currfg) goto out; - remove_zspage(page, class, currfg); - insert_zspage(page, class, newfg); - set_zspage_mapping(page, class_idx, newfg); + remove_zspage(first_page, class, currfg); + insert_zspage(first_page, class, newfg); + set_zspage_mapping(first_page, class_idx, newfg); out: return newfg; @@ -1234,11 +1236,11 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) return true; } -static bool zspage_full(struct page *page) +static bool zspage_full(struct page *first_page) { - BUG_ON(!is_first_page(page)); + BUG_ON(!is_first_page(first_page)); - return page->inuse == page->objects; + return first_page->inuse == first_page->objects; } unsigned long zs_get_total_pages(struct zs_pool *pool) -- cgit v1.2.3 From 830e4bc5baa9fda5d45257e9a3dbb3555c6c180e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 20 May 2016 16:59:39 -0700 Subject: zsmalloc: clean up many BUG_ON There are many BUG_ON in zsmalloc.c which is not recommened so change them as alternatives. Normal rule is as follows: 1. avoid BUG_ON if possible. Instead, use VM_BUG_ON or VM_BUG_ON_PAGE 2. use VM_BUG_ON_PAGE if we need to see struct page's fields 3. use those assertion in primitive functions so higher functions can rely on the assertion in the primitive function. 4. Don't use assertion if following instruction can trigger Oops Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 42 +++++++++++++++--------------------------- 1 file changed, 15 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index c3e55a4bcc78..dfe684cc3a03 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -418,7 +418,7 @@ static void get_zspage_mapping(struct page *first_page, enum fullness_group *fullness) { unsigned long m; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); m = (unsigned long)first_page->mapping; *fullness = m & FULLNESS_MASK; @@ -430,7 +430,7 @@ static void set_zspage_mapping(struct page *first_page, enum fullness_group fullness) { unsigned long m; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); m = ((class_idx & CLASS_IDX_MASK) << FULLNESS_BITS) | (fullness & FULLNESS_MASK); @@ -631,7 +631,8 @@ static enum fullness_group get_fullness_group(struct page *first_page) { int inuse, max_objects; enum fullness_group fg; - BUG_ON(!is_first_page(first_page)); + + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); inuse = first_page->inuse; max_objects = first_page->objects; @@ -659,7 +660,7 @@ static void insert_zspage(struct page *first_page, struct size_class *class, { struct page **head; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; @@ -691,13 +692,13 @@ static void remove_zspage(struct page *first_page, struct size_class *class, { struct page **head; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); if (fullness >= _ZS_NR_FULLNESS_GROUPS) return; head = &class->fullness_list[fullness]; - BUG_ON(!*head); + VM_BUG_ON_PAGE(!*head, first_page); if (list_empty(&(*head)->lru)) *head = NULL; else if (*head == first_page) @@ -724,8 +725,6 @@ static enum fullness_group fix_fullness_group(struct size_class *class, int class_idx; enum fullness_group currfg, newfg; - BUG_ON(!is_first_page(first_page)); - get_zspage_mapping(first_page, &class_idx, &currfg); newfg = get_fullness_group(first_page); if (newfg == currfg) @@ -811,7 +810,7 @@ static void *location_to_obj(struct page *page, unsigned long obj_idx) unsigned long obj; if (!page) { - BUG_ON(obj_idx); + VM_BUG_ON(obj_idx); return NULL; } @@ -844,7 +843,7 @@ static unsigned long obj_to_head(struct size_class *class, struct page *page, void *obj) { if (class->huge) { - VM_BUG_ON(!is_first_page(page)); + VM_BUG_ON_PAGE(!is_first_page(page), page); return page_private(page); } else return *(unsigned long *)obj; @@ -894,8 +893,8 @@ static void free_zspage(struct page *first_page) { struct page *nextp, *tmp, *head_extra; - BUG_ON(!is_first_page(first_page)); - BUG_ON(first_page->inuse); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + VM_BUG_ON_PAGE(first_page->inuse, first_page); head_extra = (struct page *)page_private(first_page); @@ -921,7 +920,8 @@ static void init_zspage(struct page *first_page, struct size_class *class) unsigned long off = 0; struct page *page = first_page; - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); + while (page) { struct page *next_page; struct link_free *link; @@ -1238,7 +1238,7 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage) static bool zspage_full(struct page *first_page) { - BUG_ON(!is_first_page(first_page)); + VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); return first_page->inuse == first_page->objects; } @@ -1276,14 +1276,12 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, struct page *pages[2]; void *ret; - BUG_ON(!handle); - /* * Because we use per-cpu mapping areas shared among the * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ - BUG_ON(in_interrupt()); + WARN_ON_ONCE(in_interrupt()); /* From now on, migration cannot move the object */ pin_tag(handle); @@ -1327,8 +1325,6 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) struct size_class *class; struct mapping_area *area; - BUG_ON(!handle); - obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); get_zspage_mapping(get_first_page(page), &class_idx, &fg); @@ -1448,8 +1444,6 @@ static void obj_free(struct zs_pool *pool, struct size_class *class, unsigned long f_objidx, f_offset; void *vaddr; - BUG_ON(!obj); - obj &= ~OBJ_ALLOCATED_TAG; obj_to_location(obj, &f_page, &f_objidx); first_page = get_first_page(f_page); @@ -1549,7 +1543,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src, kunmap_atomic(d_addr); kunmap_atomic(s_addr); s_page = get_next_page(s_page); - BUG_ON(!s_page); s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); s_size = class->size - written; @@ -1559,7 +1552,6 @@ static void zs_object_copy(unsigned long dst, unsigned long src, if (d_off >= PAGE_SIZE) { kunmap_atomic(d_addr); d_page = get_next_page(d_page); - BUG_ON(!d_page); d_addr = kmap_atomic(d_page); d_size = class->size - written; d_off = 0; @@ -1694,8 +1686,6 @@ static enum fullness_group putback_zspage(struct zs_pool *pool, { enum fullness_group fullness; - BUG_ON(!is_first_page(first_page)); - fullness = get_fullness_group(first_page); insert_zspage(first_page, class, fullness); set_zspage_mapping(first_page, class->index, fullness); @@ -1759,8 +1749,6 @@ static void __zs_compact(struct zs_pool *pool, struct size_class *class) spin_lock(&class->lock); while ((src_page = isolate_source_page(class))) { - BUG_ON(!is_first_page(src_page)); - if (!zs_can_compact(class)) break; -- cgit v1.2.3 From 251cbb951b831acd8451d75b40696834f07c29c5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 20 May 2016 16:59:42 -0700 Subject: zsmalloc: reorder function parameters Clean up function parameter ordering to order higher data structure first. Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index dfe684cc3a03..18535abfad40 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -569,7 +569,7 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(const char *name, struct zs_pool *pool) +static int zs_pool_stat_create(struct zs_pool *pool, const char *name) { struct dentry *entry; @@ -609,7 +609,7 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(const char *name, struct zs_pool *pool) +static inline int zs_pool_stat_create(struct zs_pool *pool, const char *name) { return 0; } @@ -655,8 +655,9 @@ static enum fullness_group get_fullness_group(struct page *first_page) * have. This functions inserts the given zspage into the freelist * identified by . */ -static void insert_zspage(struct page *first_page, struct size_class *class, - enum fullness_group fullness) +static void insert_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; @@ -687,8 +688,9 @@ static void insert_zspage(struct page *first_page, struct size_class *class, * This function removes the given zspage from the freelist identified * by . */ -static void remove_zspage(struct page *first_page, struct size_class *class, - enum fullness_group fullness) +static void remove_zspage(struct size_class *class, + enum fullness_group fullness, + struct page *first_page) { struct page **head; @@ -730,8 +732,8 @@ static enum fullness_group fix_fullness_group(struct size_class *class, if (newfg == currfg) goto out; - remove_zspage(first_page, class, currfg); - insert_zspage(first_page, class, newfg); + remove_zspage(class, currfg, first_page); + insert_zspage(class, newfg, first_page); set_zspage_mapping(first_page, class_idx, newfg); out: @@ -915,7 +917,7 @@ static void free_zspage(struct page *first_page) } /* Initialize a newly allocated zspage */ -static void init_zspage(struct page *first_page, struct size_class *class) +static void init_zspage(struct size_class *class, struct page *first_page) { unsigned long off = 0; struct page *page = first_page; @@ -1003,7 +1005,7 @@ static struct page *alloc_zspage(struct size_class *class, gfp_t flags) prev_page = page; } - init_zspage(first_page, class); + init_zspage(class, first_page); first_page->freelist = location_to_obj(first_page, 0); /* Maximum number of objects we can store in this zspage */ @@ -1348,8 +1350,8 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_unmap_object); -static unsigned long obj_malloc(struct page *first_page, - struct size_class *class, unsigned long handle) +static unsigned long obj_malloc(struct size_class *class, + struct page *first_page, unsigned long handle) { unsigned long obj; struct link_free *link; @@ -1426,7 +1428,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) class->size, class->pages_per_zspage)); } - obj = obj_malloc(first_page, class, handle); + obj = obj_malloc(class, first_page, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, first_page); record_obj(handle, obj); @@ -1499,8 +1501,8 @@ void zs_free(struct zs_pool *pool, unsigned long handle) } EXPORT_SYMBOL_GPL(zs_free); -static void zs_object_copy(unsigned long dst, unsigned long src, - struct size_class *class) +static void zs_object_copy(struct size_class *class, unsigned long dst, + unsigned long src) { struct page *s_page, *d_page; unsigned long s_objidx, d_objidx; @@ -1566,8 +1568,8 @@ static void zs_object_copy(unsigned long dst, unsigned long src, * Find alloced object in zspage from index object and * return handle. */ -static unsigned long find_alloced_obj(struct page *page, int index, - struct size_class *class) +static unsigned long find_alloced_obj(struct size_class *class, + struct page *page, int index) { unsigned long head; int offset = 0; @@ -1617,7 +1619,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, int ret = 0; while (1) { - handle = find_alloced_obj(s_page, index, class); + handle = find_alloced_obj(class, s_page, index); if (!handle) { s_page = get_next_page(s_page); if (!s_page) @@ -1634,8 +1636,8 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, } used_obj = handle_to_obj(handle); - free_obj = obj_malloc(d_page, class, handle); - zs_object_copy(free_obj, used_obj, class); + free_obj = obj_malloc(class, d_page, handle); + zs_object_copy(class, free_obj, used_obj); index++; /* * record_obj updates handle's value to free_obj and it will @@ -1664,7 +1666,7 @@ static struct page *isolate_target_page(struct size_class *class) for (i = 0; i < _ZS_NR_FULLNESS_GROUPS; i++) { page = class->fullness_list[i]; if (page) { - remove_zspage(page, class, i); + remove_zspage(class, i, page); break; } } @@ -1687,7 +1689,7 @@ static enum fullness_group putback_zspage(struct zs_pool *pool, enum fullness_group fullness; fullness = get_fullness_group(first_page); - insert_zspage(first_page, class, fullness); + insert_zspage(class, fullness, first_page); set_zspage_mapping(first_page, class->index, fullness); if (fullness == ZS_EMPTY) { @@ -1712,7 +1714,7 @@ static struct page *isolate_source_page(struct size_class *class) if (!page) continue; - remove_zspage(page, class, i); + remove_zspage(class, i, page); break; } @@ -1949,7 +1951,7 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) pool->flags = flags; - if (zs_pool_stat_create(name, pool)) + if (zs_pool_stat_create(pool, name)) goto err; /* -- cgit v1.2.3 From 1ee4716585ed80b7917ba3c5aa38e5e0d677d583 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Fri, 20 May 2016 16:59:45 -0700 Subject: zsmalloc: remove unused pool param in obj_free Let's remove unused pool param in obj_free Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 18535abfad40..ae288c9f7156 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1438,8 +1438,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) } EXPORT_SYMBOL_GPL(zs_malloc); -static void obj_free(struct zs_pool *pool, struct size_class *class, - unsigned long obj) +static void obj_free(struct size_class *class, unsigned long obj) { struct link_free *link; struct page *first_page, *f_page; @@ -1485,7 +1484,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle) class = pool->size_class[class_idx]; spin_lock(&class->lock); - obj_free(pool, class, obj); + obj_free(class, obj); fullness = fix_fullness_group(class, first_page); if (fullness == ZS_EMPTY) { zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( @@ -1648,7 +1647,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, free_obj |= BIT(HANDLE_PIN_BIT); record_obj(handle, free_obj); unpin_tag(handle); - obj_free(pool, class, used_obj); + obj_free(class, used_obj); } /* Remember last position in this iteration */ -- cgit v1.2.3 From d0d8da2dc49dfdfe1d788eaf4d55eb5d4964d926 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 20 May 2016 16:59:48 -0700 Subject: zsmalloc: require GFP in zs_malloc() Pass GFP flags to zs_malloc() instead of using a fixed mask supplied to zs_create_pool(), so we can be more flexible, but, more importantly, we need this to switch zram to per-cpu compression streams -- zram will try to allocate handle with preemption disabled in a fast path and switch to a slow path (using different gfp mask) if the fast one has failed. Apart from that, this also align zs_malloc() interface with zspool/zbud. [sergey.senozhatsky@gmail.com: pass GFP flags to zs_malloc() instead of using a fixed mask] Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish Link: http://lkml.kernel.org/r/20160429150942.GA637@swordfish Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 4 ++-- include/linux/zsmalloc.h | 4 ++-- mm/zsmalloc.c | 24 +++++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 370c2f76016d..b09acdb753ee 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -514,7 +514,7 @@ static struct zram_meta *zram_meta_alloc(char *pool_name, u64 disksize) goto out_error; } - meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); + meta->mem_pool = zs_create_pool(pool_name); if (!meta->mem_pool) { pr_err("Error creating memory pool\n"); goto out_error; @@ -717,7 +717,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, src = uncmem; } - handle = zs_malloc(meta->mem_pool, clen); + handle = zs_malloc(meta->mem_pool, clen, GFP_NOIO | __GFP_HIGHMEM); if (!handle) { pr_err("Error allocating memory for compressed page: %u, size=%zu\n", index, clen); diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h index 34eb16098a33..57a8e98f2708 100644 --- a/include/linux/zsmalloc.h +++ b/include/linux/zsmalloc.h @@ -41,10 +41,10 @@ struct zs_pool_stats { struct zs_pool; -struct zs_pool *zs_create_pool(const char *name, gfp_t flags); +struct zs_pool *zs_create_pool(const char *name); void zs_destroy_pool(struct zs_pool *pool); -unsigned long zs_malloc(struct zs_pool *pool, size_t size); +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags); void zs_free(struct zs_pool *pool, unsigned long obj); void *zs_map_object(struct zs_pool *pool, unsigned long handle, diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ae288c9f7156..aba39a291523 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -247,7 +247,6 @@ struct zs_pool { struct size_class **size_class; struct kmem_cache *handle_cachep; - gfp_t flags; /* allocation flags used when growing pool */ atomic_long_t pages_allocated; struct zs_pool_stats stats; @@ -295,10 +294,10 @@ static void destroy_handle_cache(struct zs_pool *pool) kmem_cache_destroy(pool->handle_cachep); } -static unsigned long alloc_handle(struct zs_pool *pool) +static unsigned long alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, - pool->flags & ~__GFP_HIGHMEM); + gfp & ~__GFP_HIGHMEM); } static void free_handle(struct zs_pool *pool, unsigned long handle) @@ -324,7 +323,12 @@ static void *zs_zpool_create(const char *name, gfp_t gfp, const struct zpool_ops *zpool_ops, struct zpool *zpool) { - return zs_create_pool(name, gfp); + /* + * Ignore global gfp flags: zs_malloc() may be invoked from + * different contexts and its caller must provide a valid + * gfp mask. + */ + return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) @@ -335,7 +339,7 @@ static void zs_zpool_destroy(void *pool) static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { - *handle = zs_malloc(pool, size); + *handle = zs_malloc(pool, size, gfp); return *handle ? 0 : -1; } static void zs_zpool_free(void *pool, unsigned long handle) @@ -1391,7 +1395,7 @@ static unsigned long obj_malloc(struct size_class *class, * otherwise 0. * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ -unsigned long zs_malloc(struct zs_pool *pool, size_t size) +unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; @@ -1400,7 +1404,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return 0; - handle = alloc_handle(pool); + handle = alloc_handle(pool, gfp); if (!handle) return 0; @@ -1413,7 +1417,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size) if (!first_page) { spin_unlock(&class->lock); - first_page = alloc_zspage(class, pool->flags); + first_page = alloc_zspage(class, gfp); if (unlikely(!first_page)) { free_handle(pool, handle); return 0; @@ -1878,7 +1882,7 @@ static int zs_register_shrinker(struct zs_pool *pool) * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ -struct zs_pool *zs_create_pool(const char *name, gfp_t flags) +struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; @@ -1948,8 +1952,6 @@ struct zs_pool *zs_create_pool(const char *name, gfp_t flags) prev_class = class; } - pool->flags = flags; - if (zs_pool_stat_create(pool, name)) goto err; -- cgit v1.2.3 From 200867af4dedfe7cb707f96773684de1d1fd21e6 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 20 May 2016 16:59:54 -0700 Subject: mm/zswap: use workqueue to destroy pool Add a work_struct to struct zswap_pool, and change __zswap_pool_empty to use the workqueue instead of using call_rcu(). When zswap destroys a pool no longer in use, it uses call_rcu() to perform the destruction/freeing. Since that executes in softirq context, it must not sleep. However, actually destroying the pool involves freeing the per-cpu compressors (which requires locking the cpu_add_remove_lock mutex) and freeing the zpool, for which the implementation may sleep (e.g. zsmalloc calls kmem_cache_destroy, which locks the slab_mutex). So if either mutex is currently taken, or any other part of the compressor or zpool implementation sleeps, it will result in a BUG(). It's not easy to reproduce this when changing zswap's params normally. In testing with a loaded system, this does not fail: $ cd /sys/module/zswap/parameters $ echo lz4 > compressor ; echo zsmalloc > zpool nor does this: $ while true ; do > echo lzo > compressor ; echo zbud > zpool > sleep 1 > echo lz4 > compressor ; echo zsmalloc > zpool > sleep 1 > done although it's still possible either of those might fail, depending on whether anything else besides zswap has locked the mutexes. However, changing a parameter with no delay immediately causes the schedule while atomic BUG: $ while true ; do > echo lzo > compressor ; echo lz4 > compressor > done This is essentially the same as Yu Zhao's proposed patch to zsmalloc, but moved to zswap, to cover compressor and zpool freeing. Fixes: f1c54846ee45 ("zswap: dynamic pool creation") Signed-off-by: Dan Streetman Reported-by: Yu Zhao Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index de0f119b1780..275b22cc8df4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -117,7 +117,7 @@ struct zswap_pool { struct crypto_comp * __percpu *tfm; struct kref kref; struct list_head list; - struct rcu_head rcu_head; + struct work_struct work; struct notifier_block notifier; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -658,9 +658,11 @@ static int __must_check zswap_pool_get(struct zswap_pool *pool) return kref_get_unless_zero(&pool->kref); } -static void __zswap_pool_release(struct rcu_head *head) +static void __zswap_pool_release(struct work_struct *work) { - struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head); + struct zswap_pool *pool = container_of(work, typeof(*pool), work); + + synchronize_rcu(); /* nobody should have been able to get a kref... */ WARN_ON(kref_get_unless_zero(&pool->kref)); @@ -680,7 +682,9 @@ static void __zswap_pool_empty(struct kref *kref) WARN_ON(pool == zswap_pool_current()); list_del_rcu(&pool->list); - call_rcu(&pool->rcu_head, __zswap_pool_release); + + INIT_WORK(&pool->work, __zswap_pool_release); + schedule_work(&pool->work); spin_unlock(&zswap_pools_lock); } -- cgit v1.2.3 From d34f615720d17c49b6779f6fcd5cb7eb82231a38 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Fri, 20 May 2016 16:59:56 -0700 Subject: mm/zsmalloc: don't fail if can't create debugfs info Change the return type of zs_pool_stat_create() to void, and remove the logic to abort pool creation if the stat debugfs dir/file could not be created. The debugfs stat file is for debugging/information only, and doesn't affect operation of zsmalloc; there is no reason to abort creating the pool if the stat file can't be created. This was seen with zswap, which used the same name for all pool creations, which caused zsmalloc to fail to create a second pool for zswap if CONFIG_ZSMALLOC_STAT was enabled. Signed-off-by: Dan Streetman Reviewed-by: Sergey Senozhatsky Cc: Dan Streetman Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index aba39a291523..72698db958e7 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -573,17 +573,17 @@ static const struct file_operations zs_stat_size_ops = { .release = single_release, }; -static int zs_pool_stat_create(struct zs_pool *pool, const char *name) +static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { struct dentry *entry; if (!zs_stat_root) - return -ENODEV; + return; entry = debugfs_create_dir(name, zs_stat_root); if (!entry) { pr_warn("debugfs dir <%s> creation failed\n", name); - return -ENOMEM; + return; } pool->stat_dentry = entry; @@ -592,10 +592,8 @@ static int zs_pool_stat_create(struct zs_pool *pool, const char *name) if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); - return -ENOMEM; + return; } - - return 0; } static void zs_pool_stat_destroy(struct zs_pool *pool) @@ -613,9 +611,8 @@ static void __exit zs_stat_exit(void) { } -static inline int zs_pool_stat_create(struct zs_pool *pool, const char *name) +static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) { - return 0; } static inline void zs_pool_stat_destroy(struct zs_pool *pool) @@ -623,7 +620,6 @@ static inline void zs_pool_stat_destroy(struct zs_pool *pool) } #endif - /* * For each size class, zspages are divided into different groups * depending on how "full" they are. This was done so that we could @@ -1952,8 +1948,8 @@ struct zs_pool *zs_create_pool(const char *name) prev_class = class; } - if (zs_pool_stat_create(pool, name)) - goto err; + /* debug only, don't abort if it fails */ + zs_pool_stat_create(pool, name); /* * Not critical, we still can use the pool -- cgit v1.2.3 From 57578c2ea2cb2e0d362a9212ac83cf90221d4883 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:01:54 -0700 Subject: raxix-tree: introduce CONFIG_RADIX_TREE_MULTIORDER I've been receiving increasingly concerned notes from 0day about how much my recent changes have been bloating the radix tree. Make it happier by only including multiorder support if CONFIG_TRANSPARENT_HUGEPAGES is set. This is an independent Kconfig option, so other radix tree users can also set it if they have a need. Signed-off-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig | 3 +++ lib/radix-tree.c | 26 ++++++++++++++++++-------- mm/Kconfig | 1 + tools/testing/radix-tree/linux/kernel.h | 1 + 4 files changed, 23 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/lib/Kconfig b/lib/Kconfig index 61d55bd0ed89..d79909dc01ec 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -362,6 +362,9 @@ config INTERVAL_TREE for more information. +config RADIX_TREE_MULTIORDER + bool + config ASSOCIATIVE_ARRAY bool help diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 1624c4117961..799f341977d0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -484,6 +484,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, slot = node->slots[offset]; } +#ifdef CONFIG_RADIX_TREE_MULTIORDER /* Insert pointers to the canonical entry */ if ((shift - order) > 0) { int i, n = 1 << (shift - order); @@ -499,6 +500,7 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, node->count++; } } +#endif if (nodep) *nodep = node; @@ -1469,6 +1471,20 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, return deleted; } +static inline void delete_sibling_entries(struct radix_tree_node *node, + void *ptr, unsigned offset) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + int i; + for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { + if (node->slots[offset + i] != ptr) + break; + node->slots[offset + i] = NULL; + node->count--; + } +#endif +} + /** * radix_tree_delete_item - delete an item from a radix tree * @root: radix tree root @@ -1484,7 +1500,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, unsigned long index, void *item) { struct radix_tree_node *node; - unsigned int offset, i; + unsigned int offset; void **slot; void *entry; int tag; @@ -1513,13 +1529,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, radix_tree_tag_clear(root, index, tag); } - /* Delete any sibling slots pointing to this slot */ - for (i = 1; offset + i < RADIX_TREE_MAP_SIZE; i++) { - if (node->slots[offset + i] != ptr_to_indirect(slot)) - break; - node->slots[offset + i] = NULL; - node->count--; - } + delete_sibling_entries(node, ptr_to_indirect(slot), offset); node->slots[offset] = NULL; node->count--; diff --git a/mm/Kconfig b/mm/Kconfig index 1a6a28ebcb8b..2664c118b5d2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -404,6 +404,7 @@ config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE select COMPACTION + select RADIX_TREE_MULTIORDER help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. diff --git a/tools/testing/radix-tree/linux/kernel.h b/tools/testing/radix-tree/linux/kernel.h index 31fe2c77d7ae..8ea0ed450810 100644 --- a/tools/testing/radix-tree/linux/kernel.h +++ b/tools/testing/radix-tree/linux/kernel.h @@ -9,6 +9,7 @@ #include "../../include/linux/compiler.h" +#define CONFIG_RADIX_TREE_MULTIORDER #define CONFIG_SHMEM #define CONFIG_SWAP -- cgit v1.2.3 From d604c324524bf61c68182bb27db64656a78fe911 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 20 May 2016 17:03:45 -0700 Subject: radix-tree: introduce radix_tree_replace_clear_tags() In addition to replacing the entry, we also clear all associated tags. This is really a one-off special for page_cache_tree_delete() which had far too much detailed knowledge about how the radix tree works. For efficiency, factor node_tag_clear() out of radix_tree_tag_clear() It can be used by radix_tree_delete_item() as well as radix_tree_replace_clear_tags(). Signed-off-by: Matthew Wilcox Cc: Konstantin Khlebnikov Cc: Kirill Shutemov Cc: Jan Kara Cc: Neil Brown Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 9 ++++-- lib/radix-tree.c | 76 ++++++++++++++++++++++++++++------------------ mm/filemap.c | 23 ++------------ 3 files changed, 56 insertions(+), 52 deletions(-) (limited to 'mm') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index bad63105e37e..11c8e7cc3920 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -281,9 +281,12 @@ bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); -unsigned int -radix_tree_gang_lookup(struct radix_tree_root *root, void **results, - unsigned long first_index, unsigned int max_items); +struct radix_tree_node *radix_tree_replace_clear_tags( + struct radix_tree_root *root, + unsigned long index, void *entry); +unsigned int radix_tree_gang_lookup(struct radix_tree_root *root, + void **results, unsigned long first_index, + unsigned int max_items); unsigned int radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, unsigned long *indices, unsigned long first_index, unsigned int max_items); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9d9b4b9af4b6..c7114d233b38 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -740,6 +740,26 @@ void *radix_tree_tag_set(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_tag_set); +static void node_tag_clear(struct radix_tree_root *root, + struct radix_tree_node *node, + unsigned int tag, unsigned int offset) +{ + while (node) { + if (!tag_get(node, tag, offset)) + return; + tag_clear(node, tag, offset); + if (any_tag_set(node, tag)) + return; + + offset = node->offset; + node = node->parent; + } + + /* clear the root's tag bit */ + if (root_tag_get(root, tag)) + root_tag_clear(root, tag); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -776,28 +796,9 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, offset = radix_tree_descend(parent, &node, offset); } - if (node == NULL) - goto out; + if (node) + node_tag_clear(root, parent, tag, offset); - index >>= shift; - - while (parent) { - if (!tag_get(parent, tag, offset)) - goto out; - tag_clear(parent, tag, offset); - if (any_tag_set(parent, tag)) - goto out; - - index >>= RADIX_TREE_MAP_SHIFT; - offset = index & RADIX_TREE_MAP_MASK; - parent = parent->parent; - } - - /* clear the root's tag bit */ - if (root_tag_get(root, tag)) - root_tag_clear(root, tag); - -out: return node; } EXPORT_SYMBOL(radix_tree_tag_clear); @@ -1525,14 +1526,9 @@ void *radix_tree_delete_item(struct radix_tree_root *root, offset = get_slot_offset(node, slot); - /* - * Clear all tags associated with the item to be deleted. - * This way of doing it would be inefficient, but seldom is any set. - */ - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (tag_get(node, tag, offset)) - radix_tree_tag_clear(root, index, tag); - } + /* Clear all tags associated with the item to be deleted. */ + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; @@ -1559,6 +1555,28 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_delete); +struct radix_tree_node *radix_tree_replace_clear_tags( + struct radix_tree_root *root, + unsigned long index, void *entry) +{ + struct radix_tree_node *node; + void **slot; + + __radix_tree_lookup(root, index, &node, &slot); + + if (node) { + unsigned int tag, offset = get_slot_offset(node, slot); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + node_tag_clear(root, node, tag, offset); + } else { + /* Clear root node tags */ + root->gfp_mask &= __GFP_BITS_MASK; + } + + radix_tree_replace_slot(slot, entry); + return node; +} + /** * radix_tree_tagged - test whether any items in the tree are tagged * @root: radix tree root diff --git a/mm/filemap.c b/mm/filemap.c index b418405903bc..9665b1d4f318 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -114,14 +114,11 @@ static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { struct radix_tree_node *node; - unsigned long index; - unsigned int offset; - unsigned int tag; - void **slot; VM_BUG_ON(!PageLocked(page)); - __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); + node = radix_tree_replace_clear_tags(&mapping->page_tree, page->index, + shadow); if (shadow) { mapping->nrexceptional++; @@ -135,23 +132,9 @@ static void page_cache_tree_delete(struct address_space *mapping, } mapping->nrpages--; - if (!node) { - /* Clear direct pointer tags in root node */ - mapping->page_tree.gfp_mask &= __GFP_BITS_MASK; - radix_tree_replace_slot(slot, shadow); + if (!node) return; - } - - /* Clear tree tags for the removed page */ - index = page->index; - offset = index & RADIX_TREE_MAP_MASK; - for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { - if (test_bit(offset, node->tags[tag])) - radix_tree_tag_clear(&mapping->page_tree, index, tag); - } - /* Delete page, swap shadow entry */ - radix_tree_replace_slot(slot, shadow); workingset_node_pages_dec(node); if (shadow) workingset_node_shadows_inc(node); -- cgit v1.2.3 From dee410792419aaa8bc3e3b35d2ccb6515835916d Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sat, 14 May 2016 12:20:44 -0700 Subject: /dev/dax, core: file operations and dax-mmap The "Device DAX" core enables dax mappings of performance / feature differentiated memory. An open mapping or file handle keeps the backing struct device live, but new mappings are only possible while the device is enabled. Faults are handled under rcu_read_lock to synchronize with the enabled state of the device. Similar to the filesystem-dax case the backing memory may optionally have struct page entries. However, unlike fs-dax there is no support for private mappings, or mappings that are not backed by media (see use of zero-page in fs-dax). Mappings are always guaranteed to match the alignment of the dax_region. If the dax_region is configured to have a 2MB alignment, all mappings are guaranteed to be backed by a pmd entry. Contrast this determinism with the fs-dax case where pmd mappings are opportunistic. If userspace attempts to force a misaligned mapping, the driver will fail the mmap attempt. See dax_dev_check_vma() for other scenarios that are rejected, like MAP_PRIVATE mappings. Cc: Hannes Reinecke Cc: Jeff Moyer Cc: Christoph Hellwig Cc: Andrew Morton Cc: Dave Hansen Cc: Ross Zwisler Acked-by: "Paul E. McKenney" Reviewed-by: Johannes Thumshirn Signed-off-by: Dan Williams --- drivers/dax/Kconfig | 1 + drivers/dax/dax.c | 322 ++++++++++++++++++++++++++++++++++++++++++++++++++++ mm/huge_memory.c | 1 + mm/hugetlb.c | 1 + 4 files changed, 325 insertions(+) (limited to 'mm') diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index 86ffbaa891ad..cedab7572de3 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig @@ -1,6 +1,7 @@ menuconfig DEV_DAX tristate "DAX: direct access to differentiated memory" default m if NVDIMM_DAX + depends on TRANSPARENT_HUGEPAGE help Support raw access to differentiated (persistence, bandwidth, latency...) memory via an mmap(2) capable character diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 4c22a40f2335..b891a129b275 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -49,6 +49,7 @@ struct dax_region { * @region - parent region * @dev - device backing the character device * @kref - enable this data to be tracked in filp->private_data + * @alive - !alive + rcu grace period == no new mappings can be established * @id - child id in the region * @num_resources - number of physical address extents in this device * @res - array of physical address ranges @@ -57,6 +58,7 @@ struct dax_dev { struct dax_region *region; struct device *dev; struct kref kref; + bool alive; int id; int num_resources; struct resource res[0]; @@ -150,6 +152,16 @@ static void unregister_dax_dev(void *_dev) dev_dbg(dev, "%s\n", __func__); + /* + * Note, rcu is not protecting the liveness of dax_dev, rcu is + * ensuring that any fault handlers that might have seen + * dax_dev->alive == true, have completed. Any fault handlers + * that start after synchronize_rcu() has started will abort + * upon seeing dax_dev->alive == false. + */ + dax_dev->alive = false; + synchronize_rcu(); + get_device(dev); device_unregister(dev); ida_simple_remove(&dax_region->ida, dax_dev->id); @@ -173,6 +185,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, memcpy(dax_dev->res, res, sizeof(*res) * count); dax_dev->num_resources = count; kref_init(&dax_dev->kref); + dax_dev->alive = true; dax_dev->region = dax_region; kref_get(&dax_region->kref); @@ -217,9 +230,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res, } EXPORT_SYMBOL_GPL(devm_create_dax_dev); +/* return an unmapped area aligned to the dax region specified alignment */ +static unsigned long dax_dev_get_unmapped_area(struct file *filp, + unsigned long addr, unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + unsigned long off, off_end, off_align, len_align, addr_align, align; + struct dax_dev *dax_dev = filp ? filp->private_data : NULL; + struct dax_region *dax_region; + + if (!dax_dev || addr) + goto out; + + dax_region = dax_dev->region; + align = dax_region->align; + off = pgoff << PAGE_SHIFT; + off_end = off + len; + off_align = round_up(off, align); + + if ((off_end <= off_align) || ((off_end - off_align) < align)) + goto out; + + len_align = len + align; + if ((off + len_align) < off) + goto out; + + addr_align = current->mm->get_unmapped_area(filp, addr, len_align, + pgoff, flags); + if (!IS_ERR_VALUE(addr_align)) { + addr_align += (off - addr_align) & (align - 1); + return addr_align; + } + out: + return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags); +} + +static int __match_devt(struct device *dev, const void *data) +{ + const dev_t *devt = data; + + return dev->devt == *devt; +} + +static struct device *dax_dev_find(dev_t dev_t) +{ + return class_find_device(dax_class, NULL, &dev_t, __match_devt); +} + +static int dax_dev_open(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = NULL; + struct device *dev; + + dev = dax_dev_find(inode->i_rdev); + if (!dev) + return -ENXIO; + + device_lock(dev); + dax_dev = dev_get_drvdata(dev); + if (dax_dev) { + dev_dbg(dev, "%s\n", __func__); + filp->private_data = dax_dev; + kref_get(&dax_dev->kref); + inode->i_flags = S_DAX; + } + device_unlock(dev); + + if (!dax_dev) { + put_device(dev); + return -ENXIO; + } + return 0; +} + +static int dax_dev_release(struct inode *inode, struct file *filp) +{ + struct dax_dev *dax_dev = filp->private_data; + struct device *dev = dax_dev->dev; + + dev_dbg(dax_dev->dev, "%s\n", __func__); + dax_dev_put(dax_dev); + put_device(dev); + + return 0; +} + +static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma, + const char *func) +{ + struct dax_region *dax_region = dax_dev->region; + struct device *dev = dax_dev->dev; + unsigned long mask; + + if (!dax_dev->alive) + return -ENXIO; + + /* prevent private / writable mappings from being established */ + if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) { + dev_info(dev, "%s: %s: fail, attempted private mapping\n", + current->comm, func); + return -EINVAL; + } + + mask = dax_region->align - 1; + if (vma->vm_start & mask || vma->vm_end & mask) { + dev_info(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n", + current->comm, func, vma->vm_start, vma->vm_end, + mask); + return -EINVAL; + } + + if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV + && (vma->vm_flags & VM_DONTCOPY) == 0) { + dev_info(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n", + current->comm, func); + return -EINVAL; + } + + if (!vma_is_dax(vma)) { + dev_info(dev, "%s: %s: fail, vma is not DAX capable\n", + current->comm, func); + return -EINVAL; + } + + return 0; +} + +static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, + unsigned long size) +{ + struct resource *res; + phys_addr_t phys; + int i; + + for (i = 0; i < dax_dev->num_resources; i++) { + res = &dax_dev->res[i]; + phys = pgoff * PAGE_SIZE + res->start; + if (phys >= res->start && phys <= res->end) + break; + pgoff -= PHYS_PFN(resource_size(res)); + } + + if (i < dax_dev->num_resources) { + res = &dax_dev->res[i]; + if (phys + size - 1 <= res->end) + return phys; + } + + return -1; +} + +static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + unsigned long vaddr = (unsigned long) vmf->virtual_address; + struct device *dev = dax_dev->dev; + struct dax_region *dax_region; + int rc = VM_FAULT_SIGBUS; + phys_addr_t phys; + pfn_t pfn; + + if (check_vma(dax_dev, vma, __func__)) + return VM_FAULT_SIGBUS; + + dax_region = dax_dev->region; + if (dax_region->align > PAGE_SIZE) { + dev_dbg(dev, "%s: alignment > fault size\n", __func__); + return VM_FAULT_SIGBUS; + } + + phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE); + if (phys == -1) { + dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + vmf->pgoff); + return VM_FAULT_SIGBUS; + } + + pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + + rc = vm_insert_mixed(vma, vaddr, pfn); + + if (rc == -ENOMEM) + return VM_FAULT_OOM; + if (rc < 0 && rc != -EBUSY) + return VM_FAULT_SIGBUS; + + return VM_FAULT_NOPAGE; +} + +static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + int rc; + struct file *filp = vma->vm_file; + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + current->comm, (vmf->flags & FAULT_FLAG_WRITE) + ? "write" : "read", vma->vm_start, vma->vm_end); + rcu_read_lock(); + rc = __dax_dev_fault(dax_dev, vma, vmf); + rcu_read_unlock(); + + return rc; +} + +static int __dax_dev_pmd_fault(struct dax_dev *dax_dev, + struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, + unsigned int flags) +{ + unsigned long pmd_addr = addr & PMD_MASK; + struct device *dev = dax_dev->dev; + struct dax_region *dax_region; + phys_addr_t phys; + pgoff_t pgoff; + pfn_t pfn; + + if (check_vma(dax_dev, vma, __func__)) + return VM_FAULT_SIGBUS; + + dax_region = dax_dev->region; + if (dax_region->align > PMD_SIZE) { + dev_dbg(dev, "%s: alignment > fault size\n", __func__); + return VM_FAULT_SIGBUS; + } + + /* dax pmd mappings require pfn_t_devmap() */ + if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) != (PFN_DEV|PFN_MAP)) { + dev_dbg(dev, "%s: alignment > fault size\n", __func__); + return VM_FAULT_SIGBUS; + } + + pgoff = linear_page_index(vma, pmd_addr); + phys = pgoff_to_phys(dax_dev, pgoff, PAGE_SIZE); + if (phys == -1) { + dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__, + pgoff); + return VM_FAULT_SIGBUS; + } + + pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); + + return vmf_insert_pfn_pmd(vma, addr, pmd, pfn, + flags & FAULT_FLAG_WRITE); +} + +static int dax_dev_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + int rc; + struct file *filp = vma->vm_file; + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__, + current->comm, (flags & FAULT_FLAG_WRITE) + ? "write" : "read", vma->vm_start, vma->vm_end); + + rcu_read_lock(); + rc = __dax_dev_pmd_fault(dax_dev, vma, addr, pmd, flags); + rcu_read_unlock(); + + return rc; +} + +static void dax_dev_vm_open(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(dax_dev->dev, "%s\n", __func__); + kref_get(&dax_dev->kref); +} + +static void dax_dev_vm_close(struct vm_area_struct *vma) +{ + struct file *filp = vma->vm_file; + struct dax_dev *dax_dev = filp->private_data; + + dev_dbg(dax_dev->dev, "%s\n", __func__); + dax_dev_put(dax_dev); +} + +static const struct vm_operations_struct dax_dev_vm_ops = { + .fault = dax_dev_fault, + .pmd_fault = dax_dev_pmd_fault, + .open = dax_dev_vm_open, + .close = dax_dev_vm_close, +}; + +static int dax_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct dax_dev *dax_dev = filp->private_data; + int rc; + + dev_dbg(dax_dev->dev, "%s\n", __func__); + + rc = check_vma(dax_dev, vma, __func__); + if (rc) + return rc; + + kref_get(&dax_dev->kref); + vma->vm_ops = &dax_dev_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; + +} + static const struct file_operations dax_fops = { .llseek = noop_llseek, .owner = THIS_MODULE, + .open = dax_dev_open, + .release = dax_dev_release, + .get_unmapped_area = dax_dev_get_unmapped_area, + .mmap = dax_dev_mmap, }; static int __init dax_init(void) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 86f9f8b82f8e..52ea012d8a80 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1013,6 +1013,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); return VM_FAULT_NOPAGE; } +EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 19d0d08b396f..b14e98129b07 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -624,6 +624,7 @@ pgoff_t linear_hugepage_index(struct vm_area_struct *vma, { return vma_hugecache_offset(hstate_vma(vma), vma, address); } +EXPORT_SYMBOL_GPL(linear_hugepage_index); /* * Return the size of the pages allocated when backing a VMA. In the majority -- cgit v1.2.3 From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 17:21:27 -0700 Subject: x86: remove more uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_from_user_inatomic()" is mostly the special cases for the constant size, and it's actually almost never relevant. Most users aren't actually using a constant size anyway, and the few cases that do small constant copies are better off just using __get_user() instead. So get rid of the unnecessary complexity. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_32.h | 26 -------------------------- kernel/events/uprobes.c | 3 +-- kernel/futex.c | 2 +- mm/maccess.c | 3 +-- 4 files changed, 3 insertions(+), 31 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 537cc883ea29..4b32da24faaf 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -65,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not - * constant, that shouldn't be a problem. - */ - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } return __copy_from_user_ll_nozero(to, from, n); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7edc95edfaee..c01f733ff2e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/futex.c b/kernel/futex.c index c20f06f38ef3..ee25f5ba4aca 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; diff --git a/mm/maccess.c b/mm/maccess.c index d159b1c96e48..78f9274dd49d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_disable(); do { - ret = __copy_from_user_inatomic(dst++, - (const void __user __force *)src++, 1); + ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; -- cgit v1.2.3 From 1383399d7be029281997889df23150fa6c16be6e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 23 May 2016 16:22:29 -0700 Subject: mm: memcontrol: fix possible css ref leak on oom mem_cgroup_oom may be invoked multiple times while a process is handling a page fault, in which case current->memcg_in_oom will be overwritten leaking the previously taken css reference. Link: http://lkml.kernel.org/r/1464019330-7579-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b3f16ab4b431..cf428d7b9a03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1604,7 +1604,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom) + if (!current->memcg_may_oom || current->memcg_in_oom) return; /* * We are in the middle of the charge context here, so we -- cgit v1.2.3 From dc0ef0df7b6a90892ec41933212ac701152a254c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:27 -0700 Subject: mm: make mmap_sem for write waits killable for mm syscalls This is a follow up work for oom_reaper [1]. As the async OOM killing depends on oom_sem for read we would really appreciate if a holder for write didn't stood in the way. This patchset is changing many of down_write calls to be killable to help those cases when the writer is blocked and waiting for readers to release the lock and so help __oom_reap_task to process the oom victim. Most of the patches are really trivial because the lock is help from a shallow syscall paths where we can return EINTR trivially and allow the current task to die (note that EINTR will never get to the userspace as the task has fatal signal pending). Others seem to be easy as well as the callers are already handling fatal errors and bail and return to userspace which should be sufficient to handle the failure gracefully. I am not familiar with all those code paths so a deeper review is really appreciated. As this work is touching more areas which are not directly connected I have tried to keep the CC list as small as possible and people who I believed would be familiar are CCed only to the specific patches (all should have received the cover though). This patchset is based on linux-next and it depends on down_write_killable for rw_semaphores which got merged into tip locking/rwsem branch and it is merged into this next tree. I guess it would be easiest to route these patches via mmotm because of the dependency on the tip tree but if respective maintainers prefer other way I have no objections. I haven't covered all the mmap_write(mm->mmap_sem) instances here $ git grep "down_write(.*\)" next/master | wc -l 98 $ git grep "down_write(.*\)" | wc -l 62 I have tried to cover those which should be relatively easy to review in this series because this alone should be a nice improvement. Other places can be changed on top. [0] http://lkml.kernel.org/r/1456752417-9626-1-git-send-email-mhocko@kernel.org [1] http://lkml.kernel.org/r/1452094975-551-1-git-send-email-mhocko@kernel.org [2] http://lkml.kernel.org/r/1456750705-7141-1-git-send-email-mhocko@kernel.org This patch (of 18): This is the first step in making mmap_sem write waiters killable. It focuses on the trivial ones which are taking the lock early after entering the syscall and they are not changing state before. Therefore it is very easy to change them to use down_write_killable and immediately return with -EINTR. This will allow the waiter to pass away without blocking the mmap_sem which might be required to make a forward progress. E.g. the oom reaper will need the lock for reading to dismantle the OOM victim address space. The only tricky function in this patch is vm_mmap_pgoff which has many call sites via vm_mmap. To reduce the risk keep vm_mmap with the original non-killable semantic for now. vm_munmap callers do not bother checking the return value so open code it into the munmap syscall path for now for simplicity. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: David Rientjes Cc: Dave Hansen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 5 +++-- mm/madvise.c | 8 +++++--- mm/mlock.c | 16 ++++++++++------ mm/mmap.c | 27 +++++++++++++++++++++++---- mm/mprotect.c | 3 ++- mm/mremap.c | 3 ++- mm/nommu.c | 2 +- mm/util.c | 12 +++++++++--- 8 files changed, 55 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index f6f3353b0868..bff7fd702331 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -442,9 +442,10 @@ extern u64 hwpoison_filter_flags_value; extern u64 hwpoison_filter_memcg; extern u32 hwpoison_filter_enable; -extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, +extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long); + unsigned long, unsigned long, + bool); extern void set_pageblock_order(void); unsigned long reclaim_clean_pages_from_list(struct zone *zone, diff --git a/mm/madvise.c b/mm/madvise.c index 07427d3fcead..93fb63e88b5e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -707,10 +707,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return error; write = madvise_need_mmap_write(behavior); - if (write) - down_write(¤t->mm->mmap_sem); - else + if (write) { + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; + } else { down_read(¤t->mm->mmap_sem); + } /* * If the interval [start,end) covers some unmapped address diff --git a/mm/mlock.c b/mm/mlock.c index 96f001041928..ef8dc9f395c4 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -617,7 +617,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return error; } -static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) +static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags) { unsigned long locked; unsigned long lock_limit; @@ -635,7 +635,8 @@ static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) lock_limit >>= PAGE_SHIFT; locked = len >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; locked += current->mm->locked_vm; @@ -678,7 +679,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; ret = apply_vma_lock_flags(start, len, 0); up_write(¤t->mm->mmap_sem); @@ -748,9 +750,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; - ret = -ENOMEM; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; + ret = -ENOMEM; if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || capable(CAP_IPC_LOCK)) ret = apply_mlockall_flags(flags); @@ -765,7 +768,8 @@ SYSCALL_DEFINE0(munlockall) { int ret; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; ret = apply_mlockall_flags(0); up_write(¤t->mm->mmap_sem); return ret; diff --git a/mm/mmap.c b/mm/mmap.c index b9274a0c82c9..11e1f2ca72af 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -178,7 +178,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) unsigned long min_brk; bool populate; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; #ifdef CONFIG_COMPAT_BRK /* @@ -1332,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); out_fput: if (file) fput(file); @@ -2493,6 +2494,10 @@ int vm_munmap(unsigned long start, size_t len) int ret; struct mm_struct *mm = current->mm; + /* + * XXX convert to down_write_killable as soon as all users are able + * to handle the error. + */ down_write(&mm->mmap_sem); ret = do_munmap(mm, start, len); up_write(&mm->mmap_sem); @@ -2502,8 +2507,15 @@ EXPORT_SYMBOL(vm_munmap); SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { + int ret; + struct mm_struct *mm = current->mm; + profile_munmap(addr); - return vm_munmap(addr, len); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_munmap(mm, addr, len); + up_write(&mm->mmap_sem); + return ret; } @@ -2535,7 +2547,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (pgoff + (size >> PAGE_SHIFT) < pgoff) return ret; - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + vma = find_vma(mm, start); if (!vma || !(vma->vm_flags & VM_SHARED)) @@ -2700,6 +2714,11 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) unsigned long ret; bool populate; + /* + * XXX not all users are chcecking the return value, convert + * to down_write_killable after they are able to cope with + * error + */ down_write(&mm->mmap_sem); ret = do_brk(addr, len); populate = ((mm->def_flags & VM_LOCKED) != 0); diff --git a/mm/mprotect.c b/mm/mprotect.c index b650c5412f58..5019a1ef2848 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -379,7 +379,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, reqprot = prot; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; vma = find_vma(current->mm, start); error = -ENOMEM; diff --git a/mm/mremap.c b/mm/mremap.c index 9dc499977924..1f157adfdaf9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -503,7 +503,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (!new_len) return ret; - down_write(¤t->mm->mmap_sem); + if (down_write_killable(¤t->mm->mmap_sem)) + return -EINTR; if (flags & MREMAP_FIXED) { ret = mremap_to(addr, old_len, new_addr, new_len, diff --git a/mm/nommu.c b/mm/nommu.c index c8bd59a03c71..b74512746aae 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); if (file) fput(file); diff --git a/mm/util.c b/mm/util.c index 8a1b3a1fb595..03b237746850 100644 --- a/mm/util.c +++ b/mm/util.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff) + unsigned long flag, unsigned long pgoff, bool killable) { unsigned long ret; struct mm_struct *mm = current->mm; @@ -297,7 +297,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) { - down_write(&mm->mmap_sem); + if (killable) { + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + } else { + down_write(&mm->mmap_sem); + } ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); @@ -307,6 +312,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } +/* XXX are all callers checking an error */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) @@ -316,7 +322,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, if (unlikely(offset_in_page(offset))) return -EINVAL; - return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false); } EXPORT_SYMBOL(vm_mmap); -- cgit v1.2.3 From 9fbeb5ab59a2b2a09cca2eb68283e7a090d4b98d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:30 -0700 Subject: mm: make vm_mmap killable All the callers of vm_mmap seem to check for the failure already and bail out in one way or another on the error which means that we can change it to use killable version of vm_mmap_pgoff and return -EINTR if the current task gets killed while waiting for mmap_sem. This also means that vm_mmap_pgoff can be killable by default and drop the additional parameter. This will help in the OOM conditions when the oom victim might be stuck waiting for the mmap_sem for write which in turn can block oom_reaper which relies on the mmap_sem for read to make a forward progress and reclaim the address space of the victim. Please note that load_elf_binary is ignoring vm_mmap error for current->personality & MMAP_PAGE_ZERO case but that shouldn't be a problem because the address is not used anywhere and we never return to the userspace if we got killed. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Oleg Nesterov Cc: Andrea Arcangeli Cc: Al Viro Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/internal.h | 3 +-- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/util.c | 13 ++++--------- 5 files changed, 8 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index b530c99e8e81..d5eb8dddd7c0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2013,7 +2013,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} /* These take the mm semaphore themselves */ extern unsigned long vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); -extern unsigned long vm_mmap(struct file *, unsigned long, +extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/internal.h b/mm/internal.h index bff7fd702331..a37e5b6f9d25 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -444,8 +444,7 @@ extern u32 hwpoison_filter_enable; extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, - bool); + unsigned long, unsigned long); extern void set_pageblock_order(void); unsigned long reclaim_clean_pages_from_list(struct zone *zone, diff --git a/mm/mmap.c b/mm/mmap.c index 11e1f2ca72af..420088682d4a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1333,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); out_fput: if (file) fput(file); diff --git a/mm/nommu.c b/mm/nommu.c index b74512746aae..c8bd59a03c71 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true); + retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); if (file) fput(file); diff --git a/mm/util.c b/mm/util.c index 03b237746850..917e0e3d0f8e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flag, unsigned long pgoff, bool killable) + unsigned long flag, unsigned long pgoff) { unsigned long ret; struct mm_struct *mm = current->mm; @@ -297,12 +297,8 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, ret = security_mmap_file(file, prot, flag); if (!ret) { - if (killable) { - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - } else { - down_write(&mm->mmap_sem); - } + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, &populate); up_write(&mm->mmap_sem); @@ -312,7 +308,6 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, return ret; } -/* XXX are all callers checking an error */ unsigned long vm_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long offset) @@ -322,7 +317,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, if (unlikely(offset_in_page(offset))) return -EINVAL; - return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false); + return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); } EXPORT_SYMBOL(vm_mmap); -- cgit v1.2.3 From ae7987835643e470cb220e6685bd36d92179ef9c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:33 -0700 Subject: mm: make vm_munmap killable Almost all current users of vm_munmap are ignoring the return value and so they do not handle potential error. This means that some VMAs might stay behind. This patch doesn't try to solve those potential problems. Quite contrary it adds a new failure mode by using down_write_killable in vm_munmap. This should be safer than other failure modes, though, because the process is guaranteed to die as soon as it leaves the kernel and exit_mmap will clean the whole address space. This will help in the OOM conditions when the oom victim might be stuck waiting for the mmap_sem for write which in turn can block oom_reaper which relies on the mmap_sem for read to make a forward progress and reclaim the address space of the victim. Signed-off-by: Michal Hocko Cc: Oleg Nesterov Cc: "Kirill A. Shutemov" Cc: Konstantin Khlebnikov Cc: Andrea Arcangeli Cc: Alexander Viro Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 420088682d4a..ca292a7c2b68 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2494,11 +2494,9 @@ int vm_munmap(unsigned long start, size_t len) int ret; struct mm_struct *mm = current->mm; - /* - * XXX convert to down_write_killable as soon as all users are able - * to handle the error. - */ - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_munmap(mm, start, len); up_write(&mm->mmap_sem); return ret; -- cgit v1.2.3 From 2d6c928241add2848e4eebfce407e95164229976 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 23 May 2016 16:25:42 -0700 Subject: mm: make vm_brk killable Now that all the callers handle vm_brk failure we can change it wait for mmap_sem killable to help oom_reaper to not get blocked just because vm_brk gets blocked behind mmap_sem readers. Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/mmap.c | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index d5eb8dddd7c0..2835d598d258 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2011,7 +2011,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* These take the mm semaphore themselves */ -extern unsigned long vm_brk(unsigned long, unsigned long); +extern unsigned long __must_check vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, diff --git a/mm/mmap.c b/mm/mmap.c index ca292a7c2b68..d3d9a94ca031 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2712,12 +2712,9 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) unsigned long ret; bool populate; - /* - * XXX not all users are chcecking the return value, convert - * to down_write_killable after they are able to cope with - * error - */ - down_write(&mm->mmap_sem); + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + ret = do_brk(addr, len); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); -- cgit v1.2.3 From 957949243bac5dc25e2a651f17059f54f184913e Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 26 May 2016 15:16:08 -0700 Subject: mm: make CONFIG_DEFERRED_STRUCT_PAGE_INIT depends on !FLATMEM explicitly Per the suggestion from Michal Hocko [1], DEFERRED_STRUCT_PAGE_INIT requires some ordering wrt other initialization operations, e.g. page_ext_init has to happen after the whole memmap is initialized properly. For SPARSEMEM this requires to wait for page_alloc_init_late. Other memory models (e.g. flatmem) might have different initialization layouts (page_ext_init_flatmem). Currently DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG which in turn depends on SPARSEMEM || X86_64_ACPI_NUMA depends on ARCH_ENABLE_MEMORY_HOTPLUG and X86_64_ACPI_NUMA depends on NUMA which in turn disable FLATMEM memory model: config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA so FLATMEM is ruled out via dependency maze. Be explicit and disable FLATMEM for DEFERRED_STRUCT_PAGE_INIT so that we do not reintroduce subtle initialization bugs [1] http://lkml.kernel.org/r/20160523073157.GD2278@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1464027356-32282-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 2664c118b5d2..22fa8189e4fc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -649,6 +649,7 @@ config DEFERRED_STRUCT_PAGE_INIT default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on MEMORY_HOTPLUG + depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable -- cgit v1.2.3 From 9725759a96efb1ce56a1b93455ac0ab1901c5327 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 26 May 2016 15:16:11 -0700 Subject: mm: kasan: remove unused 'reserved' field from struct kasan_alloc_meta Commit cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") added 'reserved' field, but never used it. Link: http://lkml.kernel.org/r/1464021054-2307-1-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/kasan.h | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7f7ac51d7faf..fb87923552ef 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -77,7 +77,6 @@ struct kasan_alloc_meta { struct kasan_track track; u32 state : 2; /* enum kasan_state */ u32 alloc_size : 30; - u32 reserved; }; struct qlist_node { -- cgit v1.2.3 From 1ebab2db065e99eed9ab2304d3b7ad25c9568612 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 26 May 2016 15:16:19 -0700 Subject: memcg: fix mem_cgroup_out_of_memory() return value. mem_cgroup_out_of_memory() is returning "true" if it finds a TIF_MEMDIE task after an eligible task was found, "false" if it found a TIF_MEMDIE task before an eligible task is found. This difference confuses memory_max_write() which checks the return value of mem_cgroup_out_of_memory(). Since memory_max_write() wants to continue looping, mem_cgroup_out_of_memory() should return "true" in this case. This patch sets a dummy pointer in order to return "true". Link: http://lkml.kernel.org/r/1463753327-5170-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Acked-by: Vladimir Davydov Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cf428d7b9a03..f6477a9dbe7a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1302,6 +1302,8 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); + /* Set a dummy value to return "true". */ + chosen = (void *) 1; goto unlock; case OOM_SCAN_OK: break; -- cgit v1.2.3 From 4abaac9b733ea44fcf0d561ec1813e0394e61c9d Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Thu, 26 May 2016 15:16:27 -0700 Subject: update "mm/zsmalloc: don't fail if can't create debugfs info" Some updates to commit d34f615720d1 ("mm/zsmalloc: don't fail if can't create debugfs info"): - add pr_warn to all stat failure cases - do not prevent module loading on stat failure Link: http://lkml.kernel.org/r/1463671123-5479-1-git-send-email-ddstreet@ieee.org Signed-off-by: Dan Streetman Reviewed-by: Ganesh Mahendran Acked-by: Minchan Kim Cc: Sergey Senozhatsky Cc: Dan Streetman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 72698db958e7..b6d4f258cb53 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -45,6 +45,8 @@ * */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -483,16 +485,16 @@ static inline unsigned long zs_stat_get(struct size_class *class, #ifdef CONFIG_ZSMALLOC_STAT -static int __init zs_stat_init(void) +static void __init zs_stat_init(void) { - if (!debugfs_initialized()) - return -ENODEV; + if (!debugfs_initialized()) { + pr_warn("debugfs not available, stat dir not created\n"); + return; + } zs_stat_root = debugfs_create_dir("zsmalloc", NULL); if (!zs_stat_root) - return -ENOMEM; - - return 0; + pr_warn("debugfs 'zsmalloc' stat dir creation failed\n"); } static void __exit zs_stat_exit(void) @@ -577,8 +579,10 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { struct dentry *entry; - if (!zs_stat_root) + if (!zs_stat_root) { + pr_warn("no root stat dir, not creating <%s> stat dir\n", name); return; + } entry = debugfs_create_dir(name, zs_stat_root); if (!entry) { @@ -592,7 +596,8 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name) if (!entry) { pr_warn("%s: debugfs file entry <%s> creation failed\n", name, "classes"); - return; + debugfs_remove_recursive(pool->stat_dentry); + pool->stat_dentry = NULL; } } @@ -602,9 +607,8 @@ static void zs_pool_stat_destroy(struct zs_pool *pool) } #else /* CONFIG_ZSMALLOC_STAT */ -static int __init zs_stat_init(void) +static void __init zs_stat_init(void) { - return 0; } static void __exit zs_stat_exit(void) @@ -2011,17 +2015,10 @@ static int __init zs_init(void) zpool_register_driver(&zs_zpool_driver); #endif - ret = zs_stat_init(); - if (ret) { - pr_err("zs stat initialization failed\n"); - goto stat_fail; - } + zs_stat_init(); + return 0; -stat_fail: -#ifdef CONFIG_ZPOOL - zpool_unregister_driver(&zs_zpool_driver); -#endif notifier_fail: zs_unregister_cpu_notifier(); -- cgit v1.2.3 From 5930122683dff58f0846b0f0405b4bd598a3ba6a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 27 May 2016 10:19:30 -0400 Subject: switch xattr_handler->set() to passing dentry and inode separately preparation for similar switch in ->setxattr() (see the next commit for rationale). Signed-off-by: Al Viro --- fs/9p/acl.c | 6 +++--- fs/9p/xattr.c | 5 +++-- fs/btrfs/xattr.c | 12 +++++------- fs/ceph/xattr.c | 7 ++++--- fs/cifs/xattr.c | 9 +++++---- fs/ext2/xattr_security.c | 7 ++++--- fs/ext2/xattr_trusted.c | 7 ++++--- fs/ext2/xattr_user.c | 9 +++++---- fs/ext4/xattr_security.c | 7 ++++--- fs/ext4/xattr_trusted.c | 7 ++++--- fs/ext4/xattr_user.c | 9 +++++---- fs/f2fs/xattr.c | 12 ++++++------ fs/gfs2/xattr.c | 6 +++--- fs/hfsplus/xattr.c | 12 ++++++------ fs/hfsplus/xattr.h | 2 +- fs/hfsplus/xattr_security.c | 7 ++++--- fs/hfsplus/xattr_trusted.c | 7 ++++--- fs/hfsplus/xattr_user.c | 7 ++++--- fs/jffs2/security.c | 7 ++++--- fs/jffs2/xattr_trusted.c | 7 ++++--- fs/jffs2/xattr_user.c | 7 ++++--- fs/jfs/xattr.c | 14 ++++++-------- fs/nfs/nfs4proc.c | 19 +++++++++---------- fs/ocfs2/xattr.c | 23 +++++++++++++---------- fs/orangefs/xattr.c | 10 ++++++---- fs/posix_acl.c | 6 +++--- fs/reiserfs/xattr_security.c | 9 +++++---- fs/reiserfs/xattr_trusted.c | 9 +++++---- fs/reiserfs/xattr_user.c | 9 +++++---- fs/ubifs/xattr.c | 7 +++---- fs/xattr.c | 6 ++++-- fs/xfs/xfs_xattr.c | 9 +++++---- include/linux/xattr.h | 4 ++-- mm/shmem.c | 7 ++++--- 34 files changed, 156 insertions(+), 135 deletions(-) (limited to 'mm') diff --git a/fs/9p/acl.c b/fs/9p/acl.c index eb3589edf485..0576eaeb60b9 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -239,13 +239,13 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, } static int v9fs_xattr_set_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = d_inode(dentry); v9ses = v9fs_dentry2v9ses(dentry); /* diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 18c62bae9591..a6bd349bab23 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -147,8 +147,9 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { const char *full_name = xattr_full_name(handler, name); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3bfb252206c7..d1a177a3dbe8 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -380,23 +380,21 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, - int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { name = xattr_full_name(handler, name); - return btrfs_set_prop(d_inode(dentry), name, value, size, flags); + return btrfs_set_prop(inode, name, value, size, flags); } static const struct xattr_handler btrfs_security_xattr_handler = { diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0d66722c6a52..2baa6939dfe6 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1051,12 +1051,13 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; - return __ceph_setxattr(d_inode(dentry), name, value, size, flags); + return __ceph_setxattr(inode, name, value, size, flags); } const struct xattr_handler ceph_other_xattr_handler = { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index c8b77aa24a1d..5e23f64c0804 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -39,8 +39,9 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; static int cifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { int rc = -EOPNOTSUPP; unsigned int xid; @@ -99,12 +100,12 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (value && pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - size, d_inode(dentry), + size, inode, full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(d_inode(dentry))->time = 0; + CIFS_I(inode)->time = 0; kfree(pacl); } #endif /* CONFIG_CIFS_ACL */ diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 7fd3b867ce65..7b9e9c1842d5 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -18,10 +18,11 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 0f85705ff519..65049b71af13 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -25,10 +25,11 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 1fafd27037cc..fb2f992ae763 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -29,13 +29,14 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(inode, EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 123a7d010efe..a8921112030d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -22,10 +22,11 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 60652fa24cbc..c7765c735714 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -29,10 +29,11 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 17a446ffecd3..ca20e423034b 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -30,12 +30,13 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - if (!test_opt(dentry->d_sb, XATTR_USER)) + if (!test_opt(inode->i_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(inode, EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 00ea56797258..e3decae3acfb 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -50,10 +50,11 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); + struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); switch (handler->flags) { case F2FS_XATTR_INDEX_USER: @@ -69,7 +70,7 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, default: return -EINVAL; } - return f2fs_setxattr(d_inode(dentry), handler->flags, name, + return f2fs_setxattr(inode, handler->flags, name, value, size, NULL, flags); } @@ -95,11 +96,10 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, const void *value, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index f42ab53bd30d..3a2853504084 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1251,10 +1251,10 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 4f118d282a7a..d37bb88dc746 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -424,7 +424,7 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen) { @@ -437,8 +437,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); - res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, - flags); + res = __hfsplus_setxattr(inode, xattr_name, value, size, flags); kfree(xattr_name); return res; } @@ -864,8 +863,9 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { /* * Don't allow setting properly prefixed attributes @@ -880,7 +880,7 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler, * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ - return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); + return __hfsplus_setxattr(inode, name, buffer, size, flags); } const struct xattr_handler hfsplus_xattr_osx_handler = { diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index d04ba6f58df2..68f6b539371f 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -21,7 +21,7 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); -int hfsplus_setxattr(struct dentry *dentry, const char *name, +int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index ae2ca8c2e335..37b3efa733ef 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,10 +23,11 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index eae2947060aa..94519d6c627d 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -21,10 +21,11 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 3c9eec3e4c7b..fae6c0ea0030 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -21,10 +21,11 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return hfsplus_setxattr(dentry, name, buffer, size, flags, + return hfsplus_setxattr(inode, name, buffer, size, flags, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index 3ed9a4b49778..c2332e30f218 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,10 +57,11 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 4ebecff1d922..5d6030826c52 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,10 +25,11 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index bce249e1b277..9d027b4abcf9 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,10 +25,11 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *buffer, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) { - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(inode, JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index beb182b503b3..0bf3c33aedff 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -943,11 +943,10 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - name = xattr_full_name(handler, name); return __jfs_xattr_set(inode, name, value, size, flags); } @@ -962,11 +961,10 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - if (is_known_namespace(name)) return -EOPNOTSUPP; return __jfs_xattr_set(inode, name, value, size, flags); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 084e8570da18..2e802ec47b8a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4993,12 +4993,11 @@ static int nfs4_do_set_security_label(struct inode *inode, } static int -nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) +nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -6255,11 +6254,11 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); + return nfs4_proc_set_acl(inode, buf, buflen); } static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, @@ -6277,12 +6276,12 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, const char *key, - const void *buf, size_t buflen, - int flags) + struct dentry *unused, struct inode *inode, + const char *key, const void *buf, + size_t buflen, int flags) { if (security_ismaclabel(key)) - return nfs4_set_security_label(dentry, buf, buflen); + return nfs4_set_security_label(inode, buf, buflen); return -EOPNOTSUPP; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ad16995c9e7a..d2053853951e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7254,10 +7254,11 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7325,10 +7326,11 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7354,15 +7356,16 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 99c19545752c..5893ddde0e4b 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -448,13 +448,14 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, + return orangefs_inode_setxattr(inode, ORANGEFS_XATTR_NAME_DEFAULT_PREFIX, name, buffer, @@ -478,13 +479,14 @@ static int orangefs_xattr_get_default(const struct xattr_handler *handler, } static int orangefs_xattr_set_trusted(const struct xattr_handler *handler, - struct dentry *dentry, + struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { - return orangefs_inode_setxattr(dentry->d_inode, + return orangefs_inode_setxattr(inode, ORANGEFS_XATTR_NAME_TRUSTED_PREFIX, name, buffer, diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 2c60f17e7d92..8a4a266beff3 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -822,10 +822,10 @@ posix_acl_xattr_get(const struct xattr_handler *handler, static int posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; int ret; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 86aeb9dd805a..e4cbb7719906 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -20,13 +20,14 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, } static int -security_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +security_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (IS_PRIVATE(d_inode(dentry))) + if (IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 31837f031f59..f15a5f9e84ce 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -19,13 +19,14 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, } static int -trusted_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +trusted_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(inode)) return -EPERM; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index f7c39731684b..dc59df43b2db 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -17,12 +17,13 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *buffer, size_t size, int flags) +user_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) { - if (!reiserfs_xattrs_user(dentry->d_sb)) + if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(d_inode(dentry), + return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 6c277eb6aef9..b5fc27969e9d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -579,11 +579,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, } static int ubifs_xattr_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *dentry, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct inode *inode = d_inode(dentry); - dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name, inode->i_ino, dentry, size); diff --git a/fs/xattr.c b/fs/xattr.c index fc81e771488a..b16d07889700 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -754,7 +754,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, value, size, flags); + return handler->set(handler, dentry, d_inode(dentry), name, value, + size, flags); } /* @@ -769,7 +770,8 @@ generic_removexattr(struct dentry *dentry, const char *name) handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); if (IS_ERR(handler)) return PTR_ERR(handler); - return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); + return handler->set(handler, dentry, d_inode(dentry), name, NULL, + 0, XATTR_REPLACE); } EXPORT_SYMBOL(generic_getxattr); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index d111f691f313..2773b155cb56 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -74,11 +74,12 @@ xfs_forget_acl( } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, - const char *name, const void *value, size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { int xflags = handler->flags; - struct xfs_inode *ip = XFS_I(d_inode(dentry)); + struct xfs_inode *ip = XFS_I(inode); int error; /* Convert Linux syscall to XFS internal ATTR flags */ @@ -92,7 +93,7 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, error = xfs_attr_set(ip, (unsigned char *)name, (void *)value, size, xflags); if (!error) - xfs_forget_acl(d_inode(dentry), name, xflags); + xfs_forget_acl(inode, name, xflags); return error; } diff --git a/include/linux/xattr.h b/include/linux/xattr.h index 1cc4c578deb9..76beb206741a 100644 --- a/include/linux/xattr.h +++ b/include/linux/xattr.h @@ -33,8 +33,8 @@ struct xattr_handler { struct inode *inode, const char *name, void *buffer, size_t size); int (*set)(const struct xattr_handler *, struct dentry *dentry, - const char *name, const void *buffer, size_t size, - int flags); + struct inode *inode, const char *name, const void *buffer, + size_t size, int flags); }; const char *xattr_full_name(const struct xattr_handler *, const char *); diff --git a/mm/shmem.c b/mm/shmem.c index e418a995427d..a36144909b28 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2645,10 +2645,11 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, } static int shmem_xattr_handler_set(const struct xattr_handler *handler, - struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) + struct dentry *unused, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) { - struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + struct shmem_inode_info *info = SHMEM_I(inode); name = xattr_full_name(handler, name); return simple_xattr_set(&info->xattrs, name, value, size, flags); -- cgit v1.2.3 From edd9f7230f591b7988533b1cafb07f3c03555f19 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 27 May 2016 14:27:21 -0700 Subject: mm: oom: do not reap task if there are live threads in threadgroup If the current process is exiting, we don't invoke oom killer, instead we give it access to memory reserves and try to reap its mm in case nobody is going to use it. There's a mistake in the code performing this check - we just ignore any process of the same thread group no matter if it is exiting or not - see try_oom_reaper. Fix it. Link: http://lkml.kernel.org/r/1464087628-7318-1-git-send-email-vdavydov@virtuozzo.com Fixes: 3ef22dfff239 ("oom, oom_reaper: try to reap tasks which skip regular OOM killer path")Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5bb2f7698ad7..326dd14938f0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -611,8 +611,6 @@ void try_oom_reaper(struct task_struct *tsk) if (!process_shares_mm(p, mm)) continue; - if (same_thread_group(p, tsk)) - continue; if (fatal_signal_pending(p)) continue; -- cgit v1.2.3 From fe53ca54270a757f0a28ee6bf3a54d952b550ed0 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 27 May 2016 14:27:30 -0700 Subject: mm: use early_pfn_to_nid in page_ext_init page_ext_init() checks suitable pages with pfn_to_nid(), but pfn_to_nid() depends on memmap which will not be setup fully until page_alloc_init_late() is done. Use early_pfn_to_nid() instead of pfn_to_nid() so that page extension could be still used early even though CONFIG_ DEFERRED_STRUCT_PAGE_INIT is enabled and catch early page allocation call sites. Suggested by Joonsoo Kim [1], this fix basically undoes the change introduced by commit b8f1a75d61d840 ("mm: call page_ext_init() after all struct pages are initialized") and fixes the same problem with a better approach. [1] http://lkml.kernel.org/r/CAAmzW4OUmyPwQjvd7QUfc6W1Aic__TyAuH80MLRZNMxKy0-wPQ@mail.gmail.com Link: http://lkml.kernel.org/r/1464198689-23458-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 3 +-- mm/page_ext.c | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/init/main.c b/init/main.c index bc0f9e0bcf22..4c17fda5c2ff 100644 --- a/init/main.c +++ b/init/main.c @@ -607,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif + page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); @@ -1003,8 +1004,6 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); - /* Initialize page ext after all struct pages are initializaed */ - page_ext_init(); do_basic_setup(); diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e64f7fe..44a4c029c8e7 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -390,8 +390,10 @@ void __init page_ext_init(void) * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... + * + * Take into account DEFERRED_STRUCT_PAGE_INIT. */ - if (pfn_to_nid(pfn) != nid) + if (early_pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; -- cgit v1.2.3 From f65e91df25aa426289cbcb580ca3183e24979fb1 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 27 May 2016 14:27:32 -0700 Subject: mm: use early_pfn_to_nid in register_page_bootmem_info_node register_page_bootmem_info_node() is invoked in mem_init(), so it will be called before page_alloc_init_late() if DEFERRED_STRUCT_PAGE_INIT is enabled. But, pfn_to_nid() depends on memmap which won't be fully setup until page_alloc_init_late() is done, so replace pfn_to_nid() by early_pfn_to_nid(). Link: http://lkml.kernel.org/r/1464210007-30930-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index caf2a14c37ad..b8ee0806415f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) * multiple nodes we check that this pfn does not already * reside in some other nodes. */ - if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); } } -- cgit v1.2.3 From e2fe14564d3316d1625ed20bf1083995f4960893 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 27 May 2016 14:27:35 -0700 Subject: oom_reaper: close race with exiting task Tetsuo has reported: Out of memory: Kill process 443 (oleg's-test) score 855 or sacrifice child Killed process 443 (oleg's-test) total-vm:493248kB, anon-rss:423880kB, file-rss:4kB, shmem-rss:0kB sh invoked oom-killer: gfp_mask=0x24201ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD), order=0, oom_score_adj=0 sh cpuset=/ mems_allowed=0 CPU: 2 PID: 1 Comm: sh Not tainted 4.6.0-rc7+ #51 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 Call Trace: dump_stack+0x85/0xc8 dump_header+0x5b/0x394 oom_reaper: reaped process 443 (oleg's-test), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB In other words: __oom_reap_task exit_mm atomic_inc_not_zero tsk->mm = NULL mmput atomic_dec_and_test # > 0 exit_oom_victim # New victim will be # selected # no TIF_MEMDIE task so we can select a new one unmap_page_range # to release the memory The race exists even without the oom_reaper because anybody who pins the address space and gets preempted might race with exit_mm but oom_reaper made this race more probable. We can address the oom_reaper part by using oom_lock for __oom_reap_task because this would guarantee that a new oom victim will not be selected if the oom reaper might race with the exit path. This doesn't solve the original issue, though, because somebody else still might be pinning mm_users and so __mmput won't be called to release the memory but that is not really realiably solvable because the task will get away from the oom sight as soon as it is unhashed from the task_list and so we cannot guarantee a new victim won't be selected. [akpm@linux-foundation.org: fix use of unused `mm', Per Stephen] [akpm@linux-foundation.org: coding-style fixes] Fixes: aac453635549 ("mm, oom: introduce oom reaper") Link: http://lkml.kernel.org/r/1464271493-20008-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 326dd14938f0..dfb1ab61fb23 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -443,12 +443,28 @@ static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct mm_struct *mm; + struct mm_struct *mm = NULL; struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; + /* + * We have to make sure to not race with the victim exit path + * and cause premature new oom victim selection: + * __oom_reap_task exit_mm + * atomic_inc_not_zero + * mmput + * atomic_dec_and_test + * exit_oom_victim + * [...] + * out_of_memory + * select_bad_process + * # no TIF_MEMDIE task selects new victim + * unmap_page_range # frees some memory + */ + mutex_lock(&oom_lock); + /* * Make sure we find the associated mm_struct even when the particular * thread has already terminated and cleared its mm. @@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk) */ p = find_lock_task_mm(tsk); if (!p) - return true; + goto unlock_oom; mm = p->mm; if (!atomic_inc_not_zero(&mm->mm_users)) { task_unlock(p); - return true; + goto unlock_oom; } task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto out; + goto unlock_oom; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk) * to release its memory. */ set_bit(MMF_OOM_REAPED, &mm->flags); -out: +unlock_oom: + mutex_unlock(&oom_lock); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ - mmput_async(mm); + if (mm) + mmput_async(mm); return ret; } -- cgit v1.2.3 From 0798d3c022dc63eb0ec02b511e1f76ca8411ef8e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 27 May 2016 14:27:38 -0700 Subject: mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap() If page_move_anon_rmap() is refiling a pmd-splitted THP mapped in a tail page from a pte, the "address" must be THP aligned in order for the page->index bugcheck to pass in the CONFIG_DEBUG_VM=y builds. Link: http://lkml.kernel.org/r/1464253620-106404-1-git-send-email-kirill.shutemov@linux.intel.com Fixes: 6d0a07edd17c ("mm: thp: calculate the mapcount correctly for THP pages during WP faults") Signed-off-by: Kirill A. Shutemov Reported-by: Mika Westerberg Tested-by: Mika Westerberg Reviewed-by: Andrea Arcangeli Cc: [4.5] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 8a839935b18c..0ea5d9071b32 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1098,6 +1098,8 @@ void page_move_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_VMA(!anon_vma, vma); + if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page)) + address &= HPAGE_PMD_MASK; VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; -- cgit v1.2.3 From badbda53e505089062e194c614e6f23450bc98b2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 27 May 2016 14:27:41 -0700 Subject: mm/cma: silence warnings due to max() usage pageblock_order can be (at least) an unsigned int or an unsigned long depending on the kernel config and architecture, so use max_t(unsigned long, ...) when comparing it. fixes these warnings: In file included from include/asm-generic/bug.h:13:0, from arch/powerpc/include/asm/bug.h:127, from include/linux/bug.h:4, from include/linux/mmdebug.h:4, from include/linux/mm.h:8, from include/linux/memblock.h:18, from mm/cma.c:28: mm/cma.c: In function 'cma_init_reserved_mem': include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast (void) (&_max1 == &_max2); ^ mm/cma.c:186:27: note: in expansion of macro 'max' alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); ^ mm/cma.c: In function 'cma_declare_contiguous': include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast (void) (&_max1 == &_max2); ^ include/linux/kernel.h:747:9: note: in definition of macro 'max' typeof(y) _max2 = (y); ^ mm/cma.c:270:29: note: in expansion of macro 'max' (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); ^ include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast (void) (&_max1 == &_max2); ^ include/linux/kernel.h:747:21: note: in definition of macro 'max' typeof(y) _max2 = (y); ^ mm/cma.c:270:29: note: in expansion of macro 'max' (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); ^ [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160526150748.5be38a4f@canb.auug.org.au Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index ea506eb18cd6..bd0e1412475e 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, return -EINVAL; /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + alignment = PAGE_SIZE << + max_t(unsigned long, MAX_ORDER - 1, pageblock_order); /* alignment should be aligned with order_per_bit */ if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) @@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base, * migratetype page by page allocator's buddy algorithm. In the case, * you couldn't get a contiguous memory, which is not what we want. */ - alignment = max(alignment, - (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + alignment = max(alignment, (phys_addr_t)PAGE_SIZE << + max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); -- cgit v1.2.3 From cbedbac3e66121ddbac363776c23119f8eaeefda Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 27 May 2016 14:27:43 -0700 Subject: mm/memcontrol.c: fix the margin computation in mem_cgroup_margin() mem_cgroup_margin() might return (memory.limit - memory_count) when the memsw.limit is in excess. This doesn't happen usually because we do not allow excess on hard limits and (memory.limit <= memsw.limit), but __GFP_NOFAIL charges can force the charge and cause the excess when no memory is really swappable (swap is full or no anonymous memory is left). [mhocko@suse.com: rewrote changelog] Link: http://lkml.kernel.org/r/20160525155122.GK20132@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1464068266-27736-1-git-send-email-roy.qing.li@gmail.com Signed-off-by: Li RongQing Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6477a9dbe7a..485c688a7fa8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) margin = min(margin, limit - count); + else + margin = 0; } return margin; -- cgit v1.2.3 From 7cf7806ce1e30f1691cf340f70b807acbdf419ef Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 27 May 2016 14:27:46 -0700 Subject: mm/memcontrol.c: move comments for get_mctgt_type() to proper position Move the comments for get_mctgt_type() to be before get_mctgt_type() implementation. Link: http://lkml.kernel.org/r/1463644638-7446-1-git-send-email-roy.qing.li@gmail.com Signed-off-by: Li RongQing Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 485c688a7fa8..925b431f3f03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4309,24 +4309,6 @@ static int mem_cgroup_do_precharge(unsigned long count) return 0; } -/** - * get_mctgt_type - get target type of moving charge - * @vma: the vma the pte to be checked belongs - * @addr: the address corresponding to the pte to be checked - * @ptent: the pte to be checked - * @target: the pointer the target page or swap ent will be stored(can be NULL) - * - * Returns - * 0(MC_TARGET_NONE): if the pte is not a target for move charge. - * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for - * move charge. if @target is not NULL, the page is stored in target->page - * with extra refcnt got(Callers should handle it). - * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a - * target for charge migration. if @target is not NULL, the entry is stored - * in target->ent. - * - * Called with pte lock held. - */ union mc_target { struct page *page; swp_entry_t ent; @@ -4515,6 +4497,25 @@ out: return ret; } +/** + * get_mctgt_type - get target type of moving charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * + * Called with pte lock held. + */ + static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { -- cgit v1.2.3 From 11e685672a0861ce136cc4e7f6fdd11e5390b1fa Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 27 May 2016 14:27:49 -0700 Subject: mm: disable DEFERRED_STRUCT_PAGE_INIT on !NO_BOOTMEM When we have !NO_BOOTMEM, the deferred page struct initialization doesn't work well because the pages reserved in bootmem are released to the page allocator uncoditionally. It causes memory corruption and system crash eventually. As Mel suggested, the bootmem is retiring slowly. We fix the issue by simply hiding DEFERRED_STRUCT_PAGE_INIT when bootmem is enabled. Link: http://lkml.kernel.org/r/1460602170-5821-1-git-send-email-gwshan@linux.vnet.ibm.com Signed-off-by: Gavin Shan Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 22fa8189e4fc..3e2daef3c946 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -648,7 +648,7 @@ config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on MEMORY_HOTPLUG + depends on NO_BOOTMEM && MEMORY_HOTPLUG depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a -- cgit v1.2.3 From 7ded384a12688c2a86b618da16bc87713404dfcc Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 27 May 2016 15:23:32 -0700 Subject: mm: fix section mismatch warning The register_page_bootmem_info_node() function needs to be marked __init in order to avoid a new warning introduced by commit f65e91df25aa ("mm: use early_pfn_to_nid in register_page_bootmem_info_node"). Otherwise you'll get a warning about how a non-init function calls early_pfn_to_nid (which is __meminit) Cc: Yang Shi Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 2 +- mm/memory_hotplug.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 20d8a5d4d133..5145620ba48a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -182,7 +182,7 @@ static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) #endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE -extern void register_page_bootmem_info_node(struct pglist_data *pgdat); +extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat); #else static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b8ee0806415f..e3cbdcaff2a5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -263,7 +263,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ -void register_page_bootmem_info_node(struct pglist_data *pgdat) +void __init register_page_bootmem_info_node(struct pglist_data *pgdat) { unsigned long i, pfn, end_pfn, nr_pages; int node = pgdat->node_id; -- cgit v1.2.3 From 5d22fc25d4fc8096d2d7df27ea1893d4e055e764 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 27 May 2016 15:57:31 -0700 Subject: mm: remove more IS_ERR_VALUE abuses The do_brk() and vm_brk() return value was "unsigned long" and returned the starting address on success, and an error value on failure. The reasons are entirely historical, and go back to it basically behaving like the mmap() interface does. However, nobody actually wanted that interface, and it causes totally pointless IS_ERR_VALUE() confusion. What every single caller actually wants is just the simpler integer return of zero for success and negative error number on failure. So just convert to that much clearer and more common calling convention, and get rid of all the IS_ERR_VALUE() uses wrt vm_brk(). Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 17 ++++++++--------- fs/binfmt_aout.c | 18 +++++++----------- fs/binfmt_elf.c | 11 +++++------ include/linux/mm.h | 2 +- mm/mmap.c | 16 ++++++++-------- mm/nommu.c | 2 +- 6 files changed, 30 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index f5e737ff0022..cb26f18d43af 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -116,12 +116,12 @@ static struct linux_binfmt aout_format = { .min_coredump = PAGE_SIZE }; -static unsigned long set_brk(unsigned long start, unsigned long end) +static int set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); if (end <= start) - return start; + return 0; return vm_brk(start, end - start); } @@ -321,7 +321,7 @@ static int load_aout_binary(struct linux_binprm *bprm) error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) + if (error) return error; error = read_code(bprm->file, text_addr, 32, @@ -350,7 +350,7 @@ static int load_aout_binary(struct linux_binprm *bprm) if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) { error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (IS_ERR_VALUE(error)) + if (error) return error; read_code(bprm->file, N_TXTADDR(ex), fd_offset, @@ -378,7 +378,7 @@ static int load_aout_binary(struct linux_binprm *bprm) beyond_if: error = set_brk(current->mm->start_brk, current->mm->brk); - if (IS_ERR_VALUE(error)) + if (error) return error; set_binfmt(&aout_format); @@ -441,7 +441,7 @@ static int load_aout_library(struct file *file) } #endif retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (IS_ERR_VALUE(retval)) + if (retval) goto out; read_code(file, start_addr, N_TXTOFF(ex), @@ -461,9 +461,8 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - error = vm_brk(start_addr + len, bss - len); - retval = error; - if (error != start_addr + len) + retval = vm_brk(start_addr + len, bss - len); + if (retval) goto out; } retval = 0; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 2fab9f130e51..64b331ae3428 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -127,11 +127,8 @@ static int set_brk(unsigned long start, unsigned long end) { start = PAGE_ALIGN(start); end = PAGE_ALIGN(end); - if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; + if (end > start) + return vm_brk(start, end - start); } return 0; } @@ -275,7 +272,7 @@ static int load_aout_binary(struct linux_binprm * bprm) map_size = ex.a_text+ex.a_data; #endif error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) + if (error) return error; error = read_code(bprm->file, text_addr, pos, @@ -298,7 +295,7 @@ static int load_aout_binary(struct linux_binprm * bprm) if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { error = vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - if (IS_ERR_VALUE(error)) + if (error) return error; read_code(bprm->file, N_TXTADDR(ex), fd_offset, @@ -382,7 +379,7 @@ static int load_aout_library(struct file *file) file); } retval = vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - if (IS_ERR_VALUE(retval)) + if (retval) goto out; read_code(file, start_addr, N_TXTOFF(ex), @@ -402,9 +399,8 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - error = vm_brk(start_addr + len, bss - len); - retval = error; - if (error != start_addr + len) + retval = vm_brk(start_addr + len, bss - len); + if (retval) goto out; } retval = 0; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 938fc4ede764..e158b22ef32f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -96,10 +96,9 @@ static int set_brk(unsigned long start, unsigned long end) start = ELF_PAGEALIGN(start); end = ELF_PAGEALIGN(end); if (end > start) { - unsigned long addr; - addr = vm_brk(start, end - start); - if (BAD_ADDR(addr)) - return addr; + int error = vm_brk(start, end - start); + if (error) + return error; } current->mm->start_brk = current->mm->brk = end; return 0; @@ -629,7 +628,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, /* Map the last of the bss segment */ error = vm_brk(elf_bss, last_bss - elf_bss); - if (BAD_ADDR(error)) + if (error) goto out; } @@ -1178,7 +1177,7 @@ static int load_elf_library(struct file *file) bss = eppnt->p_memsz + eppnt->p_vaddr; if (bss > len) { error = vm_brk(len, bss - len); - if (BAD_ADDR(error)) + if (error) goto out_free_ph; } error = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index a00ec816233a..5df5feb49575 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2018,7 +2018,7 @@ static inline void mm_populate(unsigned long addr, unsigned long len) {} #endif /* These take the mm semaphore themselves */ -extern unsigned long __must_check vm_brk(unsigned long, unsigned long); +extern int __must_check vm_brk(unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, diff --git a/mm/mmap.c b/mm/mmap.c index d3d9a94ca031..de2c1769cc68 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -168,7 +168,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static unsigned long do_brk(unsigned long addr, unsigned long len); +static int do_brk(unsigned long addr, unsigned long len); SYSCALL_DEFINE1(brk, unsigned long, brk) { @@ -224,7 +224,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) goto out; /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) + if (do_brk(oldbrk, newbrk-oldbrk) < 0) goto out; set_brk: @@ -2625,7 +2625,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm) * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static unsigned long do_brk(unsigned long addr, unsigned long len) +static int do_brk(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; @@ -2636,7 +2636,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) len = PAGE_ALIGN(len); if (!len) - return addr; + return 0; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; @@ -2703,13 +2703,13 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; - return addr; + return 0; } -unsigned long vm_brk(unsigned long addr, unsigned long len) +int vm_brk(unsigned long addr, unsigned long len) { struct mm_struct *mm = current->mm; - unsigned long ret; + int ret; bool populate; if (down_write_killable(&mm->mmap_sem)) @@ -2718,7 +2718,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len) ret = do_brk(addr, len); populate = ((mm->def_flags & VM_LOCKED) != 0); up_write(&mm->mmap_sem); - if (populate) + if (populate && !ret) mm_populate(addr, len); return ret; } diff --git a/mm/nommu.c b/mm/nommu.c index c8bd59a03c71..c2e58880207f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1682,7 +1682,7 @@ void exit_mmap(struct mm_struct *mm) } } -unsigned long vm_brk(unsigned long addr, unsigned long len) +int vm_brk(unsigned long addr, unsigned long len) { return -ENOMEM; } -- cgit v1.2.3