From 68f23b89067fdf187763e75a56087550624fdbee Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 30 Jan 2020 22:11:04 -0800 Subject: memcg: fix a crash in wb_workfn when a device disappears Without memcg, there is a one-to-one mapping between the bdi and bdi_writeback structures. In this world, things are fairly straightforward; the first thing bdi_unregister() does is to shutdown the bdi_writeback structure (or wb), and part of that writeback ensures that no other work queued against the wb, and that the wb is fully drained. With memcg, however, there is a one-to-many relationship between the bdi and bdi_writeback structures; that is, there are multiple wb objects which can all point to a single bdi. There is a refcount which prevents the bdi object from being released (and hence, unregistered). So in theory, the bdi_unregister() *should* only get called once its refcount goes to zero (bdi_put will drop the refcount, and when it is zero, release_bdi gets called, which calls bdi_unregister). Unfortunately, del_gendisk() in block/gen_hd.c never got the memo about the Brave New memcg World, and calls bdi_unregister directly. It does this without informing the file system, or the memcg code, or anything else. This causes the root wb associated with the bdi to be unregistered, but none of the memcg-specific wb's are shutdown. So when one of these wb's are woken up to do delayed work, they try to dereference their wb->bdi->dev to fetch the device name, but unfortunately bdi->dev is now NULL, thanks to the bdi_unregister() called by del_gendisk(). As a result, *boom*. Fortunately, it looks like the rest of the writeback path is perfectly happy with bdi->dev and bdi->owner being NULL, so the simplest fix is to create a bdi_dev_name() function which can handle bdi->dev being NULL. This also allows us to bulletproof the writeback tracepoints to prevent them from dereferencing a NULL pointer and crashing the kernel if one is tracing with memcg's enabled, and an iSCSI device dies or a USB storage stick is pulled. The most common way of triggering this will be hotremoval of a device while writeback with memcg enabled is going on. It was triggering several times a day in a heavily loaded production environment. Google Bug Id: 145475544 Link: https://lore.kernel.org/r/20191227194829.150110-1-tytso@mit.edu Link: http://lkml.kernel.org/r/20191228005211.163952-1-tytso@mit.edu Signed-off-by: Theodore Ts'o Cc: Chris Mason Cc: Tejun Heo Cc: Jens Axboe Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 97967ce06de3..f88197c1ffc2 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -504,4 +505,13 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); } +extern const char *bdi_unknown_name; + +static inline const char *bdi_dev_name(struct backing_dev_info *bdi) +{ + if (!bdi || !bdi->dev) + return bdi_unknown_name; + return dev_name(bdi->dev); +} + #endif /* _LINUX_BACKING_DEV_H */ -- cgit v1.2.3 From dd3e7cba16274831f5a69f071ed3cf13ffb352ea Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Jan 2020 22:11:47 -0800 Subject: ocfs2/dlm: move BITS_TO_BYTES() to bitops.h for wider use There are users already and will be more of BITS_TO_BYTES() macro. Move it to bitops.h for wider use. In the case of ocfs2 the replacement is identical. As for bnx2x, there are two places where floor version is used. In the first case to calculate the amount of structures that can fit one memory page. In this case obviously the ceiling variant is correct and original code might have a potential bug, if amount of bits % 8 is not 0. In the second case the macro is used to calculate bytes transmitted in one microsecond. This will work for all speeds which is multiply of 1Gbps without any change, for the rest new code will give ceiling value, for instance 100Mbps will give 13 bytes, while old code gives 12 bytes and the arithmetically correct one is 12.5 bytes. Further the value is used to setup timer threshold which in any case has its own margins due to certain resolution. I don't see here an issue with slightly shifting thresholds for low speed connections, the card is supposed to utilize highest available rate, which is usually 10Gbps. Link: http://lkml.kernel.org/r/20200108121316.22411-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Joseph Qi Acked-by: Sudarsana Reddy Kalluru Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Gang He Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h | 1 - fs/ocfs2/dlm/dlmcommon.h | 4 ---- include/linux/bitops.h | 1 + 3 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h index 066765fbef06..0a59a09ef82f 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_init.h @@ -296,7 +296,6 @@ static inline void bnx2x_dcb_config_qm(struct bnx2x *bp, enum cos_mode mode, * possible, the driver should only write the valid vnics into the internal * ram according to the appropriate port mode. */ -#define BITS_TO_BYTES(x) ((x)/8) /* CMNG constants, as derived from system spec calculations */ diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index aaf24548b02a..0463dce65bb2 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -688,10 +688,6 @@ struct dlm_begin_reco __be32 pad2; }; - -#define BITS_PER_BYTE 8 -#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE) - struct dlm_query_join_request { u8 node_idx; diff --git a/include/linux/bitops.h b/include/linux/bitops.h index e479067c202c..6c7c4133c25c 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -13,6 +13,7 @@ #define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) +#define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); extern unsigned int __sw_hweight16(unsigned int w); -- cgit v1.2.3 From ddf8f376d137ba41ca67347a0b80ba0c357a1018 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 30 Jan 2020 22:12:07 -0800 Subject: mm/filemap.c: clean up filemap_write_and_wait() At some point filemap_write_and_wait() and filemap_write_and_wait_range() got the exact same implementation with the exception of the range being specified in *_range() Similar to other functions in fs.h which call *_range(..., 0, LLONG_MAX), change filemap_write_and_wait() to be a static inline which calls filemap_write_and_wait_range() Link: http://lkml.kernel.org/r/20191129160713.30892-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Reviewed-by: Nikolay Borisov Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 +++++- mm/filemap.c | 34 ++++++---------------------------- 2 files changed, 11 insertions(+), 29 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 40be2ccb87f3..41584f50af0d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2737,7 +2737,6 @@ static inline int filemap_fdatawait(struct address_space *mapping) extern bool filemap_range_has_page(struct address_space *, loff_t lstart, loff_t lend); -extern int filemap_write_and_wait(struct address_space *mapping); extern int filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); extern int __filemap_fdatawrite_range(struct address_space *mapping, @@ -2747,6 +2746,11 @@ extern int filemap_fdatawrite_range(struct address_space *mapping, extern int filemap_check_errors(struct address_space *mapping); extern void __filemap_set_wb_err(struct address_space *mapping, int err); +static inline int filemap_write_and_wait(struct address_space *mapping) +{ + return filemap_write_and_wait_range(mapping, 0, LLONG_MAX); +} + extern int __must_check file_fdatawait_range(struct file *file, loff_t lstart, loff_t lend); extern int __must_check file_check_and_advance_wb_err(struct file *file); diff --git a/mm/filemap.c b/mm/filemap.c index bf6aa30be58d..1784478270e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,33 +632,6 @@ static bool mapping_needs_writeback(struct address_space *mapping) return mapping->nrpages; } -int filemap_write_and_wait(struct address_space *mapping) -{ - int err = 0; - - if (mapping_needs_writeback(mapping)) { - err = filemap_fdatawrite(mapping); - /* - * Even if the above returned error, the pages may be - * written partially (e.g. -ENOSPC), so we wait for it. - * But the -EIO is special case, it may indicate the worst - * thing (e.g. bug) happened, so we avoid waiting for it. - */ - if (err != -EIO) { - int err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } else { - /* Clear any previously stored errors */ - filemap_check_errors(mapping); - } - } else { - err = filemap_check_errors(mapping); - } - return err; -} -EXPORT_SYMBOL(filemap_write_and_wait); - /** * filemap_write_and_wait_range - write out & wait on a file range * @mapping: the address_space for the pages @@ -680,7 +653,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, if (mapping_needs_writeback(mapping)) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); - /* See comment of filemap_write_and_wait() */ + /* + * Even if the above returned error, the pages may be + * written partially (e.g. -ENOSPC), so we wait for it. + * But the -EIO is special case, it may indicate the worst + * thing (e.g. bug) happened, so we avoid waiting for it. + */ if (err != -EIO) { int err2 = filemap_fdatawait_range(mapping, lstart, lend); -- cgit v1.2.3 From 07d8026995287c2a2f03e28c69cdd8152fa69107 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 30 Jan 2020 22:12:28 -0800 Subject: mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit An upcoming patch changes and complicates the refcounting and especially the "put page" aspects of it. In order to keep everything clean, refactor the devmap page release routines: * Rename put_devmap_managed_page() to page_is_devmap_managed(), and limit the functionality to "read only": return a bool, with no side effects. * Add a new routine, put_devmap_managed_page(), to handle decrementing the refcount for ZONE_DEVICE pages. * Change callers (just release_pages() and put_page()) to check page_is_devmap_managed() before calling the new put_devmap_managed_page() routine. This is a performance point: put_page() is a hot path, so we need to avoid non- inline function calls where possible. * Rename __put_devmap_managed_page() to free_devmap_managed_page(), and limit the functionality to unconditionally freeing a devmap page. This is originally based on a separate patch by Ira Weiny, which applied to an early version of the put_user_page() experiments. Since then, Jérôme Glisse suggested the refactoring described above. Link: http://lkml.kernel.org/r/20200107224558.2362728-5-jhubbard@nvidia.com Signed-off-by: Ira Weiny Signed-off-by: John Hubbard Suggested-by: Jérôme Glisse Reviewed-by: Dan Williams Reviewed-by: Jan Kara Cc: Christoph Hellwig Cc: Kirill A. Shutemov Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Björn Töpel Cc: Daniel Vetter Cc: Hans Verkuil Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Jonathan Corbet Cc: Leon Romanovsky Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 18 +++++++++++++----- mm/memremap.c | 15 +-------------- mm/swap.c | 27 ++++++++++++++++++++++++++- 3 files changed, 40 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1233bf45164d..3b88618e361a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -947,9 +947,10 @@ static inline bool is_zone_device_page(const struct page *page) #endif #ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page); +void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); -static inline bool put_devmap_managed_page(struct page *page) + +static inline bool page_is_devmap_managed(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; @@ -958,7 +959,6 @@ static inline bool put_devmap_managed_page(struct page *page) switch (page->pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_FS_DAX: - __put_devmap_managed_page(page); return true; default: break; @@ -966,11 +966,17 @@ static inline bool put_devmap_managed_page(struct page *page) return false; } +void put_devmap_managed_page(struct page *page); + #else /* CONFIG_DEV_PAGEMAP_OPS */ -static inline bool put_devmap_managed_page(struct page *page) +static inline bool page_is_devmap_managed(struct page *page) { return false; } + +static inline void put_devmap_managed_page(struct page *page) +{ +} #endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) @@ -1023,8 +1029,10 @@ static inline void put_page(struct page *page) * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); return; + } if (put_page_testzero(page)) __put_page(page); diff --git a/mm/memremap.c b/mm/memremap.c index f915d074ac20..4c723d2049d5 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -411,20 +411,8 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, EXPORT_SYMBOL_GPL(get_dev_pagemap); #ifdef CONFIG_DEV_PAGEMAP_OPS -void __put_devmap_managed_page(struct page *page) +void free_devmap_managed_page(struct page *page) { - int count = page_ref_dec_return(page); - - /* still busy */ - if (count > 1) - return; - - /* only triggered by the dev_pagemap shutdown path */ - if (count == 0) { - __put_page(page); - return; - } - /* notify page idle for dax */ if (!is_device_private_page(page)) { wake_up_var(&page->_refcount); @@ -461,5 +449,4 @@ void __put_devmap_managed_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); } -EXPORT_SYMBOL(__put_devmap_managed_page); #endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/mm/swap.c b/mm/swap.c index 5341ae93861f..cf39d24ada2a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -813,8 +813,10 @@ void release_pages(struct page **pages, int nr) * processing, and instead, expect a call to * put_page_testzero(). */ - if (put_devmap_managed_page(page)) + if (page_is_devmap_managed(page)) { + put_devmap_managed_page(page); continue; + } } page = compound_head(page); @@ -1102,3 +1104,26 @@ void __init swap_setup(void) * _really_ don't want to cluster much more */ } + +#ifdef CONFIG_DEV_PAGEMAP_OPS +void put_devmap_managed_page(struct page *page) +{ + int count; + + if (WARN_ON_ONCE(!page_is_devmap_managed(page))) + return; + + count = page_ref_dec_return(page); + + /* + * devmap page refcounts are 1-based, rather than 0-based: if + * refcount is 1, then the page is free and the refcount is + * stable because nobody holds a reference on the page. + */ + if (count == 1) + free_devmap_managed_page(page); + else if (!count) + __put_page(page); +} +EXPORT_SYMBOL(put_devmap_managed_page); +#endif -- cgit v1.2.3 From eddb1c228f7951d399240a0cc57455dccc7f8777 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 30 Jan 2020 22:12:54 -0800 Subject: mm/gup: introduce pin_user_pages*() and FOLL_PIN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce pin_user_pages*() variations of get_user_pages*() calls, and also pin_longterm_pages*() variations. For now, these are placeholder calls, until the various call sites are converted to use the correct get_user_pages*() or pin_user_pages*() API. These variants will eventually all set FOLL_PIN, which is also introduced, and thoroughly documented. pin_user_pages() pin_user_pages_remote() pin_user_pages_fast() All pages that are pinned via the above calls, must be unpinned via put_user_page(). The underlying rules are: * FOLL_PIN is a gup-internal flag, so the call sites should not directly set it. That behavior is enforced with assertions. * Call sites that want to indicate that they are going to do DirectIO ("DIO") or something with similar characteristics, should call a get_user_pages()-like wrapper call that sets FOLL_PIN. These wrappers will: * Start with "pin_user_pages" instead of "get_user_pages". That makes it easy to find and audit the call sites. * Set FOLL_PIN * For pages that are received via FOLL_PIN, those pages must be returned via put_user_page(). Thanks to Jan Kara and Vlastimil Babka for explaining the 4 cases in this documentation. (I've reworded it and expanded upon it.) Link: http://lkml.kernel.org/r/20200107224558.2362728-12-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Mike Rapoport [Documentation] Reviewed-by: Jérôme Glisse Cc: Jonathan Corbet Cc: Ira Weiny Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Björn Töpel Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dan Williams Cc: Hans Verkuil Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Kirill A. Shutemov Cc: Leon Romanovsky Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/index.rst | 1 + Documentation/core-api/pin_user_pages.rst | 232 ++++++++++++++++++++++++++++++ include/linux/mm.h | 63 ++++++-- mm/gup.c | 164 ++++++++++++++++++--- 4 files changed, 426 insertions(+), 34 deletions(-) create mode 100644 Documentation/core-api/pin_user_pages.rst (limited to 'include/linux') diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index bc0c727d7fd8..a501dc1c90d0 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -31,6 +31,7 @@ Core utilities generic-radix-tree memory-allocation mm-api + pin_user_pages gfp_mask-from-fs-io timekeeping boot-time-mm diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst new file mode 100644 index 000000000000..71849830cd48 --- /dev/null +++ b/Documentation/core-api/pin_user_pages.rst @@ -0,0 +1,232 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================================== +pin_user_pages() and related calls +==================================================== + +.. contents:: :local: + +Overview +======== + +This document describes the following functions:: + + pin_user_pages() + pin_user_pages_fast() + pin_user_pages_remote() + +Basic description of FOLL_PIN +============================= + +FOLL_PIN and FOLL_LONGTERM are flags that can be passed to the get_user_pages*() +("gup") family of functions. FOLL_PIN has significant interactions and +interdependencies with FOLL_LONGTERM, so both are covered here. + +FOLL_PIN is internal to gup, meaning that it should not appear at the gup call +sites. This allows the associated wrapper functions (pin_user_pages*() and +others) to set the correct combination of these flags, and to check for problems +as well. + +FOLL_LONGTERM, on the other hand, *is* allowed to be set at the gup call sites. +This is in order to avoid creating a large number of wrapper functions to cover +all combinations of get*(), pin*(), FOLL_LONGTERM, and more. Also, the +pin_user_pages*() APIs are clearly distinct from the get_user_pages*() APIs, so +that's a natural dividing line, and a good point to make separate wrapper calls. +In other words, use pin_user_pages*() for DMA-pinned pages, and +get_user_pages*() for other cases. There are four cases described later on in +this document, to further clarify that concept. + +FOLL_PIN and FOLL_GET are mutually exclusive for a given gup call. However, +multiple threads and call sites are free to pin the same struct pages, via both +FOLL_PIN and FOLL_GET. It's just the call site that needs to choose one or the +other, not the struct page(s). + +The FOLL_PIN implementation is nearly the same as FOLL_GET, except that FOLL_PIN +uses a different reference counting technique. + +FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying that is, +FOLL_LONGTERM is a specific case, more restrictive case of FOLL_PIN. + +Which flags are set by each wrapper +=================================== + +For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup +flags the caller provides. The caller is required to pass in a non-null struct +pages* array, and the function then pin pages by incrementing each by a special +value. For now, that value is +1, just like get_user_pages*().:: + + Function + -------- + pin_user_pages FOLL_PIN is always set internally by this function. + pin_user_pages_fast FOLL_PIN is always set internally by this function. + pin_user_pages_remote FOLL_PIN is always set internally by this function. + +For these get_user_pages*() functions, FOLL_GET might not even be specified. +Behavior is a little more complex than above. If FOLL_GET was *not* specified, +but the caller passed in a non-null struct pages* array, then the function +sets FOLL_GET for you, and proceeds to pin pages by incrementing the refcount +of each page by +1.:: + + Function + -------- + get_user_pages FOLL_GET is sometimes set internally by this function. + get_user_pages_fast FOLL_GET is sometimes set internally by this function. + get_user_pages_remote FOLL_GET is sometimes set internally by this function. + +Tracking dma-pinned pages +========================= + +Some of the key design constraints, and solutions, for tracking dma-pinned +pages: + +* An actual reference count, per struct page, is required. This is because + multiple processes may pin and unpin a page. + +* False positives (reporting that a page is dma-pinned, when in fact it is not) + are acceptable, but false negatives are not. + +* struct page may not be increased in size for this, and all fields are already + used. + +* Given the above, we can overload the page->_refcount field by using, sort of, + the upper bits in that field for a dma-pinned count. "Sort of", means that, + rather than dividing page->_refcount into bit fields, we simple add a medium- + large value (GUP_PIN_COUNTING_BIAS, initially chosen to be 1024: 10 bits) to + page->_refcount. This provides fuzzy behavior: if a page has get_page() called + on it 1024 times, then it will appear to have a single dma-pinned count. + And again, that's acceptable. + +This also leads to limitations: there are only 31-10==21 bits available for a +counter that increments 10 bits at a time. + +TODO: for 1GB and larger huge pages, this is cutting it close. That's because +when pin_user_pages() follows such pages, it increments the head page by "1" +(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for +pin_user_pages()) for each tail page. So if you have a 1GB huge page: + +* There are 256K (18 bits) worth of 4 KB tail pages. +* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is, + 10 bits at a time) +* There are 21 - 18 == 3 bits available to count. Except that there aren't, + because you need to allow for a few normal get_page() calls on the head page, + as well. Fortunately, the approach of using addition, rather than "hard" + bitfields, within page->_refcount, allows for sharing these bits gracefully. + But we're still looking at about 8 references. + +This, however, is a missing feature more than anything else, because it's easily +solved by addressing an obvious inefficiency in the original get_user_pages() +approach of retrieving pages: stop treating all the pages as if they were +PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of +this, so some work is required. Once that's in place, this limitation mostly +disappears from view, because there will be ample refcounting range available. + +* Callers must specifically request "dma-pinned tracking of pages". In other + words, just calling get_user_pages() will not suffice; a new set of functions, + pin_user_page() and related, must be used. + +FOLL_PIN, FOLL_GET, FOLL_LONGTERM: when to use which flags +========================================================== + +Thanks to Jan Kara, Vlastimil Babka and several other -mm people, for describing +these categories: + +CASE 1: Direct IO (DIO) +----------------------- +There are GUP references to pages that are serving +as DIO buffers. These buffers are needed for a relatively short time (so they +are not "long term"). No special synchronization with page_mkclean() or +munmap() is provided. Therefore, flags to set at the call site are: :: + + FOLL_PIN + +...but rather than setting FOLL_PIN directly, call sites should use one of +the pin_user_pages*() routines that set FOLL_PIN. + +CASE 2: RDMA +------------ +There are GUP references to pages that are serving as DMA +buffers. These buffers are needed for a long time ("long term"). No special +synchronization with page_mkclean() or munmap() is provided. Therefore, flags +to set at the call site are: :: + + FOLL_PIN | FOLL_LONGTERM + +NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's +because DAX pages do not have a separate page cache, and so "pinning" implies +locking down file system blocks, which is not (yet) supported in that way. + +CASE 3: Hardware with page faulting support +------------------------------------------- +Here, a well-written driver doesn't normally need to pin pages at all. However, +if the driver does choose to do so, it can register MMU notifiers for the range, +and will be called back upon invalidation. Either way (avoiding page pinning, or +using MMU notifiers to unpin upon request), there is proper synchronization with +both filesystem and mm (page_mkclean(), munmap(), etc). + +Therefore, neither flag needs to be set. + +In this case, ideally, neither get_user_pages() nor pin_user_pages() should be +called. Instead, the software should be written so that it does not pin pages. +This allows mm and filesystems to operate more efficiently and reliably. + +CASE 4: Pinning for struct page manipulation only +------------------------------------------------- +Here, normal GUP calls are sufficient, so neither flag needs to be set. + +page_dma_pinned(): the whole point of pinning +============================================= + +The whole point of marking pages as "DMA-pinned" or "gup-pinned" is to be able +to query, "is this page DMA-pinned?" That allows code such as page_mkclean() +(and file system writeback code in general) to make informed decisions about +what to do when a page cannot be unmapped due to such pins. + +What to do in those cases is the subject of a years-long series of discussions +and debates (see the References at the end of this document). It's a TODO item +here: fill in the details once that's worked out. Meanwhile, it's safe to say +that having this available: :: + + static inline bool page_dma_pinned(struct page *page) + +...is a prerequisite to solving the long-running gup+DMA problem. + +Another way of thinking about FOLL_GET, FOLL_PIN, and FOLL_LONGTERM +=================================================================== + +Another way of thinking about these flags is as a progression of restrictions: +FOLL_GET is for struct page manipulation, without affecting the data that the +struct page refers to. FOLL_PIN is a *replacement* for FOLL_GET, and is for +short term pins on pages whose data *will* get accessed. As such, FOLL_PIN is +a "more severe" form of pinning. And finally, FOLL_LONGTERM is an even more +restrictive case that has FOLL_PIN as a prerequisite: this is for pages that +will be pinned longterm, and whose data will be accessed. + +Unit testing +============ +This file:: + + tools/testing/selftests/vm/gup_benchmark.c + +has the following new calls to exercise the new pin*() wrapper functions: + +* PIN_FAST_BENCHMARK (./gup_benchmark -a) +* PIN_BENCHMARK (./gup_benchmark -b) + +You can monitor how many total dma-pinned pages have been acquired and released +since the system was booted, via two new /proc/vmstat entries: :: + + /proc/vmstat/nr_foll_pin_requested + /proc/vmstat/nr_foll_pin_requested + +Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is +because there is a noticeable performance drop in put_user_page(), when they +are activated. + +References +========== + +* `Some slow progress on get_user_pages() (Apr 2, 2019) `_ +* `DMA and get_user_pages() (LPC: Dec 12, 2018) `_ +* `The trouble with get_user_pages() (Apr 30, 2018) `_ + +John Hubbard, October, 2019 diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b88618e361a..79ca557349c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1042,16 +1042,14 @@ static inline void put_page(struct page *page) * put_user_page() - release a gup-pinned page * @page: pointer to page to be released * - * Pages that were pinned via get_user_pages*() must be released via - * either put_user_page(), or one of the put_user_pages*() routines - * below. This is so that eventually, pages that are pinned via - * get_user_pages*() can be separately tracked and uniquely handled. In - * particular, interactions with RDMA and filesystems need special - * handling. + * Pages that were pinned via pin_user_pages*() must be released via either + * put_user_page(), or one of the put_user_pages*() routines. This is so that + * eventually such pages can be separately tracked and uniquely handled. In + * particular, interactions with RDMA and filesystems need special handling. * * put_user_page() and put_page() are not interchangeable, despite this early * implementation that makes them look the same. put_user_page() calls must - * be perfectly matched up with get_user_page() calls. + * be perfectly matched up with pin*() calls. */ static inline void put_user_page(struct page *page) { @@ -1509,9 +1507,16 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); +long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); +long pin_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, @@ -1519,6 +1524,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); +int pin_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, @@ -2583,13 +2590,15 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ +#define FOLL_PIN 0x40000 /* pages must be released via put_user_page() */ /* - * NOTE on FOLL_LONGTERM: + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: * * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is contrasted with - * iov_iter_get_pages() where usages which are transient. + * period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. * * FIXME: For pages which are part of a filesystem, mappings are subject to the * lifetime enforced by the filesystem and we need guarantees that longterm @@ -2604,11 +2613,39 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * Currently only get_user_pages() and get_user_pages_fast() support this flag * and calls to get_user_pages_[un]locked are specifically not allowed. This * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY + * FAULT_FLAG_ALLOW_RETRY. * - * In the CMA case: longterm pins in a CMA region would unnecessarily fragment - * that region. And so CMA attempts to migrate the page before pinning when + * In the CMA case: long term pins in a CMA region would unnecessarily fragment + * that region. And so, CMA attempts to migrate the page before pinning, when * FOLL_LONGTERM is specified. + * + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, + * but an additional pin counting system) will be invoked. This is intended for + * anything that gets a page reference and then touches page data (for example, + * Direct IO). This lets the filesystem know that some non-file-system entity is + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by + * a call to put_user_page(). + * + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different + * and separate refcounting mechanisms, however, and that means that each has + * its own acquire and release mechanisms: + * + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. + * + * FOLL_PIN: pin_user_pages*() to acquire, and put_user_pages to release. + * + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based + * calls applied to them, and that's perfectly OK. This is a constraint on the + * callers, not on the pages.) + * + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never + * directly by the caller. That's in order to help avoid mismatches when + * releasing pages: get_user_pages*() pages must be released via put_page(), + * while pin_user_pages*() pages must be released via put_user_page(). + * + * Please see Documentation/vm/pin_user_pages.rst for more information. */ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) diff --git a/mm/gup.c b/mm/gup.c index 904bd8b9be1b..cc7e78f4a960 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -194,6 +194,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, spinlock_t *ptl; pte_t *ptep, pte; + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == + (FOLL_PIN | FOLL_GET))) + return ERR_PTR(-EINVAL); retry: if (unlikely(pmd_bad(*pmd))) return no_page_table(vma, flags); @@ -811,7 +815,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, start = untagged_addr(start); - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); /* * If FOLL_FORCE is set then do not force a full fault as the hinting @@ -1035,7 +1039,16 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk, BUG_ON(*locked != 1); } - if (pages) + /* + * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior + * is to set FOLL_GET if the caller wants pages[] filled in (but has + * carelessly failed to specify FOLL_GET), so keep doing that, but only + * for FOLL_GET, not for the newer FOLL_PIN. + * + * FOLL_PIN always expects pages to be non-null, but no need to assert + * that here, as any failures will be obvious enough. + */ + if (pages && !(flags & FOLL_PIN)) flags |= FOLL_GET; pages_done = 0; @@ -1606,11 +1619,19 @@ static __always_inline long __gup_longterm_locked(struct task_struct *tsk, * should use get_user_pages because it cannot pass * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault. */ +#ifdef CONFIG_MMU long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked) { + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; + /* * Parts of FOLL_LONGTERM behavior are incompatible with * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on @@ -1636,6 +1657,16 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, } EXPORT_SYMBOL(get_user_pages_remote); +#else /* CONFIG_MMU */ +long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + return 0; +} +#endif /* !CONFIG_MMU */ + /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task @@ -1647,6 +1678,13 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas) { + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that with an assertion: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; + return __gup_longterm_locked(current, current->mm, start, nr_pages, pages, vmas, gup_flags | FOLL_TOUCH); } @@ -2389,30 +2427,15 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages, return ret; } -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, - unsigned int gup_flags, struct page **pages) +static int internal_get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, + struct page **pages) { unsigned long addr, len, end; int nr = 0, ret = 0; if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | - FOLL_FORCE))) + FOLL_FORCE | FOLL_PIN))) return -EINVAL; start = untagged_addr(start) & PAGE_MASK; @@ -2452,4 +2475,103 @@ int get_user_pages_fast(unsigned long start, int nr_pages, return ret; } + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number requested. + * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns + * -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; + + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); +} EXPORT_SYMBOL_GPL(get_user_pages_fast); + +/** + * pin_user_pages_fast() - pin user pages in memory without taking locks + * + * For now, this is a placeholder function, until various call sites are + * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, + * this is identical to get_user_pages_fast(). + * + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * is NOT intended for Case 2 (RDMA: long-term pins). + */ +int pin_user_pages_fast(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) +{ + /* + * This is a placeholder, until the pin functionality is activated. + * Until then, just behave like the corresponding get_user_pages*() + * routine. + */ + return get_user_pages_fast(start, nr_pages, gup_flags, pages); +} +EXPORT_SYMBOL_GPL(pin_user_pages_fast); + +/** + * pin_user_pages_remote() - pin pages of a remote process (task != current) + * + * For now, this is a placeholder function, until various call sites are + * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, + * this is identical to get_user_pages_remote(). + * + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * is NOT intended for Case 2 (RDMA: long-term pins). + */ +long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *locked) +{ + /* + * This is a placeholder, until the pin functionality is activated. + * Until then, just behave like the corresponding get_user_pages*() + * routine. + */ + return get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags, pages, + vmas, locked); +} +EXPORT_SYMBOL(pin_user_pages_remote); + +/** + * pin_user_pages() - pin user pages in memory for use by other devices + * + * For now, this is a placeholder function, until various call sites are + * converted to use the correct get_user_pages*() or pin_user_pages*() API. So, + * this is identical to get_user_pages(). + * + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It + * is NOT intended for Case 2 (RDMA: long-term pins). + */ +long pin_user_pages(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas) +{ + /* + * This is a placeholder, until the pin functionality is activated. + * Until then, just behave like the corresponding get_user_pages*() + * routine. + */ + return get_user_pages(start, nr_pages, gup_flags, pages, vmas); +} +EXPORT_SYMBOL(pin_user_pages); -- cgit v1.2.3 From f1f6a7dd9b53aafd81b696b9017036e7b08e57ea Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Thu, 30 Jan 2020 22:13:35 -0800 Subject: mm, tree-wide: rename put_user_page*() to unpin_user_page*() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to provide a clearer, more symmetric API for pinning and unpinning DMA pages. This way, pin_user_pages*() calls match up with unpin_user_pages*() calls, and the API is a lot closer to being self-explanatory. Link: http://lkml.kernel.org/r/20200107224558.2362728-23-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Jan Kara Cc: Alex Williamson Cc: Aneesh Kumar K.V Cc: Björn Töpel Cc: Christoph Hellwig Cc: Daniel Vetter Cc: Dan Williams Cc: Hans Verkuil Cc: Ira Weiny Cc: Jason Gunthorpe Cc: Jason Gunthorpe Cc: Jens Axboe Cc: Jerome Glisse Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Leon Romanovsky Cc: Mauro Carvalho Chehab Cc: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/core-api/pin_user_pages.rst | 2 +- arch/powerpc/mm/book3s64/iommu_api.c | 4 ++-- drivers/gpu/drm/via/via_dmablit.c | 4 ++-- drivers/infiniband/core/umem.c | 2 +- drivers/infiniband/hw/hfi1/user_pages.c | 2 +- drivers/infiniband/hw/mthca/mthca_memfree.c | 6 +++--- drivers/infiniband/hw/qib/qib_user_pages.c | 2 +- drivers/infiniband/hw/qib/qib_user_sdma.c | 6 +++--- drivers/infiniband/hw/usnic/usnic_uiom.c | 2 +- drivers/infiniband/sw/siw/siw_mem.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 4 ++-- drivers/platform/goldfish/goldfish_pipe.c | 4 ++-- drivers/vfio/vfio_iommu_type1.c | 2 +- fs/io_uring.c | 4 ++-- include/linux/mm.h | 26 +++++++++++------------ mm/gup.c | 32 ++++++++++++++--------------- mm/process_vm_access.c | 4 ++-- net/xdp/xdp_umem.c | 2 +- 18 files changed, 55 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 71849830cd48..1d490155ecd7 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -219,7 +219,7 @@ since the system was booted, via two new /proc/vmstat entries: :: /proc/vmstat/nr_foll_pin_requested Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is -because there is a noticeable performance drop in put_user_page(), when they +because there is a noticeable performance drop in unpin_user_page(), when they are activated. References diff --git a/arch/powerpc/mm/book3s64/iommu_api.c b/arch/powerpc/mm/book3s64/iommu_api.c index a86547822034..eba73ebd8ae5 100644 --- a/arch/powerpc/mm/book3s64/iommu_api.c +++ b/arch/powerpc/mm/book3s64/iommu_api.c @@ -168,7 +168,7 @@ good_exit: free_exit: /* free the references taken */ - put_user_pages(mem->hpages, pinned); + unpin_user_pages(mem->hpages, pinned); vfree(mem->hpas); kfree(mem); @@ -214,7 +214,7 @@ static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem) if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY) SetPageDirty(page); - put_user_page(page); + unpin_user_page(page); mem->hpas[i] = 0; } diff --git a/drivers/gpu/drm/via/via_dmablit.c b/drivers/gpu/drm/via/via_dmablit.c index 37c5e572993a..719d036c9384 100644 --- a/drivers/gpu/drm/via/via_dmablit.c +++ b/drivers/gpu/drm/via/via_dmablit.c @@ -188,8 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t *vsg) kfree(vsg->desc_pages); /* fall through */ case dr_via_pages_locked: - put_user_pages_dirty_lock(vsg->pages, vsg->num_pages, - (vsg->direction == DMA_FROM_DEVICE)); + unpin_user_pages_dirty_lock(vsg->pages, vsg->num_pages, + (vsg->direction == DMA_FROM_DEVICE)); /* fall through */ case dr_via_pages_alloc: vfree(vsg->pages); diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index aae5bfed7f3b..c3769a5f096d 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -54,7 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) { page = sg_page_iter_page(&sg_iter); - put_user_pages_dirty_lock(&page, 1, umem->writable && dirty); + unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty); } sg_free_table(&umem->sg_head); diff --git a/drivers/infiniband/hw/hfi1/user_pages.c b/drivers/infiniband/hw/hfi1/user_pages.c index 9a94761765c0..3b505006c0a6 100644 --- a/drivers/infiniband/hw/hfi1/user_pages.c +++ b/drivers/infiniband/hw/hfi1/user_pages.c @@ -118,7 +118,7 @@ int hfi1_acquire_user_pages(struct mm_struct *mm, unsigned long vaddr, size_t np void hfi1_release_user_pages(struct mm_struct *mm, struct page **p, size_t npages, bool dirty) { - put_user_pages_dirty_lock(p, npages, dirty); + unpin_user_pages_dirty_lock(p, npages, dirty); if (mm) { /* during close after signal, mm can be NULL */ atomic64_sub(npages, &mm->pinned_vm); diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c index 8269ab040c21..78a48aea3faf 100644 --- a/drivers/infiniband/hw/mthca/mthca_memfree.c +++ b/drivers/infiniband/hw/mthca/mthca_memfree.c @@ -482,7 +482,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); if (ret < 0) { - put_user_page(pages[0]); + unpin_user_page(pages[0]); goto out; } @@ -490,7 +490,7 @@ int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar, mthca_uarc_virt(dev, uar, i)); if (ret) { pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); - put_user_page(sg_page(&db_tab->page[i].mem)); + unpin_user_page(sg_page(&db_tab->page[i].mem)); goto out; } @@ -556,7 +556,7 @@ void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar, if (db_tab->page[i].uvirt) { mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1); pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE); - put_user_page(sg_page(&db_tab->page[i].mem)); + unpin_user_page(sg_page(&db_tab->page[i].mem)); } } diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c index 7fc4b5f81fcd..342e3172ca40 100644 --- a/drivers/infiniband/hw/qib/qib_user_pages.c +++ b/drivers/infiniband/hw/qib/qib_user_pages.c @@ -40,7 +40,7 @@ static void __qib_release_user_pages(struct page **p, size_t num_pages, int dirty) { - put_user_pages_dirty_lock(p, num_pages, dirty); + unpin_user_pages_dirty_lock(p, num_pages, dirty); } /** diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index 1a3cc2957e3a..a67599b5a550 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -317,7 +317,7 @@ static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd, * the caller can ignore this page. */ if (put) { - put_user_page(page); + unpin_user_page(page); } else { /* coalesce case */ kunmap(page); @@ -631,7 +631,7 @@ static void qib_user_sdma_free_pkt_frag(struct device *dev, kunmap(pkt->addr[i].page); if (pkt->addr[i].put_page) - put_user_page(pkt->addr[i].page); + unpin_user_page(pkt->addr[i].page); else __free_page(pkt->addr[i].page); } else if (pkt->addr[i].kvaddr) { @@ -706,7 +706,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, /* if error, return all pages not managed by pkt */ free_pages: while (i < j) - put_user_page(pages[i++]); + unpin_user_page(pages[i++]); done: return ret; diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 600896727d34..bd9f944b68fc 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -75,7 +75,7 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) for_each_sg(chunk->page_list, sg, chunk->nents, i) { page = sg_page(sg); pa = sg_phys(sg); - put_user_pages_dirty_lock(&page, 1, dirty); + unpin_user_pages_dirty_lock(&page, 1, dirty); usnic_dbg("pa: %pa\n", &pa); } kfree(chunk); diff --git a/drivers/infiniband/sw/siw/siw_mem.c b/drivers/infiniband/sw/siw/siw_mem.c index e53b07dcfed5..e2061dc0b043 100644 --- a/drivers/infiniband/sw/siw/siw_mem.c +++ b/drivers/infiniband/sw/siw/siw_mem.c @@ -63,7 +63,7 @@ struct siw_mem *siw_mem_id2obj(struct siw_device *sdev, int stag_index) static void siw_free_plist(struct siw_page_chunk *chunk, int num_pages, bool dirty) { - put_user_pages_dirty_lock(chunk->plist, num_pages, dirty); + unpin_user_pages_dirty_lock(chunk->plist, num_pages, dirty); } void siw_umem_release(struct siw_umem *umem, bool dirty) diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 162a2633b1e3..13b65ed9e74c 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -349,8 +349,8 @@ int videobuf_dma_free(struct videobuf_dmabuf *dma) BUG_ON(dma->sglen); if (dma->pages) { - put_user_pages_dirty_lock(dma->pages, dma->nr_pages, - dma->direction == DMA_FROM_DEVICE); + unpin_user_pages_dirty_lock(dma->pages, dma->nr_pages, + dma->direction == DMA_FROM_DEVICE); kfree(dma->pages); dma->pages = NULL; } diff --git a/drivers/platform/goldfish/goldfish_pipe.c b/drivers/platform/goldfish/goldfish_pipe.c index 2a5901efecde..1ab207ec9c94 100644 --- a/drivers/platform/goldfish/goldfish_pipe.c +++ b/drivers/platform/goldfish/goldfish_pipe.c @@ -360,8 +360,8 @@ static int transfer_max_buffers(struct goldfish_pipe *pipe, *consumed_size = pipe->command_buffer->rw_params.consumed_size; - put_user_pages_dirty_lock(pipe->pages, pages_count, - !is_write && *consumed_size > 0); + unpin_user_pages_dirty_lock(pipe->pages, pages_count, + !is_write && *consumed_size > 0); mutex_unlock(&pipe->lock); return 0; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 18bfc2fc8e6d..a177bf2c6683 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -310,7 +310,7 @@ static int put_pfn(unsigned long pfn, int prot) if (!is_invalid_reserved_pfn(pfn)) { struct page *page = pfn_to_page(pfn); - put_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE); + unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE); return 1; } return 0; diff --git a/fs/io_uring.c b/fs/io_uring.c index 54f664e8b9b8..1806afddfea5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -6005,7 +6005,7 @@ static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; for (j = 0; j < imu->nr_bvecs; j++) - put_user_page(imu->bvec[j].bv_page); + unpin_user_page(imu->bvec[j].bv_page); if (ctx->account_mem) io_unaccount_mem(ctx->user, imu->nr_bvecs); @@ -6150,7 +6150,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, * release any pages we did get */ if (pret > 0) - put_user_pages(pages, pret); + unpin_user_pages(pages, pret); if (ctx->account_mem) io_unaccount_mem(ctx->user, nr_pages); kvfree(imu->bvec); diff --git a/include/linux/mm.h b/include/linux/mm.h index 79ca557349c6..fc543eb45de1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1039,27 +1039,27 @@ static inline void put_page(struct page *page) } /** - * put_user_page() - release a gup-pinned page + * unpin_user_page() - release a gup-pinned page * @page: pointer to page to be released * * Pages that were pinned via pin_user_pages*() must be released via either - * put_user_page(), or one of the put_user_pages*() routines. This is so that - * eventually such pages can be separately tracked and uniquely handled. In + * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so + * that eventually such pages can be separately tracked and uniquely handled. In * particular, interactions with RDMA and filesystems need special handling. * - * put_user_page() and put_page() are not interchangeable, despite this early - * implementation that makes them look the same. put_user_page() calls must + * unpin_user_page() and put_page() are not interchangeable, despite this early + * implementation that makes them look the same. unpin_user_page() calls must * be perfectly matched up with pin*() calls. */ -static inline void put_user_page(struct page *page) +static inline void unpin_user_page(struct page *page) { put_page(page); } -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, - bool make_dirty); +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty); -void put_user_pages(struct page **pages, unsigned long npages); +void unpin_user_pages(struct page **pages, unsigned long npages); #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS @@ -2590,7 +2590,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via put_user_page() */ +#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ /* * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each @@ -2625,7 +2625,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * Direct IO). This lets the filesystem know that some non-file-system entity is * potentially changing the pages' data. In contrast to FOLL_GET (whose pages * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to put_user_page(). + * a call to unpin_user_page(). * * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different * and separate refcounting mechanisms, however, and that means that each has @@ -2633,7 +2633,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. * - * FOLL_PIN: pin_user_pages*() to acquire, and put_user_pages to release. + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. * * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based @@ -2643,7 +2643,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never * directly by the caller. That's in order to help avoid mismatches when * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via put_user_page(). + * while pin_user_pages*() pages must be released via unpin_user_page(). * * Please see Documentation/vm/pin_user_pages.rst for more information. */ diff --git a/mm/gup.c b/mm/gup.c index cc7e78f4a960..e13f4d211475 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -45,7 +45,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) } /** - * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages + * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages * @pages: array of pages to be maybe marked dirty, and definitely released. * @npages: number of pages in the @pages array. * @make_dirty: whether to mark the pages dirty @@ -55,19 +55,19 @@ static inline struct page *try_get_compound_head(struct page *page, int refs) * * For each page in the @pages array, make that page (or its head page, if a * compound page) dirty, if @make_dirty is true, and if the page was previously - * listed as clean. In any case, releases all pages using put_user_page(), - * possibly via put_user_pages(), for the non-dirty case. + * listed as clean. In any case, releases all pages using unpin_user_page(), + * possibly via unpin_user_pages(), for the non-dirty case. * - * Please see the put_user_page() documentation for details. + * Please see the unpin_user_page() documentation for details. * * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is * required, then the caller should a) verify that this is really correct, * because _lock() is usually required, and b) hand code it: - * set_page_dirty_lock(), put_user_page(). + * set_page_dirty_lock(), unpin_user_page(). * */ -void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, - bool make_dirty) +void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, + bool make_dirty) { unsigned long index; @@ -78,7 +78,7 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, */ if (!make_dirty) { - put_user_pages(pages, npages); + unpin_user_pages(pages, npages); return; } @@ -106,21 +106,21 @@ void put_user_pages_dirty_lock(struct page **pages, unsigned long npages, */ if (!PageDirty(page)) set_page_dirty_lock(page); - put_user_page(page); + unpin_user_page(page); } } -EXPORT_SYMBOL(put_user_pages_dirty_lock); +EXPORT_SYMBOL(unpin_user_pages_dirty_lock); /** - * put_user_pages() - release an array of gup-pinned pages. + * unpin_user_pages() - release an array of gup-pinned pages. * @pages: array of pages to be marked dirty and released. * @npages: number of pages in the @pages array. * - * For each page in the @pages array, release the page using put_user_page(). + * For each page in the @pages array, release the page using unpin_user_page(). * - * Please see the put_user_page() documentation for details. + * Please see the unpin_user_page() documentation for details. */ -void put_user_pages(struct page **pages, unsigned long npages) +void unpin_user_pages(struct page **pages, unsigned long npages) { unsigned long index; @@ -130,9 +130,9 @@ void put_user_pages(struct page **pages, unsigned long npages) * single operation to the head page should suffice. */ for (index = 0; index < npages; index++) - put_user_page(pages[index]); + unpin_user_page(pages[index]); } -EXPORT_SYMBOL(put_user_pages); +EXPORT_SYMBOL(unpin_user_pages); #ifdef CONFIG_MMU static struct page *no_page_table(struct vm_area_struct *vma, diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index fd20ab675b85..de41e830cdac 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -126,8 +126,8 @@ static int process_vm_rw_single_vec(unsigned long addr, pa += pinned_pages * PAGE_SIZE; /* If vm_write is set, the pages need to be made dirty: */ - put_user_pages_dirty_lock(process_pages, pinned_pages, - vm_write); + unpin_user_pages_dirty_lock(process_pages, pinned_pages, + vm_write); } return rc; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b17ce9a5534d..fa7bb5e060d0 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -212,7 +212,7 @@ static int xdp_umem_map_pages(struct xdp_umem *umem) static void xdp_umem_unpin_pages(struct xdp_umem *umem) { - put_user_pages_dirty_lock(umem->pgs, umem->npgs, true); + unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); kfree(umem->pgs); umem->pgs = NULL; -- cgit v1.2.3 From 3f9903b9ca5e981b5862d7b10086d0e8caa20298 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 30 Jan 2020 22:14:01 -0800 Subject: mm: remove the memory isolate notifier Luckily, we have no users left, so we can get rid of it. Cleanup set_migratetype_isolate() a little bit. Link: http://lkml.kernel.org/r/20191114131911.11783-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Greg Kroah-Hartman Acked-by: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Pavel Tatashin Cc: Dan Williams Cc: Oscar Salvador Cc: Qian Cai Cc: Anshuman Khandual Cc: Pingfan Liu Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 19 ------------------- include/linux/memory.h | 27 --------------------------- mm/page_isolation.c | 38 ++++---------------------------------- 3 files changed, 4 insertions(+), 80 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 799b43191dea..064643927827 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -70,20 +70,6 @@ void unregister_memory_notifier(struct notifier_block *nb) } EXPORT_SYMBOL(unregister_memory_notifier); -static ATOMIC_NOTIFIER_HEAD(memory_isolate_chain); - -int register_memory_isolate_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_register(&memory_isolate_chain, nb); -} -EXPORT_SYMBOL(register_memory_isolate_notifier); - -void unregister_memory_isolate_notifier(struct notifier_block *nb) -{ - atomic_notifier_chain_unregister(&memory_isolate_chain, nb); -} -EXPORT_SYMBOL(unregister_memory_isolate_notifier); - static void memory_block_release(struct device *dev) { struct memory_block *mem = to_memory_block(dev); @@ -175,11 +161,6 @@ int memory_notify(unsigned long val, void *v) return blocking_notifier_call_chain(&memory_chain, val, v); } -int memory_isolate_notify(unsigned long val, void *v) -{ - return atomic_notifier_call_chain(&memory_isolate_chain, val, v); -} - /* * The probe routines leave the pages uninitialized, just as the bootmem code * does. Make sure we do not access them, but instead use only information from diff --git a/include/linux/memory.h b/include/linux/memory.h index 4c75dae8dd29..de6bccb28f07 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -55,19 +55,6 @@ struct memory_notify { int status_change_nid; }; -/* - * During pageblock isolation, count the number of pages within the - * range [start_pfn, start_pfn + nr_pages) which are owned by code - * in the notifier chain. - */ -#define MEM_ISOLATE_COUNT (1<<0) - -struct memory_isolate_notify { - unsigned long start_pfn; /* Start of range to check */ - unsigned int nr_pages; /* # pages in range to check */ - unsigned int pages_found; /* # pages owned found by callbacks */ -}; - struct notifier_block; struct mem_section; @@ -94,27 +81,13 @@ static inline int memory_notify(unsigned long val, void *v) { return 0; } -static inline int register_memory_isolate_notifier(struct notifier_block *nb) -{ - return 0; -} -static inline void unregister_memory_isolate_notifier(struct notifier_block *nb) -{ -} -static inline int memory_isolate_notify(unsigned long val, void *v) -{ - return 0; -} #else extern int register_memory_notifier(struct notifier_block *nb); extern void unregister_memory_notifier(struct notifier_block *nb); -extern int register_memory_isolate_notifier(struct notifier_block *nb); -extern void unregister_memory_isolate_notifier(struct notifier_block *nb); int create_memory_block_devices(unsigned long start, unsigned long size); void remove_memory_block_devices(unsigned long start, unsigned long size); extern void memory_dev_init(void); extern int memory_notify(unsigned long val, void *v); -extern int memory_isolate_notify(unsigned long val, void *v); extern struct memory_block *find_memory_block(struct mem_section *); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04ee1663cdbe..21af88b718aa 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -18,9 +18,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { struct zone *zone; - unsigned long flags, pfn; - struct memory_isolate_notify arg; - int notifier_ret; + unsigned long flags; int ret = -EBUSY; zone = page_zone(page); @@ -35,41 +33,11 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ if (is_migrate_isolate_page(page)) goto out; - pfn = page_to_pfn(page); - arg.start_pfn = pfn; - arg.nr_pages = pageblock_nr_pages; - arg.pages_found = 0; - - /* - * It may be possible to isolate a pageblock even if the - * migratetype is not MIGRATE_MOVABLE. The memory isolation - * notifier chain is used by balloon drivers to return the - * number of pages in a range that are held by the balloon - * driver to shrink memory. If all the pages are accounted for - * by balloons, are free, or on the LRU, isolation can continue. - * Later, for example, when memory hotplug notifier runs, these - * pages reported as "can be isolated" should be isolated(freed) - * by the balloon driver through the memory notifier chain. - */ - notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); - notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret) - goto out; /* * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, - isol_flags)) - ret = 0; - - /* - * immobile means "not-on-lru" pages. If immobile is larger than - * removable-by-driver pages reported by notifier, we'll fail. - */ - -out: - if (!ret) { + if (!has_unmovable_pages(zone, page, 0, migratetype, isol_flags)) { unsigned long nr_pages; int mt = get_pageblock_migratetype(page); @@ -79,8 +47,10 @@ out: NULL); __mod_zone_freepage_state(zone, -nr_pages, mt); + ret = 0; } +out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(zone); -- cgit v1.2.3 From fe4c86c916d9151113372369f322e7436167e6f3 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 30 Jan 2020 22:14:04 -0800 Subject: mm: remove "count" parameter from has_unmovable_pages() Now that the memory isolate notifier is gone, the parameter is always 0. Drop it and cleanup has_unmovable_pages(). Link: http://lkml.kernel.org/r/20191114131911.11783-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Michal Hocko Cc: Oscar Salvador Cc: Anshuman Khandual Cc: Qian Cai Cc: Pingfan Liu Cc: Stephen Rothwell Cc: Dan Williams Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Mel Gorman Cc: Mike Rapoport Cc: Wei Yang Cc: Alexander Duyck Cc: Alexander Potapenko Cc: Arun KS Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 4 ++-- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 21 +++++++-------------- mm/page_isolation.c | 2 +- 4 files changed, 11 insertions(+), 18 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 6861df759fad..148e65a9c606 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,8 +33,8 @@ static inline bool is_migrate_isolate(int migratetype) #define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 -bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - int migratetype, int flags); +bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, + int flags); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0ddff29079c3..c3dc6b27fd6d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1182,7 +1182,7 @@ static bool is_pageblock_removable_nolock(unsigned long pfn) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, + return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE, MEMORY_OFFLINE); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe8a529f7fd8..3aec60ddd6d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8180,17 +8180,15 @@ void *__init alloc_large_system_hash(const char *tablename, /* * This function checks whether pageblock includes unmovable pages or not. - * If @count is not zero, it is okay to include less @count unmovable pages * * PageLRU check without isolation or lru_lock could race so that * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable * check without lock_page also may miss some movable non-lru pages at * race condition. So you can't expect this function should be exact. */ -bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - int migratetype, int flags) +bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, + int flags) { - unsigned long found; unsigned long iter = 0; unsigned long pfn = page_to_pfn(page); const char *reason = "unmovable page"; @@ -8216,13 +8214,11 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, goto unmovable; } - for (found = 0; iter < pageblock_nr_pages; iter++) { - unsigned long check = pfn + iter; - - if (!pfn_valid_within(check)) + for (; iter < pageblock_nr_pages; iter++) { + if (!pfn_valid_within(pfn + iter)) continue; - page = pfn_to_page(check); + page = pfn_to_page(pfn + iter); if (PageReserved(page)) goto unmovable; @@ -8271,11 +8267,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if ((flags & MEMORY_OFFLINE) && PageHWPoison(page)) continue; - if (__PageMovable(page)) + if (__PageMovable(page) || PageLRU(page)) continue; - if (!PageLRU(page)) - found++; /* * If there are RECLAIMABLE pages, we need to check * it. But now, memory offline itself doesn't call @@ -8289,8 +8283,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, * is set to both of a memory hole page and a _used_ kernel * page at boot. */ - if (found > count) - goto unmovable; + goto unmovable; } return false; unmovable: diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 21af88b718aa..1f8b9dfecbe8 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -37,7 +37,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, 0, migratetype, isol_flags)) { + if (!has_unmovable_pages(zone, page, migratetype, isol_flags)) { unsigned long nr_pages; int mt = get_pageblock_migratetype(page); -- cgit v1.2.3 From 02634a44b8aba2d4f16ea09d3c17400d9320327e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 30 Jan 2020 22:14:20 -0800 Subject: mm/memblock: define memblock_physmem_add() On the s390 platform memblock.physmem array is being built by directly calling into memblock_add_range() which is a low level function not intended to be used outside of memblock. Hence lets conditionally add helper functions for physmem array when HAVE_MEMBLOCK_PHYS_MAP is enabled. Also use MAX_NUMNODES instead of 0 as node ID similar to memblock_add() and memblock_reserve(). Make memblock_add_range() a static function as it is no longer getting used outside of memblock. Link: http://lkml.kernel.org/r/1578283835-21969-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Mike Rapoport Acked-by: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Martin Schwidefsky Cc: Collin Walling Cc: Gerald Schaefer Cc: Philipp Rudo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/kernel/setup.c | 12 +++--------- include/linux/memblock.h | 7 +++---- mm/memblock.c | 14 +++++++++++++- 3 files changed, 19 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 87a467dff5eb..7d113e208f65 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -759,14 +759,6 @@ static void __init free_mem_detect_info(void) memblock_free(start, size); } -static void __init memblock_physmem_add(phys_addr_t start, phys_addr_t size) -{ - memblock_dbg("memblock_physmem_add: [%#016llx-%#016llx]\n", - start, start + size - 1); - memblock_add_range(&memblock.memory, start, size, 0, 0); - memblock_add_range(&memblock.physmem, start, size, 0, 0); -} - static const char * __init get_mem_info_source(void) { switch (mem_detect.info_source) { @@ -791,8 +783,10 @@ static void __init memblock_add_mem_detect_info(void) get_mem_info_source(), mem_detect.info_source); /* keep memblock lists close to the kernel */ memblock_set_bottom_up(true); - for_each_mem_detect_block(i, &start, &end) + for_each_mem_detect_block(i, &start, &end) { + memblock_add(start, end - start); memblock_physmem_add(start, end - start); + } memblock_set_bottom_up(false); memblock_dump_all(); } diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b38bbefabfab..079d17d96410 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -113,6 +113,9 @@ int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_remove(phys_addr_t base, phys_addr_t size); int memblock_free(phys_addr_t base, phys_addr_t size); int memblock_reserve(phys_addr_t base, phys_addr_t size); +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +int memblock_physmem_add(phys_addr_t base, phys_addr_t size); +#endif void memblock_trim_memory(phys_addr_t align); bool memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); @@ -127,10 +130,6 @@ void reset_node_managed_pages(pg_data_t *pgdat); void reset_all_zones_managed_pages(void); /* Low level functions */ -int memblock_add_range(struct memblock_type *type, - phys_addr_t base, phys_addr_t size, - int nid, enum memblock_flags flags); - void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags, struct memblock_type *type_a, struct memblock_type *type_b, phys_addr_t *out_start, diff --git a/mm/memblock.c b/mm/memblock.c index 4bc2c7d8bf42..fc0d4db1d646 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -575,7 +575,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type, * Return: * 0 on success, -errno on failure. */ -int __init_memblock memblock_add_range(struct memblock_type *type, +static int __init_memblock memblock_add_range(struct memblock_type *type, phys_addr_t base, phys_addr_t size, int nid, enum memblock_flags flags) { @@ -830,6 +830,18 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0); } +#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP +int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size) +{ + phys_addr_t end = base + size - 1; + + memblock_dbg("%s: [%pa-%pa] %pS\n", __func__, + &base, &end, (void *)_RET_IP_); + + return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0); +} +#endif + /** * memblock_setclr_flag - set or clear flag for a memory region * @base: base address of the region -- cgit v1.2.3 From bd5c2344f9eb1ebf7ff2501ddb13d83151939780 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 30 Jan 2020 22:14:54 -0800 Subject: mm/memory_hotplug: pass in nid to online_pages() Patch series "mm/memory_hotplug: pass in nid to online_pages()". Simplify onlining code and get rid of find_memory_block(). Pass in the nid from the memory block we are trying to online directly, instead of manually looking it up. This patch (of 2): No need to lookup the memory block, we can directly pass in the nid. Link: http://lkml.kernel.org/r/20200113113354.6341-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Andrew Morton Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Cc: Anshuman Khandual Cc: Dan Williams Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 6 +++--- include/linux/memory_hotplug.h | 3 ++- mm/memory_hotplug.c | 13 ++----------- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 064643927827..15659306ad69 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -206,7 +206,7 @@ static bool pages_correctly_probed(unsigned long start_pfn) */ static int memory_block_action(unsigned long start_section_nr, unsigned long action, - int online_type) + int online_type, int nid) { unsigned long start_pfn; unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block; @@ -219,7 +219,7 @@ memory_block_action(unsigned long start_section_nr, unsigned long action, if (!pages_correctly_probed(start_pfn)) return -EBUSY; - ret = online_pages(start_pfn, nr_pages, online_type); + ret = online_pages(start_pfn, nr_pages, online_type, nid); break; case MEM_OFFLINE: ret = offline_pages(start_pfn, nr_pages); @@ -245,7 +245,7 @@ static int memory_block_change_state(struct memory_block *mem, mem->state = MEM_GOING_OFFLINE; ret = memory_block_action(mem->start_section_nr, to_state, - mem->online_type); + mem->online_type, mem->nid); mem->state = ret ? from_state_req : to_state; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ba0dca6aac6e..ffa6ad12d84a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -94,7 +94,8 @@ extern int zone_grow_free_lists(struct zone *zone, unsigned long new_nr_pages); extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ -extern int online_pages(unsigned long, unsigned long, int); +extern int online_pages(unsigned long pfn, unsigned long nr_pages, + int online_type, int nid); extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, unsigned long *valid_start, unsigned long *valid_end); extern unsigned long __offline_isolated_pages(unsigned long start_pfn, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c3dc6b27fd6d..36d80915ddc2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -783,27 +783,18 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn, return default_zone_for_pfn(nid, start_pfn, nr_pages); } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, + int online_type, int nid) { unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; - int nid; int ret; struct memory_notify arg; - struct memory_block *mem; mem_hotplug_begin(); - /* - * We can't use pfn_to_nid() because nid might be stored in struct page - * which is not yet initialized. Instead, we find nid from memory block. - */ - mem = find_memory_block(__pfn_to_section(pfn)); - nid = mem->nid; - put_device(&mem->dev); - /* associate pfn range with the zone */ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages); move_pfn_range_to_zone(zone, pfn, nr_pages, NULL); -- cgit v1.2.3 From 4a55c0474a92d5c418bcbbe122368de0910aeac2 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Thu, 30 Jan 2020 22:14:57 -0800 Subject: mm/hotplug: silence a lockdep splat with printk() It is not that hard to trigger lockdep splats by calling printk from under zone->lock. Most of them are false positives caused by lock chains introduced early in the boot process and they do not cause any real problems (although most of the early boot lock dependencies could happen after boot as well). There are some console drivers which do allocate from the printk context as well and those should be fixed. In any case, false positives are not that trivial to workaround and it is far from optimal to lose lockdep functionality for something that is a non-issue. So change has_unmovable_pages() so that it no longer calls dump_page() itself - instead it returns a "struct page *" of the unmovable page back to the caller so that in the case of a has_unmovable_pages() failure, the caller can call dump_page() after releasing zone->lock. Also, make dump_page() is able to report a CMA page as well, so the reason string from has_unmovable_pages() can be removed. Even though has_unmovable_pages doesn't hold any reference to the returned page this should be reasonably safe for the purpose of reporting the page (dump_page) because it cannot be hotremoved in the context of memory unplug. The state of the page might change but that is the case even with the existing code as zone->lock only plays role for free pages. While at it, remove a similar but unnecessary debug-only printk() as well. A sample of one of those lockdep splats is, WARNING: possible circular locking dependency detected ------------------------------------------------------ test.sh/8653 is trying to acquire lock: ffffffff865a4460 (console_owner){-.-.}, at: console_unlock+0x207/0x750 but task is already holding lock: ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at: __offline_isolated_pages+0x179/0x3e0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&(&zone->lock)->rlock){-.-.}: __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 _raw_spin_lock+0x2f/0x40 rmqueue_bulk.constprop.21+0xb6/0x1160 get_page_from_freelist+0x898/0x22c0 __alloc_pages_nodemask+0x2f3/0x1cd0 alloc_pages_current+0x9c/0x110 allocate_slab+0x4c6/0x19c0 new_slab+0x46/0x70 ___slab_alloc+0x58b/0x960 __slab_alloc+0x43/0x70 __kmalloc+0x3ad/0x4b0 __tty_buffer_request_room+0x100/0x250 tty_insert_flip_string_fixed_flag+0x67/0x110 pty_write+0xa2/0xf0 n_tty_write+0x36b/0x7b0 tty_write+0x284/0x4c0 __vfs_write+0x50/0xa0 vfs_write+0x105/0x290 redirected_tty_write+0x6a/0xc0 do_iter_write+0x248/0x2a0 vfs_writev+0x106/0x1e0 do_writev+0xd4/0x180 __x64_sys_writev+0x45/0x50 do_syscall_64+0xcc/0x76c entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #2 (&(&port->lock)->rlock){-.-.}: __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 _raw_spin_lock_irqsave+0x3a/0x50 tty_port_tty_get+0x20/0x60 tty_port_default_wakeup+0xf/0x30 tty_port_tty_wakeup+0x39/0x40 uart_write_wakeup+0x2a/0x40 serial8250_tx_chars+0x22e/0x440 serial8250_handle_irq.part.8+0x14a/0x170 serial8250_default_handle_irq+0x5c/0x90 serial8250_interrupt+0xa6/0x130 __handle_irq_event_percpu+0x78/0x4f0 handle_irq_event_percpu+0x70/0x100 handle_irq_event+0x5a/0x8b handle_edge_irq+0x117/0x370 do_IRQ+0x9e/0x1e0 ret_from_intr+0x0/0x2a cpuidle_enter_state+0x156/0x8e0 cpuidle_enter+0x41/0x70 call_cpuidle+0x5e/0x90 do_idle+0x333/0x370 cpu_startup_entry+0x1d/0x1f start_secondary+0x290/0x330 secondary_startup_64+0xb6/0xc0 -> #1 (&port_lock_key){-.-.}: __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 _raw_spin_lock_irqsave+0x3a/0x50 serial8250_console_write+0x3e4/0x450 univ8250_console_write+0x4b/0x60 console_unlock+0x501/0x750 vprintk_emit+0x10d/0x340 vprintk_default+0x1f/0x30 vprintk_func+0x44/0xd4 printk+0x9f/0xc5 -> #0 (console_owner){-.-.}: check_prev_add+0x107/0xea0 validate_chain+0x8fc/0x1200 __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 console_unlock+0x269/0x750 vprintk_emit+0x10d/0x340 vprintk_default+0x1f/0x30 vprintk_func+0x44/0xd4 printk+0x9f/0xc5 __offline_isolated_pages.cold.52+0x2f/0x30a offline_isolated_pages_cb+0x17/0x30 walk_system_ram_range+0xda/0x160 __offline_pages+0x79c/0xa10 offline_pages+0x11/0x20 memory_subsys_offline+0x7e/0xc0 device_offline+0xd5/0x110 state_store+0xc6/0xe0 dev_attr_store+0x3f/0x60 sysfs_kf_write+0x89/0xb0 kernfs_fop_write+0x188/0x240 __vfs_write+0x50/0xa0 vfs_write+0x105/0x290 ksys_write+0xc6/0x160 __x64_sys_write+0x43/0x50 do_syscall_64+0xcc/0x76c entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Chain exists of: console_owner --> &(&port->lock)->rlock --> &(&zone->lock)->rlock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&(&zone->lock)->rlock); lock(&(&port->lock)->rlock); lock(&(&zone->lock)->rlock); lock(console_owner); *** DEADLOCK *** 9 locks held by test.sh/8653: #0: ffff88839ba7d408 (sb_writers#4){.+.+}, at: vfs_write+0x25f/0x290 #1: ffff888277618880 (&of->mutex){+.+.}, at: kernfs_fop_write+0x128/0x240 #2: ffff8898131fc218 (kn->count#115){.+.+}, at: kernfs_fop_write+0x138/0x240 #3: ffffffff86962a80 (device_hotplug_lock){+.+.}, at: lock_device_hotplug_sysfs+0x16/0x50 #4: ffff8884374f4990 (&dev->mutex){....}, at: device_offline+0x70/0x110 #5: ffffffff86515250 (cpu_hotplug_lock.rw_sem){++++}, at: __offline_pages+0xbf/0xa10 #6: ffffffff867405f0 (mem_hotplug_lock.rw_sem){++++}, at: percpu_down_write+0x87/0x2f0 #7: ffff88883fff3c58 (&(&zone->lock)->rlock){-.-.}, at: __offline_isolated_pages+0x179/0x3e0 #8: ffffffff865a4920 (console_lock){+.+.}, at: vprintk_emit+0x100/0x340 stack backtrace: Hardware name: HPE ProLiant DL560 Gen10/ProLiant DL560 Gen10, BIOS U34 05/21/2019 Call Trace: dump_stack+0x86/0xca print_circular_bug.cold.31+0x243/0x26e check_noncircular+0x29e/0x2e0 check_prev_add+0x107/0xea0 validate_chain+0x8fc/0x1200 __lock_acquire+0x5b3/0xb40 lock_acquire+0x126/0x280 console_unlock+0x269/0x750 vprintk_emit+0x10d/0x340 vprintk_default+0x1f/0x30 vprintk_func+0x44/0xd4 printk+0x9f/0xc5 __offline_isolated_pages.cold.52+0x2f/0x30a offline_isolated_pages_cb+0x17/0x30 walk_system_ram_range+0xda/0x160 __offline_pages+0x79c/0xa10 offline_pages+0x11/0x20 memory_subsys_offline+0x7e/0xc0 device_offline+0xd5/0x110 state_store+0xc6/0xe0 dev_attr_store+0x3f/0x60 sysfs_kf_write+0x89/0xb0 kernfs_fop_write+0x188/0x240 __vfs_write+0x50/0xa0 vfs_write+0x105/0x290 ksys_write+0xc6/0x160 __x64_sys_write+0x43/0x50 do_syscall_64+0xcc/0x76c entry_SYSCALL_64_after_hwframe+0x49/0xbe Link: http://lkml.kernel.org/r/20200117181200.20299-1-cai@lca.pw Signed-off-by: Qian Cai Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Sergey Senozhatsky Cc: Petr Mladek Cc: Steven Rostedt (VMware) Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 4 ++-- mm/debug.c | 10 +++++++++- mm/page_alloc.c | 23 ++++++++++------------- mm/page_isolation.c | 11 ++++++++++- 4 files changed, 31 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 148e65a9c606..572458016331 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,8 +33,8 @@ static inline bool is_migrate_isolate(int migratetype) #define MEMORY_OFFLINE 0x1 #define REPORT_FAILURE 0x2 -bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, - int flags); +struct page *has_unmovable_pages(struct zone *zone, struct page *page, + int migratetype, int flags); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); diff --git a/mm/debug.c b/mm/debug.c index 4a9ded0d3504..ecccd9f17801 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -46,6 +46,13 @@ void __dump_page(struct page *page, const char *reason) { struct address_space *mapping; bool page_poisoned = PagePoisoned(page); + /* + * Accessing the pageblock without the zone lock. It could change to + * "isolate" again in the meantime, but since we are just dumping the + * state for debugging, it should be fine to accept a bit of + * inaccuracy here due to racing. + */ + bool page_cma = is_migrate_cma_page(page); int mapcount; char *type = ""; @@ -92,7 +99,8 @@ void __dump_page(struct page *page, const char *reason) } BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); - pr_warn("%sflags: %#lx(%pGp)\n", type, page->flags, &page->flags); + pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags, + page_cma ? " CMA" : ""); hex_only: print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3aec60ddd6d6..2a1a816c7992 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8185,13 +8185,17 @@ void *__init alloc_large_system_hash(const char *tablename, * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable * check without lock_page also may miss some movable non-lru pages at * race condition. So you can't expect this function should be exact. + * + * Returns a page without holding a reference. If the caller wants to + * dereference that page (e.g., dumping), it has to make sure that that it + * cannot get removed (e.g., via memory unplug) concurrently. + * */ -bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, - int flags) +struct page *has_unmovable_pages(struct zone *zone, struct page *page, + int migratetype, int flags) { unsigned long iter = 0; unsigned long pfn = page_to_pfn(page); - const char *reason = "unmovable page"; /* * TODO we could make this much more efficient by not checking every @@ -8208,9 +8212,8 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, * so consider them movable here. */ if (is_migrate_cma(migratetype)) - return false; + return NULL; - reason = "CMA page"; goto unmovable; } @@ -8285,12 +8288,10 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int migratetype, */ goto unmovable; } - return false; + return NULL; unmovable: WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); - if (flags & REPORT_FAILURE) - dump_page(pfn_to_page(pfn + iter), reason); - return true; + return pfn_to_page(pfn + iter); } #ifdef CONFIG_CONTIG_ALLOC @@ -8694,10 +8695,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) BUG_ON(!PageBuddy(page)); order = page_order(page); offlined_pages += 1 << order; -#ifdef CONFIG_DEBUG_VM - pr_info("remove from free list %lx %d %lx\n", - pfn, 1 << order, end_pfn); -#endif del_page_from_free_area(page, &zone->free_area[order]); pfn += (1 << order); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 1f8b9dfecbe8..e70586523ca3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -17,6 +17,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags) { + struct page *unmovable = NULL; struct zone *zone; unsigned long flags; int ret = -EBUSY; @@ -37,7 +38,8 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, migratetype, isol_flags)) { + unmovable = has_unmovable_pages(zone, page, migratetype, isol_flags); + if (!unmovable) { unsigned long nr_pages; int mt = get_pageblock_migratetype(page); @@ -54,6 +56,13 @@ out: spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(zone); + else if ((isol_flags & REPORT_FAILURE) && unmovable) + /* + * printk() with zone->lock held will guarantee to trigger a + * lockdep splat, so defer it here. + */ + dump_page(unmovable, "unmovable page"); + return ret; } -- cgit v1.2.3 From 26b56e116a69e70cc13976a1b0b818036f539f53 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Thu, 30 Jan 2020 22:15:10 -0800 Subject: include/linux/mm.h: clean up obsolete check on space in page->flags The check was intended to make sure we don't overrun page flags. But it's obsolete because it doesn't include LAST_CPUPID_WIDTH nor KASAN_TAG_WIDTH. Just remove check since we already have it covered in linux/page-flags-layout.h (near the end of the file). Link: http://lkml.kernel.org/r/20191208183508.89177-1-yuzhao@google.com Signed-off-by: Yu Zhao Reviewed-by: David Hildenbrand Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index fc543eb45de1..ddfc217bc026 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -916,10 +916,6 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) -#if SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS -#error SECTIONS_WIDTH+NODES_WIDTH+ZONES_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS -#endif - #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) -- cgit v1.2.3 From ca023a92c8f753a78c37cc8290bd8e3c54f1a936 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 30 Jan 2020 22:15:13 -0800 Subject: include/linux/mm.h: remove dead code totalram_pages_set() totalram_pages_set() was introduced in commit ca79b0c211af ("mm: convert totalram_pages and totalhigh_pages variables to atomic"), but no one uses it. Link: http://lkml.kernel.org/r/20191218005543.24146-1-richardw.yang@linux.intel.com Signed-off-by: Wei Yang Reviewed-by: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index ddfc217bc026..73a044ed6981 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -70,11 +70,6 @@ static inline void totalram_pages_add(long count) atomic_long_add(count, &_totalram_pages); } -static inline void totalram_pages_set(long val) -{ - atomic_long_set(&_totalram_pages, val); -} - extern void * high_memory; extern int page_cluster; -- cgit v1.2.3 From 068964541db6dbacce26e75b390e87e63ad1c100 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 30 Jan 2020 22:15:16 -0800 Subject: include/linux/memory.h: drop fields 'hw' and 'phys_callback' from struct memory_block memory_block structure elements 'hw' and 'phys_callback' are not getting used. This was originally added with commit 3947be1969a9 ("[PATCH] memory hotplug: sysfs and add/remove functions") but never seem to have been used. Just drop them now. Link: http://lkml.kernel.org/r/1576728650-13867-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Dan Williams Reviewed-by: David Hildenbrand Cc: Michal Hocko Cc: Pavel Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory.h b/include/linux/memory.h index de6bccb28f07..0b8d791b6669 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -29,8 +29,6 @@ struct memory_block { int section_count; /* serialized by mem_sysfs_mutex */ int online_type; /* for passing data to online routine */ int phys_device; /* to which fru does this belong? */ - void *hw; /* optional pointer to fw/hw data */ - int (*phys_callback)(struct memory_block *); struct device dev; int nid; /* NID for this memory block */ }; -- cgit v1.2.3 From 0a3c57729768e08de690ba872dd52ee9c3c11e8b Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Thu, 30 Jan 2020 22:15:19 -0800 Subject: mm: fix comments related to node reclaim As zone reclaim has been replaced by node reclaim, this patch fixes related comments. Link: http://lkml.kernel.org/r/20191126141346.GA22665@haolee.github.io Signed-off-by: Hao Lee Reviewed-by: Andrew Morton Cc: Anshuman Khandual Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 2 +- include/uapi/linux/sysctl.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5334ad8fc7bd..c2bc309d1634 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -758,7 +758,7 @@ typedef struct pglist_data { #ifdef CONFIG_NUMA /* - * zone reclaim becomes active if more unmapped pages exist. + * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 87aa2a6d9125..27c1ed2822e6 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -195,7 +195,7 @@ enum VM_MIN_UNMAPPED=32, /* Set min percent of unmapped pages */ VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ - VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ + VM_MIN_SLAB=35, /* Percent pages ignored by node reclaim */ }; -- cgit v1.2.3 From 23331e4893614deb555c65cdf115c8a28ed32471 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 30 Jan 2020 22:15:28 -0800 Subject: include/linux/units.h: add helpers for kelvin to/from Celsius conversion Patch series "add header file for kelvin to/from Celsius conversion helpers", v4. There are several helper macros to convert kelvin to/from Celsius in for thermal drivers. These are useful for any other drivers or subsystems, but it's odd to include just for the helpers. This adds a new that provides the equivalent inline functions for any drivers or subsystems, and switches all the users of conversion helpers in to use helpers. This patch (of 12): There are several helper macros to convert kelvin to/from Celsius in for thermal drivers. These are useful for any other drivers or subsystems, but it's odd to include just for the helpers. This adds a new that provides the equivalent inline functions for any drivers or subsystems. It is intended to replace the helpers in . Link: http://lkml.kernel.org/r/1576386975-7941-2-git-send-email-akinobu.mita@gmail.com Signed-off-by: Akinobu Mita Reviewed-by: Andy Shevchenko Cc: Sujith Thomas Cc: Darren Hart Cc: Zhang Rui Cc: Daniel Lezcano Cc: Amit Kucheria Cc: Jean Delvare Cc: Guenter Roeck Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Kalle Valo Cc: Stanislaw Gruszka Cc: Johannes Berg Cc: Emmanuel Grumbach Cc: Luca Coelho Cc: Jonathan Cameron Cc: Hartmut Knaack Cc: Lars-Peter Clausen Cc: Peter Meerwald-Stadler Cc: Andy Shevchenko Cc: Jonathan Cameron Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/units.h | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 include/linux/units.h (limited to 'include/linux') diff --git a/include/linux/units.h b/include/linux/units.h new file mode 100644 index 000000000000..aaf716364ec3 --- /dev/null +++ b/include/linux/units.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_UNITS_H +#define _LINUX_UNITS_H + +#include + +#define ABSOLUTE_ZERO_MILLICELSIUS -273150 + +static inline long milli_kelvin_to_millicelsius(long t) +{ + return t + ABSOLUTE_ZERO_MILLICELSIUS; +} + +static inline long millicelsius_to_milli_kelvin(long t) +{ + return t - ABSOLUTE_ZERO_MILLICELSIUS; +} + +#define MILLIDEGREE_PER_DEGREE 1000 +#define MILLIDEGREE_PER_DECIDEGREE 100 + +static inline long kelvin_to_millicelsius(long t) +{ + return milli_kelvin_to_millicelsius(t * MILLIDEGREE_PER_DEGREE); +} + +static inline long millicelsius_to_kelvin(long t) +{ + t = millicelsius_to_milli_kelvin(t); + + return DIV_ROUND_CLOSEST(t, MILLIDEGREE_PER_DEGREE); +} + +static inline long deci_kelvin_to_celsius(long t) +{ + t = milli_kelvin_to_millicelsius(t * MILLIDEGREE_PER_DECIDEGREE); + + return DIV_ROUND_CLOSEST(t, MILLIDEGREE_PER_DEGREE); +} + +static inline long celsius_to_deci_kelvin(long t) +{ + t = millicelsius_to_milli_kelvin(t * MILLIDEGREE_PER_DEGREE); + + return DIV_ROUND_CLOSEST(t, MILLIDEGREE_PER_DECIDEGREE); +} + +/** + * deci_kelvin_to_millicelsius_with_offset - convert Kelvin to Celsius + * @t: temperature value in decidegrees Kelvin + * @offset: difference between Kelvin and Celsius in millidegrees + * + * Return: temperature value in millidegrees Celsius + */ +static inline long deci_kelvin_to_millicelsius_with_offset(long t, long offset) +{ + return t * MILLIDEGREE_PER_DECIDEGREE - offset; +} + +static inline long deci_kelvin_to_millicelsius(long t) +{ + return milli_kelvin_to_millicelsius(t * MILLIDEGREE_PER_DECIDEGREE); +} + +static inline long millicelsius_to_deci_kelvin(long t) +{ + t = millicelsius_to_milli_kelvin(t); + + return DIV_ROUND_CLOSEST(t, MILLIDEGREE_PER_DECIDEGREE); +} + +static inline long kelvin_to_celsius(long t) +{ + return t + DIV_ROUND_CLOSEST(ABSOLUTE_ZERO_MILLICELSIUS, + MILLIDEGREE_PER_DEGREE); +} + +static inline long celsius_to_kelvin(long t) +{ + return t - DIV_ROUND_CLOSEST(ABSOLUTE_ZERO_MILLICELSIUS, + MILLIDEGREE_PER_DEGREE); +} + +#endif /* _LINUX_UNITS_H */ -- cgit v1.2.3 From cdf309fb38ad419aadb8f5856a088864688fad90 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Thu, 30 Jan 2020 22:15:57 -0800 Subject: thermal: remove kelvin to/from Celsius conversion helpers from This removes the kelvin to/from Celsius conversion helper macros in which were switched to the inline helper functions in . Link: http://lkml.kernel.org/r/1576386975-7941-9-git-send-email-akinobu.mita@gmail.com Signed-off-by: Akinobu Mita Reviewed-by: Andy Shevchenko Cc: Sujith Thomas Cc: Darren Hart Cc: Andy Shevchenko Cc: Zhang Rui Cc: Daniel Lezcano Cc: Amit Kucheria Cc: Jean Delvare Cc: Guenter Roeck Cc: Keith Busch Cc: Jens Axboe Cc: Christoph Hellwig Cc: Sagi Grimberg Cc: Emmanuel Grumbach Cc: Hartmut Knaack Cc: Johannes Berg Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kalle Valo Cc: Lars-Peter Clausen Cc: Luca Coelho Cc: Peter Meerwald-Stadler Cc: Stanislaw Gruszka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/thermal.h | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/thermal.h b/include/linux/thermal.h index d9111aebb97d..126913c6a53b 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -32,17 +32,6 @@ /* use value, which < 0K, to indicate an invalid/uninitialized temperature */ #define THERMAL_TEMP_INVALID -274000 -/* Unit conversion macros */ -#define DECI_KELVIN_TO_CELSIUS(t) ({ \ - long _t = (t); \ - ((_t-2732 >= 0) ? (_t-2732+5)/10 : (_t-2732-5)/10); \ -}) -#define CELSIUS_TO_DECI_KELVIN(t) ((t)*10+2732) -#define DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, off) (((t) - (off)) * 100) -#define DECI_KELVIN_TO_MILLICELSIUS(t) DECI_KELVIN_TO_MILLICELSIUS_WITH_OFFSET(t, 2732) -#define MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, off) (((t) / 100) + (off)) -#define MILLICELSIUS_TO_DECI_KELVIN(t) MILLICELSIUS_TO_DECI_KELVIN_WITH_OFFSET(t, 2732) - /* Default Thermal Governor */ #if defined(CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE) #define DEFAULT_THERMAL_GOVERNOR "step_wise" -- cgit v1.2.3 From 803521b149c8c71a712445a32bd9316e074df76a Mon Sep 17 00:00:00 2001 From: Mikhail Zaslonko Date: Thu, 30 Jan 2020 22:16:30 -0800 Subject: lib/zlib: add zlib_deflate_dfltcc_enabled() function Add a new function to zlib.h checking if s390 Deflate-Conversion facility is installed and enabled. Link: http://lkml.kernel.org/r/20200103223334.20669-6-zaslonko@linux.ibm.com Signed-off-by: Mikhail Zaslonko Cc: Chris Mason Cc: Christian Borntraeger Cc: David Sterba Cc: Eduard Shishkin Cc: Heiko Carstens Cc: Ilya Leoshkevich Cc: Josef Bacik Cc: Richard Purdie Cc: Vasily Gorbik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zlib.h | 6 ++++++ lib/zlib_deflate/deflate.c | 6 ++++++ lib/zlib_deflate/deflate_syms.c | 1 + lib/zlib_dfltcc/dfltcc.h | 11 +++++++++++ lib/zlib_dfltcc/dfltcc_util.h | 9 --------- 5 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/zlib.h b/include/linux/zlib.h index 92dbbd3f6c75..c757d848a758 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -191,6 +191,12 @@ extern int zlib_deflate_workspacesize (int windowBits, int memLevel); exceed those passed here. */ +extern int zlib_deflate_dfltcc_enabled (void); +/* + Returns 1 if Deflate-Conversion facility is installed and enabled, + otherwise 0. +*/ + /* extern int deflateInit (z_streamp strm, int level); diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index 0d60d81a2637..8a878d0d892c 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -59,6 +59,7 @@ #define DEFLATE_RESET_HOOK(strm) do {} while (0) #define DEFLATE_HOOK(strm, flush, bstate) 0 #define DEFLATE_NEED_CHECKSUM(strm) 1 +#define DEFLATE_DFLTCC_ENABLED() 0 #endif /* =========================================================================== @@ -1138,3 +1139,8 @@ int zlib_deflate_workspacesize(int windowBits, int memLevel) + zlib_deflate_head_memsize(memLevel) + zlib_deflate_overlay_memsize(memLevel); } + +int zlib_deflate_dfltcc_enabled(void) +{ + return DEFLATE_DFLTCC_ENABLED(); +} diff --git a/lib/zlib_deflate/deflate_syms.c b/lib/zlib_deflate/deflate_syms.c index 72fe4b73be53..24b740b99678 100644 --- a/lib/zlib_deflate/deflate_syms.c +++ b/lib/zlib_deflate/deflate_syms.c @@ -12,6 +12,7 @@ #include EXPORT_SYMBOL(zlib_deflate_workspacesize); +EXPORT_SYMBOL(zlib_deflate_dfltcc_enabled); EXPORT_SYMBOL(zlib_deflate); EXPORT_SYMBOL(zlib_deflateInit2); EXPORT_SYMBOL(zlib_deflateEnd); diff --git a/lib/zlib_dfltcc/dfltcc.h b/lib/zlib_dfltcc/dfltcc.h index be70c807b62f..2a2fac1d050a 100644 --- a/lib/zlib_dfltcc/dfltcc.h +++ b/lib/zlib_dfltcc/dfltcc.h @@ -3,6 +3,8 @@ #define DFLTCC_H #include "../zlib_deflate/defutil.h" +#include +#include /* * Tuning parameters. @@ -14,6 +16,8 @@ #define DFLTCC_DHT_MIN_SAMPLE_SIZE 4096 #define DFLTCC_RIBM 0 +#define DFLTCC_FACILITY 151 + /* * Parameter Block for Query Available Functions. */ @@ -113,6 +117,11 @@ typedef enum { } dfltcc_inflate_action; dfltcc_inflate_action dfltcc_inflate(z_streamp strm, int flush, int *ret); +static inline int is_dfltcc_enabled(void) +{ +return (zlib_dfltcc_support != ZLIB_DFLTCC_DISABLED && + test_facility(DFLTCC_FACILITY)); +} #define DEFLATE_RESET_HOOK(strm) \ dfltcc_reset((strm), sizeof(deflate_state)) @@ -121,6 +130,8 @@ dfltcc_inflate_action dfltcc_inflate(z_streamp strm, #define DEFLATE_NEED_CHECKSUM(strm) (!dfltcc_can_deflate((strm))) +#define DEFLATE_DFLTCC_ENABLED() is_dfltcc_enabled() + #define INFLATE_RESET_HOOK(strm) \ dfltcc_reset((strm), sizeof(struct inflate_state)) diff --git a/lib/zlib_dfltcc/dfltcc_util.h b/lib/zlib_dfltcc/dfltcc_util.h index 7c0d3bdc50a9..4a46b5009f0d 100644 --- a/lib/zlib_dfltcc/dfltcc_util.h +++ b/lib/zlib_dfltcc/dfltcc_util.h @@ -3,8 +3,6 @@ #define DFLTCC_UTIL_H #include -#include -#include /* * C wrapper for the DEFLATE CONVERSION CALL instruction. @@ -24,7 +22,6 @@ typedef enum { #define HBT_CIRCULAR (1 << 7) #define HB_BITS 15 #define HB_SIZE (1 << HB_BITS) -#define DFLTCC_FACILITY 151 static inline dfltcc_cc dfltcc( int fn, @@ -101,12 +98,6 @@ static inline int dfltcc_are_params_ok( (strategy == Z_DEFAULT_STRATEGY); } -static inline int is_dfltcc_enabled(void) -{ - return (zlib_dfltcc_support != ZLIB_DFLTCC_DISABLED && - test_facility(DFLTCC_FACILITY)); -} - char *oesc_msg(char *buf, int oesc); #endif /* DFLTCC_UTIL_H */ -- cgit v1.2.3 From d5767057c9a76a29f073dad66b7fa12a90e8c748 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 30 Jan 2020 22:16:40 -0800 Subject: uapi: rename ext2_swab() to swab() and share globally in swab.h ext2_swab() is defined locally in lib/find_bit.c However it is not specific to ext2, neither to bitmaps. There are many potential users of it, so rename it to just swab() and move to include/uapi/linux/swab.h ABI guarantees that size of unsigned long corresponds to BITS_PER_LONG, therefore drop unneeded cast. Link: http://lkml.kernel.org/r/20200103202846.21616-1-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Allison Randal Cc: Joe Perches Cc: Thomas Gleixner Cc: William Breathitt Gray Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swab.h | 1 + include/uapi/linux/swab.h | 10 ++++++++++ lib/find_bit.c | 16 ++-------------- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swab.h b/include/linux/swab.h index e466fd159c85..bcff5149861a 100644 --- a/include/linux/swab.h +++ b/include/linux/swab.h @@ -7,6 +7,7 @@ # define swab16 __swab16 # define swab32 __swab32 # define swab64 __swab64 +# define swab __swab # define swahw32 __swahw32 # define swahb32 __swahb32 # define swab16p __swab16p diff --git a/include/uapi/linux/swab.h b/include/uapi/linux/swab.h index 23cd84868cc3..fa7f97da5b76 100644 --- a/include/uapi/linux/swab.h +++ b/include/uapi/linux/swab.h @@ -4,6 +4,7 @@ #include #include +#include #include /* @@ -132,6 +133,15 @@ static inline __attribute_const__ __u32 __fswahb32(__u32 val) __fswab64(x)) #endif +static __always_inline unsigned long __swab(const unsigned long y) +{ +#if BITS_PER_LONG == 64 + return __swab64(y); +#else /* BITS_PER_LONG == 32 */ + return __swab32(y); +#endif +} + /** * __swahw32 - return a word-swapped 32-bit value * @x: value to wordswap diff --git a/lib/find_bit.c b/lib/find_bit.c index e35a76b291e6..06e503c339f3 100644 --- a/lib/find_bit.c +++ b/lib/find_bit.c @@ -149,18 +149,6 @@ EXPORT_SYMBOL(find_last_bit); #ifdef __BIG_ENDIAN -/* include/linux/byteorder does not support "unsigned long" type */ -static inline unsigned long ext2_swab(const unsigned long y) -{ -#if BITS_PER_LONG == 64 - return (unsigned long) __swab64((u64) y); -#elif BITS_PER_LONG == 32 - return (unsigned long) __swab32((u32) y); -#else -#error BITS_PER_LONG not defined -#endif -} - #if !defined(find_next_bit_le) || !defined(find_next_zero_bit_le) static inline unsigned long _find_next_bit_le(const unsigned long *addr1, const unsigned long *addr2, unsigned long nbits, @@ -177,7 +165,7 @@ static inline unsigned long _find_next_bit_le(const unsigned long *addr1, tmp ^= invert; /* Handle 1st word. */ - tmp &= ext2_swab(BITMAP_FIRST_WORD_MASK(start)); + tmp &= swab(BITMAP_FIRST_WORD_MASK(start)); start = round_down(start, BITS_PER_LONG); while (!tmp) { @@ -191,7 +179,7 @@ static inline unsigned long _find_next_bit_le(const unsigned long *addr1, tmp ^= invert; } - return min(start + __ffs(ext2_swab(tmp)), nbits); + return min(start + __ffs(swab(tmp)), nbits); } #endif -- cgit v1.2.3 From 2b755626cad6d9dcbad09d022ba7bdfc39ed773d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 30 Jan 2020 22:17:32 -0800 Subject: include/linux/io-mapping.h-mapping: use PHYS_PFN() macro in io_mapping_map_atomic_wc() Use PHYS_PFN() macro in io_mapping_map_atomic_wc() instead of open coded variant. Link: http://lkml.kernel.org/r/20191209165624.56351-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/io-mapping.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index 6e125e9b4187..837058bc1c9f 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -28,6 +28,7 @@ struct io_mapping { #ifdef CONFIG_HAVE_ATOMIC_IOMAP +#include #include /* * For small address space machines, mapping large objects @@ -64,12 +65,10 @@ io_mapping_map_atomic_wc(struct io_mapping *mapping, unsigned long offset) { resource_size_t phys_addr; - unsigned long pfn; BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; - pfn = (unsigned long) (phys_addr >> PAGE_SHIFT); - return iomap_atomic_prot_pfn(pfn, mapping->prot); + return iomap_atomic_prot_pfn(PHYS_PFN(phys_addr), mapping->prot); } static inline void -- cgit v1.2.3