From f1fffbd44722cec9b8dd54d5cc86bd081ce39217 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 7 Mar 2019 16:27:07 -0800 Subject: linux/fs.h: move member alignment check next to definition of struct filename Instead of doing this compile-time check in some slightly arbitrary user of struct filename, put it next to the definition. Link: http://lkml.kernel.org/r/20190208203015.29702-3-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Cc: Alexander Viro Cc: Kees Cook Cc: Luc Van Oostenryck Cc: Masahiro Yamada Cc: Nick Desaulniers Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namei.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 914178cdbe94..d604f6b3bcc3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -39,7 +39,6 @@ #include #include #include -#include #include "internal.h" #include "mount.h" @@ -131,7 +130,6 @@ getname_flags(const char __user *filename, int flags, int *empty) struct filename *result; char *kname; int len; - BUILD_BUG_ON(offsetof(struct filename, iname) % sizeof(long) != 0); result = audit_reusename(filename); if (result) -- cgit v1.2.3 From afe1a715e8b699ab029eaa2afe83ee63a6c62810 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Thu, 7 Mar 2019 16:28:00 -0800 Subject: btrfs: implement btrfs_debug* in terms of helper macro First, the btrfs_debug macros open-code (one possible definition of) DYNAMIC_DEBUG_BRANCH, so they don't benefit from the CONFIG_JUMP_LABEL optimization. Second, a planned change of struct _ddebug (to reduce its size on 64 bit machines) requires that all descriptors in a translation unit use distinct identifiers. Using the new _dynamic_func_call_no_desc helper macro from dynamic_debug.h takes care of both of these. No functional change. Link: http://lkml.kernel.org/r/20190212214150.4807-12-linux@rasmusvillemoes.dk Signed-off-by: Rasmus Villemoes Acked-by: David Sterba Acked-by: Jason Baron Cc: Greg Kroah-Hartman Cc: Ingo Molnar Cc: Petr Mladek Cc: "Rafael J . Wysocki" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/ctree.h | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7a2a2621f0d9..94618a028730 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3415,31 +3415,17 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); #if defined(CONFIG_DYNAMIC_DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args); \ -} while (0) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args); \ -} while (0) + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) #define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, \ - ##args);\ -} while (0) -#define btrfs_debug_rl(fs_info, fmt, args...) \ -do { \ - DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ - if (unlikely(descriptor.flags & _DPRINTK_FLAGS_PRINT)) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, \ - ##args); \ -} while (0) + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) #elif defined(DEBUG) #define btrfs_debug(fs_info, fmt, args...) \ btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -- cgit v1.2.3 From c141175d011f18252abb9aa8b018c4e93c71d64b Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Thu, 7 Mar 2019 16:28:46 -0800 Subject: epoll: make sure all elements in ready list are in FIFO order Patch series "use rwlock in order to reduce ep_poll_callback() contention", v3. The last patch targets the contention problem in ep_poll_callback(), which can be very well reproduced by generating events (write to pipe or eventfd) from many threads, while consumer thread does polling. The following are some microbenchmark results based on the test [1] which starts threads which generate N events each. The test ends when all events are successfully fetched by the poller thread: spinlock ======== threads events/ms run-time ms 8 6402 12495 16 7045 22709 32 7395 43268 rwlock + xchg ============= threads events/ms run-time ms 8 10038 7969 16 12178 13138 32 13223 24199 According to the results bandwidth of delivered events is significantly increased, thus execution time is reduced. This patch (of 4): All coming events are stored in FIFO order and this is also should be applicable to ->ovflist, which originally is stack, i.e. LIFO. Thus to keep correct FIFO order ->ovflist should reversed by adding elements to the head of the read list but not to the tail. Link: http://lkml.kernel.org/r/20190103150104.17128-2-rpenyaev@suse.de Signed-off-by: Roman Penyaev Reviewed-by: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a5d219d920e7..60f6b2712f5e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -722,7 +722,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); + /* + * ->ovflist is LIFO, so we have to reverse it in order + * to keep in FIFO. + */ + list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } -- cgit v1.2.3 From c3e320b61581ef7919269ca242ff13951ccfc763 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Thu, 7 Mar 2019 16:28:49 -0800 Subject: epoll: unify awaking of wakeup source on ep_poll_callback() path Original comment "Activate ep->ws since epi->ws may get deactivated at any time" indeed sounds loud, but it is incorrect, because the path where we check epi->ws is a path where insert to ovflist happens, i.e. ep_scan_ready_list() has taken ep->mtx and waits for this callback to finish, thus ep_modify() (which unregisters wakeup source) waits for ep_scan_ready_list(). Here in this patch I simply call ep_pm_stay_awake_rcu(), which is a bit extra for this path (indirectly protected by main ep->mtx, so even rcu is not needed), but I do not want to create another naked __ep_pm_stay_awake() variant only for this particular case, so rcu variant is just better for all the cases. Link: http://lkml.kernel.org/r/20190103150104.17128-4-rpenyaev@suse.de Signed-off-by: Roman Penyaev Cc: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 60f6b2712f5e..53c3d4677dd4 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1162,14 +1162,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v if (epi->next == EP_UNACTIVE_PTR) { epi->next = READ_ONCE(ep->ovflist); WRITE_ONCE(ep->ovflist, epi); - if (epi->ws) { - /* - * Activate ep->ws since epi->ws may get - * deactivated at any time. - */ - __pm_stay_awake(ep->ws); - } - + ep_pm_stay_awake_rcu(epi); } goto out_unlock; } -- cgit v1.2.3 From a218cc4914209ac14476cb32769b31a556355b22 Mon Sep 17 00:00:00 2001 From: Roman Penyaev Date: Thu, 7 Mar 2019 16:28:53 -0800 Subject: epoll: use rwlock in order to reduce ep_poll_callback() contention The goal of this patch is to reduce contention of ep_poll_callback() which can be called concurrently from different CPUs in case of high events rates and many fds per epoll. Problem can be very well reproduced by generating events (write to pipe or eventfd) from many threads, while consumer thread does polling. In other words this patch increases the bandwidth of events which can be delivered from sources to the poller by adding poll items in a lockless way to the list. The main change is in replacement of the spinlock with a rwlock, which is taken on read in ep_poll_callback(), and then by adding poll items to the tail of the list using xchg atomic instruction. Write lock is taken everywhere else in order to stop list modifications and guarantee that list updates are fully completed (I assume that write side of a rwlock does not starve, it seems qrwlock implementation has these guarantees). The following are some microbenchmark results based on the test [1] which starts threads which generate N events each. The test ends when all events are successfully fetched by the poller thread: spinlock ======== threads events/ms run-time ms 8 6402 12495 16 7045 22709 32 7395 43268 rwlock + xchg ============= threads events/ms run-time ms 8 10038 7969 16 12178 13138 32 13223 24199 According to the results bandwidth of delivered events is significantly increased, thus execution time is reduced. This patch was tested with different sort of microbenchmarks and artificial delays (e.g. "udelay(get_random_int() & 0xff)") introduced in kernel on paths where items are added to lists. [1] https://github.com/rouming/test-tools/blob/master/stress-epoll.c Link: http://lkml.kernel.org/r/20190103150104.17128-5-rpenyaev@suse.de Signed-off-by: Roman Penyaev Cc: Davidlohr Bueso Cc: Jason Baron Cc: Al Viro Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 158 ++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 122 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 53c3d4677dd4..4a0e98d87fcc 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -50,10 +50,10 @@ * * 1) epmutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->wq.lock (spinlock) + * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a spinlock (ep->wq.lock) because we manipulate objects + * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -85,7 +85,7 @@ * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global - * mutex "epmutex" (together with "ep->wq.lock") to have it working, + * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee @@ -182,8 +182,6 @@ struct epitem { * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. - * - * Access to it is protected by the lock inside wq. */ struct eventpoll { /* @@ -203,13 +201,16 @@ struct eventpoll { /* List of ready file descriptors */ struct list_head rdllist; + /* Lock which protects rdllist and ovflist */ + rwlock_t lock; + /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out - * holding ->wq.lock. + * holding ->lock. */ struct epitem *ovflist; @@ -697,17 +698,17 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * because we want the "sproc" callback to be able to do it * in a lockless way. */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, &txlist); WRITE_ONCE(ep->ovflist, NULL); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); /* * Now call the callback function. */ res = (*sproc)(ep, &txlist, priv); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -749,11 +750,11 @@ static __poll_t ep_scan_ready_list(struct eventpoll *ep, * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); if (!ep_locked) mutex_unlock(&ep->mtx); @@ -793,10 +794,10 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) rb_erase_cached(&epi->rbn, &ep->rbr); - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -846,7 +847,7 @@ static void ep_free(struct eventpoll *ep) * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit - * us during this operation. So we can avoid the lock on "ep->wq.lock". + * us during this operation. So we can avoid the lock on "ep->lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ @@ -1027,6 +1028,7 @@ static int ep_alloc(struct eventpoll **pep) goto free_uid; mutex_init(&ep->mtx); + rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1116,21 +1118,107 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, } #endif /* CONFIG_CHECKPOINT_RESTORE */ +/** + * Adds a new entry to the tail of the list in a lockless way, i.e. + * multiple CPUs are allowed to call this function concurrently. + * + * Beware: it is necessary to prevent any other modifications of the + * existing list until all changes are completed, in other words + * concurrent list_add_tail_lockless() calls should be protected + * with a read lock, where write lock acts as a barrier which + * makes sure all list_add_tail_lockless() calls are fully + * completed. + * + * Also an element can be locklessly added to the list only in one + * direction i.e. either to the tail either to the head, otherwise + * concurrent access will corrupt the list. + * + * Returns %false if element has been already added to the list, %true + * otherwise. + */ +static inline bool list_add_tail_lockless(struct list_head *new, + struct list_head *head) +{ + struct list_head *prev; + + /* + * This is simple 'new->next = head' operation, but cmpxchg() + * is used in order to detect that same element has been just + * added to the list from another CPU: the winner observes + * new->next == new. + */ + if (cmpxchg(&new->next, new, head) != new) + return false; + + /* + * Initially ->next of a new element must be updated with the head + * (we are inserting to the tail) and only then pointers are atomically + * exchanged. XCHG guarantees memory ordering, thus ->next should be + * updated before pointers are actually swapped and pointers are + * swapped before prev->next is updated. + */ + + prev = xchg(&head->prev, new); + + /* + * It is safe to modify prev->next and new->prev, because a new element + * is added only to the tail and new->next is updated before XCHG. + */ + + prev->next = new; + new->prev = prev; + + return true; +} + +/** + * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, + * i.e. multiple CPUs are allowed to call this function concurrently. + * + * Returns %false if epi element has been already chained, %true otherwise. + */ +static inline bool chain_epi_lockless(struct epitem *epi) +{ + struct eventpoll *ep = epi->ep; + + /* Check that the same epi has not been just chained from another CPU */ + if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) + return false; + + /* Atomically exchange tail */ + epi->next = xchg(&ep->ovflist, epi); + + return true; +} + /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. + * + * This callback takes a read lock in order not to content with concurrent + * events from another file descriptors, thus all modifications to ->rdllist + * or ->ovflist are lockless. Read lock is paired with the write lock from + * ep_scan_ready_list(), which stops all list modifications and guarantees + * that lists state is seen correctly. + * + * Another thing worth to mention is that ep_poll_callback() can be called + * concurrently for the same @epi from different CPUs if poll table was inited + * with several wait queues entries. Plural wakeup from different CPUs of a + * single wait queue is serialized by wq.lock, but the case when multiple wait + * queues are used should be detected accordingly. This is detected using + * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; - unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); + unsigned long flags; int ewake = 0; - spin_lock_irqsave(&ep->wq.lock, flags); + read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1159,17 +1247,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (epi->next == EP_UNACTIVE_PTR) { - epi->next = READ_ONCE(ep->ovflist); - WRITE_ONCE(ep->ovflist, epi); + if (epi->next == EP_UNACTIVE_PTR && + chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); - } goto out_unlock; } /* If this file is already in the ready list we exit soon */ - if (!ep_is_linked(epi)) { - list_add_tail(&epi->rdllink, &ep->rdllist); + if (!ep_is_linked(epi) && + list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) { ep_pm_stay_awake_rcu(epi); } @@ -1194,13 +1280,13 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v break; } } - wake_up_locked(&ep->wq); + wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: - spin_unlock_irqrestore(&ep->wq.lock, flags); + read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1485,7 +1571,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, goto error_remove_epi; /* We have to drop the new item inside our item list to keep track of it */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1497,12 +1583,12 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); atomic_long_inc(&ep->user->epoll_watches); @@ -1528,10 +1614,10 @@ error_unregister: * list, since that is used/cleaned only inside a section bound by "mtx". * And ep_insert() is called with "mtx" held. */ - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); @@ -1575,9 +1661,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). - * We need this because we did not take ep->wq.lock while + * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take - * ep->wq.lock). + * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also @@ -1596,18 +1682,18 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) - wake_up_locked(&ep->wq); + wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -1768,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, */ timed_out = 1; - spin_lock_irq(&ep->wq.lock); + write_lock_irq(&ep->lock); eavail = ep_events_available(ep); - spin_unlock_irq(&ep->wq.lock); + write_unlock_irq(&ep->lock); goto send_events; } -- cgit v1.2.3 From faf1c3152032275370f35dc757501ae0c47ded53 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Mar 2019 16:28:56 -0800 Subject: fs/binfmt_elf.c: don't be afraid of overflow Number of ELF program headers is 16-bit by spec, so total size comfortably fits into "unsigned int". Space savings: 7 bytes! add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-7 (-7) Function old new delta load_elf_phdrs 137 130 -7 Link: http://lkml.kernel.org/r/20190204202715.GA27482@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 54207327f98f..fd4b618c412e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -418,8 +418,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; - int retval, size, err = -1; + int retval, err = -1; loff_t pos = elf_ex->e_phoff; + unsigned int size; /* * If the size of this structure has changed, then punt, since @@ -429,13 +430,9 @@ static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, goto out; /* Sanity check the number of program headers... */ - if (elf_ex->e_phnum < 1 || - elf_ex->e_phnum > 65536U / sizeof(struct elf_phdr)) - goto out; - /* ...and their total size. */ size = sizeof(struct elf_phdr) * elf_ex->e_phnum; - if (size > ELF_MIN_ALIGN) + if (size == 0 || size > 65536 || size > ELF_MIN_ALIGN) goto out; elf_phdata = kmalloc(size, GFP_KERNEL); -- cgit v1.2.3 From 93f044e282b6abc13cdbffc91f909e197e700302 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Mar 2019 16:28:59 -0800 Subject: fs/binfmt_elf.c: use list_for_each_entry() [adobriyan@gmail.com: fixup compilation] Link: http://lkml.kernel.org/r/20190205064334.GA2152@avx2 Link: http://lkml.kernel.org/r/20190204202800.GB27482@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fd4b618c412e..51ff26b51528 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2030,7 +2030,6 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, const kernel_siginfo_t *siginfo, struct pt_regs *regs) { - struct list_head *t; struct core_thread *ct; struct elf_thread_status *ets; @@ -2047,10 +2046,9 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_add(&ets->list, &info->thread_list); } - list_for_each(t, &info->thread_list) { + list_for_each_entry(ets, &info->thread_list, list) { int sz; - ets = list_entry(t, struct elf_thread_status, list); sz = elf_dump_thread_status(siginfo->si_signo, ets); info->thread_status_size += sz; } @@ -2114,20 +2112,17 @@ static size_t get_note_info_size(struct elf_note_info *info) static int write_note_info(struct elf_note_info *info, struct coredump_params *cprm) { + struct elf_thread_status *ets; int i; - struct list_head *t; for (i = 0; i < info->numnote; i++) if (!writenote(info->notes + i, cprm)) return 0; /* write out the thread status notes section */ - list_for_each(t, &info->thread_list) { - struct elf_thread_status *tmp = - list_entry(t, struct elf_thread_status, list); - - for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], cprm)) + list_for_each_entry(ets, &info->thread_list, list) { + for (i = 0; i < ets->num_notes; i++) + if (!writenote(&ets->notes[i], cprm)) return 0; } -- cgit v1.2.3 From 49ac981965e0032c22e44791a694a83511ebd8fe Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Mar 2019 16:29:03 -0800 Subject: fs/binfmt_elf.c: spread const a little Link: http://lkml.kernel.org/r/20190204202830.GC27482@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 51ff26b51528..7d09d125f148 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -57,8 +57,6 @@ #endif static int load_elf_binary(struct linux_binprm *bprm); -static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, - int, int, unsigned long); #ifdef CONFIG_USELIB static int load_elf_library(struct file *); @@ -347,7 +345,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, #ifndef elf_map static unsigned long elf_map(struct file *filep, unsigned long addr, - struct elf_phdr *eppnt, int prot, int type, + const struct elf_phdr *eppnt, int prot, int type, unsigned long total_size) { unsigned long map_addr; @@ -387,7 +385,7 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, #endif /* !elf_map */ -static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) { int i, first_idx = -1, last_idx = -1; @@ -414,7 +412,7 @@ static unsigned long total_mapping_size(struct elf_phdr *cmds, int nr) * header pointed to by elf_ex, into a newly allocated array. The caller is * responsible for freeing the allocated data. Returns an ERR_PTR upon failure. */ -static struct elf_phdr *load_elf_phdrs(struct elfhdr *elf_ex, +static struct elf_phdr *load_elf_phdrs(const struct elfhdr *elf_ex, struct file *elf_file) { struct elf_phdr *elf_phdata = NULL; -- cgit v1.2.3 From 60d6d04ca3abb34d5e89f030dbea440d9715a168 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Mar 2019 16:29:09 -0800 Subject: autofs: add ignore mount option Add an autofs file system mount option that can be used to provide a generic indicator to applications that the mount entry should be ignored when displaying mount information. In other OSes that provide autofs and that provide a mount list to user space based on the kernel mount list a no-op mount option ("ignore" is the one use on the most common OS) is allowed so that autofs file system users can optionally use it. The idea is that it be used by user space programs to exclude autofs mounts from consideration when reading the mounts list. Prior to the change to link /etc/mtab to /proc/self/mounts all I needed to do to achieve this was to use mount(2) and not update the mtab but now that no longer works. I know the symlinking happened a long time ago and I considered doing this then but, at the time I couldn't remember the commonly used option name and thought persuading the various utility maintainers would be too hard. But now I have a RHEL request to do this for compatibility for a widely used product so I want to go ahead with it and try and enlist the help of some utility package maintainers. Clearly, without the option nothing can be done so it's at least a start. Link: http://lkml.kernel.org/r/154725123970.11260.6113771566924907275.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 1 + fs/autofs/inode.c | 9 ++++++++- include/uapi/linux/auto_fs.h | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 3e59f0ed777b..b735f2b1e462 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -105,6 +105,7 @@ struct autofs_wait_queue { #define AUTOFS_SBI_CATATONIC 0x0001 #define AUTOFS_SBI_STRICTEXPIRE 0x0002 +#define AUTOFS_SBI_IGNORE 0x0004 struct autofs_sb_info { u32 magic; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 078992eee299..8647ecaa89fc 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -89,6 +89,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_printf(m, ",strictexpire"); + if (sbi->flags & AUTOFS_SBI_IGNORE) + seq_printf(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); @@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = { }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire}; + Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire, + Opt_ignore}; static const match_table_t tokens = { {Opt_fd, "fd=%u"}, @@ -124,6 +127,7 @@ static const match_table_t tokens = { {Opt_direct, "direct"}, {Opt_offset, "offset"}, {Opt_strictexpire, "strictexpire"}, + {Opt_ignore, "ignore"}, {Opt_err, NULL} }; @@ -206,6 +210,9 @@ static int parse_options(char *options, case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; + case Opt_ignore: + sbi->flags |= AUTOFS_SBI_IGNORE; + break; default: return 1; } diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 082119630b49..1f7925afad2d 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 4 +#define AUTOFS_PROTO_SUBVERSION 5 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed -- cgit v1.2.3 From 874d22d62bc6dca7ceec48c44d247af0109e0b5b Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Mar 2019 16:29:12 -0800 Subject: fs/autofs/inode.c: use seq_puts() for simple strings in autofs_show_options() Fix checkpatch.sh WARNING about the use of seq_printf() to print simple strings in autofs_show_options(), use seq_puts() in this case. Link: http://lkml.kernel.org/r/154889012613.4863.12231175554744203482.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 8647ecaa89fc..80597b88718b 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -82,20 +82,20 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",maxproto=%d", sbi->max_proto); if (autofs_type_offset(sbi->type)) - seq_printf(m, ",offset"); + seq_puts(m, ",offset"); else if (autofs_type_direct(sbi->type)) - seq_printf(m, ",direct"); + seq_puts(m, ",direct"); else - seq_printf(m, ",indirect"); + seq_puts(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) - seq_printf(m, ",strictexpire"); + seq_puts(m, ",strictexpire"); if (sbi->flags & AUTOFS_SBI_IGNORE) - seq_printf(m, ",ignore"); + seq_puts(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else - seq_printf(m, ",pipe_ino=-1"); + seq_puts(m, ",pipe_ino=-1"); #endif return 0; } -- cgit v1.2.3 From 660c9fc72e06d7a46d96d2cb8524a26565072a76 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 7 Mar 2019 16:29:16 -0800 Subject: autofs: clear O_NONBLOCK on the pipe autofs does not expect the pipe it is given to have O_NONBLOCK set - specifically if __kernel_write() in autofs_write() returns -EAGAIN, this is treated as a fatal error and the pipe is closed. For safety autofs should, therefore, clear the O_NONBLOCK flag. Releases of systemd prior to 8th February 2019 used pipe2(p, O_NONBLOCK|O_CLOEXEC) and thus (inadvertently) set this flag. Link: http://lkml.kernel.org/r/154993550902.3321.1183632970046073478.stgit@pluto-themaw-net Signed-off-by: NeilBrown Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index b735f2b1e462..70c132acdab1 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -216,6 +216,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) return -EINVAL; /* We want a packet pipe */ pipe->f_flags |= O_DIRECT; + /* We don't expect -EAGAIN */ + pipe->f_flags &= ~O_NONBLOCK; return 0; } -- cgit v1.2.3 From 67ceb1eca0acc045c9ef170a05f58fd710063967 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 7 Mar 2019 16:29:19 -0800 Subject: fat: enable .splice_write to support splice on O_DIRECT file Now splice() on O_DIRECT-opened fat file will return -EFAULT, that is because the default .splice_write, namely default_file_splice_write(), will construct an ITER_KVEC iov_iter and dio_refill_pages() in dio path can not handle it. Fix it by implementing .splice_write through iter_file_splice_write(). Spotted by xfs-tests generic/091. Link: http://lkml.kernel.org/r/20190210094754.56355-1-houtao1@huawei.com Signed-off-by: Hou Tao Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fat/file.c b/fs/fat/file.c index 13935ee99e1e..b3bed32946b1 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -214,6 +214,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fallocate = fat_fallocate, }; -- cgit v1.2.3 From 26e152252e92999b975fe935c666a090c46905f7 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 7 Mar 2019 16:29:23 -0800 Subject: fs/exec.c: replace opencoded set_mask_bits() Link: http://lkml.kernel.org/r/1548275584-18096-2-git-send-email-vgupta@synopsys.com Link: http://lkml.kernel.org/g/20150807115710.GA16897@redhat.com Signed-off-by: Vineet Gupta Reviewed-by: Anthony Yznaga Acked-by: Oleg Nesterov Cc: Alexander Viro Cc: Peter Zijlstra (Intel) Cc: Chris Wilson Cc: Ingo Molnar Cc: Jani Nikula Cc: Miklos Szeredi Cc: Theodore Ts'o Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 74f3672146a7..bf0ace3841ad 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1944,15 +1944,10 @@ EXPORT_SYMBOL(set_binfmt); */ void set_dumpable(struct mm_struct *mm, int value) { - unsigned long old, new; - if (WARN_ON((unsigned)value > SUID_DUMP_ROOT)) return; - do { - old = READ_ONCE(mm->flags); - new = (old & ~MMF_DUMPABLE_MASK) | value; - } while (cmpxchg(&mm->flags, old, new) != old); + set_mask_bits(&mm->flags, MMF_DUMPABLE_MASK, value); } SYSCALL_DEFINE3(execve, -- cgit v1.2.3 From 6eb3c3d0a52dca337e327ae8868ca1f44a712e02 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Mar 2019 16:29:26 -0800 Subject: exec: increase BINPRM_BUF_SIZE to 256 Large enterprise clients often run applications out of networked file systems where the IT mandated layout of project volumes can end up leading to paths that are longer than 128 characters. Bumping this up to the next order of two solves this problem in all but the most egregious case while still fitting into a 512b slab. [oleg@redhat.com: update comment, per Kees] Link: http://lkml.kernel.org/r/20181112160956.GA28472@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Ben Woodard Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/uapi/linux/binfmts.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index bf0ace3841ad..2e0033348d8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) /* * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * Check permissions, then read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h index 4abad03a8853..689025d9c185 100644 --- a/include/uapi/linux/binfmts.h +++ b/include/uapi/linux/binfmts.h @@ -16,6 +16,6 @@ struct pt_regs; #define MAX_ARG_STRINGS 0x7FFFFFFF /* sizeof(linux_binprm->buf) */ -#define BINPRM_BUF_SIZE 128 +#define BINPRM_BUF_SIZE 256 #endif /* _UAPI_LINUX_BINFMTS_H */ -- cgit v1.2.3