From b9a49520679e98700d3d89689cc91c08a1c88c1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 19 Jan 2025 00:55:32 +0100 Subject: rcuref: Plug slowpath race in rcuref_put() Kernel test robot reported an "imbalanced put" in the rcuref_put() slow path, which turned out to be a false positive. Consider the following race: ref = 0 (via rcuref_init(ref, 1)) T1 T2 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath(ref) rcuref_get(ref) -> atomic_add_negative_relaxed(1, &ref->refcnt) -> return true; # ref -> 0 rcuref_put(ref) -> atomic_add_negative_release(-1, ref) # ref -> 0xffffffff -> rcuref_put_slowpath() -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xffffffff / RCUREF_NOREF -> atomic_try_cmpxchg_release(&ref->refcnt, &cnt, RCUREF_DEAD)) # ref -> 0xe0000000 / RCUREF_DEAD -> return true -> cnt = atomic_read(&ref->refcnt); # cnt -> 0xe0000000 / RCUREF_DEAD -> if (cnt > RCUREF_RELEASED) # 0xe0000000 > 0xc0000000 -> WARN_ONCE(cnt >= RCUREF_RELEASED, "rcuref - imbalanced put()") The problem is the additional read in the slow path (after it decremented to RCUREF_NOREF) which can happen after the counter has been marked RCUREF_DEAD. Prevent this by reusing the return value of the decrement. Now every "final" put uses RCUREF_NOREF in the slow path and attempts the final cmpxchg() to RCUREF_DEAD. [ bigeasy: Add changelog ] Fixes: ee1ee6db07795 ("atomics: Provide rcuref - scalable reference counting") Reported-by: kernel test robot Debugged-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Reviewed-by: Sebastian Andrzej Siewior Cc: stable@vger.kernel.org Closes: https://lore.kernel.org/oe-lkp/202412311453.9d7636a2-lkp@intel.com --- include/linux/rcuref.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h index 2c8bfd0f1b6b..6322d8c1c6b4 100644 --- a/include/linux/rcuref.h +++ b/include/linux/rcuref.h @@ -71,27 +71,30 @@ static inline __must_check bool rcuref_get(rcuref_t *ref) return rcuref_get_slowpath(ref); } -extern __must_check bool rcuref_put_slowpath(rcuref_t *ref); +extern __must_check bool rcuref_put_slowpath(rcuref_t *ref, unsigned int cnt); /* * Internal helper. Do not invoke directly. */ static __always_inline __must_check bool __rcuref_put(rcuref_t *ref) { + int cnt; + RCU_LOCKDEP_WARN(!rcu_read_lock_held() && preemptible(), "suspicious rcuref_put_rcusafe() usage"); /* * Unconditionally decrease the reference count. The saturation and * dead zones provide enough tolerance for this. */ - if (likely(!atomic_add_negative_release(-1, &ref->refcnt))) + cnt = atomic_sub_return_release(1, &ref->refcnt); + if (likely(cnt >= 0)) return false; /* * Handle the last reference drop and cases inside the saturation * and dead zones. */ - return rcuref_put_slowpath(ref); + return rcuref_put_slowpath(ref, cnt); } /** -- cgit v1.2.3 From 36b62df5683c315ba58c950f1a9c771c796c30ec Mon Sep 17 00:00:00 2001 From: Jiayuan Chen Date: Wed, 22 Jan 2025 18:09:14 +0800 Subject: bpf: Fix wrong copied_seq calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'sk->copied_seq' was updated in the tcp_eat_skb() function when the action of a BPF program was SK_REDIRECT. For other actions, like SK_PASS, the update logic for 'sk->copied_seq' was moved to tcp_bpf_recvmsg_parser() to ensure the accuracy of the 'fionread' feature. It works for a single stream_verdict scenario, as it also modified sk_data_ready->sk_psock_verdict_data_ready->tcp_read_skb to remove updating 'sk->copied_seq'. However, for programs where both stream_parser and stream_verdict are active (strparser purpose), tcp_read_sock() was used instead of tcp_read_skb() (sk_data_ready->strp_data_ready->tcp_read_sock). tcp_read_sock() now still updates 'sk->copied_seq', leading to duplicate updates. In summary, for strparser + SK_PASS, copied_seq is redundantly calculated in both tcp_read_sock() and tcp_bpf_recvmsg_parser(). The issue causes incorrect copied_seq calculations, which prevent correct data reads from the recv() interface in user-land. We do not want to add new proto_ops to implement a new version of tcp_read_sock, as this would introduce code complexity [1]. We could have added noack and copied_seq to desc, and then called ops->read_sock. However, unfortunately, other modules didn’t fully initialize desc to zero. So, for now, we are directly calling tcp_read_sock_noack() in tcp_bpf.c. [1]: https://lore.kernel.org/bpf/20241218053408.437295-1-mrpre@163.com Fixes: e5c6de5fa025 ("bpf, sockmap: Incorrectly handling copied_seq") Suggested-by: Jakub Sitnicki Signed-off-by: Jiayuan Chen Signed-off-by: Martin KaFai Lau Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://patch.msgid.link/20250122100917.49845-3-mrpre@163.com --- include/linux/skmsg.h | 2 ++ include/net/tcp.h | 8 ++++++++ net/core/skmsg.c | 7 +++++++ net/ipv4/tcp.c | 29 ++++++++++++++++++++++++----- net/ipv4/tcp_bpf.c | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 77 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2cbe0c22a32f..0b9095a281b8 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -91,6 +91,8 @@ struct sk_psock { struct sk_psock_progs progs; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) struct strparser strp; + u32 copied_seq; + u32 ingress_bytes; #endif struct sk_buff_head ingress_skb; struct list_head ingress_msg; diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b2b04835688..9c044fb9ab26 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -729,6 +729,9 @@ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor); struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off); void tcp_read_done(struct sock *sk, size_t len); @@ -2599,6 +2602,11 @@ struct sk_psock; #ifdef CONFIG_BPF_SYSCALL int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); +#ifdef CONFIG_BPF_STREAM_PARSER +struct strparser; +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor); +#endif /* CONFIG_BPF_STREAM_PARSER */ #endif /* CONFIG_BPF_SYSCALL */ #ifdef CONFIG_INET diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 61f3f3d4e528..0ddc4c718833 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, return num_sge; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) + psock->ingress_bytes += len; +#endif copied = len; msg->sg.start = 0; msg->sg.size = copied; @@ -1144,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) if (!ret) sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED); + if (sk_is_tcp(sk)) { + psock->strp.cb.read_sock = tcp_bpf_strp_read_sock; + psock->copied_seq = tcp_sk(sk)->copied_seq; + } return ret; } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c41..285678d8ce07 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1565,12 +1565,13 @@ EXPORT_SYMBOL(tcp_recv_skb); * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ -int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - u32 seq = tp->copied_seq; + u32 seq = *copied_seq; u32 offset; int copied = 0; @@ -1624,9 +1625,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_eat_recv_skb(sk, skb); if (!desc->count) break; - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); } - WRITE_ONCE(tp->copied_seq, seq); + WRITE_ONCE(*copied_seq, seq); + + if (noack) + goto out; tcp_rcv_space_adjust(sk); @@ -1635,10 +1639,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } +out: return copied; } + +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __tcp_read_sock(sk, desc, recv_actor, false, + &tcp_sk(sk)->copied_seq); +} EXPORT_SYMBOL(tcp_read_sock); +int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack, + u32 *copied_seq) +{ + return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq); +} + int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 47f65b1b70ca..ba581785adb4 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops) ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP; } +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) +int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sock *sk = strp->sk; + struct sk_psock *psock; + struct tcp_sock *tp; + int copied = 0; + + tp = tcp_sk(sk); + rcu_read_lock(); + psock = sk_psock(sk); + if (WARN_ON_ONCE(!psock)) { + desc->error = -EINVAL; + goto out; + } + + psock->ingress_bytes = 0; + copied = tcp_read_sock_noack(sk, desc, recv_actor, true, + &psock->copied_seq); + if (copied < 0) + goto out; + /* recv_actor may redirect skb to another socket (SK_REDIRECT) or + * just put skb into ingress queue of current socket (SK_PASS). + * For SK_REDIRECT, we need to ack the frame immediately but for + * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser(). + */ + tp->copied_seq = psock->copied_seq - psock->ingress_bytes; + tcp_rcv_space_adjust(sk); + __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes); +out: + rcu_read_unlock(); + return copied; +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4; -- cgit v1.2.3 From a1f7b7ff0e10ae574d388131596390157222f986 Mon Sep 17 00:00:00 2001 From: Pierre-Louis Bossart Date: Mon, 10 Feb 2025 10:17:27 +0200 Subject: PCI: pci_ids: add INTEL_HDA_PTL_H Add Intel PTL-H audio Device ID. Signed-off-by: Pierre-Louis Bossart Signed-off-by: Peter Ujfalusi Reviewed-by: Kai Vehmanen Reviewed-by: Bard Liao Acked-by: Bjorn Helgaas Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250210081730.22916-2-peter.ujfalusi@linux.intel.com --- include/linux/pci_ids.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index de5deb1a0118..1a2594a38199 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -3134,6 +3134,7 @@ #define PCI_DEVICE_ID_INTEL_HDA_LNL_P 0xa828 #define PCI_DEVICE_ID_INTEL_S21152BB 0xb152 #define PCI_DEVICE_ID_INTEL_HDA_BMG 0xe2f7 +#define PCI_DEVICE_ID_INTEL_HDA_PTL_H 0xe328 #define PCI_DEVICE_ID_INTEL_HDA_PTL 0xe428 #define PCI_DEVICE_ID_INTEL_HDA_CML_R 0xf0c8 #define PCI_DEVICE_ID_INTEL_HDA_RKL_S 0xf1c8 -- cgit v1.2.3 From 1d0013962d220b166d9f7c9fe2746f1542e459a3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Feb 2025 22:23:59 +0000 Subject: netfs: Fix a number of read-retry hangs Fix a number of hangs in the netfslib read-retry code, including: (1) netfs_reissue_read() doubles up the getting of references on subrequests, thereby leaking the subrequest and causing inode eviction to wait indefinitely. This can lead to the kernel reporting a hang in the filesystem's evict_inode(). Fix this by removing the get from netfs_reissue_read() and adding one to netfs_retry_read_subrequests() to deal with the one place that didn't double up. (2) The loop in netfs_retry_read_subrequests() that retries a sequence of failed subrequests doesn't record whether or not it retried the one that the "subreq" pointer points to when it leaves the loop. It may not if renegotiation/repreparation of the subrequests means that fewer subrequests are needed to span the cumulative range of the sequence. Because it doesn't record this, the piece of code that discards now-superfluous subrequests doesn't know whether it should discard the one "subreq" points to - and so it doesn't. Fix this by noting whether the last subreq it examines is superfluous and if it is, then getting rid of it and all subsequent subrequests. If that one one wasn't superfluous, then we would have tried to go round the previous loop again and so there can be no further unretried subrequests in the sequence. (3) netfs_retry_read_subrequests() gets yet an extra ref on any additional subrequests it has to get because it ran out of ones it could reuse to to renegotiation/repreparation shrinking the subrequests. Fix this by removing that extra ref. (4) In netfs_retry_reads(), it was using wait_on_bit() to wait for NETFS_SREQ_IN_PROGRESS to be cleared on all subrequests in the sequence - but netfs_read_subreq_terminated() is now using a wait queue on the request instead and so this wait will never finish. Fix this by waiting on the wait queue instead. To make this work, a new flag, NETFS_RREQ_RETRYING, is now set around the wait loop to tell the wake-up code to wake up the wait queue rather than requeuing the request's work item. Note that this flag replaces the NETFS_RREQ_NEED_RETRY flag which is no longer used. (5) Whilst not strictly anything to do with the hang, netfs_retry_read_subrequests() was also doubly incrementing the subreq_counter and re-setting the debug index, leaving a gap in the trace. This is also fixed. One of these hangs was observed with 9p and with cifs. Others were forced by manual code injection into fs/afs/file.c. Firstly, afs_prepare_read() was created to provide an changing pattern of maximum subrequest sizes: static int afs_prepare_read(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; if (!S_ISREG(subreq->rreq->inode->i_mode)) return 0; if (subreq->retry_count < 20) rreq->io_streams[0].sreq_max_len = umax(200, 2222 - subreq->retry_count * 40); else rreq->io_streams[0].sreq_max_len = 3333; return 0; } and pointed to by afs_req_ops. Then the following: struct netfs_io_subrequest *subreq = op->fetch.subreq; if (subreq->error == 0 && S_ISREG(subreq->rreq->inode->i_mode) && subreq->retry_count < 20) { subreq->transferred = subreq->already_done; __clear_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); __set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); afs_fetch_data_notify(op); return; } was inserted into afs_fetch_data_success() at the beginning and struct netfs_io_subrequest given an extra field, "already_done" that was set to the value in "subreq->transferred" by netfs_reissue_read(). When reading a 4K file, the subrequests would get gradually smaller, a new subrequest would be allocated around the 3rd retry and then eventually be rendered superfluous when the 20th retry was hit and the limit on the first subrequest was eased. Fixes: e2d46f2ec332 ("netfs: Change the read result collector to only use one work item") Signed-off-by: David Howells Link: https://lore.kernel.org/r/20250212222402.3618494-2-dhowells@redhat.com Tested-by: Marc Dionne Tested-by: Steve French cc: Ihor Solodrai cc: Eric Van Hensbergen cc: Latchesar Ionkov cc: Dominique Martinet cc: Christian Schoenebeck cc: Paulo Alcantara cc: Jeff Layton cc: v9fs@lists.linux.dev cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Christian Brauner --- fs/netfs/read_collect.c | 6 ++++-- fs/netfs/read_retry.c | 40 ++++++++++++++++++++++++++++++---------- include/linux/netfs.h | 2 +- include/trace/events/netfs.h | 4 +++- 4 files changed, 38 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index f65affa5a9e4..636cc5a98ef5 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work) */ void netfs_wake_read_collector(struct netfs_io_request *rreq) { - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { if (!work_pending(&rreq->work)) { netfs_get_request(rreq, netfs_rreq_trace_get_work); if (!queue_work(system_unbound_wq, &rreq->work)) @@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) netfs_wake_read_collector(rreq); netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 2290af0d51ac..8316c4533a51 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -14,7 +14,6 @@ static void netfs_reissue_read(struct netfs_io_request *rreq, { __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); __set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); subreq->rreq->netfs_ops->issue_read(subreq); } @@ -48,6 +47,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) __clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); subreq->retry_count++; netfs_reset_iter(subreq); + netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); } } @@ -75,7 +75,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) struct iov_iter source; unsigned long long start, len; size_t part; - bool boundary = false; + bool boundary = false, subreq_superfluous = false; /* Go through the subreqs and find the next span of contiguous * buffer that we then rejig (cifs, for example, needs the @@ -116,8 +116,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) /* Work through the sublist. */ subreq = from; list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) { - if (!len) + if (!len) { + subreq_superfluous = true; break; + } subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start - subreq->transferred; subreq->len = len + subreq->transferred; @@ -154,19 +156,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_read(rreq, subreq); - if (subreq == to) + if (subreq == to) { + subreq_superfluous = false; break; + } } /* If we managed to use fewer subreqs, we can discard the * excess; if we used the same number, then we're done. */ if (!len) { - if (subreq == to) + if (!subreq_superfluous) continue; list_for_each_entry_safe_from(subreq, tmp, &stream->subrequests, rreq_link) { - trace_netfs_sreq(subreq, netfs_sreq_trace_discard); + trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); if (subreq == to) @@ -187,14 +191,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) subreq->source = NETFS_DOWNLOAD_FROM_SERVER; subreq->start = start; subreq->len = len; - subreq->debug_index = atomic_inc_return(&rreq->subreq_counter); subreq->stream_nr = stream->stream_nr; subreq->retry_count = 1; trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index, refcount_read(&subreq->ref), netfs_sreq_trace_new); - netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); list_add(&subreq->rreq_link, &to->rreq_link); to = list_next_entry(to, rreq_link); @@ -256,14 +258,32 @@ void netfs_retry_reads(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; + DEFINE_WAIT(myself); + + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + + finish_wait(&rreq->waitq, &myself); } + clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); netfs_retry_read_subrequests(rreq); diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 071d05d81d38..c86a11cfc4a3 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -278,7 +278,7 @@ struct netfs_io_request { #define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */ #define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */ #define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */ -#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */ +#define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */ #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark * write to cache on read */ const struct netfs_request_ops *netfs_ops; diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h index 6e699cadcb29..f880835f7695 100644 --- a/include/trace/events/netfs.h +++ b/include/trace/events/netfs.h @@ -99,7 +99,7 @@ EM(netfs_sreq_trace_limited, "LIMIT") \ EM(netfs_sreq_trace_need_clear, "N-CLR") \ EM(netfs_sreq_trace_partial_read, "PARTR") \ - EM(netfs_sreq_trace_need_retry, "NRTRY") \ + EM(netfs_sreq_trace_need_retry, "ND-RT") \ EM(netfs_sreq_trace_prepare, "PREP ") \ EM(netfs_sreq_trace_prep_failed, "PRPFL") \ EM(netfs_sreq_trace_progress, "PRGRS") \ @@ -108,7 +108,9 @@ EM(netfs_sreq_trace_short, "SHORT") \ EM(netfs_sreq_trace_split, "SPLIT") \ EM(netfs_sreq_trace_submit, "SUBMT") \ + EM(netfs_sreq_trace_superfluous, "SPRFL") \ EM(netfs_sreq_trace_terminated, "TERM ") \ + EM(netfs_sreq_trace_wait_for, "_WAIT") \ EM(netfs_sreq_trace_write, "WRITE") \ EM(netfs_sreq_trace_write_skip, "SKIP ") \ E_(netfs_sreq_trace_write_term, "WTERM") -- cgit v1.2.3 From 02d954c0fdf91845169cdacc7405b120f90afe01 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 10 Feb 2025 16:32:50 +0100 Subject: sched: Compact RSEQ concurrency IDs with reduced threads and affinity When a process reduces its number of threads or clears bits in its CPU affinity mask, the mm_cid allocation should eventually converge towards smaller values. However, the change introduced by: commit 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") adds a per-mm/CPU recent_cid which is never unset unless a thread migrates. This is a tradeoff between: A) Preserving cache locality after a transition from many threads to few threads, or after reducing the hamming weight of the allowed CPU mask. B) Making the mm_cid upper bounds wrt nr threads and allowed CPU mask easy to document and understand. C) Allowing applications to eventually react to mm_cid compaction after reduction of the nr threads or allowed CPU mask, making the tracking of mm_cid compaction easier by shrinking it back towards 0 or not. D) Making sure applications that periodically reduce and then increase again the nr threads or allowed CPU mask still benefit from good cache locality with mm_cid. Introduce the following changes: * After shrinking the number of threads or reducing the number of allowed CPUs, reduce the value of max_nr_cid so expansion of CID allocation will preserve cache locality if the number of threads or allowed CPUs increase again. * Only re-use a recent_cid if it is within the max_nr_cid upper bound, else find the first available CID. Fixes: 7e019dcc470f ("sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads") Signed-off-by: Mathieu Desnoyers Signed-off-by: Gabriele Monaco Signed-off-by: Peter Zijlstra (Intel) Tested-by: Gabriele Monaco Link: https://lkml.kernel.org/r/20250210153253.460471-2-gmonaco@redhat.com --- include/linux/mm_types.h | 7 ++++--- kernel/sched/sched.h | 25 ++++++++++++++++++++++--- 2 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6b27db7f9496..0234f14f2aa6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -875,10 +875,11 @@ struct mm_struct { */ unsigned int nr_cpus_allowed; /** - * @max_nr_cid: Maximum number of concurrency IDs allocated. + * @max_nr_cid: Maximum number of allowed concurrency + * IDs allocated. * - * Track the highest number of concurrency IDs allocated for the - * mm. + * Track the highest number of allowed concurrency IDs + * allocated for the mm. */ atomic_t max_nr_cid; /** diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b93c8c3dc05a..c8512a9fb022 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3698,10 +3698,28 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) { struct cpumask *cidmask = mm_cidmask(mm); struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid; - int cid = __this_cpu_read(pcpu_cid->recent_cid); + int cid, max_nr_cid, allowed_max_nr_cid; + /* + * After shrinking the number of threads or reducing the number + * of allowed cpus, reduce the value of max_nr_cid so expansion + * of cid allocation will preserve cache locality if the number + * of threads or allowed cpus increase again. + */ + max_nr_cid = atomic_read(&mm->max_nr_cid); + while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed), + atomic_read(&mm->mm_users))), + max_nr_cid > allowed_max_nr_cid) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */ + if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) { + max_nr_cid = allowed_max_nr_cid; + break; + } + } /* Try to re-use recent cid. This improves cache locality. */ - if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask)) + cid = __this_cpu_read(pcpu_cid->recent_cid); + if (!mm_cid_is_unset(cid) && cid < max_nr_cid && + !cpumask_test_and_set_cpu(cid, cidmask)) return cid; /* * Expand cid allocation if the maximum number of concurrency @@ -3709,8 +3727,9 @@ static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm) * and number of threads. Expanding cid allocation as much as * possible improves cache locality. */ - cid = atomic_read(&mm->max_nr_cid); + cid = max_nr_cid; while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) { + /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1)) continue; if (!cpumask_test_and_set_cpu(cid, cidmask)) -- cgit v1.2.3 From 84e009042d0f3dfe91bec60bcd208ee3f866cbcd Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Mon, 17 Feb 2025 17:08:27 +0100 Subject: nvme-tcp: add basic support for the C2HTermReq PDU Previously, the NVMe/TCP host driver did not handle the C2HTermReq PDU, instead printing "unsupported pdu type (3)" when received. This patch adds support for processing the C2HTermReq PDU, allowing the driver to print the Fatal Error Status field. Example of output: nvme nvme4: Received C2HTermReq (FES = Invalid PDU Header Field) Signed-off-by: Maurizio Lombardi Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch --- drivers/nvme/host/tcp.c | 43 +++++++++++++++++++++++++++++++++++++++++++ include/linux/nvme-tcp.h | 2 ++ 2 files changed, 45 insertions(+) (limited to 'include/linux') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 841238f38fdd..038b35238c26 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -763,6 +763,40 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return 0; } +static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue, + struct nvme_tcp_term_pdu *pdu) +{ + u16 fes; + const char *msg; + u32 plen = le32_to_cpu(pdu->hdr.plen); + + static const char * const msg_table[] = { + [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field", + [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error", + [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error", + [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range", + [NVME_TCP_FES_R2T_LIMIT_EXCEEDED] = "R2T Limit Exceeded", + [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter", + }; + + if (plen < NVME_TCP_MIN_C2HTERM_PLEN || + plen > NVME_TCP_MAX_C2HTERM_PLEN) { + dev_err(queue->ctrl->ctrl.device, + "Received a malformed C2HTermReq PDU (plen = %u)\n", + plen); + return; + } + + fes = le16_to_cpu(pdu->fes); + if (fes && fes < ARRAY_SIZE(msg_table)) + msg = msg_table[fes]; + else + msg = "Unknown"; + + dev_err(queue->ctrl->ctrl.device, + "Received C2HTermReq (FES = %s)\n", msg); +} + static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -784,6 +818,15 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->type == nvme_tcp_c2h_term)) { + /* + * C2HTermReq never includes Header or Data digests. + * Skip the checks. + */ + nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu); + return -EINVAL; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) diff --git a/include/linux/nvme-tcp.h b/include/linux/nvme-tcp.h index e07e8978d691..e435250fcb4d 100644 --- a/include/linux/nvme-tcp.h +++ b/include/linux/nvme-tcp.h @@ -13,6 +13,8 @@ #define NVME_TCP_ADMIN_CCSZ SZ_8K #define NVME_TCP_DIGEST_LENGTH 4 #define NVME_TCP_MIN_MAXH2CDATA 4096 +#define NVME_TCP_MIN_C2HTERM_PLEN 24 +#define NVME_TCP_MAX_C2HTERM_PLEN 152 enum nvme_tcp_pfv { NVME_TCP_PFV_1_0 = 0x0, -- cgit v1.2.3 From d422247d14a53fe825b1778edf104167d8fd8f3f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 13 Feb 2025 15:49:59 +0900 Subject: nvme: Cleanup the definition of the controller config register fields Reorganized the enum used to define the fields of the contrller configuration (CC) register in include/linux/nvme.h to: 1) Group together all the values defined for each field. 2) Add the missing field masks definitions. 3) Add comments to describe the enum and each field. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- include/linux/nvme.h | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index fe3b60818fdc..2dc05b1c3283 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -199,28 +199,54 @@ enum { #define NVME_NVM_IOSQES 6 #define NVME_NVM_IOCQES 4 +/* + * Controller Configuration (CC) register (Offset 14h) + */ enum { + /* Enable (EN): bit 0 */ NVME_CC_ENABLE = 1 << 0, NVME_CC_EN_SHIFT = 0, + + /* Bits 03:01 are reserved (NVMe Base Specification rev 2.1) */ + + /* I/O Command Set Selected (CSS): bits 06:04 */ NVME_CC_CSS_SHIFT = 4, - NVME_CC_MPS_SHIFT = 7, - NVME_CC_AMS_SHIFT = 11, - NVME_CC_SHN_SHIFT = 14, - NVME_CC_IOSQES_SHIFT = 16, - NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_NVM = 0 << NVME_CC_CSS_SHIFT, NVME_CC_CSS_CSI = 6 << NVME_CC_CSS_SHIFT, - NVME_CC_CSS_MASK = 7 << NVME_CC_CSS_SHIFT, + + /* Memory Page Size (MPS): bits 10:07 */ + NVME_CC_MPS_SHIFT = 7, + NVME_CC_MPS_MASK = 0xf << NVME_CC_MPS_SHIFT, + + /* Arbitration Mechanism Selected (AMS): bits 13:11 */ + NVME_CC_AMS_SHIFT = 11, + NVME_CC_AMS_MASK = 7 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_RR = 0 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_WRRU = 1 << NVME_CC_AMS_SHIFT, NVME_CC_AMS_VS = 7 << NVME_CC_AMS_SHIFT, + + /* Shutdown Notification (SHN): bits 15:14 */ + NVME_CC_SHN_SHIFT = 14, + NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NONE = 0 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_NORMAL = 1 << NVME_CC_SHN_SHIFT, NVME_CC_SHN_ABRUPT = 2 << NVME_CC_SHN_SHIFT, - NVME_CC_SHN_MASK = 3 << NVME_CC_SHN_SHIFT, + + /* I/O Submission Queue Entry Size (IOSQES): bits 19:16 */ + NVME_CC_IOSQES_SHIFT = 16, + NVME_CC_IOSQES_MASK = 0xf << NVME_CC_IOSQES_SHIFT, NVME_CC_IOSQES = NVME_NVM_IOSQES << NVME_CC_IOSQES_SHIFT, + + /* I/O Completion Queue Entry Size (IOCQES): bits 23:20 */ + NVME_CC_IOCQES_SHIFT = 20, + NVME_CC_IOCQES_MASK = 0xf << NVME_CC_IOCQES_SHIFT, NVME_CC_IOCQES = NVME_NVM_IOCQES << NVME_CC_IOCQES_SHIFT, + + /* Controller Ready Independent of Media Enable (CRIME): bit 24 */ NVME_CC_CRIME = 1 << 24, + + /* Bits 25:31 are reserved (NVMe Base Specification rev 2.1) */ }; enum { -- cgit v1.2.3 From 5bbd6e863b15a85221e49b9bdb2d5d8f0bb91f3d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 1 Feb 2025 15:00:02 -0500 Subject: SUNRPC: Prevent looping due to rpc_signal_task() races If rpc_signal_task() is called while a task is in an rpc_call_done() callback function, and the latter calls rpc_restart_call(), the task can end up looping due to the RPC_TASK_SIGNALLED flag being set without the tk_rpc_status being set. Removing the redundant mechanism for signalling the task fixes the looping behaviour. Reported-by: Li Lingfeng Fixes: 39494194f93b ("SUNRPC: Fix races with rpc_killall_tasks()") Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Anna Schumaker --- include/linux/sunrpc/sched.h | 3 +-- include/trace/events/sunrpc.h | 3 +-- net/sunrpc/sched.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index fec1e8a1570c..eac57914dcf3 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -158,7 +158,6 @@ enum { RPC_TASK_NEED_XMIT, RPC_TASK_NEED_RECV, RPC_TASK_MSG_PIN_WAIT, - RPC_TASK_SIGNALLED, }; #define rpc_test_and_set_running(t) \ @@ -171,7 +170,7 @@ enum { #define RPC_IS_ACTIVATED(t) test_bit(RPC_TASK_ACTIVE, &(t)->tk_runstate) -#define RPC_SIGNALLED(t) test_bit(RPC_TASK_SIGNALLED, &(t)->tk_runstate) +#define RPC_SIGNALLED(t) (READ_ONCE(task->tk_rpc_status) == -ERESTARTSYS) /* * Task priorities. diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index b13dc275ef4a..851841336ee6 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -360,8 +360,7 @@ TRACE_EVENT(rpc_request, { (1UL << RPC_TASK_ACTIVE), "ACTIVE" }, \ { (1UL << RPC_TASK_NEED_XMIT), "NEED_XMIT" }, \ { (1UL << RPC_TASK_NEED_RECV), "NEED_RECV" }, \ - { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }, \ - { (1UL << RPC_TASK_SIGNALLED), "SIGNALLED" }) + { (1UL << RPC_TASK_MSG_PIN_WAIT), "MSG_PIN_WAIT" }) DECLARE_EVENT_CLASS(rpc_task_running, diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cef623ea1506..9b45fbdc90ca 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -864,8 +864,6 @@ void rpc_signal_task(struct rpc_task *task) if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) return; trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) rpc_wake_up_queued_task(queue, task); -- cgit v1.2.3 From 4b5a28b38c4a0106c64416a1b2042405166b26ce Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 18 Feb 2025 05:49:30 -0800 Subject: net: Add non-RCU dev_getbyhwaddr() helper Add dedicated helper for finding devices by hardware address when holding rtnl_lock, similar to existing dev_getbyhwaddr_rcu(). This prevents PROVE_LOCKING warnings when rtnl_lock is held but RCU read lock is not. Extract common address comparison logic into dev_addr_cmp(). The context about this change could be found in the following discussion: Link: https://lore.kernel.org/all/20250206-scarlet-ermine-of-improvement-1fcac5@leitao/ Cc: kuniyu@amazon.com Cc: ushankar@purestorage.com Suggested-by: Eric Dumazet Signed-off-by: Breno Leitao Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250218-arm_fix_selftest-v5-1-d3d6892db9e1@debian.org Signed-off-by: Jakub Kicinski --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index c0a86afb85da..94b7d4eca003 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3275,6 +3275,8 @@ static inline struct net_device *first_net_device_rcu(struct net *net) } int netdev_boot_setup_check(struct net_device *dev); +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *hwaddr); struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, const char *hwaddr); struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type); diff --git a/net/core/dev.c b/net/core/dev.c index fafd2f4b5d5d..72459dd02f38 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1121,6 +1121,12 @@ out: return ret; } +static bool dev_addr_cmp(struct net_device *dev, unsigned short type, + const char *ha) +{ + return dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len); +} + /** * dev_getbyhwaddr_rcu - find a device by its hardware address * @net: the applicable net namespace @@ -1129,7 +1135,7 @@ out: * * Search for an interface by MAC address. Returns NULL if the device * is not found or a pointer to the device. - * The caller must hold RCU or RTNL. + * The caller must hold RCU. * The returned device has not had its ref count increased * and the caller must therefore be careful about locking * @@ -1141,14 +1147,39 @@ struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type, struct net_device *dev; for_each_netdev_rcu(net, dev) - if (dev->type == type && - !memcmp(dev->dev_addr, ha, dev->addr_len)) + if (dev_addr_cmp(dev, type, ha)) return dev; return NULL; } EXPORT_SYMBOL(dev_getbyhwaddr_rcu); +/** + * dev_getbyhwaddr() - find a device by its hardware address + * @net: the applicable net namespace + * @type: media type of device + * @ha: hardware address + * + * Similar to dev_getbyhwaddr_rcu(), but the owner needs to hold + * rtnl_lock. + * + * Context: rtnl_lock() must be held. + * Return: pointer to the net_device, or NULL if not found + */ +struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, + const char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + for_each_netdev(net, dev) + if (dev_addr_cmp(dev, type, ha)) + return dev; + + return NULL; +} +EXPORT_SYMBOL(dev_getbyhwaddr); + struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type) { struct net_device *dev, *ret = NULL; -- cgit v1.2.3 From 6bc7e4eb0499562ccd291712fd7be0d1a5aad00a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 18 Feb 2025 19:29:40 +0100 Subject: Revert "net: skb: introduce and use a single page frag cache" After the previous commit is finally safe to revert commit dbae2b062824 ("net: skb: introduce and use a single page frag cache"): do it here. The intended goal of such change was to counter a performance regression introduced by commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs"). Unfortunately, the blamed commit introduces another regression for the virtio_net driver. Such a driver calls napi_alloc_skb() with a tiny size, so that the whole head frag could fit a 512-byte block. The single page frag cache uses a 1K fragment for such allocation, and the additional overhead, under small UDP packets flood, makes the page allocator a bottleneck. Thanks to commit bf9f1baa279f ("net: add dedicated kmem_cache for typical/small skb->head"), this revert does not re-introduce the original regression. Actually, in the relevant test on top of this revert, I measure a small but noticeable positive delta, just above noise level. The revert itself required some additional mangling due to recent updates in the affected code. Suggested-by: Eric Dumazet Fixes: dbae2b062824 ("net: skb: introduce and use a single page frag cache") Reviewed-by: Eric Dumazet Signed-off-by: Paolo Abeni --- include/linux/netdevice.h | 1 - net/core/dev.c | 17 ++++++++ net/core/skbuff.c | 104 +++------------------------------------------- 3 files changed, 22 insertions(+), 100 deletions(-) (limited to 'include/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 94b7d4eca003..ab550a89b9bf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4117,7 +4117,6 @@ void netif_receive_skb_list(struct list_head *head); gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb); void napi_gro_flush(struct napi_struct *napi, bool flush_old); struct sk_buff *napi_get_frags(struct napi_struct *napi); -void napi_get_frags_check(struct napi_struct *napi); gro_result_t napi_gro_frags(struct napi_struct *napi); static inline void napi_free_frags(struct napi_struct *napi) diff --git a/net/core/dev.c b/net/core/dev.c index 72459dd02f38..1b252e9459fd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6991,6 +6991,23 @@ netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi) list_add_rcu(&napi->dev_list, higher); /* adds after higher */ } +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +static void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void netif_napi_add_weight_locked(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f5a6d50570c4..7b03b64fdcb2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -223,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) -#if PAGE_SIZE == SZ_4K - -#define NAPI_HAS_SMALL_PAGE_FRAG 1 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) - -/* specialized page frag allocator using a single order 0 page - * and slicing it into 1K sized fragment. Constrained to systems - * with a very limited amount of 1K fragments fitting a single - * page - to avoid excessive truesize underestimation - */ - -struct page_frag_1k { - void *va; - u16 offset; - bool pfmemalloc; -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) -{ - struct page *page; - int offset; - - offset = nc->offset - SZ_1K; - if (likely(offset >= 0)) - goto use_frag; - - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - if (!page) - return NULL; - - nc->va = page_address(page); - nc->pfmemalloc = page_is_pfmemalloc(page); - offset = PAGE_SIZE - SZ_1K; - page_ref_add(page, offset / SZ_1K); - -use_frag: - nc->offset = offset; - return nc->va + offset; -} -#else - -/* the small page is actually unused in this build; add dummy helpers - * to please the compiler and avoid later preprocessor's conditionals - */ -#define NAPI_HAS_SMALL_PAGE_FRAG 0 -#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false - -struct page_frag_1k { -}; - -static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) -{ - return NULL; -} - -#endif - struct napi_alloc_cache { local_lock_t bh_lock; struct page_frag_cache page; - struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -293,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = { .bh_lock = INIT_LOCAL_LOCK(bh_lock), }; -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -816,11 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. - * When the small frag allocator is available, prefer it over kmalloc - * for small fragments */ - if ((!NAPI_HAS_SMALL_PAGE_FRAG && - len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) || + if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -830,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len) goto skb_success; } + len = SKB_HEAD_ALIGN(len); + if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; local_lock_nested_bh(&napi_alloc_cache.bh_lock); nc = this_cpu_ptr(&napi_alloc_cache); - if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { - /* we are artificially inflating the allocation size, but - * that is not as bad as it may look like, as: - * - 'len' less than GRO_MAX_HEAD makes little sense - * - On most systems, larger 'len' values lead to fragment - * size above 512 bytes - * - kmalloc would use the kmalloc-1k slab for such values - * - Builds with smaller GRO_MAX_HEAD will very likely do - * little networking, as that implies no WiFi and no - * tunnels support, and 32 bits arches. - */ - len = SZ_1K; - data = page_frag_alloc_1k(&nc->page_small, gfp_mask); - pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); - } else { - len = SKB_HEAD_ALIGN(len); - - data = page_frag_alloc(&nc->page, len, gfp_mask); - pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); - } + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!data)) -- cgit v1.2.3 From b4c173dfbb6c78568578ff18f9e8822d7bd0e31b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Feb 2025 11:02:58 +0100 Subject: fuse: don't truncate cached, mutated symlink Fuse allows the value of a symlink to change and this property is exploited by some filesystems (e.g. CVMFS). It has been observed, that sometimes after changing the symlink contents, the value is truncated to the old size. This is caused by fuse_getattr() racing with fuse_reverse_inval_inode(). fuse_reverse_inval_inode() updates the fuse_inode's attr_version, which results in fuse_change_attributes() exiting before updating the cached attributes This is okay, as the cached attributes remain invalid and the next call to fuse_change_attributes() will likely update the inode with the correct values. The reason this causes problems is that cached symlinks will be returned through page_get_link(), which truncates the symlink to inode->i_size. This is correct for filesystems that don't mutate symlinks, but in this case it causes bad behavior. The solution is to just remove this truncation. This can cause a regression in a filesystem that relies on supplying a symlink larger than the file size, but this is unlikely. If that happens we'd need to make this behavior conditional. Reported-by: Laura Promberger Tested-by: Sam Lewis Signed-off-by: Miklos Szeredi Link: https://lore.kernel.org/r/20250220100258.793363-1-mszeredi@redhat.com Reviewed-by: Bernd Schubert Signed-off-by: Christian Brauner --- fs/fuse/dir.c | 2 +- fs/namei.c | 24 +++++++++++++++++++----- include/linux/fs.h | 2 ++ 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 198862b086ff..3805f9b06c9d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1636,7 +1636,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b93..ecb7b95c2ca3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5356,10 +5356,9 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ -const char *page_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - char *kaddr; struct page *page; struct address_space *mapping = inode->i_mapping; @@ -5378,8 +5377,23 @@ const char *page_get_link(struct dentry *dentry, struct inode *inode, } set_delayed_call(callback, page_put_link, page); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); - kaddr = page_address(page); - nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return page_address(page); +} + +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + return __page_get_link(dentry, inode, callback); +} +EXPORT_SYMBOL_GPL(page_get_link_raw); + +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + char *kaddr = __page_get_link(dentry, inode, callback); + + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 2c3b2f8a621f..9346adf28f7b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3452,6 +3452,8 @@ extern const struct file_operations generic_ro_fops; extern int readlink_copy(char __user *, int, const char *, int); extern int page_readlink(struct dentry *, char __user *, int); +extern const char *page_get_link_raw(struct dentry *, struct inode *, + struct delayed_call *); extern const char *page_get_link(struct dentry *, struct inode *, struct delayed_call *); extern void page_put_link(void *); -- cgit v1.2.3 From 8510edf191d2df0822ea22d6226e4eef87562271 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Tue, 18 Feb 2025 20:02:08 +0800 Subject: mm/filemap: fix miscalculated file range for filemap_fdatawrite_range_kick() iocb->ki_pos has been updated with the number of written bytes since generic_perform_write(). Besides __filemap_fdatawrite_range() accepts the inclusive end of the data range. Fixes: 1d4457576570 ("mm: call filemap_fdatawrite_range_kick() after IOCB_DONTCACHE issue") Signed-off-by: Jingbo Xu Link: https://lore.kernel.org/r/20250218120209.88093-2-jefflexu@linux.alibaba.com Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- include/linux/fs.h | 4 ++-- mm/filemap.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 9346adf28f7b..2788df98080f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2975,8 +2975,8 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count) } else if (iocb->ki_flags & IOCB_DONTCACHE) { struct address_space *mapping = iocb->ki_filp->f_mapping; - filemap_fdatawrite_range_kick(mapping, iocb->ki_pos, - iocb->ki_pos + count); + filemap_fdatawrite_range_kick(mapping, iocb->ki_pos - count, + iocb->ki_pos - 1); } return count; diff --git a/mm/filemap.c b/mm/filemap.c index 804d7365680c..d4564a79eb35 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -445,7 +445,7 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * filemap_fdatawrite_range_kick - start writeback on a range * @mapping: target address_space * @start: index to start writeback on - * @end: last (non-inclusive) index for writeback + * @end: last (inclusive) index for writeback * * This is a non-integrity writeback helper, to start writing back folios * for the indicated range. -- cgit v1.2.3 From 889c57066ceee5e9172232da0608a8ac053bb6e5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 25 Feb 2025 10:21:41 +0800 Subject: block: make segment size limit workable for > 4K PAGE_SIZE Using PAGE_SIZE as a minimum expected DMA segment size in consideration of devices which have a max DMA segment size of < 64k when used on 64k PAGE_SIZE systems leads to devices not being able to probe such as eMMC and Exynos UFS controller [0] [1] you can end up with a probe failure as follows: WARNING: CPU: 2 PID: 397 at block/blk-settings.c:339 blk_validate_limits+0x364/0x3c0 Ensure we use min(max_seg_size, seg_boundary_mask + 1) as the new min segment size when max segment size is < PAGE_SIZE for 16k and 64k base page size systems. If anyone need to backport this patch, the following commits are depended: commit 6aeb4f836480 ("block: remove bio_add_pc_page") commit 02ee5d69e3ba ("block: remove blk_rq_bio_prep") commit b7175e24d6ac ("block: add a dma mapping iterator") Link: https://lore.kernel.org/linux-block/20230612203314.17820-1-bvanassche@acm.org/ # [0] Link: https://lore.kernel.org/linux-block/1d55e942-5150-de4c-3a02-c3d066f87028@acm.org/ # [1] Cc: Yi Zhang Cc: John Garry Cc: Keith Busch Tested-by: Paul Bunyan Reviewed-by: Daniel Gomez Reviewed-by: Luis Chamberlain Reviewed-by: Bart Van Assche Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250225022141.2154581-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-settings.c | 14 +++++++++++--- block/blk.h | 9 +++++++-- include/linux/blkdev.h | 1 + 4 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/block/blk-merge.c b/block/blk-merge.c index c7c85e10cf9c..1d1589c35297 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -329,7 +329,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim, if (nsegs < lim->max_segments && bytes + bv.bv_len <= max_bytes && - bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + bv.bv_offset + bv.bv_len <= lim->min_segment_size) { nsegs++; bytes += bv.bv_len; } else { diff --git a/block/blk-settings.c b/block/blk-settings.c index c44dadc35e1e..b9c6f0ec1c49 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -246,6 +246,7 @@ int blk_validate_limits(struct queue_limits *lim) { unsigned int max_hw_sectors; unsigned int logical_block_sectors; + unsigned long seg_size; int err; /* @@ -303,7 +304,7 @@ int blk_validate_limits(struct queue_limits *lim) max_hw_sectors = min_not_zero(lim->max_hw_sectors, lim->max_dev_sectors); if (lim->max_user_sectors) { - if (lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + if (lim->max_user_sectors < BLK_MIN_SEGMENT_SIZE / SECTOR_SIZE) return -EINVAL; lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); } else if (lim->io_opt > (BLK_DEF_MAX_SECTORS_CAP << SECTOR_SHIFT)) { @@ -341,7 +342,7 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->seg_boundary_mask) lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + if (WARN_ON_ONCE(lim->seg_boundary_mask < BLK_MIN_SEGMENT_SIZE - 1)) return -EINVAL; /* @@ -362,10 +363,17 @@ int blk_validate_limits(struct queue_limits *lim) */ if (!lim->max_segment_size) lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + if (WARN_ON_ONCE(lim->max_segment_size < BLK_MIN_SEGMENT_SIZE)) return -EINVAL; } + /* setup min segment size for building new segment in fast path */ + if (lim->seg_boundary_mask > lim->max_segment_size - 1) + seg_size = lim->max_segment_size; + else + seg_size = lim->seg_boundary_mask + 1; + lim->min_segment_size = min_t(unsigned int, seg_size, PAGE_SIZE); + /* * We require drivers to at least do logical block aligned I/O, but * historically could not check for that due to the separate calls diff --git a/block/blk.h b/block/blk.h index 90fa5f28ccab..9cf9a0099416 100644 --- a/block/blk.h +++ b/block/blk.h @@ -14,6 +14,7 @@ struct elevator_type; #define BLK_DEV_MAX_SECTORS (LLONG_MAX >> 9) +#define BLK_MIN_SEGMENT_SIZE 4096 /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -358,8 +359,12 @@ struct bio *bio_split_zone_append(struct bio *bio, static inline bool bio_may_need_split(struct bio *bio, const struct queue_limits *lim) { - return lim->chunk_sectors || bio->bi_vcnt != 1 || - bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; + if (lim->chunk_sectors) + return true; + if (bio->bi_vcnt != 1) + return true; + return bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > + lim->min_segment_size; } /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 248416ecd01c..58ff5aca83b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -367,6 +367,7 @@ struct queue_limits { unsigned int max_sectors; unsigned int max_user_sectors; unsigned int max_segment_size; + unsigned int min_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; -- cgit v1.2.3 From 73cfc53cc3b6380eccf013049574485f64cb83ca Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 21 Feb 2025 14:57:07 +0100 Subject: objtool: Fix C jump table annotations for Clang A C jump table (such as the one used by the BPF interpreter) is a const global array of absolute code addresses, and this means that the actual values in the table may not be known until the kernel is booted (e.g., when using KASLR or when the kernel VA space is sized dynamically). When using PIE codegen, the compiler will default to placing such const global objects in .data.rel.ro (which is annotated as writable), rather than .rodata (which is annotated as read-only). As C jump tables are explicitly emitted into .rodata, this used to result in warnings for LoongArch builds (which uses PIE codegen for the entire kernel) like Warning: setting incorrect section attributes for .rodata..c_jump_table due to the fact that the explicitly specified .rodata section inherited the read-write annotation that the compiler uses for such objects when using PIE codegen. This warning was suppressed by explicitly adding the read-only annotation to the __attribute__((section(""))) string, by commit c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Unfortunately, this hack does not work on Clang's integrated assembler, which happily interprets the appended section type and permission specifiers as part of the section name, which therefore no longer matches the hard-coded pattern '.rodata..c_jump_table' that objtool expects, causing it to emit a warning kernel/bpf/core.o: warning: objtool: ___bpf_prog_run+0x20: sibling call from callable instruction with modified stack frame Work around this, by emitting C jump tables into .data.rel.ro instead, which is treated as .rodata by the linker script for all builds, not just PIE based ones. Fixes: c5b1184decc8 ("compiler.h: specify correct attribute for .rodata..c_jump_table") Tested-by: Tiezhu Yang # on LoongArch Signed-off-by: Ard Biesheuvel Link: https://lore.kernel.org/r/20250221135704.431269-6-ardb+git@google.com Signed-off-by: Josh Poimboeuf --- include/linux/compiler.h | 2 +- tools/objtool/check.c | 7 ++++--- tools/objtool/include/objtool/special.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/compiler.h b/include/linux/compiler.h index b087de2f3e94..0c25f3e429bb 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -110,7 +110,7 @@ void ftrace_likely_update(struct ftrace_likely_data *f, int val, /* Unreachable code */ #ifdef CONFIG_OBJTOOL /* Annotate a C jump table to allow objtool to follow the code flow */ -#define __annotate_jump_table __section(".rodata..c_jump_table,\"a\",@progbits #") +#define __annotate_jump_table __section(".data.rel.ro.c_jump_table") #else /* !CONFIG_OBJTOOL */ #define __annotate_jump_table #endif /* CONFIG_OBJTOOL */ diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 497cb8dfb3eb..1b5a1b3ea7a9 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2471,13 +2471,14 @@ static void mark_rodata(struct objtool_file *file) * * - .rodata: can contain GCC switch tables * - .rodata.: same, if -fdata-sections is being used - * - .rodata..c_jump_table: contains C annotated jump tables + * - .data.rel.ro.c_jump_table: contains C annotated jump tables * * .rodata.str1.* sections are ignored; they don't contain jump tables. */ for_each_sec(file, sec) { - if (!strncmp(sec->name, ".rodata", 7) && - !strstr(sec->name, ".str1.")) { + if ((!strncmp(sec->name, ".rodata", 7) && + !strstr(sec->name, ".str1.")) || + !strncmp(sec->name, ".data.rel.ro", 12)) { sec->rodata = true; found = true; } diff --git a/tools/objtool/include/objtool/special.h b/tools/objtool/include/objtool/special.h index e7ee7ffccefd..e049679bb17b 100644 --- a/tools/objtool/include/objtool/special.h +++ b/tools/objtool/include/objtool/special.h @@ -10,7 +10,7 @@ #include #include -#define C_JUMP_TABLE_SECTION ".rodata..c_jump_table" +#define C_JUMP_TABLE_SECTION ".data.rel.ro.c_jump_table" struct special_alt { struct list_head list; -- cgit v1.2.3 From 9084ed79ddaaaa1ec01cd304af9fb532c26252db Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Thu, 20 Feb 2025 14:29:36 -0500 Subject: lsm,nfs: fix memory leak of lsm_context commit b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") did not preserve the lsm id for subsequent release calls, which results in a memory leak. Fix it by saving the lsm id in the nfs4_label and providing it on the subsequent release call. Fixes: b530104f50e8 ("lsm: lsm_context in security_dentry_init_security") Signed-off-by: Stephen Smalley Acked-by: Paul Moore Acked-by: Casey Schaufler Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 7 ++++--- include/linux/nfs4.h | 1 + 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index c25ecdb76d30..6e95db6c17e9 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -133,6 +133,7 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (err) return NULL; + label->lsmid = shim.id; label->label = shim.context; label->len = shim.len; return label; @@ -145,7 +146,7 @@ nfs4_label_release_security(struct nfs4_label *label) if (label) { shim.context = label->label; shim.len = label->len; - shim.id = LSM_ID_UNDEF; + shim.id = label->lsmid; security_release_secctx(&shim); } } @@ -6272,7 +6273,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_label label = {0, 0, buflen, buf}; + struct nfs4_label label = {0, 0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_fattr fattr = { @@ -6377,7 +6378,7 @@ static int nfs4_do_set_security_label(struct inode *inode, static int nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { - struct nfs4_label ilabel = {0, 0, buflen, (char *)buf }; + struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf }; struct nfs_fattr *fattr; int status; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index 71fbebfa43c7..9ac83ca88326 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -47,6 +47,7 @@ struct nfs4_acl { struct nfs4_label { uint32_t lfs; uint32_t pi; + u32 lsmid; u32 len; char *label; }; -- cgit v1.2.3 From 18912c520674ec4d920fe3826e7e4fefeecdf5ae Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 24 Feb 2025 09:44:01 -0800 Subject: tcp: devmem: don't write truncated dmabuf CMSGs to userspace Currently, we report -ETOOSMALL (err) only on the first iteration (!sent). When we get put_cmsg error after a bunch of successful put_cmsg calls, we don't signal the error at all. This might be confusing on the userspace side which will see truncated CMSGs but no MSG_CTRUNC signal. Consider the following case: - sizeof(struct cmsghdr) = 16 - sizeof(struct dmabuf_cmsg) = 24 - total cmsg size (CMSG_LEN) = 40 (16+24) When calling recvmsg with msg_controllen=60, the userspace will receive two(!) dmabuf_cmsg(s), the first one will be a valid one and the second one will be silently truncated. There is no easy way to discover the truncation besides doing something like "cm->cmsg_len != CMSG_LEN(sizeof(dmabuf_cmsg))". Introduce new put_devmem_cmsg wrapper that reports an error instead of doing the truncation. Mina suggests that it's the intended way this API should work. Note that we might now report MSG_CTRUNC when the users (incorrectly) call us with msg_control == NULL. Fixes: 8f0b3cc9a4c1 ("tcp: RX path for devmem TCP") Reviewed-by: Mina Almasry Signed-off-by: Stanislav Fomichev Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250224174401.3582695-1-sdf@fomichev.me Signed-off-by: Jakub Kicinski --- include/linux/socket.h | 2 ++ net/core/scm.c | 10 ++++++++++ net/ipv4/tcp.c | 26 ++++++++++---------------- 3 files changed, 22 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/socket.h b/include/linux/socket.h index d18cc47e89bd..c3322eb3d686 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -392,6 +392,8 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +extern int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data); struct timespec64; struct __kernel_timespec; diff --git a/net/core/scm.c b/net/core/scm.c index 4f6a14babe5a..733c0cbd393d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -282,6 +282,16 @@ efault: } EXPORT_SYMBOL(put_cmsg); +int put_cmsg_notrunc(struct msghdr *msg, int level, int type, int len, + void *data) +{ + /* Don't produce truncated CMSGs */ + if (!msg->msg_control || msg->msg_controllen < CMSG_LEN(len)) + return -ETOOSMALL; + + return put_cmsg(msg, level, type, len, data); +} + void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) { struct scm_timestamping64 tss; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0d704bda6c41..d74281eca14f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2438,14 +2438,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, */ memset(&dmabuf_cmsg, 0, sizeof(dmabuf_cmsg)); dmabuf_cmsg.frag_size = copy; - err = put_cmsg(msg, SOL_SOCKET, SO_DEVMEM_LINEAR, - sizeof(dmabuf_cmsg), &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_LINEAR, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } sent += copy; @@ -2499,16 +2497,12 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, offset += copy; remaining_len -= copy; - err = put_cmsg(msg, SOL_SOCKET, - SO_DEVMEM_DMABUF, - sizeof(dmabuf_cmsg), - &dmabuf_cmsg); - if (err || msg->msg_flags & MSG_CTRUNC) { - msg->msg_flags &= ~MSG_CTRUNC; - if (!err) - err = -ETOOSMALL; + err = put_cmsg_notrunc(msg, SOL_SOCKET, + SO_DEVMEM_DMABUF, + sizeof(dmabuf_cmsg), + &dmabuf_cmsg); + if (err) goto out; - } atomic_long_inc(&niov->pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); -- cgit v1.2.3 From a6aa36e957a1bfb5341986dec32d013d23228fe1 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 14 Feb 2025 13:14:34 +0900 Subject: block: Remove zone write plugs when handling native zone append writes For devices that natively support zone append operations, REQ_OP_ZONE_APPEND BIOs are not processed through zone write plugging and are immediately issued to the zoned device. This means that there is no write pointer offset tracking done for these operations and that a zone write plug is not necessary. However, when receiving a zone append BIO, we may already have a zone write plug for the target zone if that zone was previously partially written using regular write operations. In such case, since the write pointer offset of the zone write plug is not incremented by the amount of sectors appended to the zone, 2 issues arise: 1) we risk leaving the plug in the disk hash table if the zone is fully written using zone append or regular write operations, because the write pointer offset will never reach the "zone full" state. 2) Regular write operations that are issued after zone append operations will always be failed by blk_zone_wplug_prepare_bio() as the write pointer alignment check will fail, even if the user correctly accounted for the zone append operations and issued the regular writes with a correct sector. Avoid these issues by immediately removing the zone write plug of zones that are the target of zone append operations when blk_zone_plug_bio() is called. The new function blk_zone_wplug_handle_native_zone_append() implements this for devices that natively support zone append. The removal of the zone write plug using disk_remove_zone_wplug() requires aborting all plugged regular write using disk_zone_wplug_abort() as otherwise the plugged write BIOs would never be executed (with the plug removed, the completion path will never see again the zone write plug as disk_get_zone_wplug() will return NULL). Rate-limited warnings are added to blk_zone_wplug_handle_native_zone_append() and to disk_zone_wplug_abort() to signal this. Since blk_zone_wplug_handle_native_zone_append() is called in the hot path for operations that will not be plugged, disk_get_zone_wplug() is optimized under the assumption that a user issuing zone append operations is not at the same time issuing regular writes and that there are no hashed zone write plugs. The struct gendisk atomic counter nr_zone_wplugs is added to check this, with this counter incremented in disk_insert_zone_wplug() and decremented in disk_remove_zone_wplug(). To be consistent with this fix, we do not need to fill the zone write plug hash table with zone write plugs for zones that are partially written for a device that supports native zone append operations. So modify blk_revalidate_seq_zone() to return early to avoid allocating and inserting a zone write plug for partially written sequential zones if the device natively supports zone append. Reported-by: Jorgen Hansen Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Tested-by: Jorgen Hansen Link: https://lore.kernel.org/r/20250214041434.82564-1-dlemoal@kernel.org Signed-off-by: Jens Axboe --- block/blk-zoned.c | 76 +++++++++++++++++++++++++++++++++++++++++++++----- include/linux/blkdev.h | 7 +++-- 2 files changed, 73 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 761ea662ddc3..0c77244a35c9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -410,13 +410,14 @@ static bool disk_insert_zone_wplug(struct gendisk *disk, } } hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + atomic_inc(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); return true; } -static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, - sector_t sector) +static struct blk_zone_wplug *disk_get_hashed_zone_wplug(struct gendisk *disk, + sector_t sector) { unsigned int zno = disk_zone_no(disk, sector); unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); @@ -437,6 +438,15 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, return NULL; } +static inline struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) +{ + if (!atomic_read(&disk->nr_zone_wplugs)) + return NULL; + + return disk_get_hashed_zone_wplug(disk, sector); +} + static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) { struct blk_zone_wplug *zwplug = @@ -503,6 +513,7 @@ static void disk_remove_zone_wplug(struct gendisk *disk, zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; spin_lock_irqsave(&disk->zone_wplugs_lock, flags); hlist_del_init_rcu(&zwplug->node); + atomic_dec(&disk->nr_zone_wplugs); spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); disk_put_zone_wplug(zwplug); } @@ -593,6 +604,11 @@ static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { struct bio *bio; + if (bio_list_empty(&zwplug->bio_list)) + return; + + pr_warn_ratelimited("%s: zone %u: Aborting plugged BIOs\n", + zwplug->disk->disk_name, zwplug->zone_no); while ((bio = bio_list_pop(&zwplug->bio_list))) blk_zone_wplug_bio_io_error(zwplug, bio); } @@ -1040,6 +1056,47 @@ plug: return true; } +static void blk_zone_wplug_handle_native_zone_append(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * We have native support for zone append operations, so we are not + * going to handle @bio through plugging. However, we may already have a + * zone write plug for the target zone if that zone was previously + * partially written using regular writes. In such case, we risk leaving + * the plug in the disk hash table if the zone is fully written using + * zone append operations. Avoid this by removing the zone write plug. + */ + zwplug = disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + if (likely(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * We are about to remove the zone write plug. But if the user + * (mistakenly) has issued regular writes together with native zone + * append, we must aborts the writes as otherwise the plugged BIOs would + * not be executed by the plug BIO work as disk_get_zone_wplug() will + * return NULL after the plug is removed. Aborting the plugged write + * BIOs is consistent with the fact that these writes will most likely + * fail anyway as there is no ordering guarantees between zone append + * operations and regular write operations. + */ + if (!bio_list_empty(&zwplug->bio_list)) { + pr_warn_ratelimited("%s: zone %u: Invalid mix of zone append and regular writes\n", + disk->disk_name, zwplug->zone_no); + disk_zone_wplug_abort(zwplug); + } + disk_remove_zone_wplug(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + + disk_put_zone_wplug(zwplug); +} + /** * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging * @bio: The BIO being submitted @@ -1096,8 +1153,10 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) */ switch (bio_op(bio)) { case REQ_OP_ZONE_APPEND: - if (!bdev_emulates_zone_append(bdev)) + if (!bdev_emulates_zone_append(bdev)) { + blk_zone_wplug_handle_native_zone_append(bio); return false; + } fallthrough; case REQ_OP_WRITE: case REQ_OP_WRITE_ZEROES: @@ -1284,6 +1343,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk, { unsigned int i; + atomic_set(&disk->nr_zone_wplugs, 0); disk->zone_wplugs_hash_bits = min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); @@ -1338,6 +1398,7 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) } } + WARN_ON_ONCE(atomic_read(&disk->nr_zone_wplugs)); kfree(disk->zone_wplugs_hash); disk->zone_wplugs_hash = NULL; disk->zone_wplugs_hash_bits = 0; @@ -1550,11 +1611,12 @@ static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, } /* - * We need to track the write pointer of all zones that are not - * empty nor full. So make sure we have a zone write plug for - * such zone if the device has a zone write plug hash table. + * If the device needs zone append emulation, we need to track the + * write pointer of all zones that are not empty nor full. So make sure + * we have a zone write plug for such zone if the device has a zone + * write plug hash table. */ - if (!disk->zone_wplugs_hash) + if (!queue_emulates_zone_append(disk->queue) || !disk->zone_wplugs_hash) return 0; disk_zone_wplug_sync_wp_offset(disk, zone); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 58ff5aca83b6..d37751789bf5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -196,10 +196,11 @@ struct gendisk { unsigned int zone_capacity; unsigned int last_zone_capacity; unsigned long __rcu *conv_zones_bitmap; - unsigned int zone_wplugs_hash_bits; - spinlock_t zone_wplugs_lock; + unsigned int zone_wplugs_hash_bits; + atomic_t nr_zone_wplugs; + spinlock_t zone_wplugs_lock; struct mempool_s *zone_wplugs_pool; - struct hlist_head *zone_wplugs_hash; + struct hlist_head *zone_wplugs_hash; struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ -- cgit v1.2.3 From 02410ac72ac3707936c07ede66e94360d0d65319 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Wed, 26 Feb 2025 12:06:51 +0000 Subject: mm: hugetlb: Add huge page size param to huge_ptep_get_and_clear() In order to fix a bug, arm64 needs to be told the size of the huge page for which the huge_pte is being cleared in huge_ptep_get_and_clear(). Provide for this by adding an `unsigned long sz` parameter to the function. This follows the same pattern as huge_pte_clear() and set_huge_pte_at(). This commit makes the required interface modifications to the core mm as well as all arches that implement this function (arm64, loongarch, mips, parisc, powerpc, riscv, s390, sparc). The actual arm64 bug will be fixed in a separate commit. Cc: stable@vger.kernel.org Fixes: 66b3923a1a0f ("arm64: hugetlb: add support for PTE contiguous bit") Acked-by: David Hildenbrand Reviewed-by: Alexandre Ghiti # riscv Reviewed-by: Christophe Leroy Reviewed-by: Catalin Marinas Reviewed-by: Anshuman Khandual Signed-off-by: Ryan Roberts Acked-by: Alexander Gordeev # s390 Link: https://lore.kernel.org/r/20250226120656.2400136-2-ryan.roberts@arm.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/hugetlb.h | 4 ++-- arch/arm64/mm/hugetlbpage.c | 8 +++++--- arch/loongarch/include/asm/hugetlb.h | 6 ++++-- arch/mips/include/asm/hugetlb.h | 6 ++++-- arch/parisc/include/asm/hugetlb.h | 2 +- arch/parisc/mm/hugetlbpage.c | 2 +- arch/powerpc/include/asm/hugetlb.h | 6 ++++-- arch/riscv/include/asm/hugetlb.h | 3 ++- arch/riscv/mm/hugetlbpage.c | 2 +- arch/s390/include/asm/hugetlb.h | 16 ++++++++++++---- arch/s390/mm/hugetlbpage.c | 4 ++-- arch/sparc/include/asm/hugetlb.h | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- include/asm-generic/hugetlb.h | 2 +- include/linux/hugetlb.h | 4 +++- mm/hugetlb.c | 4 ++-- 16 files changed, 46 insertions(+), 27 deletions(-) (limited to 'include/linux') diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index c6dff3e69539..03db9cb21ace 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -42,8 +42,8 @@ extern int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT extern void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 98a2a0e64e25..06db4649af91 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -396,8 +396,8 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr, __pte_clear(mm, addr, ptep); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) { int ncontig; size_t pgsize; @@ -549,6 +549,8 @@ bool __init arch_hugetlb_valid_size(unsigned long size) pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + unsigned long psize = huge_page_size(hstate_vma(vma)); + if (alternative_has_cap_unlikely(ARM64_WORKAROUND_2645198)) { /* * Break-before-make (BBM) is required for all user space mappings @@ -558,7 +560,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr if (pte_user_exec(__ptep_get(ptep))) return huge_ptep_clear_flush(vma, addr, ptep); } - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index c8e4057734d0..4dc4b3e04225 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -36,7 +36,8 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = ptep_get(ptep); @@ -51,8 +52,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index d0a86ce83de9..fbc71ddcf0f6 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -27,7 +27,8 @@ static inline int prepare_hugepage_range(struct file *file, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { pte_t clear; pte_t pte = *ptep; @@ -42,13 +43,14 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); /* * clear the huge pte entry firstly, so that the other smp threads will * not get old pte entry after finishing flush_tlb_page and before * setting new huge pte entry */ - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_tlb_page(vma, addr); return pte; } diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 5b3a5429f71b..21e9ace17739 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -10,7 +10,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index e9d18cf25b79..a94fe546d434 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -126,7 +126,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t entry; diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index dad2e7980f24..86326587e58d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -45,7 +45,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, + unsigned long sz) { return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 1)); } @@ -55,8 +56,9 @@ static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pte_t pte; + unsigned long sz = huge_page_size(hstate_vma(vma)); - pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, sz); flush_hugetlb_page(vma, addr); return pte; } diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index faf3624d8057..446126497768 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -28,7 +28,8 @@ void set_huge_pte_at(struct mm_struct *mm, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); + unsigned long addr, pte_t *ptep, + unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/riscv/mm/hugetlbpage.c b/arch/riscv/mm/hugetlbpage.c index 42314f093922..b4a78a4b35cf 100644 --- a/arch/riscv/mm/hugetlbpage.c +++ b/arch/riscv/mm/hugetlbpage.c @@ -293,7 +293,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { pte_t orig_pte = ptep_get(ptep); int pte_num; diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7c52acaf9f82..663e87220e89 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -25,8 +25,16 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep); + #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + unsigned long sz) +{ + return __huge_ptep_get_and_clear(mm, addr, ptep); +} static inline void arch_clear_hugetlb_flags(struct folio *folio) { @@ -48,7 +56,7 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); + return __huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } #define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS @@ -59,7 +67,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, int changed = !pte_same(huge_ptep_get(vma->vm_mm, addr, ptep), pte); if (changed) { - huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + __huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); __set_huge_pte_at(vma->vm_mm, addr, ptep, pte); } return changed; @@ -69,7 +77,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_t pte = huge_ptep_get_and_clear(mm, addr, ptep); + pte_t pte = __huge_ptep_get_and_clear(mm, addr, ptep); __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index d9ce199953de..2e568f175cd4 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -188,8 +188,8 @@ pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep) return __rste_to_pte(pte_val(*ptep)); } -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) +pte_t __huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) { pte_t pte = huge_ptep_get(mm, addr, ptep); pmd_t *pmdp = (pmd_t *) ptep; diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index c714ca6a05aa..e7a9cdd498dc 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,7 +20,7 @@ void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); + pte_t *ptep, unsigned long sz); #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index eee601a0d2cf..80504148d8a5 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -260,7 +260,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, } pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) + pte_t *ptep, unsigned long sz) { unsigned int i, nptes, orig_shift, shift; unsigned long size; diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index f42133dae68e..2afc95bf1655 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -90,7 +90,7 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, #ifndef __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) + unsigned long addr, pte_t *ptep, unsigned long sz) { return ptep_get_and_clear(mm, addr, ptep); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ec8c0ccc8f95..bf5f7256bd28 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1004,7 +1004,9 @@ static inline void hugetlb_count_sub(long l, struct mm_struct *mm) static inline pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { - return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep); + unsigned long psize = huge_page_size(hstate_vma(vma)); + + return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep, psize); } #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 65068671e460..de9d49e521c1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5447,7 +5447,7 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, if (src_ptl != dst_ptl) spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); - pte = huge_ptep_get_and_clear(mm, old_addr, src_pte); + pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) huge_pte_clear(mm, new_addr, dst_pte, sz); @@ -5622,7 +5622,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear(mm, address, ptep, sz); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); -- cgit v1.2.3 From 916b7f42b3b3b539a71c204a9b49fdc4ca92cd82 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 27 Feb 2025 15:06:31 -0800 Subject: kvm: retry nx_huge_page_recovery_thread creation A VMM may send a non-fatal signal to its threads, including vCPU tasks, at any time, and thus may signal vCPU tasks during KVM_RUN. If a vCPU task receives the signal while its trying to spawn the huge page recovery vhost task, then KVM_RUN will fail due to copy_process() returning -ERESTARTNOINTR. Rework call_once() to mark the call complete if and only if the called function succeeds, and plumb the function's true error code back to the call_once() invoker. This provides userspace with the correct, non-fatal error code so that the VMM doesn't terminate the VM on -ENOMEM, and allows subsequent KVM_RUN a succeed by virtue of retrying creation of the NX huge page task. Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson [implemented the kvm user side] Signed-off-by: Keith Busch Message-ID: <20250227230631.303431-3-kbusch@meta.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 10 ++++------ include/linux/call_once.h | 47 ++++++++++++++++++++++++++++++++++------------- 2 files changed, 38 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 18ca1ea6dc24..8160870398b9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7460,7 +7460,7 @@ static bool kvm_nx_huge_page_recovery_worker(void *data) return true; } -static void kvm_mmu_start_lpage_recovery(struct once *once) +static int kvm_mmu_start_lpage_recovery(struct once *once) { struct kvm_arch *ka = container_of(once, struct kvm_arch, nx_once); struct kvm *kvm = container_of(ka, struct kvm, arch); @@ -7472,12 +7472,13 @@ static void kvm_mmu_start_lpage_recovery(struct once *once) kvm, "kvm-nx-lpage-recovery"); if (IS_ERR(nx_thread)) - return; + return PTR_ERR(nx_thread); vhost_task_start(nx_thread); /* Make the task visible only once it is fully started. */ WRITE_ONCE(kvm->arch.nx_huge_page_recovery_thread, nx_thread); + return 0; } int kvm_mmu_post_init_vm(struct kvm *kvm) @@ -7485,10 +7486,7 @@ int kvm_mmu_post_init_vm(struct kvm *kvm) if (nx_hugepage_mitigation_hard_disabled) return 0; - call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); - if (!kvm->arch.nx_huge_page_recovery_thread) - return -ENOMEM; - return 0; + return call_once(&kvm->arch.nx_once, kvm_mmu_start_lpage_recovery); } void kvm_mmu_pre_destroy_vm(struct kvm *kvm) diff --git a/include/linux/call_once.h b/include/linux/call_once.h index 6261aa0b3fb0..13cd6469e7e5 100644 --- a/include/linux/call_once.h +++ b/include/linux/call_once.h @@ -26,20 +26,41 @@ do { \ __once_init((once), #once, &__key); \ } while (0) -static inline void call_once(struct once *once, void (*cb)(struct once *)) +/* + * call_once - Ensure a function has been called exactly once + * + * @once: Tracking struct + * @cb: Function to be called + * + * If @once has never completed successfully before, call @cb and, if + * it returns a zero or positive value, mark @once as completed. Return + * the value returned by @cb + * + * If @once has completed succesfully before, return 0. + * + * The call to @cb is implicitly surrounded by a mutex, though for + * efficiency the * function avoids taking it after the first call. + */ +static inline int call_once(struct once *once, int (*cb)(struct once *)) { - /* Pairs with atomic_set_release() below. */ - if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) - return; - - guard(mutex)(&once->lock); - WARN_ON(atomic_read(&once->state) == ONCE_RUNNING); - if (atomic_read(&once->state) != ONCE_NOT_STARTED) - return; - - atomic_set(&once->state, ONCE_RUNNING); - cb(once); - atomic_set_release(&once->state, ONCE_COMPLETED); + int r, state; + + /* Pairs with atomic_set_release() below. */ + if (atomic_read_acquire(&once->state) == ONCE_COMPLETED) + return 0; + + guard(mutex)(&once->lock); + state = atomic_read(&once->state); + if (unlikely(state != ONCE_NOT_STARTED)) + return WARN_ON_ONCE(state != ONCE_COMPLETED) ? -EINVAL : 0; + + atomic_set(&once->state, ONCE_RUNNING); + r = cb(once); + if (r < 0) + atomic_set(&once->state, ONCE_NOT_STARTED); + else + atomic_set_release(&once->state, ONCE_COMPLETED); + return r; } #endif /* _LINUX_CALL_ONCE_H */ -- cgit v1.2.3 From e04918dc594669068f5d59d567d08db531167188 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sun, 2 Mar 2025 15:18:24 +0800 Subject: cred: Fix RCU warnings in override/revert_creds Fix RCU warnings in override_creds and revert_creds by turning the RCU pointer into a normal pointer using rcu_replace_pointer. These warnings were previously private to the cred code, but due to the move into the header file they are now polluting unrelated subsystems. Fixes: 49dffdfde462 ("cred: Add a light version of override/revert_creds()") Signed-off-by: Herbert Xu Link: https://lore.kernel.org/r/Z8QGQGW0IaSklKG7@gondor.apana.org.au Signed-off-by: Christian Brauner --- include/linux/cred.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cred.h b/include/linux/cred.h index 0c3c4b16b469..5658a3bfe803 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -172,18 +172,12 @@ static inline bool cap_ambient_invariant_ok(const struct cred *cred) static inline const struct cred *override_creds(const struct cred *override_cred) { - const struct cred *old = current->cred; - - rcu_assign_pointer(current->cred, override_cred); - return old; + return rcu_replace_pointer(current->cred, override_cred, 1); } static inline const struct cred *revert_creds(const struct cred *revert_cred) { - const struct cred *override_cred = current->cred; - - rcu_assign_pointer(current->cred, revert_cred); - return override_cred; + return rcu_replace_pointer(current->cred, revert_cred, 1); } /** -- cgit v1.2.3 From 3d252160b818045f3a152b13756f6f37ca34639d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 4 Mar 2025 13:51:38 +0000 Subject: fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex pipe_readable(), pipe_writable(), and pipe_poll() can read "pipe->head" and "pipe->tail" outside of "pipe->mutex" critical section. When the head and the tail are read individually in that order, there is a window for interruption between the two reads in which both the head and the tail can be updated by concurrent readers and writers. One of the problematic scenarios observed with hackbench running multiple groups on a large server on a particular pipe inode is as follows: pipe->head = 36 pipe->tail = 36 hackbench-118762 [057] ..... 1029.550548: pipe_write: *wakes up: pipe not full* hackbench-118762 [057] ..... 1029.550548: pipe_write: head: 36 -> 37 [tail: 36] hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next reader 118740* hackbench-118762 [057] ..... 1029.550548: pipe_write: *wake up next writer 118768* hackbench-118768 [206] ..... 1029.55055X: pipe_write: *writer wakes up* hackbench-118768 [206] ..... 1029.55055X: pipe_write: head = READ_ONCE(pipe->head) [37] ... CPU 206 interrupted (exact wakeup was not traced but 118768 did read head at 37 in traces) hackbench-118740 [057] ..... 1029.550558: pipe_read: *reader wakes up: pipe is not empty* hackbench-118740 [057] ..... 1029.550558: pipe_read: tail: 36 -> 37 [head = 37] hackbench-118740 [057] ..... 1029.550559: pipe_read: *pipe is empty; wakeup writer 118768* hackbench-118740 [057] ..... 1029.550559: pipe_read: *sleeps* hackbench-118766 [185] ..... 1029.550592: pipe_write: *New writer comes in* hackbench-118766 [185] ..... 1029.550592: pipe_write: head: 37 -> 38 [tail: 37] hackbench-118766 [185] ..... 1029.550592: pipe_write: *wakes up reader 118766* hackbench-118740 [185] ..... 1029.550598: pipe_read: *reader wakes up; pipe not empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: tail: 37 -> 38 [head: 38] hackbench-118740 [185] ..... 1029.550599: pipe_read: *pipe is empty* hackbench-118740 [185] ..... 1029.550599: pipe_read: *reader sleeps; wakeup writer 118768* ... CPU 206 switches back to writer hackbench-118768 [206] ..... 1029.550601: pipe_write: tail = READ_ONCE(pipe->tail) [38] hackbench-118768 [206] ..... 1029.550601: pipe_write: pipe_full()? (u32)(37 - 38) >= 16? Yes hackbench-118768 [206] ..... 1029.550601: pipe_write: *writer goes back to sleep* [ Tasks 118740 and 118768 can then indefinitely wait on each other. ] The unsigned arithmetic in pipe_occupancy() wraps around when "pipe->tail > pipe->head" leading to pipe_full() returning true despite the pipe being empty. The case of genuine wraparound of "pipe->head" is handled since pipe buffer has data allowing readers to make progress until the pipe->tail wraps too after which the reader will wakeup a sleeping writer, however, mistaking the pipe to be full when it is in fact empty can lead to readers and writers waiting on each other indefinitely. This issue became more problematic and surfaced as a hang in hackbench after the optimization in commit aaec5a95d596 ("pipe_read: don't wake up the writer if the pipe is still full") significantly reduced the number of spurious wakeups of writers that had previously helped mask the issue. To avoid missing any updates between the reads of "pipe->head" and "pipe->write", unionize the two with a single unsigned long "pipe->head_tail" member that can be loaded atomically. Using "pipe->head_tail" to read the head and the tail ensures the lockless checks do not miss any updates to the head or the tail and since those two are only updated under "pipe->mutex", it ensures that the head is always ahead of, or equal to the tail resulting in correct calculations. [ prateek: commit log, testing on x86 platforms. ] Reported-and-debugged-by: Swapnil Sapkal Closes: https://lore.kernel.org/lkml/e813814e-7094-4673-bc69-731af065a0eb@amd.com/ Reported-by: Alexey Gladkov Closes: https://lore.kernel.org/all/Z8Wn0nTvevLRG_4m@example.org/ Fixes: 8cefc107ca54 ("pipe: Use head and tail pointers for the ring, not cursor and length") Tested-by: Swapnil Sapkal Reviewed-by: Oleg Nesterov Tested-by: Alexey Gladkov Signed-off-by: K Prateek Nayak Signed-off-by: Linus Torvalds --- fs/pipe.c | 19 ++++++++----------- include/linux/pipe_fs_i.h | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 45 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/fs/pipe.c b/fs/pipe.c index ce1af7592780..e8e6698f3698 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -210,11 +210,10 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; unsigned int writers = READ_ONCE(pipe->writers); - return !pipe_empty(head, tail) || !writers; + return !pipe_empty(idx.head, idx.tail) || !writers; } static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, @@ -417,11 +416,10 @@ static inline int is_packetized(struct file *file) /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; unsigned int max_usage = READ_ONCE(pipe->max_usage); - return !pipe_full(head, tail, max_usage) || + return !pipe_full(idx.head, idx.tail, max_usage) || !READ_ONCE(pipe->readers); } @@ -659,7 +657,7 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head, tail; + union pipe_index idx; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); @@ -680,19 +678,18 @@ pipe_poll(struct file *filp, poll_table *wait) * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ - head = READ_ONCE(pipe->head); - tail = READ_ONCE(pipe->tail); + idx.head_tail = READ_ONCE(pipe->head_tail); mask = 0; if (filp->f_mode & FMODE_READ) { - if (!pipe_empty(head, tail)) + if (!pipe_empty(idx.head, idx.tail)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_pipe != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { - if (!pipe_full(head, tail, pipe->max_usage)) + if (!pipe_full(idx.head, idx.tail, pipe->max_usage)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 8ff23bf5a819..3cc4f8eab853 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -31,6 +31,33 @@ struct pipe_buffer { unsigned long private; }; +/* + * Really only alpha needs 32-bit fields, but + * might as well do it for 64-bit architectures + * since that's what we've historically done, + * and it makes 'head_tail' always be a simple + * 'unsigned long'. + */ +#ifdef CONFIG_64BIT +typedef unsigned int pipe_index_t; +#else +typedef unsigned short pipe_index_t; +#endif + +/* + * We have to declare this outside 'struct pipe_inode_info', + * but then we can't use 'union pipe_index' for an anonymous + * union, so we end up having to duplicate this declaration + * below. Annoying. + */ +union pipe_index { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; +}; + /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing @@ -58,8 +85,16 @@ struct pipe_buffer { struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; - unsigned int head; - unsigned int tail; + + /* This has to match the 'union pipe_index' above */ + union { + unsigned long head_tail; + struct { + pipe_index_t head; + pipe_index_t tail; + }; + }; + unsigned int max_usage; unsigned int ring_size; unsigned int nr_accounted; -- cgit v1.2.3 From 778b94d7ac17b5800aa857222911f09cc986b509 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 28 Feb 2025 11:01:53 -0600 Subject: ACPI: platform_profile: Add support for hidden choices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When two drivers don't support all the same profiles the legacy interface only exports the common profiles. This causes problems for cases where one driver uses low-power but another uses quiet because the result is that neither is exported to sysfs. To allow two drivers to disagree, add support for "hidden choices". Hidden choices are platform profiles that a driver supports to be compatible with the platform profile of another driver. Fixes: 688834743d67 ("ACPI: platform_profile: Allow multiple handlers") Reported-by: Antheas Kapenekakis Closes: https://lore.kernel.org/platform-driver-x86/e64b771e-3255-42ad-9257-5b8fc6c24ac9@gmx.de/T/#mc068042dd29df36c16c8af92664860fc4763974b Signed-off-by: Mario Limonciello Tested-by: Antheas Kapenekakis Tested-by: Derek J. Clark Acked-by: Ilpo Järvinen Link: https://patch.msgid.link/20250228170155.2623386-2-superm1@kernel.org Signed-off-by: Rafael J. Wysocki --- drivers/acpi/platform_profile.c | 94 +++++++++++++++++++++++++++++++--------- include/linux/platform_profile.h | 3 ++ 2 files changed, 76 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/drivers/acpi/platform_profile.c b/drivers/acpi/platform_profile.c index 2ad53cc6aae5..ef9444482db1 100644 --- a/drivers/acpi/platform_profile.c +++ b/drivers/acpi/platform_profile.c @@ -21,9 +21,15 @@ struct platform_profile_handler { struct device dev; int minor; unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + unsigned long hidden_choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; const struct platform_profile_ops *ops; }; +struct aggregate_choices_data { + unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + int count; +}; + static const char * const profile_names[] = { [PLATFORM_PROFILE_LOW_POWER] = "low-power", [PLATFORM_PROFILE_COOL] = "cool", @@ -73,7 +79,7 @@ static int _store_class_profile(struct device *dev, void *data) lockdep_assert_held(&profile_lock); handler = to_pprof_handler(dev); - if (!test_bit(*bit, handler->choices)) + if (!test_bit(*bit, handler->choices) && !test_bit(*bit, handler->hidden_choices)) return -EOPNOTSUPP; return handler->ops->profile_set(dev, *bit); @@ -239,21 +245,44 @@ static const struct class platform_profile_class = { /** * _aggregate_choices - Aggregate the available profile choices * @dev: The device - * @data: The available profile choices + * @arg: struct aggregate_choices_data * * Return: 0 on success, -errno on failure */ -static int _aggregate_choices(struct device *dev, void *data) +static int _aggregate_choices(struct device *dev, void *arg) { + unsigned long tmp[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + struct aggregate_choices_data *data = arg; struct platform_profile_handler *handler; - unsigned long *aggregate = data; lockdep_assert_held(&profile_lock); handler = to_pprof_handler(dev); - if (test_bit(PLATFORM_PROFILE_LAST, aggregate)) - bitmap_copy(aggregate, handler->choices, PLATFORM_PROFILE_LAST); + bitmap_or(tmp, handler->choices, handler->hidden_choices, PLATFORM_PROFILE_LAST); + if (test_bit(PLATFORM_PROFILE_LAST, data->aggregate)) + bitmap_copy(data->aggregate, tmp, PLATFORM_PROFILE_LAST); else - bitmap_and(aggregate, handler->choices, aggregate, PLATFORM_PROFILE_LAST); + bitmap_and(data->aggregate, tmp, data->aggregate, PLATFORM_PROFILE_LAST); + data->count++; + + return 0; +} + +/** + * _remove_hidden_choices - Remove hidden choices from aggregate data + * @dev: The device + * @arg: struct aggregate_choices_data + * + * Return: 0 on success, -errno on failure + */ +static int _remove_hidden_choices(struct device *dev, void *arg) +{ + struct aggregate_choices_data *data = arg; + struct platform_profile_handler *handler; + + lockdep_assert_held(&profile_lock); + handler = to_pprof_handler(dev); + bitmap_andnot(data->aggregate, handler->choices, + handler->hidden_choices, PLATFORM_PROFILE_LAST); return 0; } @@ -270,22 +299,31 @@ static ssize_t platform_profile_choices_show(struct device *dev, struct device_attribute *attr, char *buf) { - unsigned long aggregate[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + struct aggregate_choices_data data = { + .aggregate = { [0 ... BITS_TO_LONGS(PLATFORM_PROFILE_LAST) - 1] = ~0UL }, + .count = 0, + }; int err; - set_bit(PLATFORM_PROFILE_LAST, aggregate); + set_bit(PLATFORM_PROFILE_LAST, data.aggregate); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &profile_lock) { err = class_for_each_device(&platform_profile_class, NULL, - aggregate, _aggregate_choices); + &data, _aggregate_choices); if (err) return err; + if (data.count == 1) { + err = class_for_each_device(&platform_profile_class, NULL, + &data, _remove_hidden_choices); + if (err) + return err; + } } /* no profile handler registered any more */ - if (bitmap_empty(aggregate, PLATFORM_PROFILE_LAST)) + if (bitmap_empty(data.aggregate, PLATFORM_PROFILE_LAST)) return -EINVAL; - return _commmon_choices_show(aggregate, buf); + return _commmon_choices_show(data.aggregate, buf); } /** @@ -373,7 +411,10 @@ static ssize_t platform_profile_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; + struct aggregate_choices_data data = { + .aggregate = { [0 ... BITS_TO_LONGS(PLATFORM_PROFILE_LAST) - 1] = ~0UL }, + .count = 0, + }; int ret; int i; @@ -381,13 +422,13 @@ static ssize_t platform_profile_store(struct device *dev, i = sysfs_match_string(profile_names, buf); if (i < 0 || i == PLATFORM_PROFILE_CUSTOM) return -EINVAL; - set_bit(PLATFORM_PROFILE_LAST, choices); + set_bit(PLATFORM_PROFILE_LAST, data.aggregate); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &profile_lock) { ret = class_for_each_device(&platform_profile_class, NULL, - choices, _aggregate_choices); + &data, _aggregate_choices); if (ret) return ret; - if (!test_bit(i, choices)) + if (!test_bit(i, data.aggregate)) return -EOPNOTSUPP; ret = class_for_each_device(&platform_profile_class, NULL, &i, @@ -453,12 +494,15 @@ EXPORT_SYMBOL_GPL(platform_profile_notify); */ int platform_profile_cycle(void) { + struct aggregate_choices_data data = { + .aggregate = { [0 ... BITS_TO_LONGS(PLATFORM_PROFILE_LAST) - 1] = ~0UL }, + .count = 0, + }; enum platform_profile_option next = PLATFORM_PROFILE_LAST; enum platform_profile_option profile = PLATFORM_PROFILE_LAST; - unsigned long choices[BITS_TO_LONGS(PLATFORM_PROFILE_LAST)]; int err; - set_bit(PLATFORM_PROFILE_LAST, choices); + set_bit(PLATFORM_PROFILE_LAST, data.aggregate); scoped_cond_guard(mutex_intr, return -ERESTARTSYS, &profile_lock) { err = class_for_each_device(&platform_profile_class, NULL, &profile, _aggregate_profiles); @@ -470,14 +514,14 @@ int platform_profile_cycle(void) return -EINVAL; err = class_for_each_device(&platform_profile_class, NULL, - choices, _aggregate_choices); + &data, _aggregate_choices); if (err) return err; /* never iterate into a custom if all drivers supported it */ - clear_bit(PLATFORM_PROFILE_CUSTOM, choices); + clear_bit(PLATFORM_PROFILE_CUSTOM, data.aggregate); - next = find_next_bit_wrap(choices, + next = find_next_bit_wrap(data.aggregate, PLATFORM_PROFILE_LAST, profile + 1); @@ -532,6 +576,14 @@ struct device *platform_profile_register(struct device *dev, const char *name, return ERR_PTR(-EINVAL); } + if (ops->hidden_choices) { + err = ops->hidden_choices(drvdata, pprof->hidden_choices); + if (err) { + dev_err(dev, "platform_profile hidden_choices failed\n"); + return ERR_PTR(err); + } + } + guard(mutex)(&profile_lock); /* create class interface for individual handler */ diff --git a/include/linux/platform_profile.h b/include/linux/platform_profile.h index 8ab5b0e8eb2c..8c9df7dadd5d 100644 --- a/include/linux/platform_profile.h +++ b/include/linux/platform_profile.h @@ -33,6 +33,8 @@ enum platform_profile_option { * @probe: Callback to setup choices available to the new class device. These * choices will only be enforced when setting a new profile, not when * getting the current one. + * @hidden_choices: Callback to setup choices that are not visible to the user + * but can be set by the driver. * @profile_get: Callback that will be called when showing the current platform * profile in sysfs. * @profile_set: Callback that will be called when storing a new platform @@ -40,6 +42,7 @@ enum platform_profile_option { */ struct platform_profile_ops { int (*probe)(void *drvdata, unsigned long *choices); + int (*hidden_choices)(void *drvdata, unsigned long *choices); int (*profile_get)(struct device *dev, enum platform_profile_option *profile); int (*profile_set)(struct device *dev, enum platform_profile_option profile); }; -- cgit v1.2.3 From c27c66afc449b80f3b4b84d123358c0248f2cf63 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Mar 2025 07:08:09 -1000 Subject: fs/pipe: Fix pipe_occupancy() with 16-bit indexes The pipe_occupancy() logic implicitly relied on the natural unsigned modulo arithmetic in C, but that doesn't work for the new 'pipe_index_t' case, since any arithmetic will be done in 'int' (and here we had also made it 'unsigned int' due to the function call boundary). So make the modulo arithmetic explicit by casting the result to the proper type. Cc: Oleg Nesterov Cc: Mateusz Guzik Cc: Manfred Spraul Cc: Christian Brauner Cc: Swapnil Sapkal Cc: Alexey Gladkov Cc: K Prateek Nayak Link: https://lore.kernel.org/all/CAHk-=wjyHsGLx=rxg6PKYBNkPYAejgo7=CbyL3=HGLZLsAaJFQ@mail.gmail.com/ Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex") Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 3cc4f8eab853..1f013ed7577e 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -192,7 +192,7 @@ static inline bool pipe_empty(unsigned int head, unsigned int tail) */ static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head - tail; + return (pipe_index_t)(head - tail); } /** -- cgit v1.2.3 From cfced12f5100e50d56bc587299393fd33c1169a9 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 5 Mar 2025 11:23:01 +0000 Subject: include/linux/pipe_fs_i: Add htmldoc annotation for "head_tail" member Add htmldoc annotation for the newly introduced "head_tail" member describing it to be a union of the pipe_inode_info's @head and @tail members. Reported-by: Stephen Rothwell Closes: https://lore.kernel.org/lkml/20250305204609.5e64768e@canb.auug.org.au/ Fixes: 3d252160b818 ("fs/pipe: Read pipe->{head,tail} atomically outside pipe->mutex") Signed-off-by: K Prateek Nayak Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 1f013ed7577e..05ccbc5d0129 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -65,6 +65,7 @@ union pipe_index { * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption + * @head_tail: unsigned long union of @head and @tail * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) -- cgit v1.2.3 From 0d2d0f3d93ddd6556f23c917d910becd9925ddeb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 5 Mar 2025 07:35:40 -1000 Subject: fs/pipe: remove buggy and unused 'helper' function While looking for incorrect users of the pipe head/tail fields (see commit c27c66afc449: "fs/pipe: Fix pipe_occupancy() with 16-bit indexes"), I found a bug in pipe_discard_from() that looked entirely broken. However, the fix is trivial: this buggy function isn't actually called by anything, so let's just remove it ASAP. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 9 --------- 1 file changed, 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 05ccbc5d0129..e572e6fc4f81 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -281,15 +281,6 @@ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, return buf->ops->try_steal(pipe, buf); } -static inline void pipe_discard_from(struct pipe_inode_info *pipe, - unsigned int old_head) -{ - unsigned int mask = pipe->ring_size - 1; - - while (pipe->head > old_head) - pipe_buf_release(pipe, &pipe->bufs[--pipe->head & mask]); -} - /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE -- cgit v1.2.3 From 74d42bdb3a4673b1c10d1f457184e4d3c9cb0196 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Mar 2025 07:30:42 -1000 Subject: fs/pipe: express 'pipe_empty()' in terms of 'pipe_occupancy()' That's what 'pipe_full()' does, so it's more consistent. But more importantly it gets the type limits right when the pipe head and tail are no longer necessarily 'unsigned int'. Signed-off-by: Linus Torvalds --- include/linux/pipe_fs_i.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index e572e6fc4f81..4d0a2267e6ef 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -177,23 +177,23 @@ static inline bool pipe_has_watch_queue(const struct pipe_inode_info *pipe) } /** - * pipe_empty - Return true if the pipe is empty + * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline bool pipe_empty(unsigned int head, unsigned int tail) +static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { - return head == tail; + return (pipe_index_t)(head - tail); } /** - * pipe_occupancy - Return number of slots used in the pipe + * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ -static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) +static inline bool pipe_empty(unsigned int head, unsigned int tail) { - return (pipe_index_t)(head - tail); + return !pipe_occupancy(head, tail); } /** -- cgit v1.2.3 From e7112524e5e885181cc5ae4d258f33b9dbe0b907 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 6 Mar 2025 08:27:51 -0800 Subject: block: Name the RQF flags enum Commit 5f89154e8e9e3445f9b59 ("block: Use enum to define RQF_x bit indexes") converted the RQF flags to an anonymous enum, which was a beneficial change. This patch goes one step further by naming the enum as "rqf_flags". This naming enables exporting these flags to BPF clients, eliminating the need to duplicate these flags in BPF code. Instead, BPF clients can now access the same kernel-side values through CO:RE (Compile Once, Run Everywhere), as shown in this example: rqf_stats = bpf_core_enum_value(enum rqf_flags, __RQF_STATS) Suggested-by: Yonghong Song Signed-off-by: Breno Leitao Link: https://lore.kernel.org/r/20250306-rqf_flags-v1-1-bbd64918b406@debian.org Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index fa2a76cc2f73..71f4f0cc3dac 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -28,7 +28,7 @@ typedef enum rq_end_io_ret (rq_end_io_fn)(struct request *, blk_status_t); typedef __u32 __bitwise req_flags_t; /* Keep rqf_name[] in sync with the definitions below */ -enum { +enum rqf_flags { /* drive already may have started this one */ __RQF_STARTED, /* request for flush sequence */ -- cgit v1.2.3 From 00a7d39898c8010bfd5ff62af31ca5db34421b38 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Mar 2025 18:25:35 -1000 Subject: fs/pipe: add simpler helpers for common cases The fix to atomically read the pipe head and tail state when not holding the pipe mutex has caused a number of headaches due to the size change of the involved types. It turns out that we don't have _that_ many places that access these fields directly and were affected, but we have more than we strictly should have, because our low-level helper functions have been designed to have intimate knowledge of how the pipes work. And as a result, that random noise of direct 'pipe->head' and 'pipe->tail' accesses makes it harder to pinpoint any actual potential problem spots remaining. For example, we didn't have a "is the pipe full" helper function, but instead had a "given these pipe buffer indexes and this pipe size, is the pipe full". That's because some low-level pipe code does actually want that much more complicated interface. But most other places literally just want a "is the pipe full" helper, and not having it meant that those places ended up being unnecessarily much too aware of this all. It would have been much better if only the very core pipe code that cared had been the one aware of this all. So let's fix it - better late than never. This just introduces the trivial wrappers for "is this pipe full or empty" and to get how many pipe buffers are used, so that instead of writing if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) the places that literally just want to know if a pipe is full can just say if (pipe_is_full(pipe)) instead. The existing trivial cases were converted with a 'sed' script. This cuts down on the places that access pipe->head and pipe->tail directly outside of the pipe code (and core splice code) quite a lot. The splice code in particular still revels in doing the direct low-level accesses, and the fuse fuse_dev_splice_write() code also seems a bit unnecessarily eager to go very low-level, but it's at least a bit better than it used to be. Signed-off-by: Linus Torvalds --- drivers/char/virtio_console.c | 4 ++-- fs/fuse/dev.c | 2 +- fs/pipe.c | 6 +++--- fs/splice.c | 20 ++++++++++---------- include/linux/pipe_fs_i.h | 27 +++++++++++++++++++++++++++ mm/filemap.c | 7 +++---- mm/shmem.c | 6 +++--- 7 files changed, 49 insertions(+), 23 deletions(-) (limited to 'include/linux') diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 24442485e73e..18f92dd44d45 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -923,14 +923,14 @@ static ssize_t port_fops_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); ret = 0; - if (pipe_empty(pipe->head, pipe->tail)) + if (pipe_is_empty(pipe)) goto error_out; ret = wait_port_writable(port, filp->f_flags & O_NONBLOCK); if (ret < 0) goto error_out; - occupancy = pipe_occupancy(pipe->head, pipe->tail); + occupancy = pipe_buf_usage(pipe); buf = alloc_buf(port->portdev->vdev, 0, occupancy); if (!buf) { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 3c9caafca9e2..2c3a4d09e500 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1457,7 +1457,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { + if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } diff --git a/fs/pipe.c b/fs/pipe.c index 5c872775a6db..4d0799e4e719 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -394,7 +394,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) wake_next_reader = true; mutex_lock(&pipe->mutex); } - if (pipe_empty(pipe->head, pipe->tail)) + if (pipe_is_empty(pipe)) wake_next_reader = false; mutex_unlock(&pipe->mutex); @@ -577,11 +577,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); mutex_lock(&pipe->mutex); - was_empty = pipe_empty(pipe->head, pipe->tail); + was_empty = pipe_is_empty(pipe); wake_next_writer = true; } out: - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) wake_next_writer = false; mutex_unlock(&pipe->mutex); diff --git a/fs/splice.c b/fs/splice.c index 28cfa63aa236..23fa5561b944 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -331,7 +331,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, int i; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); @@ -527,7 +527,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des return -ERESTARTSYS; repeat: - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { if (!pipe->writers) return 0; @@ -820,7 +820,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, if (signal_pending(current)) break; - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { ret = 0; if (!pipe->writers) goto out; @@ -968,7 +968,7 @@ static ssize_t do_splice_read(struct file *in, loff_t *ppos, return 0; /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + p_space = pipe->max_usage - pipe_buf_usage(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) @@ -1080,7 +1080,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; - WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); + WARN_ON_ONCE(!pipe_is_empty(pipe)); while (len) { size_t read_len; @@ -1268,7 +1268,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_is_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; @@ -1652,13 +1652,13 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_empty(pipe->head, pipe->tail)) + if (!pipe_is_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -1688,13 +1688,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_is_full(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (pipe_is_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 4d0a2267e6ef..b698758000f8 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -208,6 +208,33 @@ static inline bool pipe_full(unsigned int head, unsigned int tail, return pipe_occupancy(head, tail) >= limit; } +/** + * pipe_is_full - Return true if the pipe is full + * @pipe: the pipe + */ +static inline bool pipe_is_full(const struct pipe_inode_info *pipe) +{ + return pipe_full(pipe->head, pipe->tail, pipe->max_usage); +} + +/** + * pipe_is_empty - Return true if the pipe is empty + * @pipe: the pipe + */ +static inline bool pipe_is_empty(const struct pipe_inode_info *pipe) +{ + return pipe_empty(pipe->head, pipe->tail); +} + +/** + * pipe_buf_usage - Return how many pipe buffers are in use + * @pipe: the pipe + */ +static inline unsigned int pipe_buf_usage(const struct pipe_inode_info *pipe) +{ + return pipe_occupancy(pipe->head, pipe->tail); +} + /** * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring * @pipe: The pipe to access diff --git a/mm/filemap.c b/mm/filemap.c index d4564a79eb35..2974691fdfad 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2897,8 +2897,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, size = min(size, folio_size(folio) - offset); offset %= PAGE_SIZE; - while (spliced < size && - !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (spliced < size && !pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); @@ -2955,7 +2954,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3015,7 +3014,7 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) goto out; } diff --git a/mm/shmem.c b/mm/shmem.c index 4ea6109a8043..20032a333d80 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3487,7 +3487,7 @@ static size_t splice_zeropage_into_pipe(struct pipe_inode_info *pipe, size = min_t(size_t, size, PAGE_SIZE - offset); - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + if (!pipe_is_full(pipe)) { struct pipe_buffer *buf = pipe_head_buf(pipe); *buf = (struct pipe_buffer) { @@ -3514,7 +3514,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, int error = 0; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); @@ -3601,7 +3601,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) break; cond_resched(); -- cgit v1.2.3