bpf: udp: Make sure iter->batch always contains a full bucket snapshot

Require that iter->batch always contains a full bucket snapshot. This invariant is important to avoid skipping or repeating sockets during iteration when combined with the next few patches. Before, there were two cases where a call to bpf_iter_udp_batch may only capture part of a bucket: 1. When bpf_iter_udp_realloc_batch() returns -ENOMEM [1]. 2. When more sockets are added to the bucket while calling bpf_iter_udp_realloc_batch(), making the updated batch size insufficient [2]. In cases where the batch size only covers part of a bucket, it is possible to forget which sockets were already visited, especially if we have to process a bucket in more than two batches. This forces us to choose between repeating or skipping sockets, so don't allow this: 1. Stop iteration and propagate -ENOMEM up to userspace if reallocation fails instead of continuing with a partial batch. 2. Try bpf_iter_udp_realloc_batch() with GFP_USER just as before, but if we still aren't able to capture the full bucket, call bpf_iter_udp_realloc_batch() again while holding the bucket lock to guarantee the bucket does not change. On the second attempt use GFP_NOWAIT since we hold onto the spin lock. Introduce the udp_portaddr_for_each_entry_from macro and use it instead of udp_portaddr_for_each_entry to make it possible to continue iteration from an arbitrary socket. This is required for this patch in the GFP_NOWAIT case to allow us to fill the rest of a batch starting from the middle of a bucket and the later patch which skips sockets that were already seen. Testing all scenarios directly is a bit difficult, but I did some manual testing to exercise the code paths where GFP_NOWAIT is used and where ERR_PTR(err) is returned. I used the realloc test case included later in this series to trigger a scenario where a realloc happens inside bpf_iter_udp_batch and made a small code tweak to force the first realloc attempt to allocate a too-small batch, thus requiring another attempt with GFP_NOWAIT. Some printks showed both reallocs with the tests passing: Apr 25 23:16:24 crow kernel: go again GFP_USER Apr 25 23:16:24 crow kernel: go again GFP_NOWAIT With this setup, I also forced each of the bpf_iter_udp_realloc_batch calls to return -ENOMEM to ensure that iteration ends and that the read() in userspace fails. [1]: https://lore.kernel.org/bpf/CABi4-ogUtMrH8-NVB6W8Xg_F_KDLq=yy-yu-tKr2udXE2Mu1Lg@mail.gmail.com/ [2]: https://lore.kernel.org/bpf/7ed28273-a716-4638-912d-f86f965e54bb@linux.dev/ Signed-off-by: Jordan Rife <jordan@jrife.io> Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
author: Jordan Rife <jordan@jrife.io> 2025-05-02 19:15:21 +0300
committer: Martin KaFai Lau <martin.lau@kernel.org> 2025-05-02 20:54:37 +0300
commit: 66d454e99d71857faf249486912e381ec83760b4 (patch)
tree: a458e39bde54eb5efdff5db873f8eb9e80421306
parent: 3e485e15a169fb69c07c75d9f82843dd481215fc (diff)
download: linux-66d454e99d71857faf249486912e381ec83760b4.tar.xz
2 files changed, 58 insertions, 26 deletions
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 895240177f4f..4e1a672af4c5 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -216,6 +216,9 @@ static inline void udp_allow_gso(struct sock *sk)
 #define udp_portaddr_for_each_entry(__sk, list) \
 	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
+#define udp_portaddr_for_each_entry_from(__sk) \
+	hlist_for_each_entry_from(__sk, __sk_common.skc_portaddr_node)
+
 #define udp_portaddr_for_each_entry_rcu(__sk, list) \
 	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 68a77323bc51..426a8b7c5cde 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3433,8 +3433,9 @@ static struct sock *bpf_iter_udp_batch(struct seq_file *seq)
 	int resume_bucket, resume_offset;
 	struct udp_table *udptable;
 	unsigned int batch_sks = 0;
-	bool resized = false;
+	int resizes = 0;
 	struct sock *sk;
+	int err = 0;
 
 	resume_bucket = state->bucket;
 	resume_offset = iter->offset;
@@ -3455,18 +3456,21 @@ again:
 	 */
 	iter->cur_sk = 0;
 	iter->end_sk = 0;
-	iter->st_bucket_done = false;
+	iter->st_bucket_done = true;
 	batch_sks = 0;
 
 	for (; state->bucket <= udptable->mask; state->bucket++) {
 		struct udp_hslot *hslot2 = &udptable->hash2[state->bucket].hslot;
 
 		if (hlist_empty(&hslot2->head))
-			continue;
+			goto next_bucket;
 
 		iter->offset = 0;
 		spin_lock_bh(&hslot2->lock);
-		udp_portaddr_for_each_entry(sk, &hslot2->head) {
+		sk = hlist_entry_safe(hslot2->head.first, struct sock,
+				      __sk_common.skc_portaddr_node);
+fill_batch:
+		udp_portaddr_for_each_entry_from(sk) {
 			if (seq_sk_match(seq, sk)) {
 				/* Resume from the last iterated socket at the
 				 * offset in the bucket before iterator was stopped.
@@ -3483,33 +3487,55 @@ again:
 				batch_sks++;
 			}
 		}
+
+		/* Allocate a larger batch and try again. */
+		if (unlikely(resizes <= 1 && iter->end_sk &&
+			     iter->end_sk != batch_sks)) {
+			resizes++;
+
+			/* First, try with GFP_USER to maximize the chances of
+			 * grabbing more memory.
+			 */
+			if (resizes == 1) {
+				spin_unlock_bh(&hslot2->lock);
+				err = bpf_iter_udp_realloc_batch(iter,
+								 batch_sks * 3 / 2,
+								 GFP_USER);
+				if (err)
+					return ERR_PTR(err);
+				/* Start over. */
+				goto again;
+			}
+
+			/* Next, hold onto the lock, so the bucket doesn't
+			 * change while we get the rest of the sockets.
+			 */
+			err = bpf_iter_udp_realloc_batch(iter, batch_sks,
+							 GFP_NOWAIT);
+			if (err) {
+				spin_unlock_bh(&hslot2->lock);
+				return ERR_PTR(err);
+			}
+
+			/* Pick up where we left off. */
+			sk = iter->batch[iter->end_sk - 1];
+			sk = hlist_entry_safe(sk->__sk_common.skc_portaddr_node.next,
+					      struct sock,
+					      __sk_common.skc_portaddr_node);
+			batch_sks = iter->end_sk;
+			goto fill_batch;
+		}
+
 		spin_unlock_bh(&hslot2->lock);
 
 		if (iter->end_sk)
 			break;
+next_bucket:
+		resizes = 0;
 	}
 
-	/* All done: no batch made. */
-	if (!iter->end_sk)
-		return NULL;
-
-	if (iter->end_sk == batch_sks) {
-		/* Batching is done for the current bucket; return the first
-		 * socket to be iterated from the batch.
-		 */
-		iter->st_bucket_done = true;
-		goto done;
-	}
-	if (!resized && !bpf_iter_udp_realloc_batch(iter, batch_sks * 3 / 2,
-						    GFP_USER)) {
-		resized = true;
-		/* After allocating a larger batch, retry one more time to grab
-		 * the whole bucket.
-		 */
-		goto again;
-	}
-done:
-	return iter->batch[0];
+	WARN_ON_ONCE(iter->end_sk != batch_sks);
+	return iter->end_sk ? iter->batch[0] : NULL;
 }
 
 static void *bpf_iter_udp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -3873,7 +3899,10 @@ static int bpf_iter_udp_realloc_batch(struct bpf_udp_iter_state *iter,
 	if (!new_batch)
 		return -ENOMEM;
 
-	bpf_iter_udp_put_batch(iter);
+	if (flags != GFP_NOWAIT)
+		bpf_iter_udp_put_batch(iter);
+
+	memcpy(new_batch, iter->batch, sizeof(*iter->batch) * iter->end_sk);
 	kvfree(iter->batch);
 	iter->batch = new_batch;
 	iter->max_sk = new_batch_sz;
author	Jordan Rife <jordan@jrife.io>	2025-05-02 19:15:21 +0300
committer	Martin KaFai Lau <martin.lau@kernel.org>	2025-05-02 20:54:37 +0300
commit	66d454e99d71857faf249486912e381ec83760b4 (patch)
tree	a458e39bde54eb5efdff5db873f8eb9e80421306
parent	3e485e15a169fb69c07c75d9f82843dd481215fc (diff)
download	linux-66d454e99d71857faf249486912e381ec83760b4.tar.xz