summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexei Starovoitov <ast@kernel.org>2026-06-15 07:30:21 +0300
committerAlexei Starovoitov <ast@kernel.org>2026-06-15 07:30:21 +0300
commit4e7c52b178a75d9e733761ccae9fc25eaa7d8583 (patch)
treeb0c88d547739abbb2cbc792edb146bd05712ff36
parent1f32c0d619d996b395f36a920f58159949be922a (diff)
parent70b139d0483cd42808326c36c4b63d5be4a3cccb (diff)
downloadlinux-4e7c52b178a75d9e733761ccae9fc25eaa7d8583.tar.xz
Merge branch 'bpf-skmsg-some-fixes-for-skmsg'
Jiayuan Chen says: ==================== bpf, skmsg: some fixes for skmsg All fixes are from previous patches sent by Weiming Shi, Zhang Cen, Kuniyuki and Sechang Lim, which have already been reviewed by me and John and Jakub. https://lore.kernel.org/bpf/20260610081218.506709-2-rhkrqnwk98@gmail.com/ https://lore.kernel.org/bpf/20260520102715.3033936-1-rollkingzzc@gmail.com/ https://lore.kernel.org/bpf/20260424191602.1522411-3-bestswngs@gmail.com/ https://lore.kernel.org/bpf/20260423155807.1245644-2-bestswngs@gmail.com/ https://lore.kernel.org/bpf/20260221233234.3814768-4-kuniyu@google.com/ The automated reviewer (sashiko) may still flag a few other potential issues on top of this series. After looking into them, they are either already covered by the patches here, are the BPF program's own responsibility (e.g. initializing the payload it pushes) and intentionally left out, or only reachable under very narrow conditions that require a specially crafted BPF program and an unusual sk_msg ring state, so they are not practical to trigger and are left out of this series. I'm collecting these fixes together because the same problems have been re-sent many times in slightly different forms, and I hope this series can be prioritized for merging so the duplicates can finally settle. With so many AI-generated patches floating around for these spots, leaving them unmerged just keeps wasting maintainer review cycles on the same issues. v3->v4: Carry Kuniyuki Iwashima's reviewed-by tag. Drop the __GFP_ZERO patch; initializing the pushed payload is the BPF program's responsibility, not the kernel's (per maintainer feedback). https://lore.kernel.org/bpf/20260612130919.299124-1-jiayuan.chen@linux.dev/ v2->v3: Target to bpf-next and carry Emil's reviewed-by tag. Reverse xmas tree style is used suggested by Cong. (not all code match reverse xmas tree due to variable dependency) v1->v2: fix problem when fix the conflict. ==================== Link: https://patch.msgid.link/20260615021959.140010-1-jiayuan.chen@linux.dev Signed-off-by: Alexei Starovoitov <ast@kernel.org>
-rw-r--r--net/core/filter.c97
-rw-r--r--net/ipv4/udp_bpf.c9
-rw-r--r--tools/testing/selftests/bpf/prog_tests/sockmap_basic.c48
-rw-r--r--tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c27
4 files changed, 173 insertions, 8 deletions
diff --git a/net/core/filter.c b/net/core/filter.c
index 57b00c6cc8cc..126aba56f1c0 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2654,6 +2654,38 @@ static void sk_msg_reset_curr(struct sk_msg *msg)
}
}
+static bool sk_msg_elem_is_copy(const struct sk_msg *msg, u32 i)
+{
+ return test_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_clear_elem_copy(struct sk_msg *msg, u32 i)
+{
+ __clear_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_set_elem_copy(struct sk_msg *msg, u32 i)
+{
+ __set_bit(i, msg->sg.copy);
+}
+
+static void sk_msg_clear_copy_range(struct sk_msg *msg, u32 start, u32 end)
+{
+ while (start != end) {
+ sk_msg_clear_elem_copy(msg, start);
+ sk_msg_iter_var_next(start);
+ }
+}
+
+static void sk_msg_sg_move(struct sk_msg *msg, u32 dst, u32 src)
+{
+ msg->sg.data[dst] = msg->sg.data[src];
+ if (sk_msg_elem_is_copy(msg, src))
+ sk_msg_set_elem_copy(msg, dst);
+ else
+ sk_msg_clear_elem_copy(msg, dst);
+}
+
static const struct bpf_func_proto bpf_msg_cork_bytes_proto = {
.func = bpf_msg_cork_bytes,
.gpl_only = false,
@@ -2692,7 +2724,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
* account for the headroom.
*/
bytes_sg_total = start - offset + bytes;
- if (!test_bit(i, msg->sg.copy) && bytes_sg_total <= len)
+ if (!sk_msg_elem_is_copy(msg, i) && bytes_sg_total <= len)
goto out;
/* At this point we need to linearize multiple scatterlist
@@ -2738,6 +2770,7 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
} while (i != last_sge);
sg_set_page(&msg->sg.data[first_sge], page, copy, 0);
+ sk_msg_clear_elem_copy(msg, first_sge);
/* To repair sg ring we need to shift entries. If we only
* had a single entry though we can just replace it and
@@ -2747,8 +2780,14 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
shift = last_sge > first_sge ?
last_sge - first_sge - 1 :
NR_MSG_FRAG_IDS - first_sge + last_sge - 1;
- if (!shift)
+ if (!shift) {
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
goto out;
+ }
+
+ i = first_sge;
+ sk_msg_iter_var_next(i);
+ sk_msg_clear_copy_range(msg, i, last_sge);
i = first_sge;
sk_msg_iter_var_next(i);
@@ -2762,16 +2801,18 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
if (move_from == msg->sg.end)
break;
- msg->sg.data[i] = msg->sg.data[move_from];
+ sk_msg_sg_move(msg, i, move_from);
msg->sg.data[move_from].length = 0;
msg->sg.data[move_from].page_link = 0;
msg->sg.data[move_from].offset = 0;
+ sk_msg_clear_elem_copy(msg, move_from);
sk_msg_iter_var_next(i);
} while (1);
msg->sg.end = msg->sg.end - shift > msg->sg.end ?
msg->sg.end - shift + NR_MSG_FRAG_IDS :
msg->sg.end - shift;
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
out:
sk_msg_reset_curr(msg);
msg->data = sg_virt(&msg->sg.data[first_sge]) + start - offset;
@@ -2792,8 +2833,10 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = {
BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
+ bool sge_copy = false, nsge_copy = false, nnsge_copy = false;
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
+ bool rsge_copy = false;
u8 *raw, *to, *from;
struct page *page;
@@ -2829,6 +2872,9 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
if (!space || (space == 1 && start != offset))
copy = msg->sg.data[i].length;
+ if (unlikely(copy + len < copy))
+ return -EINVAL;
+
page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP,
get_order(copy + len));
if (unlikely(!page))
@@ -2866,10 +2912,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_prev(i);
psge = sk_msg_elem(msg, i);
rsge = sk_msg_elem_cpy(msg, i);
+ rsge_copy = sk_msg_elem_is_copy(msg, i);
psge->length = start - offset;
rsge.length -= psge->length;
- rsge.offset += start;
+ rsge.offset += start - offset;
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
@@ -2891,23 +2938,34 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* Shift one or two slots as needed */
sge = sk_msg_elem_cpy(msg, new);
sg_unmark_end(&sge);
+ sge_copy = sk_msg_elem_is_copy(msg, new);
nsge = sk_msg_elem_cpy(msg, i);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
if (rsge.length) {
sk_msg_iter_var_next(i);
nnsge = sk_msg_elem_cpy(msg, i);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_next(msg, end);
}
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
+ if (sge_copy)
+ sk_msg_set_elem_copy(msg, i);
+ else
+ sk_msg_clear_elem_copy(msg, i);
sge = nsge;
+ sge_copy = nsge_copy;
sk_msg_iter_var_next(i);
if (rsge.length) {
nsge = nnsge;
+ nsge_copy = nnsge_copy;
nnsge = sk_msg_elem_cpy(msg, i);
+ nnsge_copy = sk_msg_elem_is_copy(msg, i);
} else {
nsge = sk_msg_elem_cpy(msg, i);
+ nsge_copy = sk_msg_elem_is_copy(msg, i);
}
}
@@ -2915,13 +2973,18 @@ place_new:
/* Place newly allocated data buffer */
sk_mem_charge(msg->sk, len);
msg->sg.size += len;
- __clear_bit(new, msg->sg.copy);
+ sk_msg_clear_elem_copy(msg, new);
sg_set_page(&msg->sg.data[new], page, len + copy, 0);
if (rsge.length) {
get_page(sg_page(&rsge));
sk_msg_iter_var_next(new);
msg->sg.data[new] = rsge;
+ if (rsge_copy)
+ sk_msg_set_elem_copy(msg, new);
+ else
+ sk_msg_clear_elem_copy(msg, new);
}
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
sk_msg_reset_curr(msg);
sk_msg_compute_data_pointers(msg);
@@ -2947,35 +3010,46 @@ static void sk_msg_shift_left(struct sk_msg *msg, int i)
do {
prev = i;
sk_msg_iter_var_next(i);
- msg->sg.data[prev] = msg->sg.data[i];
+ sk_msg_sg_move(msg, prev, i);
} while (i != msg->sg.end);
sk_msg_iter_prev(msg, end);
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
}
static void sk_msg_shift_right(struct sk_msg *msg, int i)
{
struct scatterlist tmp, sge;
+ bool tmp_copy, sge_copy;
sk_msg_iter_next(msg, end);
sge = sk_msg_elem_cpy(msg, i);
+ sge_copy = sk_msg_elem_is_copy(msg, i);
sk_msg_iter_var_next(i);
tmp = sk_msg_elem_cpy(msg, i);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
while (i != msg->sg.end) {
msg->sg.data[i] = sge;
+ if (sge_copy)
+ sk_msg_set_elem_copy(msg, i);
+ else
+ sk_msg_clear_elem_copy(msg, i);
sk_msg_iter_var_next(i);
sge = tmp;
+ sge_copy = tmp_copy;
tmp = sk_msg_elem_cpy(msg, i);
+ tmp_copy = sk_msg_elem_is_copy(msg, i);
}
+ sk_msg_clear_elem_copy(msg, msg->sg.end);
}
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
u32 i = 0, l = 0, space, offset = 0;
- u64 last = start + len;
- int pop;
+ u64 last = (u64)start + len;
+ u32 pop;
if (unlikely(flags))
return -EINVAL;
@@ -3024,8 +3098,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
*/
if (start != offset) {
struct scatterlist *nsge, *sge = sk_msg_elem(msg, i);
+ bool sge_copy = sk_msg_elem_is_copy(msg, i);
int a = start - offset;
int b = sge->length - pop - a;
+ u32 sge_idx = i;
sk_msg_iter_var_next(i);
@@ -3038,6 +3114,10 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
sg_set_page(nsge,
sg_page(sge),
b, sge->offset + pop + a);
+ if (sge_copy)
+ sk_msg_set_elem_copy(msg, i);
+ else
+ sk_msg_clear_elem_copy(msg, i);
} else {
struct page *page, *orig;
u8 *to, *from;
@@ -3054,6 +3134,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
memcpy(to, from, a);
memcpy(to + a, from + a + pop, b);
sg_set_page(sge, page, a + b, 0);
+ sk_msg_clear_elem_copy(msg, sge_idx);
put_page(orig);
}
pop = 0;
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index 9f33b07b1481..ad57c4c9eaab 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -50,7 +50,9 @@ static int udp_msg_wait_data(struct sock *sk, struct sk_psock *psock,
sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
ret = udp_msg_has_data(sk, psock);
if (!ret) {
+ release_sock(sk);
wait_woken(&wait, TASK_INTERRUPTIBLE, timeo);
+ lock_sock(sk);
ret = udp_msg_has_data(sk, psock);
}
sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
@@ -79,6 +81,7 @@ static int udp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out;
}
+ lock_sock(sk);
msg_bytes_ready:
copied = sk_msg_recvmsg(sk, psock, msg, len, flags);
if (!copied) {
@@ -90,11 +93,17 @@ msg_bytes_ready:
if (data) {
if (psock_has_data(psock))
goto msg_bytes_ready;
+
+ release_sock(sk);
+
ret = sk_udp_recvmsg(sk, msg, len, flags);
goto out;
}
copied = -EAGAIN;
}
+
+ release_sock(sk);
+
ret = copied;
out:
sk_psock_put(sk, psock);
diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
index d2846579285f..cb3229711f93 100644
--- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
+++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c
@@ -14,6 +14,7 @@
#include "test_sockmap_pass_prog.skel.h"
#include "test_sockmap_drop_prog.skel.h"
#include "test_sockmap_change_tail.skel.h"
+#include "test_sockmap_msg_pop_data.skel.h"
#include "bpf_iter_sockmap.skel.h"
#include "sockmap_helpers.h"
@@ -666,6 +667,51 @@ out:
test_sockmap_change_tail__destroy(skel);
}
+static void test_sockmap_msg_verdict_pop_data(void)
+{
+ struct test_sockmap_msg_pop_data *skel;
+ int err, map, verdict;
+ int c1 = -1, p1 = -1, sent;
+ int zero = 0;
+ char *buf;
+ const size_t len = 32 * 1024;
+
+ skel = test_sockmap_msg_pop_data__open_and_load();
+ if (!ASSERT_OK_PTR(skel, "open_and_load"))
+ return;
+
+ verdict = bpf_program__fd(skel->progs.prog_msg_pop_data);
+ map = bpf_map__fd(skel->maps.sock_map);
+
+ err = bpf_prog_attach(verdict, map, BPF_SK_MSG_VERDICT, 0);
+ if (!ASSERT_OK(err, "bpf_prog_attach"))
+ goto out;
+
+ err = create_pair(AF_INET, SOCK_STREAM, &c1, &p1);
+ if (!ASSERT_OK(err, "create_pair"))
+ goto out;
+
+ err = bpf_map_update_elem(map, &zero, &c1, BPF_NOEXIST);
+ if (!ASSERT_OK(err, "bpf_map_update_elem"))
+ goto out_close;
+
+ buf = calloc(len, 1);
+ if (!ASSERT_OK_PTR(buf, "calloc"))
+ goto out_close;
+
+ sent = xsend(c1, buf, len, 0);
+ ASSERT_EQ(sent, (ssize_t)len, "xsend");
+ ASSERT_EQ(skel->data->pop_data_ret, -EINVAL, "pop_data_rejects overflow");
+
+ free(buf);
+
+out_close:
+ close(c1);
+ close(p1);
+out:
+ test_sockmap_msg_pop_data__destroy(skel);
+}
+
static void test_sockmap_skb_verdict_peek_helper(int map)
{
int err, c1, p1, zero = 0, sent, recvd, avail;
@@ -1373,6 +1419,8 @@ void test_sockmap_basic(void)
test_sockmap_skb_verdict_fionread(false);
if (test__start_subtest("sockmap skb_verdict change tail"))
test_sockmap_skb_verdict_change_tail();
+ if (test__start_subtest("sockmap msg_verdict pop_data overflow"))
+ test_sockmap_msg_verdict_pop_data();
if (test__start_subtest("sockmap skb_verdict msg_f_peek"))
test_sockmap_skb_verdict_peek();
if (test__start_subtest("sockmap skb_verdict msg_f_peek with link"))
diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c
new file mode 100644
index 000000000000..301e65b95256
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_sockmap_msg_pop_data.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+struct {
+ __uint(type, BPF_MAP_TYPE_SOCKMAP);
+ __uint(max_entries, 1);
+ __type(key, int);
+ __type(value, int);
+} sock_map SEC(".maps");
+
+#define POP_START 0x48a3
+#define POP_LEN 0xfffffffd
+
+long pop_data_ret = 1;
+
+SEC("sk_msg")
+int prog_msg_pop_data(struct sk_msg_md *msg)
+{
+ if (msg->size <= POP_START)
+ return SK_PASS;
+
+ pop_data_ret = bpf_msg_pop_data(msg, POP_START, POP_LEN, 0);
+ return SK_PASS;
+}
+
+char _license[] SEC("license") = "GPL";