From e6ed54e86aae9e4f7286ce8d5c73780f91b48d1c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 3 Jun 2025 16:12:39 -0400 Subject: Bluetooth: MGMT: Fix UAF on mgmt_remove_adv_monitor_complete This reworks MGMT_OP_REMOVE_ADV_MONITOR to not use mgmt_pending_add to avoid crashes like bellow: ================================================================== BUG: KASAN: slab-use-after-free in mgmt_remove_adv_monitor_complete+0xe5/0x540 net/bluetooth/mgmt.c:5406 Read of size 8 at addr ffff88801c53f318 by task kworker/u5:5/5341 CPU: 0 UID: 0 PID: 5341 Comm: kworker/u5:5 Not tainted 6.15.0-syzkaller-10402-g4cb6c8af8591 #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 mgmt_remove_adv_monitor_complete+0xe5/0x540 net/bluetooth/mgmt.c:5406 hci_cmd_sync_work+0x261/0x3a0 net/bluetooth/hci_sync.c:334 process_one_work kernel/workqueue.c:3238 [inline] process_scheduled_works+0xade/0x17b0 kernel/workqueue.c:3321 worker_thread+0x8a0/0xda0 kernel/workqueue.c:3402 kthread+0x711/0x8a0 kernel/kthread.c:464 ret_from_fork+0x3fc/0x770 arch/x86/kernel/process.c:148 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 Allocated by task 5987: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4358 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] mgmt_pending_new+0x65/0x240 net/bluetooth/mgmt_util.c:252 mgmt_pending_add+0x34/0x120 net/bluetooth/mgmt_util.c:279 remove_adv_monitor+0x103/0x1b0 net/bluetooth/mgmt.c:5454 hci_mgmt_cmd+0x9c9/0xef0 net/bluetooth/hci_sock.c:1719 hci_sock_sendmsg+0x6ca/0xef0 net/bluetooth/hci_sock.c:1839 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:727 sock_write_iter+0x258/0x330 net/socket.c:1131 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x548/0xa90 fs/read_write.c:686 ksys_write+0x145/0x250 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 5989: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2380 [inline] slab_free mm/slub.c:4642 [inline] kfree+0x18e/0x440 mm/slub.c:4841 mgmt_pending_foreach+0xc9/0x120 net/bluetooth/mgmt_util.c:242 mgmt_index_removed+0x10d/0x2f0 net/bluetooth/mgmt.c:9366 hci_sock_bind+0xbe9/0x1000 net/bluetooth/hci_sock.c:1314 __sys_bind_socket net/socket.c:1810 [inline] __sys_bind+0x2c3/0x3e0 net/socket.c:1841 __do_sys_bind net/socket.c:1846 [inline] __se_sys_bind net/socket.c:1844 [inline] __x64_sys_bind+0x7a/0x90 net/socket.c:1844 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: 66bd095ab5d4 ("Bluetooth: advmon offload MSFT remove monitor") Closes: https://syzkaller.appspot.com/bug?extid=feb0dc579bbe30a13190 Reported-by: syzbot+feb0dc579bbe30a13190@syzkaller.appspotmail.com Tested-by: syzbot+feb0dc579bbe30a13190@syzkaller.appspotmail.com Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 2b261e74e2c4..93fcb659f0d4 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -2400,7 +2400,6 @@ void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance); void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev, u8 instance); -void mgmt_adv_monitor_removed(struct hci_dev *hdev, u16 handle); int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip); void mgmt_adv_monitor_device_lost(struct hci_dev *hdev, u16 handle, bdaddr_t *bdaddr, u8 addr_type); -- cgit v1.2.3 From 6fe26f694c824b8a4dbf50c635bee1302e3f099c Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Tue, 20 May 2025 15:42:21 -0400 Subject: Bluetooth: MGMT: Protect mgmt_pending list with its own lock This uses a mutex to protect from concurrent access of mgmt_pending list which can cause crashes like: ================================================================== BUG: KASAN: slab-use-after-free in hci_sock_get_channel+0x60/0x68 net/bluetooth/hci_sock.c:91 Read of size 2 at addr ffff0000c48885b2 by task syz.4.334/7318 CPU: 0 UID: 0 PID: 7318 Comm: syz.4.334 Not tainted 6.15.0-rc7-syzkaller-g187899f4124a #0 PREEMPT Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 Call trace: show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:466 (C) __dump_stack+0x30/0x40 lib/dump_stack.c:94 dump_stack_lvl+0xd8/0x12c lib/dump_stack.c:120 print_address_description+0xa8/0x254 mm/kasan/report.c:408 print_report+0x68/0x84 mm/kasan/report.c:521 kasan_report+0xb0/0x110 mm/kasan/report.c:634 __asan_report_load2_noabort+0x20/0x2c mm/kasan/report_generic.c:379 hci_sock_get_channel+0x60/0x68 net/bluetooth/hci_sock.c:91 mgmt_pending_find+0x7c/0x140 net/bluetooth/mgmt_util.c:223 pending_find net/bluetooth/mgmt.c:947 [inline] remove_adv_monitor+0x44/0x1a4 net/bluetooth/mgmt.c:5445 hci_mgmt_cmd+0x780/0xc00 net/bluetooth/hci_sock.c:1712 hci_sock_sendmsg+0x544/0xbb0 net/bluetooth/hci_sock.c:1832 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg net/socket.c:727 [inline] sock_write_iter+0x25c/0x378 net/socket.c:1131 new_sync_write fs/read_write.c:591 [inline] vfs_write+0x62c/0x97c fs/read_write.c:684 ksys_write+0x120/0x210 fs/read_write.c:736 __do_sys_write fs/read_write.c:747 [inline] __se_sys_write fs/read_write.c:744 [inline] __arm64_sys_write+0x7c/0x90 fs/read_write.c:744 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 Allocated by task 7037: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x40/0x78 mm/kasan/common.c:68 kasan_save_alloc_info+0x44/0x54 mm/kasan/generic.c:562 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x9c/0xb4 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __do_kmalloc_node mm/slub.c:4327 [inline] __kmalloc_noprof+0x2fc/0x4c8 mm/slub.c:4339 kmalloc_noprof include/linux/slab.h:909 [inline] sk_prot_alloc+0xc4/0x1f0 net/core/sock.c:2198 sk_alloc+0x44/0x3ac net/core/sock.c:2254 bt_sock_alloc+0x4c/0x300 net/bluetooth/af_bluetooth.c:148 hci_sock_create+0xa8/0x194 net/bluetooth/hci_sock.c:2202 bt_sock_create+0x14c/0x24c net/bluetooth/af_bluetooth.c:132 __sock_create+0x43c/0x91c net/socket.c:1541 sock_create net/socket.c:1599 [inline] __sys_socket_create net/socket.c:1636 [inline] __sys_socket+0xd4/0x1c0 net/socket.c:1683 __do_sys_socket net/socket.c:1697 [inline] __se_sys_socket net/socket.c:1695 [inline] __arm64_sys_socket+0x7c/0x94 net/socket.c:1695 __invoke_syscall arch/arm64/kernel/syscall.c:35 [inline] invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:49 el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:132 do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:151 el0_svc+0x58/0x17c arch/arm64/kernel/entry-common.c:767 el0t_64_sync_handler+0x78/0x108 arch/arm64/kernel/entry-common.c:786 el0t_64_sync+0x198/0x19c arch/arm64/kernel/entry.S:600 Freed by task 6607: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x40/0x78 mm/kasan/common.c:68 kasan_save_free_info+0x58/0x70 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x68/0x88 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2380 [inline] slab_free mm/slub.c:4642 [inline] kfree+0x17c/0x474 mm/slub.c:4841 sk_prot_free net/core/sock.c:2237 [inline] __sk_destruct+0x4f4/0x760 net/core/sock.c:2332 sk_destruct net/core/sock.c:2360 [inline] __sk_free+0x320/0x430 net/core/sock.c:2371 sk_free+0x60/0xc8 net/core/sock.c:2382 sock_put include/net/sock.h:1944 [inline] mgmt_pending_free+0x88/0x118 net/bluetooth/mgmt_util.c:290 mgmt_pending_remove+0xec/0x104 net/bluetooth/mgmt_util.c:298 mgmt_set_powered_complete+0x418/0x5cc net/bluetooth/mgmt.c:1355 hci_cmd_sync_work+0x204/0x33c net/bluetooth/hci_sync.c:334 process_one_work+0x7e8/0x156c kernel/workqueue.c:3238 process_scheduled_works kernel/workqueue.c:3319 [inline] worker_thread+0x958/0xed8 kernel/workqueue.c:3400 kthread+0x5fc/0x75c kernel/kthread.c:464 ret_from_fork+0x10/0x20 arch/arm64/kernel/entry.S:847 Fixes: a380b6cff1a2 ("Bluetooth: Add generic mgmt helper API") Closes: https://syzkaller.appspot.com/bug?extid=0a7039d5d9986ff4ecec Closes: https://syzkaller.appspot.com/bug?extid=cc0cc52e7f43dc9e6df1 Reported-by: syzbot+0a7039d5d9986ff4ecec@syzkaller.appspotmail.com Tested-by: syzbot+0a7039d5d9986ff4ecec@syzkaller.appspotmail.com Tested-by: syzbot+cc0cc52e7f43dc9e6df1@syzkaller.appspotmail.com Signed-off-by: Dmitry Antipov Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 1 + net/bluetooth/mgmt.c | 101 +++++++++++++++++++-------------------- net/bluetooth/mgmt_util.c | 32 +++++++++++-- net/bluetooth/mgmt_util.h | 4 +- 5 files changed, 80 insertions(+), 59 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 93fcb659f0d4..f7b1a9eb9543 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -546,6 +546,7 @@ struct hci_dev { struct hci_conn_hash conn_hash; struct list_head mesh_pending; + struct mutex mgmt_pending_lock; struct list_head mgmt_pending; struct list_head reject_list; struct list_head accept_list; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index aeda2e4557d5..487c045a7ba8 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2485,6 +2485,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); + mutex_init(&hdev->mgmt_pending_lock); ida_init(&hdev->unset_handle_ida); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index feaeec2423ae..de7adb9a47f9 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1447,22 +1447,17 @@ static void settings_rsp(struct mgmt_pending_cmd *cmd, void *data) send_settings_rsp(cmd->sk, cmd->opcode, match->hdev); - list_del(&cmd->list); - if (match->sk == NULL) { match->sk = cmd->sk; sock_hold(match->sk); } - - mgmt_pending_free(cmd); } static void cmd_status_rsp(struct mgmt_pending_cmd *cmd, void *data) { u8 *status = data; - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, *status); - mgmt_pending_remove(cmd); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, *status); } static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) @@ -1476,8 +1471,6 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) if (cmd->cmd_complete) { cmd->cmd_complete(cmd, match->mgmt_status); - mgmt_pending_remove(cmd); - return; } @@ -1486,13 +1479,13 @@ static void cmd_complete_rsp(struct mgmt_pending_cmd *cmd, void *data) static int generic_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, cmd->param, cmd->param_len); } static int addr_cmd_complete(struct mgmt_pending_cmd *cmd, u8 status) { - return mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, + return mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, cmd->param, sizeof(struct mgmt_addr_info)); } @@ -1532,7 +1525,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); goto done; } @@ -1707,7 +1700,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); goto done; } @@ -1943,8 +1936,8 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, cmd_status_rsp, - &mgmt_err); + mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, + cmd_status_rsp, &mgmt_err); return; } @@ -1954,7 +1947,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); if (changed) new_settings(hdev, match.sk); @@ -2074,12 +2067,12 @@ static void set_le_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, cmd_status_rsp, - &status); + mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, + &status); return; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); new_settings(hdev, match.sk); @@ -2138,7 +2131,7 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) struct sock *sk = cmd->sk; if (status) { - mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, + mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, cmd_status_rsp, &status); return; } @@ -2638,7 +2631,7 @@ static void mgmt_class_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), hdev->dev_class, 3); mgmt_pending_free(cmd); @@ -3427,7 +3420,7 @@ static int pairing_complete(struct mgmt_pending_cmd *cmd, u8 status) bacpy(&rp.addr.bdaddr, &conn->dst); rp.addr.type = link_to_bdaddr(conn->type, conn->dst_type); - err = mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_PAIR_DEVICE, + err = mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_PAIR_DEVICE, status, &rp, sizeof(rp)); /* So we don't get further callbacks for this connection */ @@ -5186,7 +5179,7 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, hci_update_passive_scan(hdev); } - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_remove(cmd); @@ -5401,7 +5394,7 @@ static void mgmt_remove_adv_monitor_complete(struct hci_dev *hdev, hci_update_passive_scan(hdev); } - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(status), &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -5777,7 +5770,7 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) return; - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -5998,7 +5991,7 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, mgmt_status(err), + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); mgmt_pending_remove(cmd); @@ -6223,7 +6216,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) u8 status = mgmt_status(err); if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, cmd_status_rsp, &status); return; } @@ -6233,7 +6226,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, settings_rsp, + mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, &match); new_settings(hdev, match.sk); @@ -6577,7 +6570,7 @@ static void set_bredr_complete(struct hci_dev *hdev, void *data, int err) */ hci_dev_clear_flag(hdev, HCI_BREDR_ENABLED); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); } else { send_settings_rsp(cmd->sk, MGMT_OP_SET_BREDR, hdev); new_settings(hdev, cmd->sk); @@ -6714,7 +6707,7 @@ static void set_secure_conn_complete(struct hci_dev *hdev, void *data, int err) if (err) { u8 mgmt_err = mgmt_status(err); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); goto done; } @@ -7161,7 +7154,7 @@ static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err) rp.max_tx_power = HCI_TX_POWER_INVALID; } - mgmt_cmd_complete(cmd->sk, cmd->index, MGMT_OP_GET_CONN_INFO, status, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, MGMT_OP_GET_CONN_INFO, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -7321,7 +7314,7 @@ static void get_clock_info_complete(struct hci_dev *hdev, void *data, int err) } complete: - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, status, &rp, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, status, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -8571,10 +8564,10 @@ static void add_advertising_complete(struct hci_dev *hdev, void *data, int err) rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); add_adv_complete(hdev, cmd->sk, cp->instance, err); @@ -8762,10 +8755,10 @@ static void add_ext_adv_params_complete(struct hci_dev *hdev, void *data, hci_remove_adv_instance(hdev, cp->instance); - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); } else { - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); } @@ -8912,10 +8905,10 @@ static void add_ext_adv_data_complete(struct hci_dev *hdev, void *data, int err) rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -9074,10 +9067,10 @@ static void remove_advertising_complete(struct hci_dev *hdev, void *data, rp.instance = cp->instance; if (err) - mgmt_cmd_status(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err)); else - mgmt_cmd_complete(cmd->sk, cmd->index, cmd->opcode, + mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, MGMT_STATUS_SUCCESS, &rp, sizeof(rp)); mgmt_pending_free(cmd); @@ -9349,7 +9342,7 @@ void mgmt_index_removed(struct hci_dev *hdev) if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) return; - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); + mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { mgmt_index_event(MGMT_EV_UNCONF_INDEX_REMOVED, hdev, NULL, 0, @@ -9387,7 +9380,8 @@ void mgmt_power_on(struct hci_dev *hdev, int err) hci_update_passive_scan(hdev); } - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp, + &match); new_settings(hdev, match.sk); @@ -9402,7 +9396,8 @@ void __mgmt_power_off(struct hci_dev *hdev) struct cmd_lookup match = { NULL, hdev }; u8 zero_cod[] = { 0, 0, 0 }; - mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match); + mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, true, settings_rsp, + &match); /* If the power off is because of hdev unregistration let * use the appropriate INVALID_INDEX status. Otherwise use @@ -9416,7 +9411,7 @@ void __mgmt_power_off(struct hci_dev *hdev) else match.mgmt_status = MGMT_STATUS_NOT_POWERED; - mgmt_pending_foreach(0, hdev, cmd_complete_rsp, &match); + mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); if (memcmp(hdev->dev_class, zero_cod, sizeof(zero_cod)) != 0) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, @@ -9657,7 +9652,6 @@ static void unpair_device_rsp(struct mgmt_pending_cmd *cmd, void *data) device_unpaired(hdev, &cp->addr.bdaddr, cp->addr.type, cmd->sk); cmd->cmd_complete(cmd, 0); - mgmt_pending_remove(cmd); } bool mgmt_powering_down(struct hci_dev *hdev) @@ -9713,8 +9707,8 @@ void mgmt_disconnect_failed(struct hci_dev *hdev, bdaddr_t *bdaddr, struct mgmt_cp_disconnect *cp; struct mgmt_pending_cmd *cmd; - mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, unpair_device_rsp, - hdev); + mgmt_pending_foreach(MGMT_OP_UNPAIR_DEVICE, hdev, true, + unpair_device_rsp, hdev); cmd = pending_find(MGMT_OP_DISCONNECT, hdev); if (!cmd) @@ -9907,7 +9901,7 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) if (status) { u8 mgmt_err = mgmt_status(status); - mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, + mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true, cmd_status_rsp, &mgmt_err); return; } @@ -9917,8 +9911,8 @@ void mgmt_auth_enable_complete(struct hci_dev *hdev, u8 status) else changed = hci_dev_test_and_clear_flag(hdev, HCI_LINK_SECURITY); - mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, settings_rsp, - &match); + mgmt_pending_foreach(MGMT_OP_SET_LINK_SECURITY, hdev, true, + settings_rsp, &match); if (changed) new_settings(hdev, match.sk); @@ -9942,9 +9936,12 @@ void mgmt_set_class_of_dev_complete(struct hci_dev *hdev, u8 *dev_class, { struct cmd_lookup match = { NULL, hdev, mgmt_status(status) }; - mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, sk_lookup, &match); - mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, sk_lookup, &match); - mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, sk_lookup, &match); + mgmt_pending_foreach(MGMT_OP_SET_DEV_CLASS, hdev, false, sk_lookup, + &match); + mgmt_pending_foreach(MGMT_OP_ADD_UUID, hdev, false, sk_lookup, + &match); + mgmt_pending_foreach(MGMT_OP_REMOVE_UUID, hdev, false, sk_lookup, + &match); if (!status) { mgmt_limited_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev, dev_class, diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index 3713ff490c65..a88a07da3947 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -217,30 +217,47 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev) { - struct mgmt_pending_cmd *cmd; + struct mgmt_pending_cmd *cmd, *tmp; + + mutex_lock(&hdev->mgmt_pending_lock); - list_for_each_entry(cmd, &hdev->mgmt_pending, list) { + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (hci_sock_get_channel(cmd->sk) != channel) continue; - if (cmd->opcode == opcode) + + if (cmd->opcode == opcode) { + mutex_unlock(&hdev->mgmt_pending_lock); return cmd; + } } + mutex_unlock(&hdev->mgmt_pending_lock); + return NULL; } -void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data) { struct mgmt_pending_cmd *cmd, *tmp; + mutex_lock(&hdev->mgmt_pending_lock); + list_for_each_entry_safe(cmd, tmp, &hdev->mgmt_pending, list) { if (opcode > 0 && cmd->opcode != opcode) continue; + if (remove) + list_del(&cmd->list); + cb(cmd, data); + + if (remove) + mgmt_pending_free(cmd); } + + mutex_unlock(&hdev->mgmt_pending_lock); } struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, @@ -254,7 +271,7 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, return NULL; cmd->opcode = opcode; - cmd->index = hdev->id; + cmd->hdev = hdev; cmd->param = kmemdup(data, len, GFP_KERNEL); if (!cmd->param) { @@ -280,7 +297,9 @@ struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, if (!cmd) return NULL; + mutex_lock(&hdev->mgmt_pending_lock); list_add_tail(&cmd->list, &hdev->mgmt_pending); + mutex_unlock(&hdev->mgmt_pending_lock); return cmd; } @@ -294,7 +313,10 @@ void mgmt_pending_free(struct mgmt_pending_cmd *cmd) void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) { + mutex_lock(&cmd->hdev->mgmt_pending_lock); list_del(&cmd->list); + mutex_unlock(&cmd->hdev->mgmt_pending_lock); + mgmt_pending_free(cmd); } diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index f2ba994ab1d8..024e51dd6937 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -33,7 +33,7 @@ struct mgmt_mesh_tx { struct mgmt_pending_cmd { struct list_head list; u16 opcode; - int index; + struct hci_dev *hdev; void *param; size_t param_len; struct sock *sk; @@ -54,7 +54,7 @@ int mgmt_cmd_complete(struct sock *sk, u16 index, u16 cmd, u8 status, struct mgmt_pending_cmd *mgmt_pending_find(unsigned short channel, u16 opcode, struct hci_dev *hdev); -void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, +void mgmt_pending_foreach(u16 opcode, struct hci_dev *hdev, bool remove, void (*cb)(struct mgmt_pending_cmd *cmd, void *data), void *data); struct mgmt_pending_cmd *mgmt_pending_add(struct sock *sk, u16 opcode, -- cgit v1.2.3 From 127c49624a0980ee7b8a5ba9094d6942332a48da Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Wed, 4 Jun 2025 18:06:04 +0200 Subject: can: add drop reasons in the receive path of AF_CAN Besides the existing pr_warn_once(), use skb drop reasons in case AF_CAN layer drops non-conformant CAN{,FD,XL} frames, or conformant frames received by "wrong" devices, so that it's possible to debug (and count) such events using existing tracepoints: | # perf record -e skb:kfree_skb -aR -- ./drv/canfdtest -v -g -l 1 vcan0 | # perf script | [...] | canfdtest 1123 [000] 3893.271264: skb:kfree_skb: skbaddr=0xffff975703c9f700 rx_sk=(nil) protocol=12 location=can_rcv+0x4b reason: CAN_RX_INVALID_FRAME Signed-off-by: Davide Caratti Link: https://patch.msgid.link/20250604160605.1005704-2-dcaratti@redhat.com Signed-off-by: Marc Kleine-Budde --- include/net/dropreason-core.h | 18 ++++++++++++++++++ net/can/af_can.c | 6 +++--- 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index bcf9d7467e1a..b9e78290269e 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -121,6 +121,9 @@ FN(ARP_PVLAN_DISABLE) \ FN(MAC_IEEE_MAC_CONTROL) \ FN(BRIDGE_INGRESS_STP_STATE) \ + FN(CAN_RX_INVALID_FRAME) \ + FN(CANFD_RX_INVALID_FRAME) \ + FN(CANXL_RX_INVALID_FRAME) \ FNe(MAX) /** @@ -573,6 +576,21 @@ enum skb_drop_reason { * ingress bridge port does not allow frames to be forwarded. */ SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE, + /** + * @SKB_DROP_REASON_CAN_RX_INVALID_FRAME: received + * non conform CAN frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CAN_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANFD_RX_INVALID_FRAME: received + * non conform CAN-FD frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANFD_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_CANXL_RX_INVALID_FRAME: received + * non conform CAN-XL frame (or device is unable to receive CAN frames) + */ + SKB_DROP_REASON_CANXL_RX_INVALID_FRAME, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/can/af_can.c b/net/can/af_can.c index 4aab7033c933..b2387a46794a 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -683,7 +683,7 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_CAN_RX_INVALID_FRAME); return NET_RX_DROP; } @@ -698,7 +698,7 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_CANFD_RX_INVALID_FRAME); return NET_RX_DROP; } @@ -713,7 +713,7 @@ static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_CANXL_RX_INVALID_FRAME); return NET_RX_DROP; } -- cgit v1.2.3 From c09ef59e17c6921c577d54bc8da4331b955d01a7 Mon Sep 17 00:00:00 2001 From: Dipayaan Roy Date: Mon, 9 Jun 2025 03:01:03 -0700 Subject: net: mana: Expose additional hardware counters for drop and TC via ethtool. Add support for reporting additional hardware counters for drop and TC using the ethtool -S interface. These counters include: - Aggregate Rx/Tx drop counters - Per-TC Rx/Tx packet counters - Per-TC Rx/Tx byte counters - Per-TC Rx/Tx pause frame counters The counters are exposed using ethtool_ops->get_ethtool_stats and ethtool_ops->get_strings. This feature/counters are not available to all versions of hardware. Signed-off-by: Dipayaan Roy Reviewed-by: Subbaraya Sundeep Reviewed-by: Haiyang Zhang Link: https://patch.msgid.link/20250609100103.GA7102@linuxonhyperv3.guj3yctzbm1etfxqx2vob5hsef.xx.internal.cloudapp.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/hw_channel.c | 6 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 87 +++++++++++++- drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 76 +++++++++++- include/net/mana/mana.h | 131 +++++++++++++++++++++ 4 files changed, 292 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index a8c4d8db75a5..3d3677c0d014 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021, Microsoft Corporation. */ #include +#include #include #include @@ -890,8 +891,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, } if (ctx->status_code && ctx->status_code != GDMA_STATUS_MORE_ENTRIES) { - dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n", - ctx->status_code); + if (req_msg->req.msg_type != MANA_QUERY_PHY_STAT) + dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n", + ctx->status_code); err = -EPROTO; goto out; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index ccd2885c939e..e68b8190bb7a 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -774,8 +774,9 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, err = mana_gd_send_request(gc, in_len, in_buf, out_len, out_buf); if (err || resp->status) { - dev_err(dev, "Failed to send mana message: %d, 0x%x\n", - err, resp->status); + if (req->req.msg_type != MANA_QUERY_PHY_STAT) + dev_err(dev, "Failed to send mana message: %d, 0x%x\n", + err, resp->status); return err ? err : -EPROTO; } @@ -2611,6 +2612,88 @@ void mana_query_gf_stats(struct mana_port_context *apc) apc->eth_stats.hc_tx_err_gdma = resp.tx_err_gdma; } +void mana_query_phy_stats(struct mana_port_context *apc) +{ + struct mana_query_phy_stat_resp resp = {}; + struct mana_query_phy_stat_req req = {}; + struct net_device *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_PHY_STAT, + sizeof(req), sizeof(resp)); + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + if (err) + return; + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_PHY_STAT, + sizeof(resp)); + if (err || resp.hdr.status) { + netdev_err(ndev, + "Failed to query PHY stats: %d, resp:0x%x\n", + err, resp.hdr.status); + return; + } + + /* Aggregate drop counters */ + apc->phy_stats.rx_pkt_drop_phy = resp.rx_pkt_drop_phy; + apc->phy_stats.tx_pkt_drop_phy = resp.tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + apc->phy_stats.rx_pkt_tc0_phy = resp.rx_pkt_tc0_phy; + apc->phy_stats.tx_pkt_tc0_phy = resp.tx_pkt_tc0_phy; + apc->phy_stats.rx_pkt_tc1_phy = resp.rx_pkt_tc1_phy; + apc->phy_stats.tx_pkt_tc1_phy = resp.tx_pkt_tc1_phy; + apc->phy_stats.rx_pkt_tc2_phy = resp.rx_pkt_tc2_phy; + apc->phy_stats.tx_pkt_tc2_phy = resp.tx_pkt_tc2_phy; + apc->phy_stats.rx_pkt_tc3_phy = resp.rx_pkt_tc3_phy; + apc->phy_stats.tx_pkt_tc3_phy = resp.tx_pkt_tc3_phy; + apc->phy_stats.rx_pkt_tc4_phy = resp.rx_pkt_tc4_phy; + apc->phy_stats.tx_pkt_tc4_phy = resp.tx_pkt_tc4_phy; + apc->phy_stats.rx_pkt_tc5_phy = resp.rx_pkt_tc5_phy; + apc->phy_stats.tx_pkt_tc5_phy = resp.tx_pkt_tc5_phy; + apc->phy_stats.rx_pkt_tc6_phy = resp.rx_pkt_tc6_phy; + apc->phy_stats.tx_pkt_tc6_phy = resp.tx_pkt_tc6_phy; + apc->phy_stats.rx_pkt_tc7_phy = resp.rx_pkt_tc7_phy; + apc->phy_stats.tx_pkt_tc7_phy = resp.tx_pkt_tc7_phy; + + /* Per TC byte Counters */ + apc->phy_stats.rx_byte_tc0_phy = resp.rx_byte_tc0_phy; + apc->phy_stats.tx_byte_tc0_phy = resp.tx_byte_tc0_phy; + apc->phy_stats.rx_byte_tc1_phy = resp.rx_byte_tc1_phy; + apc->phy_stats.tx_byte_tc1_phy = resp.tx_byte_tc1_phy; + apc->phy_stats.rx_byte_tc2_phy = resp.rx_byte_tc2_phy; + apc->phy_stats.tx_byte_tc2_phy = resp.tx_byte_tc2_phy; + apc->phy_stats.rx_byte_tc3_phy = resp.rx_byte_tc3_phy; + apc->phy_stats.tx_byte_tc3_phy = resp.tx_byte_tc3_phy; + apc->phy_stats.rx_byte_tc4_phy = resp.rx_byte_tc4_phy; + apc->phy_stats.tx_byte_tc4_phy = resp.tx_byte_tc4_phy; + apc->phy_stats.rx_byte_tc5_phy = resp.rx_byte_tc5_phy; + apc->phy_stats.tx_byte_tc5_phy = resp.tx_byte_tc5_phy; + apc->phy_stats.rx_byte_tc6_phy = resp.rx_byte_tc6_phy; + apc->phy_stats.tx_byte_tc6_phy = resp.tx_byte_tc6_phy; + apc->phy_stats.rx_byte_tc7_phy = resp.rx_byte_tc7_phy; + apc->phy_stats.tx_byte_tc7_phy = resp.tx_byte_tc7_phy; + + /* Per TC pause Counters */ + apc->phy_stats.rx_pause_tc0_phy = resp.rx_pause_tc0_phy; + apc->phy_stats.tx_pause_tc0_phy = resp.tx_pause_tc0_phy; + apc->phy_stats.rx_pause_tc1_phy = resp.rx_pause_tc1_phy; + apc->phy_stats.tx_pause_tc1_phy = resp.tx_pause_tc1_phy; + apc->phy_stats.rx_pause_tc2_phy = resp.rx_pause_tc2_phy; + apc->phy_stats.tx_pause_tc2_phy = resp.tx_pause_tc2_phy; + apc->phy_stats.rx_pause_tc3_phy = resp.rx_pause_tc3_phy; + apc->phy_stats.tx_pause_tc3_phy = resp.tx_pause_tc3_phy; + apc->phy_stats.rx_pause_tc4_phy = resp.rx_pause_tc4_phy; + apc->phy_stats.tx_pause_tc4_phy = resp.tx_pause_tc4_phy; + apc->phy_stats.rx_pause_tc5_phy = resp.rx_pause_tc5_phy; + apc->phy_stats.tx_pause_tc5_phy = resp.tx_pause_tc5_phy; + apc->phy_stats.rx_pause_tc6_phy = resp.rx_pause_tc6_phy; + apc->phy_stats.tx_pause_tc6_phy = resp.tx_pause_tc6_phy; + apc->phy_stats.rx_pause_tc7_phy = resp.rx_pause_tc7_phy; + apc->phy_stats.tx_pause_tc7_phy = resp.tx_pause_tc7_phy; +} + static int mana_init_port(struct net_device *ndev) { struct mana_port_context *apc = netdev_priv(ndev); diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index c419626073f5..4fb3a04994a2 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -7,10 +7,12 @@ #include -static const struct { +struct mana_stats_desc { char name[ETH_GSTRING_LEN]; u16 offset; -} mana_eth_stats[] = { +}; + +static const struct mana_stats_desc mana_eth_stats[] = { {"stop_queue", offsetof(struct mana_ethtool_stats, stop_queue)}, {"wake_queue", offsetof(struct mana_ethtool_stats, wake_queue)}, {"hc_rx_discards_no_wqe", offsetof(struct mana_ethtool_stats, @@ -75,6 +77,59 @@ static const struct { rx_cqe_unknown_type)}, }; +static const struct mana_stats_desc mana_phy_stats[] = { + { "hc_rx_pkt_drop_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_drop_phy) }, + { "hc_tx_pkt_drop_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_drop_phy) }, + { "hc_tc0_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc0_phy) }, + { "hc_tc0_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc0_phy) }, + { "hc_tc0_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc0_phy) }, + { "hc_tc0_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc0_phy) }, + { "hc_tc1_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc1_phy) }, + { "hc_tc1_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc1_phy) }, + { "hc_tc1_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc1_phy) }, + { "hc_tc1_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc1_phy) }, + { "hc_tc2_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc2_phy) }, + { "hc_tc2_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc2_phy) }, + { "hc_tc2_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc2_phy) }, + { "hc_tc2_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc2_phy) }, + { "hc_tc3_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc3_phy) }, + { "hc_tc3_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc3_phy) }, + { "hc_tc3_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc3_phy) }, + { "hc_tc3_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc3_phy) }, + { "hc_tc4_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc4_phy) }, + { "hc_tc4_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc4_phy) }, + { "hc_tc4_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc4_phy) }, + { "hc_tc4_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc4_phy) }, + { "hc_tc5_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc5_phy) }, + { "hc_tc5_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc5_phy) }, + { "hc_tc5_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc5_phy) }, + { "hc_tc5_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc5_phy) }, + { "hc_tc6_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc6_phy) }, + { "hc_tc6_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc6_phy) }, + { "hc_tc6_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc6_phy) }, + { "hc_tc6_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc6_phy) }, + { "hc_tc7_rx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, rx_pkt_tc7_phy) }, + { "hc_tc7_rx_byte_phy", offsetof(struct mana_ethtool_phy_stats, rx_byte_tc7_phy) }, + { "hc_tc7_tx_pkt_phy", offsetof(struct mana_ethtool_phy_stats, tx_pkt_tc7_phy) }, + { "hc_tc7_tx_byte_phy", offsetof(struct mana_ethtool_phy_stats, tx_byte_tc7_phy) }, + { "hc_tc0_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc0_phy) }, + { "hc_tc0_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc0_phy) }, + { "hc_tc1_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc1_phy) }, + { "hc_tc1_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc1_phy) }, + { "hc_tc2_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc2_phy) }, + { "hc_tc2_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc2_phy) }, + { "hc_tc3_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc3_phy) }, + { "hc_tc3_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc3_phy) }, + { "hc_tc4_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc4_phy) }, + { "hc_tc4_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc4_phy) }, + { "hc_tc5_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc5_phy) }, + { "hc_tc5_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc5_phy) }, + { "hc_tc6_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc6_phy) }, + { "hc_tc6_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc6_phy) }, + { "hc_tc7_rx_pause_phy", offsetof(struct mana_ethtool_phy_stats, rx_pause_tc7_phy) }, + { "hc_tc7_tx_pause_phy", offsetof(struct mana_ethtool_phy_stats, tx_pause_tc7_phy) }, +}; + static int mana_get_sset_count(struct net_device *ndev, int stringset) { struct mana_port_context *apc = netdev_priv(ndev); @@ -83,8 +138,8 @@ static int mana_get_sset_count(struct net_device *ndev, int stringset) if (stringset != ETH_SS_STATS) return -EINVAL; - return ARRAY_SIZE(mana_eth_stats) + num_queues * - (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT); + return ARRAY_SIZE(mana_eth_stats) + ARRAY_SIZE(mana_phy_stats) + + num_queues * (MANA_STATS_RX_COUNT + MANA_STATS_TX_COUNT); } static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) @@ -99,6 +154,9 @@ static void mana_get_strings(struct net_device *ndev, u32 stringset, u8 *data) for (i = 0; i < ARRAY_SIZE(mana_eth_stats); i++) ethtool_puts(&data, mana_eth_stats[i].name); + for (i = 0; i < ARRAY_SIZE(mana_phy_stats); i++) + ethtool_puts(&data, mana_phy_stats[i].name); + for (i = 0; i < num_queues; i++) { ethtool_sprintf(&data, "rx_%d_packets", i); ethtool_sprintf(&data, "rx_%d_bytes", i); @@ -128,6 +186,7 @@ static void mana_get_ethtool_stats(struct net_device *ndev, struct mana_port_context *apc = netdev_priv(ndev); unsigned int num_queues = apc->num_queues; void *eth_stats = &apc->eth_stats; + void *phy_stats = &apc->phy_stats; struct mana_stats_rx *rx_stats; struct mana_stats_tx *tx_stats; unsigned int start; @@ -151,9 +210,18 @@ static void mana_get_ethtool_stats(struct net_device *ndev, /* we call mana function to update stats from GDMA */ mana_query_gf_stats(apc); + /* We call this mana function to get the phy stats from GDMA and includes + * aggregate tx/rx drop counters, Per-TC(Traffic Channel) tx/rx and pause + * counters. + */ + mana_query_phy_stats(apc); + for (q = 0; q < ARRAY_SIZE(mana_eth_stats); q++) data[i++] = *(u64 *)(eth_stats + mana_eth_stats[q].offset); + for (q = 0; q < ARRAY_SIZE(mana_phy_stats); q++) + data[i++] = *(u64 *)(phy_stats + mana_phy_stats[q].offset); + for (q = 0; q < num_queues; q++) { rx_stats = &apc->rxqs[q]->stats; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 9abb66461211..4176edf1be71 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -404,6 +404,65 @@ struct mana_ethtool_stats { u64 rx_cqe_unknown_type; }; +struct mana_ethtool_phy_stats { + /* Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; + struct mana_context { struct gdma_dev *gdma_dev; @@ -474,6 +533,8 @@ struct mana_port_context { struct mana_ethtool_stats eth_stats; + struct mana_ethtool_phy_stats phy_stats; + /* Debugfs */ struct dentry *mana_port_debugfs; }; @@ -501,6 +562,7 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); void mana_query_gf_stats(struct mana_port_context *apc); +void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); @@ -527,6 +589,7 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ MANA_REGISTER_FILTER = 0x28000, @@ -689,6 +752,74 @@ struct mana_query_gf_stat_resp { u64 tx_err_gdma; }; /* HW DATA */ +/* Query phy stats */ +struct mana_query_phy_stat_req { + struct gdma_req_hdr hdr; + u64 req_stats; +}; /* HW DATA */ + +struct mana_query_phy_stat_resp { + struct gdma_resp_hdr hdr; + u64 reported_stats; + + /* Aggregate Drop Counters */ + u64 rx_pkt_drop_phy; + u64 tx_pkt_drop_phy; + + /* Per TC(Traffic class) traffic Counters */ + u64 rx_pkt_tc0_phy; + u64 tx_pkt_tc0_phy; + u64 rx_pkt_tc1_phy; + u64 tx_pkt_tc1_phy; + u64 rx_pkt_tc2_phy; + u64 tx_pkt_tc2_phy; + u64 rx_pkt_tc3_phy; + u64 tx_pkt_tc3_phy; + u64 rx_pkt_tc4_phy; + u64 tx_pkt_tc4_phy; + u64 rx_pkt_tc5_phy; + u64 tx_pkt_tc5_phy; + u64 rx_pkt_tc6_phy; + u64 tx_pkt_tc6_phy; + u64 rx_pkt_tc7_phy; + u64 tx_pkt_tc7_phy; + + u64 rx_byte_tc0_phy; + u64 tx_byte_tc0_phy; + u64 rx_byte_tc1_phy; + u64 tx_byte_tc1_phy; + u64 rx_byte_tc2_phy; + u64 tx_byte_tc2_phy; + u64 rx_byte_tc3_phy; + u64 tx_byte_tc3_phy; + u64 rx_byte_tc4_phy; + u64 tx_byte_tc4_phy; + u64 rx_byte_tc5_phy; + u64 tx_byte_tc5_phy; + u64 rx_byte_tc6_phy; + u64 tx_byte_tc6_phy; + u64 rx_byte_tc7_phy; + u64 tx_byte_tc7_phy; + + /* Per TC(Traffic Class) pause Counters */ + u64 rx_pause_tc0_phy; + u64 tx_pause_tc0_phy; + u64 rx_pause_tc1_phy; + u64 tx_pause_tc1_phy; + u64 rx_pause_tc2_phy; + u64 tx_pause_tc2_phy; + u64 rx_pause_tc3_phy; + u64 tx_pause_tc3_phy; + u64 rx_pause_tc4_phy; + u64 tx_pause_tc4_phy; + u64 rx_pause_tc5_phy; + u64 tx_pause_tc5_phy; + u64 rx_pause_tc6_phy; + u64 tx_pause_tc6_phy; + u64 rx_pause_tc7_phy; + u64 tx_pause_tc7_phy; +}; /* HW DATA */ + /* Configure vPort Rx Steering */ struct mana_cfg_rx_steer_req_v2 { struct gdma_req_hdr hdr; -- cgit v1.2.3 From 561939ed44932da639ba703ffcd4d4d5ff2c7569 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 9 Jun 2025 11:32:35 -0400 Subject: net: remove unused sock_enable_timestamps This function was introduced in commit 783da70e8396 ("net: add sock_enable_timestamps"), with one caller in rxrpc. That only caller was removed in commit 7903d4438b3f ("rxrpc: Don't use received skbuff timestamps"). Signed-off-by: Willem de Bruijn Reviewed-by: Jason Xing Link: https://patch.msgid.link/20250609153254.3504909-1-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 1 - net/core/sock.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 92e7c1aae3cc..85e17da5c9db 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2982,7 +2982,6 @@ void sock_set_timestamp(struct sock *sk, int optname, bool valbool); int sock_set_timestamping(struct sock *sk, int optname, struct so_timestamping timestamping); -void sock_enable_timestamps(struct sock *sk); #if defined(CONFIG_CGROUP_BPF) void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op); #else diff --git a/net/core/sock.c b/net/core/sock.c index 3b409bc8ef6d..502042a0d3b5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -837,14 +837,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } } -void sock_enable_timestamps(struct sock *sk) -{ - lock_sock(sk); - __sock_set_timestamps(sk, true, false, true); - release_sock(sk); -} -EXPORT_SYMBOL(sock_enable_timestamps); - void sock_set_timestamp(struct sock *sk, int optname, bool valbool) { switch (optname) { -- cgit v1.2.3 From 2bc64b89c4c4073ee8f9543373c64da9b6bbe5e0 Mon Sep 17 00:00:00 2001 From: Gur Stavi Date: Mon, 9 Jun 2025 18:07:52 +0300 Subject: queue_api: add subqueue variant netif_subqueue_sent Add a new function, netif_subqueue_sent, which is a wrapper for netdev_tx_sent_queue. Drivers that use the subqueue variant macros, netif_subqueue_xxx, identify queue by index and are not required to obtain struct netdev_queue explicitly. Such drivers still need to call netdev_tx_sent_queue which is a counterpart of netif_subqueue_completed_wake. Allowing drivers to use a subqueue variant for this purpose improves their code consistency by always referring to queue by its index. Signed-off-by: Gur Stavi Link: https://patch.msgid.link/909a5c92db49cad39f0954d6cb86775e6480ef4c.1749038081.git.gur.stavi@huawei.com Signed-off-by: Jakub Kicinski --- include/net/netdev_queues.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/net') diff --git a/include/net/netdev_queues.h b/include/net/netdev_queues.h index ba2eaf39089b..6e835972abd1 100644 --- a/include/net/netdev_queues.h +++ b/include/net/netdev_queues.h @@ -294,6 +294,15 @@ netdev_txq_completed_mb(struct netdev_queue *dev_queue, netif_txq_try_stop(_txq, get_desc, start_thrs); \ }) +static inline void netif_subqueue_sent(const struct net_device *dev, + unsigned int idx, unsigned int bytes) +{ + struct netdev_queue *txq; + + txq = netdev_get_tx_queue(dev, idx); + netdev_tx_sent_queue(txq, bytes); +} + #define netif_subqueue_maybe_stop(dev, idx, get_desc, stop_thrs, start_thrs) \ ({ \ struct netdev_queue *_txq; \ -- cgit v1.2.3 From 2660a544fdc0940bba15f70508a46cf9a6491230 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 9 Jun 2025 19:08:03 +0200 Subject: net: Fix TOCTOU issue in sk_is_readable() sk->sk_prot->sock_is_readable is a valid function pointer when sk resides in a sockmap. After the last sk_psock_put() (which usually happens when socket is removed from sockmap), sk->sk_prot gets restored and sk->sk_prot->sock_is_readable becomes NULL. This makes sk_is_readable() racy, if the value of sk->sk_prot is reloaded after the initial check. Which in turn may lead to a null pointer dereference. Ensure the function pointer does not turn NULL after the check. Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support") Suggested-by: Jakub Sitnicki Signed-off-by: Michal Luczaj Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250609-skisreadable-toctou-v1-1-d0dfb2d62c37@rbox.co Signed-off-by: Jakub Kicinski --- include/net/sock.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 92e7c1aae3cc..4c37015b7cf7 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -3010,8 +3010,11 @@ int sock_ioctl_inout(struct sock *sk, unsigned int cmd, int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); static inline bool sk_is_readable(struct sock *sk) { - if (sk->sk_prot->sock_is_readable) - return sk->sk_prot->sock_is_readable(sk); + const struct proto *prot = READ_ONCE(sk->sk_prot); + + if (prot->sock_is_readable) + return prot->sock_is_readable(sk); + return false; } #endif /* _SOCK_H */ -- cgit v1.2.3 From 5842c01a9ed1d515c8ba2d6d3733eac78ace89c1 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 6 Jun 2025 14:32:49 -0400 Subject: Bluetooth: ISO: Fix not using bc_sid as advertisement SID Currently bc_sid is being ignore when acting as Broadcast Source role, so this fix it by passing the bc_sid and then use it when programming the PA: < HCI Command: LE Set Exte.. (0x08|0x0036) plen 25 Handle: 0x01 Properties: 0x0000 Min advertising interval: 140.000 msec (0x00e0) Max advertising interval: 140.000 msec (0x00e0) Channel map: 37, 38, 39 (0x07) Own address type: Random (0x01) Peer address type: Public (0x00) Peer address: 00:00:00:00:00:00 (OUI 00-00-00) Filter policy: Allow Scan Request from Any, Allow Connect Request from Any (0x00) TX power: Host has no preference (0x7f) Primary PHY: LE 1M (0x01) Secondary max skip: 0x00 Secondary PHY: LE 2M (0x02) SID: 0x01 Scan request notifications: Disabled (0x00) Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 9 ++++++--- include/net/bluetooth/hci_sync.h | 4 ++-- net/bluetooth/hci_conn.c | 31 ++++++++++++++++++++++++------- net/bluetooth/hci_core.c | 16 +++++++++++++++- net/bluetooth/hci_sync.c | 20 +++++++++++++++++--- net/bluetooth/iso.c | 12 ++++++++---- 6 files changed, 72 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f7b1a9eb9543..a760f05fa3fb 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -242,6 +242,7 @@ struct adv_info { __u8 mesh; __u8 instance; __u8 handle; + __u8 sid; __u32 flags; __u16 timeout; __u16 remaining_time; @@ -1551,13 +1552,14 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, u16 timeout); struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); -struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base); struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, struct bt_iso_qos *qos); struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, __u8 data_len, __u8 *data); struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos); @@ -1832,6 +1834,7 @@ int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, void hci_adv_instances_clear(struct hci_dev *hdev); struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance); +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid); struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance); struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, @@ -1839,7 +1842,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle); -struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval); int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, diff --git a/include/net/bluetooth/hci_sync.h b/include/net/bluetooth/hci_sync.h index 72558c826aa1..5224f57f6af2 100644 --- a/include/net/bluetooth/hci_sync.h +++ b/include/net/bluetooth/hci_sync.h @@ -115,8 +115,8 @@ int hci_enable_ext_advertising_sync(struct hci_dev *hdev, u8 instance); int hci_enable_advertising_sync(struct hci_dev *hdev); int hci_enable_advertising(struct hci_dev *hdev); -int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, - u8 *data, u32 flags, u16 min_interval, +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval); int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 99efeed6a766..4f379184df5b 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1501,8 +1501,8 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, - struct bt_iso_qos *qos, __u8 base_len, - __u8 *base) + __u8 sid, struct bt_iso_qos *qos, + __u8 base_len, __u8 *base) { struct hci_conn *conn; int err; @@ -1543,6 +1543,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, return conn; conn->state = BT_CONNECT; + conn->sid = sid; hci_conn_hold(conn); return conn; @@ -2062,7 +2063,8 @@ static int create_big_sync(struct hci_dev *hdev, void *data) if (qos->bcast.bis) sync_interval = interval * 4; - err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->le_per_adv_data_len, + err = hci_start_per_adv_sync(hdev, qos->bcast.bis, conn->sid, + conn->le_per_adv_data_len, conn->le_per_adv_data, flags, interval, interval, sync_interval); if (err) @@ -2134,7 +2136,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) } } -struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, +struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { @@ -2156,7 +2158,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); if (IS_ERR(conn)) return conn; @@ -2207,20 +2209,35 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) } struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos, + __u8 dst_type, __u8 sid, + struct bt_iso_qos *qos, __u8 base_len, __u8 *base) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); if (IS_ERR(conn)) return conn; if (conn->state == BT_CONNECTED) return conn; + /* Check if SID needs to be allocated then search for the first + * available. + */ + if (conn->sid == HCI_SID_INVALID) { + u8 sid; + + for (sid = 0; sid <= 0x0f; sid++) { + if (!hci_find_adv_sid(hdev, sid)) { + conn->sid = sid; + break; + } + } + } + data.big = qos->bcast.big; data.bis = qos->bcast.bis; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 487c045a7ba8..07a8b4281a39 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1584,6 +1584,19 @@ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) return NULL; } +/* This function requires the caller holds hdev->lock */ +struct adv_info *hci_find_adv_sid(struct hci_dev *hdev, u8 sid) +{ + struct adv_info *adv; + + list_for_each_entry(adv, &hdev->adv_instances, list) { + if (adv->sid == sid) + return adv; + } + + return NULL; +} + /* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { @@ -1736,7 +1749,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, } /* This function requires the caller holds hdev->lock */ -struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, +struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u8 sid, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { @@ -1748,6 +1761,7 @@ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, if (IS_ERR(adv)) return adv; + adv->sid = sid; adv->periodic = true; adv->per_adv_data_len = data_len; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 83de3847c8ea..6687f2a4d1eb 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1261,10 +1261,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) hci_cpu_to_le24(adv->min_interval, cp.min_interval); hci_cpu_to_le24(adv->max_interval, cp.max_interval); cp.tx_power = adv->tx_power; + cp.sid = adv->sid; } else { hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; + cp.sid = 0x00; } secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); @@ -1594,8 +1596,8 @@ static int hci_adv_bcast_annoucement(struct hci_dev *hdev, struct adv_info *adv) return hci_update_adv_data_sync(hdev, adv->instance); } -int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, - u8 *data, u32 flags, u16 min_interval, +int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 sid, + u8 data_len, u8 *data, u32 flags, u16 min_interval, u16 max_interval, u16 sync_interval) { struct adv_info *adv = NULL; @@ -1607,6 +1609,18 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, if (instance) { adv = hci_find_adv_instance(hdev, instance); if (adv) { + if (sid != HCI_SID_INVALID && adv->sid != sid) { + /* If the SID don't match attempt to find by + * SID. + */ + adv = hci_find_adv_sid(hdev, sid); + if (!adv) { + bt_dev_err(hdev, + "Unable to find adv_info"); + return -EINVAL; + } + } + /* Turn it into periodic advertising */ adv->periodic = true; adv->per_adv_data_len = data_len; @@ -1615,7 +1629,7 @@ int hci_start_per_adv_sync(struct hci_dev *hdev, u8 instance, u8 data_len, adv->flags = flags; } else if (!adv) { /* Create an instance if that could not be found */ - adv = hci_add_per_instance(hdev, instance, flags, + adv = hci_add_per_instance(hdev, instance, sid, flags, data_len, data, sync_interval, sync_interval); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index affa2077e3a2..3c2c98eecc62 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -336,7 +336,7 @@ static int iso_connect_bis(struct sock *sk) struct hci_dev *hdev; int err; - BT_DBG("%pMR", &iso_pi(sk)->src); + BT_DBG("%pMR (SID 0x%2.2x)", &iso_pi(sk)->src, iso_pi(sk)->bc_sid); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); @@ -365,7 +365,7 @@ static int iso_connect_bis(struct sock *sk) /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { - hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, + hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { @@ -375,12 +375,16 @@ static int iso_connect_bis(struct sock *sk) } else { hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, + iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } + + /* Update SID if it was not set */ + if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) + iso_pi(sk)->bc_sid = hcon->sid; } conn = iso_conn_add(hcon); -- cgit v1.2.3 From c0f21029f123d1b15f8eddc8e3976bf0c8781c43 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 8 Jun 2025 10:42:53 +0300 Subject: xfrm: always initialize offload path Offload path is used for GRO with SW IPsec, and not just for HW offload. So initialize it anyway. Fixes: 585b64f5a620 ("xfrm: delay initialization of offload path till its actually requested") Reported-by: Sabrina Dubroca Closes: https://lore.kernel.org/all/aEGW_5HfPqU1rFjl@krikkit Signed-off-by: Leon Romanovsky Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 2 +- net/xfrm/xfrm_device.c | 1 - net/xfrm/xfrm_state.c | 6 ++---- net/xfrm/xfrm_user.c | 1 + 4 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index a21e276dbe44..e45a275fca26 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -474,7 +474,7 @@ struct xfrm_type_offload { int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); -void xfrm_set_type_offload(struct xfrm_state *x); +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load); static inline void xfrm_unset_type_offload(struct xfrm_state *x) { if (!x->type_offload) diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 81fd486b5e56..d2819baea414 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -305,7 +305,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, return -EINVAL; } - xfrm_set_type_offload(x); if (!x->type_offload) { NL_SET_ERR_MSG(extack, "Type doesn't support offload"); dev_put(dev); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7e34fc94f668..c7e6472c623d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -424,11 +424,10 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, } EXPORT_SYMBOL(xfrm_unregister_type_offload); -void xfrm_set_type_offload(struct xfrm_state *x) +void xfrm_set_type_offload(struct xfrm_state *x, bool try_load) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; - bool try_load = true; retry: afinfo = xfrm_state_get_afinfo(x->props.family); @@ -607,6 +606,7 @@ static void ___xfrm_state_destroy(struct xfrm_state *x) kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); + xfrm_unset_type_offload(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -780,8 +780,6 @@ void xfrm_dev_state_free(struct xfrm_state *x) struct xfrm_dev_offload *xso = &x->xso; struct net_device *dev = READ_ONCE(xso->dev); - xfrm_unset_type_offload(x); - if (dev && dev->xfrmdev_ops) { spin_lock_bh(&xfrm_state_dev_gc_lock); if (!hlist_unhashed(&x->dev_gclist)) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 59f258daf830..1db18f470f42 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -977,6 +977,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* override default values from above */ xfrm_update_ae_params(x, attrs, 0); + xfrm_set_type_offload(x, attrs[XFRMA_OFFLOAD_DEV]); /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_state_add(net, x, -- cgit v1.2.3 From adcaa890c7a4a91a422168d8fb629183fff07b2f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 11 Jun 2025 11:15:15 +0000 Subject: net_sched: remove qdisc_tree_flush_backlog() This function is no longer used after the four prior fixes. Given all prior uses were wrong, it seems better to remove it. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250611111515.1983366-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sch_generic.h | 8 -------- 1 file changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 629368ab2787..638948be4c50 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -973,14 +973,6 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, *backlog = qstats.backlog; } -static inline void qdisc_tree_flush_backlog(struct Qdisc *sch) -{ - __u32 qlen, backlog; - - qdisc_qstats_qlen_backlog(sch, &qlen, &backlog); - qdisc_tree_reduce_backlog(sch, qlen, backlog); -} - static inline void qdisc_purge_queue(struct Qdisc *sch) { __u32 qlen, backlog; -- cgit v1.2.3 From b776999bf25ddca9880bc3c9c30b8f84a748504b Mon Sep 17 00:00:00 2001 From: RubenKelevra Date: Thu, 12 Jun 2025 16:50:12 +0200 Subject: net: pfcp: fix typo in message_priority field name The field is spelled "message_priprity" in the big-endian bit-field definition. Nothing in-tree currently references the member, so the typo does not break kernel builds, but it is clearly incorrect. Signed-off-by: RubenKelevra Link: https://patch.msgid.link/20250612145012.185321-1-rubenkelevra@gmail.com Signed-off-by: Jakub Kicinski --- include/net/pfcp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/pfcp.h b/include/net/pfcp.h index af14f970b80e..639553797d3e 100644 --- a/include/net/pfcp.h +++ b/include/net/pfcp.h @@ -45,7 +45,7 @@ struct pfcphdr_session { reserved:4; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:4, - message_priprity:4; + message_priority:4; #else #error "Please fix " #endif -- cgit v1.2.3 From 6ad5ff6e7282d1252364cc08af88260ef0ec4cda Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:19 +0200 Subject: libeth: convert to netmem Back when the libeth Rx core was initially written, devmem was a draft and netmem_ref didn't exist in the mainline. Now that it's here, make libeth MP-agnostic before introducing any new code or any new library users. When it's known that the created PP/FQ is for header buffers, use faster "unsafe" underscored netmem <--> virt accessors as netmem_is_net_iov() is always false in that case, but consumes some cycles (bit test + true branch). Reviewed-by: Mina Almasry Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_txrx.c | 14 +++++---- .../net/ethernet/intel/idpf/idpf_singleq_txrx.c | 2 +- drivers/net/ethernet/intel/idpf/idpf_txrx.c | 36 +++++++++++++--------- drivers/net/ethernet/intel/libeth/rx.c | 8 ++--- include/net/libeth/rx.h | 22 +++++++------ 5 files changed, 46 insertions(+), 36 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c index 23e786b9793d..aaf70c625655 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c +++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c @@ -723,7 +723,7 @@ static void iavf_clean_rx_ring(struct iavf_ring *rx_ring) for (u32 i = rx_ring->next_to_clean; i != rx_ring->next_to_use; ) { const struct libeth_fqe *rx_fqes = &rx_ring->rx_fqes[i]; - page_pool_put_full_page(rx_ring->pp, rx_fqes->page, false); + libeth_rx_recycle_slow(rx_fqes->netmem); if (unlikely(++i == rx_ring->count)) i = 0; @@ -1197,10 +1197,11 @@ static void iavf_add_rx_frag(struct sk_buff *skb, const struct libeth_fqe *rx_buffer, unsigned int size) { - u32 hr = rx_buffer->page->pp->p.offset; + u32 hr = netmem_get_pp(rx_buffer->netmem)->p.offset; - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page, - rx_buffer->offset + hr, size, rx_buffer->truesize); + skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags, + rx_buffer->netmem, rx_buffer->offset + hr, + size, rx_buffer->truesize); } /** @@ -1214,12 +1215,13 @@ static void iavf_add_rx_frag(struct sk_buff *skb, static struct sk_buff *iavf_build_skb(const struct libeth_fqe *rx_buffer, unsigned int size) { - u32 hr = rx_buffer->page->pp->p.offset; + struct page *buf_page = __netmem_to_page(rx_buffer->netmem); + u32 hr = buf_page->pp->p.offset; struct sk_buff *skb; void *va; /* prefetch first cache line of first page */ - va = page_address(rx_buffer->page) + rx_buffer->offset; + va = page_address(buf_page) + rx_buffer->offset; net_prefetch(va + hr); /* build an skb around the page buffer */ diff --git a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c index 993c354aa27a..555879b1248d 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_singleq_txrx.c @@ -1006,7 +1006,7 @@ static int idpf_rx_singleq_clean(struct idpf_rx_queue *rx_q, int budget) break; skip_data: - rx_buf->page = NULL; + rx_buf->netmem = 0; IDPF_SINGLEQ_BUMP_RING_IDX(rx_q, ntc); cleaned_count++; diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c index 5cf440e09d0a..cef9dfb877e8 100644 --- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c +++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c @@ -383,12 +383,12 @@ err_out: */ static void idpf_rx_page_rel(struct libeth_fqe *rx_buf) { - if (unlikely(!rx_buf->page)) + if (unlikely(!rx_buf->netmem)) return; - page_pool_put_full_page(rx_buf->page->pp, rx_buf->page, false); + libeth_rx_recycle_slow(rx_buf->netmem); - rx_buf->page = NULL; + rx_buf->netmem = 0; rx_buf->offset = 0; } @@ -3240,10 +3240,10 @@ idpf_rx_process_skb_fields(struct idpf_rx_queue *rxq, struct sk_buff *skb, void idpf_rx_add_frag(struct idpf_rx_buf *rx_buf, struct sk_buff *skb, unsigned int size) { - u32 hr = rx_buf->page->pp->p.offset; + u32 hr = netmem_get_pp(rx_buf->netmem)->p.offset; - skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page, - rx_buf->offset + hr, size, rx_buf->truesize); + skb_add_rx_frag_netmem(skb, skb_shinfo(skb)->nr_frags, rx_buf->netmem, + rx_buf->offset + hr, size, rx_buf->truesize); } /** @@ -3266,16 +3266,20 @@ static u32 idpf_rx_hsplit_wa(const struct libeth_fqe *hdr, struct libeth_fqe *buf, u32 data_len) { u32 copy = data_len <= L1_CACHE_BYTES ? data_len : ETH_HLEN; + struct page *hdr_page, *buf_page; const void *src; void *dst; - if (!libeth_rx_sync_for_cpu(buf, copy)) + if (unlikely(netmem_is_net_iov(buf->netmem)) || + !libeth_rx_sync_for_cpu(buf, copy)) return 0; - dst = page_address(hdr->page) + hdr->offset + hdr->page->pp->p.offset; - src = page_address(buf->page) + buf->offset + buf->page->pp->p.offset; - memcpy(dst, src, LARGEST_ALIGN(copy)); + hdr_page = __netmem_to_page(hdr->netmem); + buf_page = __netmem_to_page(buf->netmem); + dst = page_address(hdr_page) + hdr->offset + hdr_page->pp->p.offset; + src = page_address(buf_page) + buf->offset + buf_page->pp->p.offset; + memcpy(dst, src, LARGEST_ALIGN(copy)); buf->offset += copy; return copy; @@ -3291,11 +3295,12 @@ static u32 idpf_rx_hsplit_wa(const struct libeth_fqe *hdr, */ struct sk_buff *idpf_rx_build_skb(const struct libeth_fqe *buf, u32 size) { - u32 hr = buf->page->pp->p.offset; + struct page *buf_page = __netmem_to_page(buf->netmem); + u32 hr = buf_page->pp->p.offset; struct sk_buff *skb; void *va; - va = page_address(buf->page) + buf->offset; + va = page_address(buf_page) + buf->offset; prefetch(va + hr); skb = napi_build_skb(va, buf->truesize); @@ -3429,7 +3434,8 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) if (unlikely(!hdr_len && !skb)) { hdr_len = idpf_rx_hsplit_wa(hdr, rx_buf, pkt_len); - pkt_len -= hdr_len; + /* If failed, drop both buffers by setting len to 0 */ + pkt_len -= hdr_len ? : pkt_len; u64_stats_update_begin(&rxq->stats_sync); u64_stats_inc(&rxq->q_stats.hsplit_buf_ovf); @@ -3446,7 +3452,7 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) u64_stats_update_end(&rxq->stats_sync); } - hdr->page = NULL; + hdr->netmem = 0; payload: if (!libeth_rx_sync_for_cpu(rx_buf, pkt_len)) @@ -3462,7 +3468,7 @@ payload: break; skip_data: - rx_buf->page = NULL; + rx_buf->netmem = 0; idpf_rx_post_buf_refill(refillq, buf_id); IDPF_RX_BUMP_NTC(rxq, ntc); diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c index c2c53552c440..2afa6e33f160 100644 --- a/drivers/net/ethernet/intel/libeth/rx.c +++ b/drivers/net/ethernet/intel/libeth/rx.c @@ -204,14 +204,14 @@ void libeth_rx_fq_destroy(struct libeth_fq *fq) EXPORT_SYMBOL_GPL(libeth_rx_fq_destroy); /** - * libeth_rx_recycle_slow - recycle a libeth page from the NAPI context - * @page: page to recycle + * libeth_rx_recycle_slow - recycle libeth netmem + * @netmem: network memory to recycle * * To be used on exceptions or rare cases not requiring fast inline recycling. */ -void libeth_rx_recycle_slow(struct page *page) +void __cold libeth_rx_recycle_slow(netmem_ref netmem) { - page_pool_recycle_direct(page->pp, page); + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false); } EXPORT_SYMBOL_GPL(libeth_rx_recycle_slow); diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h index ab05024be518..7d5dc58984b1 100644 --- a/include/net/libeth/rx.h +++ b/include/net/libeth/rx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_RX_H #define __LIBETH_RX_H @@ -31,7 +31,7 @@ /** * struct libeth_fqe - structure representing an Rx buffer (fill queue element) - * @page: page holding the buffer + * @netmem: network memory reference holding the buffer * @offset: offset from the page start (to the headroom) * @truesize: total space occupied by the buffer (w/ headroom and tailroom) * @@ -40,7 +40,7 @@ * former, @offset is always 0 and @truesize is always ```PAGE_SIZE```. */ struct libeth_fqe { - struct page *page; + netmem_ref netmem; u32 offset; u32 truesize; } __aligned_largest; @@ -102,15 +102,16 @@ static inline dma_addr_t libeth_rx_alloc(const struct libeth_fq_fp *fq, u32 i) struct libeth_fqe *buf = &fq->fqes[i]; buf->truesize = fq->truesize; - buf->page = page_pool_dev_alloc(fq->pp, &buf->offset, &buf->truesize); - if (unlikely(!buf->page)) + buf->netmem = page_pool_dev_alloc_netmem(fq->pp, &buf->offset, + &buf->truesize); + if (unlikely(!buf->netmem)) return DMA_MAPPING_ERROR; - return page_pool_get_dma_addr(buf->page) + buf->offset + + return page_pool_get_dma_addr_netmem(buf->netmem) + buf->offset + fq->pp->p.offset; } -void libeth_rx_recycle_slow(struct page *page); +void libeth_rx_recycle_slow(netmem_ref netmem); /** * libeth_rx_sync_for_cpu - synchronize or recycle buffer post DMA @@ -126,18 +127,19 @@ void libeth_rx_recycle_slow(struct page *page); static inline bool libeth_rx_sync_for_cpu(const struct libeth_fqe *fqe, u32 len) { - struct page *page = fqe->page; + netmem_ref netmem = fqe->netmem; /* Very rare, but possible case. The most common reason: * the last fragment contained FCS only, which was then * stripped by the HW. */ if (unlikely(!len)) { - libeth_rx_recycle_slow(page); + libeth_rx_recycle_slow(netmem); return false; } - page_pool_dma_sync_for_cpu(page->pp, page, fqe->offset, len); + page_pool_dma_sync_netmem_for_cpu(netmem_get_pp(netmem), netmem, + fqe->offset, len); return true; } -- cgit v1.2.3 From 35c64b6500ef7308155bf0dc556c646e4d7b0fd3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:20 +0200 Subject: libeth: support native XDP and register memory model Expand libeth's Page Pool functionality by adding native XDP support. This means picking the appropriate headroom and DMA direction. Also, register all the created &page_pools as XDP memory models. A driver then can call xdp_rxq_info_attach_page_pool() when registering its RxQ info. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/rx.c | 20 +++++++++++++++----- include/net/libeth/rx.h | 6 +++++- 2 files changed, 20 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/rx.c b/drivers/net/ethernet/intel/libeth/rx.c index 2afa6e33f160..62521a1f4ec9 100644 --- a/drivers/net/ethernet/intel/libeth/rx.c +++ b/drivers/net/ethernet/intel/libeth/rx.c @@ -72,7 +72,7 @@ static u32 libeth_rx_hw_len_truesize(const struct page_pool_params *pp, static bool libeth_rx_page_pool_params(struct libeth_fq *fq, struct page_pool_params *pp) { - pp->offset = LIBETH_SKB_HEADROOM; + pp->offset = fq->xdp ? LIBETH_XDP_HEADROOM : LIBETH_SKB_HEADROOM; /* HW-writeable / syncable length per one page */ pp->max_len = LIBETH_RX_PAGE_LEN(pp->offset); @@ -159,11 +159,12 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi) .dev = napi->dev->dev.parent, .netdev = napi->dev, .napi = napi, - .dma_dir = DMA_FROM_DEVICE, }; struct libeth_fqe *fqes; struct page_pool *pool; - bool ret; + int ret; + + pp.dma_dir = fq->xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; if (!fq->hsplit) ret = libeth_rx_page_pool_params(fq, &pp); @@ -177,18 +178,26 @@ int libeth_rx_fq_create(struct libeth_fq *fq, struct napi_struct *napi) return PTR_ERR(pool); fqes = kvcalloc_node(fq->count, sizeof(*fqes), GFP_KERNEL, fq->nid); - if (!fqes) + if (!fqes) { + ret = -ENOMEM; goto err_buf; + } + + ret = xdp_reg_page_pool(pool); + if (ret) + goto err_mem; fq->fqes = fqes; fq->pp = pool; return 0; +err_mem: + kvfree(fqes); err_buf: page_pool_destroy(pool); - return -ENOMEM; + return ret; } EXPORT_SYMBOL_GPL(libeth_rx_fq_create); @@ -198,6 +207,7 @@ EXPORT_SYMBOL_GPL(libeth_rx_fq_create); */ void libeth_rx_fq_destroy(struct libeth_fq *fq) { + xdp_unreg_page_pool(fq->pp); kvfree(fq->fqes); page_pool_destroy(fq->pp); } diff --git a/include/net/libeth/rx.h b/include/net/libeth/rx.h index 7d5dc58984b1..5d991404845e 100644 --- a/include/net/libeth/rx.h +++ b/include/net/libeth/rx.h @@ -13,8 +13,10 @@ /* Space reserved in front of each frame */ #define LIBETH_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN) +#define LIBETH_XDP_HEADROOM (ALIGN(XDP_PACKET_HEADROOM, NET_SKB_PAD) + \ + NET_IP_ALIGN) /* Maximum headroom for worst-case calculations */ -#define LIBETH_MAX_HEADROOM LIBETH_SKB_HEADROOM +#define LIBETH_MAX_HEADROOM LIBETH_XDP_HEADROOM /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */ #define LIBETH_RX_LL_LEN (ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN) /* Maximum supported L2-L4 header length */ @@ -66,6 +68,7 @@ enum libeth_fqe_type { * @count: number of descriptors/buffers the queue has * @type: type of the buffers this queue has * @hsplit: flag whether header split is enabled + * @xdp: flag indicating whether XDP is enabled * @buf_len: HW-writeable length per each buffer * @nid: ID of the closest NUMA node with memory */ @@ -81,6 +84,7 @@ struct libeth_fq { /* Cold fields */ enum libeth_fqe_type type:2; bool hsplit:1; + bool xdp:1; u32 buf_len; int nid; -- cgit v1.2.3 From 8591c3afe8882a00d9070daf78c384b003b596f3 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:21 +0200 Subject: libeth: xdp: add XDP_TX buffers sending Start adding XDP-specific code to libeth, namely handling XDP_TX buffers (only sending). The idea is that we accumulate up to 16 buffers on the stack, then, if either the limit is reached or the polling is finished, flush them at once with only one XDPSQ cleaning (if needed). The main sending function will be aware of the sending budget and already have all the info to send the buffers, so it can't fail. Drivers need to provide 2 inline callbacks to the main sending function: for cleaning an XDPSQ and for filling descriptors; the library code takes care of the rest. Note that unlike the generic code, multi-buffer support is not wrapped here with unlikely() to not hurt header split setups. &libeth_xdp_buff is a simple extension over &xdp_buff which has a direct pointer to the corresponding Rx descriptor (and, luckily, precisely 1 CL size and 16-byte alignment on x86_64). Suggested-by: Maciej Fijalkowski # xmit logic Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/Kconfig | 10 +- drivers/net/ethernet/intel/libeth/Makefile | 6 +- drivers/net/ethernet/intel/libeth/xdp.c | 89 +++++ include/net/libeth/tx.h | 11 +- include/net/libeth/xdp.h | 541 +++++++++++++++++++++++++++++ 5 files changed, 652 insertions(+), 5 deletions(-) create mode 100644 drivers/net/ethernet/intel/libeth/xdp.c create mode 100644 include/net/libeth/xdp.h (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig index 480293b71dbc..d8c4926574fb 100644 --- a/drivers/net/ethernet/intel/libeth/Kconfig +++ b/drivers/net/ethernet/intel/libeth/Kconfig @@ -1,9 +1,15 @@ # SPDX-License-Identifier: GPL-2.0-only -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation config LIBETH - tristate + tristate "Common Ethernet library (libeth)" if COMPILE_TEST select PAGE_POOL help libeth is a common library containing routines shared between several drivers, but not yet promoted to the generic kernel API. + +config LIBETH_XDP + tristate "Common XDP library (libeth_xdp)" if COMPILE_TEST + select LIBETH + help + XDP helpers based on libeth hotpath management. diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile index 52492b081132..9ba78f463f2e 100644 --- a/drivers/net/ethernet/intel/libeth/Makefile +++ b/drivers/net/ethernet/intel/libeth/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0-only -# Copyright (C) 2024 Intel Corporation +# Copyright (C) 2024-2025 Intel Corporation obj-$(CONFIG_LIBETH) += libeth.o libeth-y := rx.o + +obj-$(CONFIG_LIBETH_XDP) += libeth_xdp.o + +libeth_xdp-y += xdp.o diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c new file mode 100644 index 000000000000..444449c72221 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2025 Intel Corporation */ + +#define DEFAULT_SYMBOL_NAMESPACE "LIBETH_XDP" + +#include + +#include + +/* ``XDP_TX`` bulking */ + +static void __cold +libeth_xdp_tx_return_one(const struct libeth_xdp_tx_frame *frm) +{ + if (frm->len_fl & LIBETH_XDP_TX_MULTI) + libeth_xdp_return_frags(frm->data + frm->soff, true); + + libeth_xdp_return_va(frm->data, true); +} + +static void __cold +libeth_xdp_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, u32 count) +{ + for (u32 i = 0; i < count; i++) { + const struct libeth_xdp_tx_frame *frm = &bq[i]; + + if (!(frm->len_fl & LIBETH_XDP_TX_FIRST)) + continue; + + libeth_xdp_tx_return_one(frm); + } +} + +static void __cold libeth_trace_xdp_exception(const struct net_device *dev, + const struct bpf_prog *prog, + u32 act) +{ + trace_xdp_exception(dev, prog, act); +} + +/** + * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames + * @bq: XDP Tx frame bulk + * @sent: number of frames sent successfully (from this bulk) + * @flags: internal libeth_xdp flags + * + * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly. + * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust + * the Tx bulk to try again later. + */ +void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags) +{ + const struct libeth_xdp_tx_frame *pos = &bq->bulk[sent]; + u32 left = bq->count - sent; + + libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX); + + if (!(flags & LIBETH_XDP_TX_DROP)) { + memmove(bq->bulk, pos, left * sizeof(*bq->bulk)); + bq->count = left; + + return; + } + + libeth_xdp_tx_return_bulk(pos, left); + + bq->count = 0; +} +EXPORT_SYMBOL_GPL(libeth_xdp_tx_exception); + +/* Rx polling path */ + +/** + * libeth_xdp_return_buff_slow - free &libeth_xdp_buff + * @xdp: buffer to free/return + * + * Slowpath version of libeth_xdp_return_buff() to be called on exceptions, + * queue clean-ups etc., without unwanted inlining. + */ +void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp) +{ + __libeth_xdp_return_buff(xdp, false); +} +EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_slow); + +MODULE_DESCRIPTION("Common Ethernet library - XDP infra"); +MODULE_IMPORT_NS("LIBETH"); +MODULE_LICENSE("GPL"); diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 35614f9523f6..3e68d11914f7 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_TX_H #define __LIBETH_TX_H @@ -12,11 +12,13 @@ /** * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion - * @LIBETH_SQE_EMPTY: unused/empty, no action required + * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX frag, no action required * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats + * @__LIBETH_SQE_XDP_START: separator between skb and XDP types + * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats */ enum libeth_sqe_type { LIBETH_SQE_EMPTY = 0U, @@ -24,6 +26,9 @@ enum libeth_sqe_type { LIBETH_SQE_SLAB, LIBETH_SQE_FRAG, LIBETH_SQE_SKB, + + __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, }; /** @@ -32,6 +37,7 @@ enum libeth_sqe_type { * @rs_idx: index of the last buffer from the batch this one was sent in * @raw: slab buffer to free via kfree() * @skb: &sk_buff to consume + * @sinfo: skb shared info of an XDP_TX frame * @dma: DMA address to unmap * @len: length of the mapped region to unmap * @nr_frags: number of frags in the frame this buffer belongs to @@ -46,6 +52,7 @@ struct libeth_sqe { union { void *raw; struct sk_buff *skb; + struct skb_shared_info *sinfo; }; DEFINE_DMA_UNMAP_ADDR(dma); diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h new file mode 100644 index 000000000000..4988453a3d70 --- /dev/null +++ b/include/net/libeth/xdp.h @@ -0,0 +1,541 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XDP_H +#define __LIBETH_XDP_H + +#include +#include + +#include +#include +#include + +/* + * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to, + * pick maximum pointer-compatible alignment. + */ +#define __LIBETH_XDP_BUFF_ALIGN \ + (IS_ALIGNED(sizeof(struct xdp_buff_xsk), 16) ? 16 : \ + IS_ALIGNED(sizeof(struct xdp_buff_xsk), 8) ? 8 : \ + sizeof(long)) + +/** + * struct libeth_xdp_buff - libeth extension over &xdp_buff + * @base: main &xdp_buff + * @data: shortcut for @base.data + * @desc: RQ descriptor containing metadata for this buffer + * @priv: driver-private scratchspace + * + * The main reason for this is to have a pointer to the descriptor to be able + * to quickly get frame metadata from xdpmo and driver buff-to-xdp callbacks + * (as well as bigger alignment). + * Pointer/layout-compatible with &xdp_buff and &xdp_buff_xsk. + */ +struct libeth_xdp_buff { + union { + struct xdp_buff base; + void *data; + }; + + const void *desc; + unsigned long priv[] + __aligned(__LIBETH_XDP_BUFF_ALIGN); +} __aligned(__LIBETH_XDP_BUFF_ALIGN); +static_assert(offsetof(struct libeth_xdp_buff, data) == + offsetof(struct xdp_buff_xsk, xdp.data)); +static_assert(offsetof(struct libeth_xdp_buff, desc) == + offsetof(struct xdp_buff_xsk, cb)); +static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), + __alignof(struct libeth_xdp_buff))); + +/* Common Tx bits */ + +/** + * enum - libeth_xdp internal Tx flags + * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue + * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled + * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent + */ +enum { + LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, + LIBETH_XDP_TX_BATCH = 8, + + LIBETH_XDP_TX_DROP = BIT(0), +}; + +/** + * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags + * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length + * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame + * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame + * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags + * @LIBETH_XDP_TX_FLAGS: only for ``XDP_TX``, [31:16] of ::len_fl is flags + */ +enum { + LIBETH_XDP_TX_LEN = GENMASK(15, 0), + + LIBETH_XDP_TX_FIRST = BIT(16), + LIBETH_XDP_TX_LAST = BIT(17), + LIBETH_XDP_TX_MULTI = BIT(18), + + LIBETH_XDP_TX_FLAGS = GENMASK(31, 16), +}; + +/** + * struct libeth_xdp_tx_frame - represents one XDP Tx element + * @data: frame start pointer for ``XDP_TX`` + * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed + * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info + * @frag: one (non-head) frag for ``XDP_TX`` + */ +struct libeth_xdp_tx_frame { + union { + /* ``XDP_TX`` */ + struct { + void *data; + u32 len_fl; + u32 soff; + }; + + /* ``XDP_TX`` frag */ + skb_frag_t frag; + }; +} __aligned_largest; +static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == + offsetof(struct libeth_xdp_tx_frame, len_fl)); + +/** + * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending + * @prog: corresponding active XDP program + * @dev: &net_device which the frames are transmitted on + * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure + * @count: current number of frames in @bulk + * @bulk: array of queued frames for bulk Tx + * + * All XDP Tx operations queue each frame to the bulk first and flush it + * when @count reaches the array end. Bulk is always placed on the stack + * for performance. One bulk element contains all the data necessary + * for sending a frame and then freeing it on completion. + */ +struct libeth_xdp_tx_bulk { + const struct bpf_prog *prog; + struct net_device *dev; + void *xdpsq; + + u32 count; + struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK]; +} __aligned(sizeof(struct libeth_xdp_tx_frame)); + +/** + * LIBETH_XDP_ONSTACK_BULK - declare &libeth_xdp_tx_bulk on the stack + * @bq: name of the variable to declare + * + * Helper to declare a bulk on the stack with a compiler hint that it should + * not be initialized automatically (with `CONFIG_INIT_STACK_ALL_*`) for + * performance reasons. + */ +#define LIBETH_XDP_ONSTACK_BULK(bq) \ + struct libeth_xdp_tx_bulk bq __uninitialized + +/** + * struct libeth_xdpsq - abstraction for an XDPSQ + * @sqes: array of Tx buffers from the actual queue struct + * @descs: opaque pointer to the HW descriptor array + * @ntu: pointer to the next free descriptor index + * @count: number of descriptors on that queue + * @pending: pointer to the number of sent-not-completed descs on that queue + * @xdp_tx: pointer to the above + * + * Abstraction for driver-independent implementation of Tx. Placed on the stack + * and filled by the driver before the transmission, so that the generic + * functions can access and modify driver-specific resources. + */ +struct libeth_xdpsq { + struct libeth_sqe *sqes; + void *descs; + + u32 *ntu; + u32 count; + + u32 *pending; + u32 *xdp_tx; +}; + +/** + * struct libeth_xdp_tx_desc - abstraction for an XDP Tx descriptor + * @addr: DMA address of the frame + * @len: length of the frame + * @flags: XDP Tx flags + * @opts: combined @len + @flags for speed + * + * Filled by the generic functions and then passed to driver-specific functions + * to fill a HW Tx descriptor, always placed on the [function] stack. + */ +struct libeth_xdp_tx_desc { + dma_addr_t addr; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; +} __aligned_largest; + +/** + * libeth_xdp_tx_xmit_bulk - main XDP Tx function + * @bulk: array of frames to send + * @xdpsq: pointer to the driver-specific XDPSQ struct + * @n: number of frames to send + * @unroll: whether to unroll the queue filling loop for speed + * @priv: driver-specific private data + * @prep: callback for cleaning the queue and filling abstract &libeth_xdpsq + * @fill: internal callback for filling &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: callback for filling a HW descriptor with the frame info + * + * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for + * all types of frames. + * @unroll greatly increases the object code size, but also greatly increases + * performance. + * The compilers inline all those onstack abstractions to direct data accesses. + * + * Return: number of frames actually placed on the queue, <= @n. The function + * can't fail, but can send less frames if there's no enough free descriptors + * available. The actual free space is returned by @prep from the driver. + */ +static __always_inline u32 +libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, + u32 n, bool unroll, u64 priv, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv)) +{ + struct libeth_xdpsq sq __uninitialized; + u32 this, batched, off = 0; + u32 ntu, i = 0; + + n = min(n, prep(xdpsq, &sq)); + if (unlikely(!n)) + return 0; + + ntu = *sq.ntu; + + this = sq.count - ntu; + if (likely(this > n)) + this = n; + +again: + if (!unroll) + goto linear; + + batched = ALIGN_DOWN(this, LIBETH_XDP_TX_BATCH); + + for ( ; i < off + batched; i += LIBETH_XDP_TX_BATCH) { + u32 base = ntu + i - off; + + unrolled_count(LIBETH_XDP_TX_BATCH) + for (u32 j = 0; j < LIBETH_XDP_TX_BATCH; j++) + xmit(fill(bulk[i + j], base + j, &sq, priv), + base + j, &sq, priv); + } + + if (batched < this) { +linear: + for ( ; i < off + this; i++) + xmit(fill(bulk[i], ntu + i - off, &sq, priv), + ntu + i - off, &sq, priv); + } + + ntu += this; + if (likely(ntu < sq.count)) + goto out; + + ntu = 0; + + if (i < n) { + this = n - i; + off = i; + + goto again; + } + +out: + *sq.ntu = ntu; + *sq.pending += n; + if (sq.xdp_tx) + *sq.xdp_tx += n; + + return n; +} + +/* ``XDP_TX`` bulking */ + +void libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp); + +/** + * libeth_xdp_tx_queue_head - internal helper for queueing one ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XDP buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xdp_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + const struct libeth_xdp_buff *xdp) +{ + const struct xdp_buff *base = &xdp->base; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .data = xdp->data, + .len_fl = (base->data_end - xdp->data) | LIBETH_XDP_TX_FIRST, + .soff = xdp_data_hard_end(base) - xdp->data, + }; + + if (!xdp_buff_has_frags(base)) + return false; + + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xdp_tx_queue_frag - internal helper for queueing one ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + */ +static inline void libeth_xdp_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag) +{ + bq->bulk[bq->count++].frag = *frag; +} + +/** + * libeth_xdp_tx_queue_bulk - internal helper for queueing one ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XDP buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xdp_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + const struct skb_shared_info *sinfo; + bool ret = true; + u32 nr_frags; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + libeth_xdp_return_buff_slow(xdp); + return false; + } + + if (!libeth_xdp_tx_queue_head(bq, xdp)) + goto out; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + nr_frags = sinfo->nr_frags; + + for (u32 i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, 0))) { + ret = false; + break; + } + + libeth_xdp_tx_queue_frag(bq, &sinfo->frags[i]); + } + +out: + bq->bulk[bq->count - 1].len_fl |= LIBETH_XDP_TX_LAST; + xdp->data = NULL; + + return ret; +} + +/** + * libeth_xdp_tx_fill_stats - fill &libeth_sqe with ``XDP_TX`` frame stats + * @sqe: SQ element to fill + * @desc: libeth_xdp Tx descriptor + * @sinfo: &skb_shared_info for this frame + * + * Internal helper for filling an SQE with the frame stats, do not use in + * drivers. Fills the number of frags and bytes for this frame. + */ +#define libeth_xdp_tx_fill_stats(sqe, desc, sinfo) \ + __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, __UNIQUE_ID(sqe_), \ + __UNIQUE_ID(desc_), __UNIQUE_ID(sinfo_)) + +#define __libeth_xdp_tx_fill_stats(sqe, desc, sinfo, ue, ud, us) do { \ + const struct libeth_xdp_tx_desc *ud = (desc); \ + const struct skb_shared_info *us; \ + struct libeth_sqe *ue = (sqe); \ + \ + ue->nr_frags = 1; \ + ue->bytes = ud->len; \ + \ + if (ud->flags & LIBETH_XDP_TX_MULTI) { \ + us = (sinfo); \ + ue->nr_frags += us->nr_frags; \ + ue->bytes += us->xdp_frags_size; \ + } \ +} while (0) + +/** + * libeth_xdp_tx_fill_buf - internal helper to fill one ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct skb_shared_info *sinfo; + skb_frag_t *frag = &frm.frag; + struct libeth_sqe *sqe; + netmem_ref netmem; + + if (frm.len_fl & LIBETH_XDP_TX_FIRST) { + sinfo = frm.data + frm.soff; + skb_frag_fill_netmem_desc(frag, virt_to_netmem(frm.data), + offset_in_page(frm.data), + frm.len_fl); + } else { + sinfo = NULL; + } + + netmem = skb_frag_netmem(frag); + desc = (typeof(desc)){ + .addr = page_pool_get_dma_addr_netmem(netmem) + + skb_frag_off(frag), + .len = skb_frag_size(frag) & LIBETH_XDP_TX_LEN, + .flags = skb_frag_size(frag) & LIBETH_XDP_TX_FLAGS, + }; + + dma_sync_single_for_device(__netmem_get_pp(netmem)->p.dev, desc.addr, + desc.len, DMA_BIDIRECTIONAL); + + if (!sinfo) + return desc; + + sqe = &sq->sqes[i]; + sqe->type = LIBETH_SQE_XDP_TX; + sqe->sinfo = sinfo; + libeth_xdp_tx_fill_stats(sqe, &desc, sinfo); + + return desc; +} + +void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, + u32 flags); + +/** + * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk + * @bq: bulk to flush + * @flags: XDP TX flags + * @prep: driver-specific callback to prepare the queue for sending + * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc + * @xmit: driver callback to fill a HW descriptor + * + * Internal abstraction to create bulk flush functions for drivers. + * + * Return: true if anything was sent, false otherwise. + */ +static __always_inline bool +__libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + struct libeth_xdp_tx_desc + (*fill)(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, + u64 priv)) +{ + u32 sent, drops; + int err = 0; + + sent = libeth_xdp_tx_xmit_bulk(bq->bulk, bq->xdpsq, + min(bq->count, LIBETH_XDP_TX_BULK), + false, 0, prep, fill, xmit); + drops = bq->count - sent; + + if (unlikely(drops)) { + libeth_xdp_tx_exception(bq, sent, flags); + err = -ENXIO; + } else { + bq->count = 0; + } + + trace_xdp_bulk_tx(bq->dev, sent, drops, err); + + return likely(sent); +} + +/** + * libeth_xdp_tx_flush_bulk - wrapper to define flush of one ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see above + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + */ +#define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ + xmit) + +/* Rx polling path */ + +static inline void libeth_xdp_return_va(const void *data, bool napi) +{ + netmem_ref netmem = virt_to_netmem(data); + + page_pool_put_full_netmem(__netmem_get_pp(netmem), netmem, napi); +} + +static inline void libeth_xdp_return_frags(const struct skb_shared_info *sinfo, + bool napi) +{ + for (u32 i = 0; i < sinfo->nr_frags; i++) { + netmem_ref netmem = skb_frag_netmem(&sinfo->frags[i]); + + page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, napi); + } +} + +/** + * libeth_xdp_return_buff - free/recycle &libeth_xdp_buff + * @xdp: buffer to free + * + * Hotpath helper to free &libeth_xdp_buff. Comparing to xdp_return_buff(), + * it's faster as it gets inlined and always assumes order-0 pages and safe + * direct recycling. Zeroes @xdp->data to avoid UAFs. + */ +#define libeth_xdp_return_buff(xdp) __libeth_xdp_return_buff(xdp, true) + +static inline void __libeth_xdp_return_buff(struct libeth_xdp_buff *xdp, + bool napi) +{ + if (!xdp_buff_has_frags(&xdp->base)) + goto out; + + libeth_xdp_return_frags(xdp_get_shared_info_from_buff(&xdp->base), + napi); + +out: + libeth_xdp_return_va(xdp->data, napi); + xdp->data = NULL; +} + +#endif /* __LIBETH_XDP_H */ -- cgit v1.2.3 From 084ceda7decdbeff2bafbe2d28f57aed50b3bc46 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:22 +0200 Subject: libeth: xdp: add .ndo_xdp_xmit() helpers Add helpers for implementing .ndo_xdp_xmit(). Same as for XDP_TX, accumulate up to 16 DMA-mapped frames on the stack, then flush. If DMA mapping is failed for some reason, don't try mapping further frames, but still flush what was already prepared. DMA address of a head frame is stored in its headroom, assuming it has enough of it for an 8 (or 4) byte value. In addition to @prep and @xmit driver callbacks in XDP_TX, xmit also needs @finalize to kick the XDPSQ after filling. Signed-off-by: Alexander Lobakin Reviewed-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xdp.c | 37 +++- include/net/libeth/tx.h | 6 + include/net/libeth/xdp.h | 290 +++++++++++++++++++++++++++++++- 3 files changed, 328 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index 444449c72221..c65ea5d2746a 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -42,7 +42,7 @@ static void __cold libeth_trace_xdp_exception(const struct net_device *dev, * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames * @bq: XDP Tx frame bulk * @sent: number of frames sent successfully (from this bulk) - * @flags: internal libeth_xdp flags + * @flags: internal libeth_xdp flags (.ndo_xdp_xmit etc.) * * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly. * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust @@ -54,7 +54,8 @@ void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, const struct libeth_xdp_tx_frame *pos = &bq->bulk[sent]; u32 left = bq->count - sent; - libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX); + if (!(flags & LIBETH_XDP_TX_NDO)) + libeth_trace_xdp_exception(bq->dev, bq->prog, XDP_TX); if (!(flags & LIBETH_XDP_TX_DROP)) { memmove(bq->bulk, pos, left * sizeof(*bq->bulk)); @@ -63,12 +64,42 @@ void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, return; } - libeth_xdp_tx_return_bulk(pos, left); + if (!(flags & LIBETH_XDP_TX_NDO)) + libeth_xdp_tx_return_bulk(pos, left); + else + libeth_xdp_xmit_return_bulk(pos, left, bq->dev); bq->count = 0; } EXPORT_SYMBOL_GPL(libeth_xdp_tx_exception); +/* .ndo_xdp_xmit() implementation */ + +u32 __cold libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev) +{ + u32 n = 0; + + for (u32 i = 0; i < count; i++) { + const struct libeth_xdp_tx_frame *frm = &bq[i]; + dma_addr_t dma; + + if (frm->flags & LIBETH_XDP_TX_FIRST) + dma = *libeth_xdp_xmit_frame_dma(frm->xdpf); + else + dma = dma_unmap_addr(frm, dma); + + dma_unmap_page(dev->dev.parent, dma, dma_unmap_len(frm, len), + DMA_TO_DEVICE); + + /* Actual xdp_frames are freed by the core */ + n += !!(frm->flags & LIBETH_XDP_TX_FIRST); + } + + return n; +} +EXPORT_SYMBOL_GPL(libeth_xdp_xmit_return_bulk); + /* Rx polling path */ /** diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 3e68d11914f7..e2b62a8b4c57 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -19,6 +19,8 @@ * @LIBETH_SQE_SKB: &sk_buff, unmap and napi_consume_skb(), update stats * @__LIBETH_SQE_XDP_START: separator between skb and XDP types * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats + * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats + * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA */ enum libeth_sqe_type { LIBETH_SQE_EMPTY = 0U, @@ -29,6 +31,8 @@ enum libeth_sqe_type { __LIBETH_SQE_XDP_START, LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, + LIBETH_SQE_XDP_XMIT, + LIBETH_SQE_XDP_XMIT_FRAG, }; /** @@ -38,6 +42,7 @@ enum libeth_sqe_type { * @raw: slab buffer to free via kfree() * @skb: &sk_buff to consume * @sinfo: skb shared info of an XDP_TX frame + * @xdpf: XDP frame from ::ndo_xdp_xmit() * @dma: DMA address to unmap * @len: length of the mapped region to unmap * @nr_frags: number of frags in the frame this buffer belongs to @@ -53,6 +58,7 @@ struct libeth_sqe { void *raw; struct sk_buff *skb; struct skb_shared_info *sinfo; + struct xdp_frame *xdpf; }; DEFINE_DMA_UNMAP_ADDR(dma); diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 4988453a3d70..839001d901b2 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -11,6 +11,17 @@ #include #include +/* + * Defined as bits to be able to use them as a mask on Rx. + * Also used as internal return values on Tx. + */ +enum { + LIBETH_XDP_PASS = 0U, + LIBETH_XDP_DROP = BIT(0), + LIBETH_XDP_ABORTED = BIT(1), + LIBETH_XDP_TX = BIT(2), +}; + /* * &xdp_buff_xsk is the largest structure &libeth_xdp_buff gets casted to, * pick maximum pointer-compatible alignment. @@ -56,12 +67,14 @@ static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), * @LIBETH_XDP_TX_BULK: one bulk size at which it will be flushed to the queue * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent + * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit() */ enum { LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, LIBETH_XDP_TX_BATCH = 8, LIBETH_XDP_TX_DROP = BIT(0), + LIBETH_XDP_TX_NDO = BIT(1), }; /** @@ -88,6 +101,11 @@ enum { * @len_fl: ``XDP_TX``, combined flags [31:16] and len [15:0] field for speed * @soff: ``XDP_TX``, offset from @data to the start of &skb_shared_info * @frag: one (non-head) frag for ``XDP_TX`` + * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit() + * @dma: DMA address of the non-head frag for .ndo_xdp_xmit() + * @len: frag length for .ndo_xdp_xmit() + * @flags: Tx flags for the above + * @opts: combined @len + @flags for the above for speed */ struct libeth_xdp_tx_frame { union { @@ -100,6 +118,21 @@ struct libeth_xdp_tx_frame { /* ``XDP_TX`` frag */ skb_frag_t frag; + + /* .ndo_xdp_xmit() */ + struct { + union { + struct xdp_frame *xdpf; + dma_addr_t dma; + }; + union { + struct { + u32 len; + u32 flags; + }; + aligned_u64 opts; + }; + }; }; } __aligned_largest; static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == @@ -107,7 +140,7 @@ static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == /** * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending - * @prog: corresponding active XDP program + * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit() * @dev: &net_device which the frames are transmitted on * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure * @count: current number of frames in @bulk @@ -445,7 +478,7 @@ void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, /** * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk * @bq: bulk to flush - * @flags: XDP TX flags + * @flags: XDP TX flags (.ndo_xdp_xmit() etc.) * @prep: driver-specific callback to prepare the queue for sending * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc * @xmit: driver callback to fill a HW descriptor @@ -495,6 +528,259 @@ __libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ xmit) +/* .ndo_xdp_xmit() implementation */ + +/** + * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame + * @xf: pointer to the XDP frame + * + * There's no place in &libeth_xdp_tx_frame to store DMA address for an + * &xdp_frame head. The headroom is used then, the address is placed right + * after the frame struct, naturally aligned. + * + * Return: pointer to the DMA address to use. + */ +#define libeth_xdp_xmit_frame_dma(xf) \ + _Generic((xf), \ + const struct xdp_frame *: \ + (const dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf), \ + struct xdp_frame *: \ + (dma_addr_t *)__libeth_xdp_xmit_frame_dma(xf) \ + ) + +static inline void *__libeth_xdp_xmit_frame_dma(const struct xdp_frame *xdpf) +{ + void *addr = (void *)(xdpf + 1); + + if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && + __alignof(*xdpf) < sizeof(dma_addr_t)) + addr = PTR_ALIGN(addr, sizeof(dma_addr_t)); + + return addr; +} + +/** + * libeth_xdp_xmit_queue_head - internal helper for queueing one XDP xmit head + * @bq: XDP Tx bulk to queue the head frag to + * @xdpf: XDP frame with the head to queue + * @dev: device to perform DMA mapping + * + * Return: ``LIBETH_XDP_DROP`` on DMA mapping error, + * ``LIBETH_XDP_PASS`` if it's the only frag in the frame, + * ``LIBETH_XDP_TX`` if it's an S/G frame. + */ +static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + struct device *dev) +{ + dma_addr_t dma; + + dma = dma_map_single(dev, xdpf->data, xdpf->len, DMA_TO_DEVICE); + if (dma_mapping_error(dev, dma)) + return LIBETH_XDP_DROP; + + *libeth_xdp_xmit_frame_dma(xdpf) = dma; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xdpf = xdpf, + .len = xdpf->len, + .flags = LIBETH_XDP_TX_FIRST, + }; + + if (!xdp_frame_has_frags(xdpf)) + return LIBETH_XDP_PASS; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return LIBETH_XDP_TX; +} + +/** + * libeth_xdp_xmit_queue_frag - internal helper for queueing one XDP xmit frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: frag to queue + * @dev: device to perform DMA mapping + * + * Return: true on success, false on DMA mapping error. + */ +static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, + const skb_frag_t *frag, + struct device *dev) +{ + dma_addr_t dma; + + dma = skb_frag_dma_map(dev, frag); + if (dma_mapping_error(dev, dma)) + return false; + + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .dma = dma, + .len = skb_frag_size(frag), + }; + + return true; +} + +/** + * libeth_xdp_xmit_queue_bulk - internal helper for queueing one XDP xmit frame + * @bq: XDP Tx bulk to queue the frame to + * @xdpf: XDP frame to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: ``LIBETH_XDP_TX`` on success, + * ``LIBETH_XDP_DROP`` if the frame should be dropped by the stack, + * ``LIBETH_XDP_ABORTED`` if the frame will be dropped by libeth_xdp. + */ +static __always_inline u32 +libeth_xdp_xmit_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame *xdpf, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 head, nr_frags, i, ret = LIBETH_XDP_TX; + struct device *dev = bq->dev->dev.parent; + const struct skb_shared_info *sinfo; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + return LIBETH_XDP_DROP; + + head = libeth_xdp_xmit_queue_head(bq, xdpf, dev); + if (head == LIBETH_XDP_PASS) + goto out; + else if (head == LIBETH_XDP_DROP) + return LIBETH_XDP_DROP; + + sinfo = xdp_get_shared_info_from_frame(xdpf); + nr_frags = sinfo->nr_frags; + + for (i = 0; i < nr_frags; i++) { + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_NDO))) + break; + + if (!libeth_xdp_xmit_queue_frag(bq, &sinfo->frags[i], dev)) + break; + } + + if (unlikely(i < nr_frags)) + ret = LIBETH_XDP_ABORTED; + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xdp_xmit_fill_buf - internal helper to fill one XDP xmit &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the mapped DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + struct libeth_sqe *sqe; + struct xdp_frame *xdpf; + + if (frm.flags & LIBETH_XDP_TX_FIRST) { + xdpf = frm.xdpf; + desc.addr = *libeth_xdp_xmit_frame_dma(xdpf); + } else { + xdpf = NULL; + desc.addr = frm.dma; + } + desc.opts = frm.opts; + + sqe = &sq->sqes[i]; + dma_unmap_addr_set(sqe, dma, desc.addr); + dma_unmap_len_set(sqe, len, desc.len); + + if (!xdpf) { + sqe->type = LIBETH_SQE_XDP_XMIT_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XDP_XMIT; + sqe->xdpf = xdpf; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_frame(xdpf)); + + return desc; +} + +/** + * libeth_xdp_xmit_flush_bulk - wrapper to define flush of one XDP xmit bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + */ +#define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \ + libeth_xdp_xmit_fill_buf, xmit) + +u32 libeth_xdp_xmit_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count, const struct net_device *dev); + +/** + * __libeth_xdp_xmit_do_bulk - internal function to implement .ndo_xdp_xmit() + * @bq: XDP Tx bulk to queue frames to + * @frames: XDP frames passed by the stack + * @n: number of frames + * @flags: flags passed by the stack + * @flush_bulk: driver callback to flush an XDP xmit bulk + * @finalize: driver callback to finalize sending XDP Tx frames on the queue + * + * Perform common checks, map the frags and queue them to the bulk, then flush + * the bulk to the XDPSQ. If requested by the stack, finalize the queue. + * + * Return: number of frames send or -errno on error. + */ +static __always_inline int +__libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, + struct xdp_frame **frames, u32 n, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + u32 nxmit = 0; + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) + return -EINVAL; + + for (u32 i = 0; likely(i < n); i++) { + u32 ret; + + ret = libeth_xdp_xmit_queue_bulk(bq, frames[i], flush_bulk); + if (unlikely(ret != LIBETH_XDP_TX)) { + nxmit += ret == LIBETH_XDP_ABORTED; + break; + } + + nxmit++; + } + + if (bq->count) { + flush_bulk(bq, LIBETH_XDP_TX_NDO); + if (unlikely(bq->count)) + nxmit -= libeth_xdp_xmit_return_bulk(bq->bulk, + bq->count, + bq->dev); + } + + finalize(bq->xdpsq, nxmit, flags & XDP_XMIT_FLUSH); + + return nxmit; +} + /* Rx polling path */ static inline void libeth_xdp_return_va(const void *data, bool napi) -- cgit v1.2.3 From 26ce8eb0bb7d47c5fb36f7c12f34e4a320f14cac Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:23 +0200 Subject: libeth: xdp: add XDPSQE completion helpers Similarly to libeth_tx_complete(), add libeth_xdp_complete_tx() to handle XDP_TX and xmit buffers. Both use bulk return under the hood. Also add out of line libeth_tx_complete_any() which handles both regular and XDP frames (if libeth_xdp is loaded), for example, to call on queue destroy, where we don't need inlining but convenience. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/Makefile | 1 + drivers/net/ethernet/intel/libeth/priv.h | 26 ++++++++++++ drivers/net/ethernet/intel/libeth/tx.c | 38 +++++++++++++++++ drivers/net/ethernet/intel/libeth/xdp.c | 58 ++++++++++++++++++++++++++ include/net/libeth/tx.h | 13 +++++- include/net/libeth/types.h | 21 +++++++++- include/net/libeth/xdp.h | 66 ++++++++++++++++++++++++++++++ 7 files changed, 221 insertions(+), 2 deletions(-) create mode 100644 drivers/net/ethernet/intel/libeth/priv.h create mode 100644 drivers/net/ethernet/intel/libeth/tx.c (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile index 9ba78f463f2e..51669840ee06 100644 --- a/drivers/net/ethernet/intel/libeth/Makefile +++ b/drivers/net/ethernet/intel/libeth/Makefile @@ -4,6 +4,7 @@ obj-$(CONFIG_LIBETH) += libeth.o libeth-y := rx.o +libeth-y += tx.o obj-$(CONFIG_LIBETH_XDP) += libeth_xdp.o diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h new file mode 100644 index 000000000000..1bd6e2d7a3e7 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/priv.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_PRIV_H +#define __LIBETH_PRIV_H + +#include + +/* XDP */ + +struct skb_shared_info; +struct xdp_frame_bulk; + +struct libeth_xdp_ops { + void (*bulk)(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); +}; + +void libeth_attach_xdp(const struct libeth_xdp_ops *ops); + +static inline void libeth_detach_xdp(void) +{ + libeth_attach_xdp(NULL); +} + +#endif /* __LIBETH_PRIV_H */ diff --git a/drivers/net/ethernet/intel/libeth/tx.c b/drivers/net/ethernet/intel/libeth/tx.c new file mode 100644 index 000000000000..227c841ab16a --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/tx.c @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2025 Intel Corporation */ + +#define DEFAULT_SYMBOL_NAMESPACE "LIBETH" + +#include + +#include "priv.h" + +/* Tx buffer completion */ + +DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk); + +/** + * libeth_tx_complete_any - perform Tx completion for one SQE of any type + * @sqe: Tx buffer to complete + * @cp: polling params + * + * Can be used to complete both regular and XDP SQEs, for example when + * destroying queues. + * When libeth_xdp is not loaded, XDPSQEs won't be handled. + */ +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp) +{ + if (sqe->type >= __LIBETH_SQE_XDP_START) + __libeth_xdp_complete_tx(sqe, cp, static_call(bulk)); + else + libeth_tx_complete(sqe, cp); +} +EXPORT_SYMBOL_GPL(libeth_tx_complete_any); + +/* Module */ + +void libeth_attach_xdp(const struct libeth_xdp_ops *ops) +{ + static_call_update(bulk, ops ? ops->bulk : NULL); +} +EXPORT_SYMBOL_GPL(libeth_attach_xdp); diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index c65ea5d2746a..c29a1a0dfc57 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -7,6 +7,8 @@ #include +#include "priv.h" + /* ``XDP_TX`` bulking */ static void __cold @@ -115,6 +117,62 @@ void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp) } EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_slow); +/* Tx buffer completion */ + +static void libeth_xdp_put_netmem_bulk(netmem_ref netmem, + struct xdp_frame_bulk *bq) +{ + if (unlikely(bq->count == XDP_BULK_QUEUE_SIZE)) + xdp_flush_frame_bulk(bq); + + bq->q[bq->count++] = netmem; +} + +/** + * libeth_xdp_return_buff_bulk - free &xdp_buff as part of a bulk + * @sinfo: shared info corresponding to the buffer + * @bq: XDP frame bulk to store the buffer + * @frags: whether the buffer has frags + * + * Same as xdp_return_frame_bulk(), but for &libeth_xdp_buff, speeds up Tx + * completion of ``XDP_TX`` buffers and allows to free them in same bulks + * with &xdp_frame buffers. + */ +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags) +{ + if (!frags) + goto head; + + for (u32 i = 0; i < sinfo->nr_frags; i++) + libeth_xdp_put_netmem_bulk(skb_frag_netmem(&sinfo->frags[i]), + bq); + +head: + libeth_xdp_put_netmem_bulk(virt_to_netmem(sinfo), bq); +} +EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_bulk); + +/* Module */ + +static const struct libeth_xdp_ops xdp_ops __initconst = { + .bulk = libeth_xdp_return_buff_bulk, +}; + +static int __init libeth_xdp_module_init(void) +{ + libeth_attach_xdp(&xdp_ops); + + return 0; +} +module_init(libeth_xdp_module_init); + +static void __exit libeth_xdp_module_exit(void) +{ + libeth_detach_xdp(); +} +module_exit(libeth_xdp_module_exit); + MODULE_DESCRIPTION("Common Ethernet library - XDP infra"); MODULE_IMPORT_NS("LIBETH"); MODULE_LICENSE("GPL"); diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index e2b62a8b4c57..33b9bb22f6ac 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -84,7 +84,10 @@ struct libeth_sqe { /** * struct libeth_cq_pp - completion queue poll params * @dev: &device to perform DMA unmapping + * @bq: XDP frame bulk to combine return operations * @ss: onstack NAPI stats to fill + * @xss: onstack XDPSQ NAPI stats to fill + * @xdp_tx: number of XDP frames processed * @napi: whether it's called from the NAPI context * * libeth uses this structure to access objects needed for performing full @@ -93,7 +96,13 @@ struct libeth_sqe { */ struct libeth_cq_pp { struct device *dev; - struct libeth_sq_napi_stats *ss; + struct xdp_frame_bulk *bq; + + union { + struct libeth_sq_napi_stats *ss; + struct libeth_xdpsq_napi_stats *xss; + }; + u32 xdp_tx; bool napi; }; @@ -139,4 +148,6 @@ static inline void libeth_tx_complete(struct libeth_sqe *sqe, sqe->type = LIBETH_SQE_EMPTY; } +void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp); + #endif /* __LIBETH_TX_H */ diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index 603825e45133..ad7a5c1f119f 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* Copyright (C) 2024 Intel Corporation */ +/* Copyright (C) 2024-2025 Intel Corporation */ #ifndef __LIBETH_TYPES_H #define __LIBETH_TYPES_H @@ -22,4 +22,23 @@ struct libeth_sq_napi_stats { }; }; +/** + * struct libeth_xdpsq_napi_stats - "hot" counters to update in XDP Tx + * completion loop + * @packets: completed frames counter + * @bytes: sum of bytes of completed frames above + * @fragments: sum of fragments of completed S/G frames + * @raw: alias to access all the fields as an array + */ +struct libeth_xdpsq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 839001d901b2..c47ecba56020 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -824,4 +824,70 @@ out: xdp->data = NULL; } +/* Tx buffer completion */ + +void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, + struct xdp_frame_bulk *bq, bool frags); + +/** + * __libeth_xdp_complete_tx - complete sent XDPSQE + * @sqe: SQ element / Tx buffer to complete + * @cp: Tx polling/completion params + * @bulk: internal callback to bulk-free ``XDP_TX`` buffers + * + * Use the non-underscored version in drivers instead. This one is shared + * internally with libeth_tx_complete_any(). + * Complete an XDPSQE of any type of XDP frame. This includes DMA unmapping + * when needed, buffer freeing, stats update, and SQE invalidation. + */ +static __always_inline void +__libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, + typeof(libeth_xdp_return_buff_bulk) bulk) +{ + enum libeth_sqe_type type = sqe->type; + + switch (type) { + case LIBETH_SQE_EMPTY: + return; + case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XDP_XMIT_FRAG: + dma_unmap_page(cp->dev, dma_unmap_addr(sqe, dma), + dma_unmap_len(sqe, len), DMA_TO_DEVICE); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + bulk(sqe->sinfo, cp->bq, sqe->nr_frags != 1); + break; + case LIBETH_SQE_XDP_XMIT: + xdp_return_frame_bulk(sqe->xdpf, cp->bq); + break; + default: + break; + } + + switch (type) { + case LIBETH_SQE_XDP_TX: + case LIBETH_SQE_XDP_XMIT: + cp->xdp_tx -= sqe->nr_frags; + + cp->xss->packets++; + cp->xss->bytes += sqe->bytes; + break; + default: + break; + } + + sqe->type = LIBETH_SQE_EMPTY; +} + +static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, + struct libeth_cq_pp *cp) +{ + __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk); +} + #endif /* __LIBETH_XDP_H */ -- cgit v1.2.3 From c4ba6a9b9d460c6fd742e118022f2808ec3c4223 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:24 +0200 Subject: libeth: xdp: add XDPSQ locking helpers Unfortunately, it's not always possible to allocate max(num_rxqs, nr_cpu_ids) even on hi-end NICs. To mitigate this, add simple locking helpers to libeth_xdp. As long as XDPSQs are not shared, the whole functionality is gated behind a static lock. Otherwise, each bulk flush locks the queue for the time of cleaning and filling the descriptors. As long as this particular queue is not used by more than 1 CPU, the impact is minimal (runtime check for boolean twice per 16+ descriptors). Suggested-by: Maciej Fijalkowski # static key Signed-off-by: Alexander Lobakin Reviewed-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xdp.c | 47 ++++++++++++ include/net/libeth/types.h | 21 +++++- include/net/libeth/xdp.h | 127 +++++++++++++++++++++++++++++++- 3 files changed, 192 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index c29a1a0dfc57..0f08dd405190 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -9,6 +9,53 @@ #include "priv.h" +/* XDPSQ sharing */ + +DEFINE_STATIC_KEY_FALSE(libeth_xdpsq_share); +EXPORT_SYMBOL_GPL(libeth_xdpsq_share); + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + bool warn; + + spin_lock_init(&lock->lock); + lock->share = true; + + warn = !static_key_enabled(&libeth_xdpsq_share); + static_branch_inc(&libeth_xdpsq_share); + + if (warn && net_ratelimit()) + netdev_warn(dev, "XDPSQ sharing enabled, possible XDP Tx slowdown\n"); +} +EXPORT_SYMBOL_GPL(__libeth_xdpsq_get); + +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + static_branch_dec(&libeth_xdpsq_share); + + if (!static_key_enabled(&libeth_xdpsq_share) && net_ratelimit()) + netdev_notice(dev, "XDPSQ sharing disabled\n"); + + lock->share = false; +} +EXPORT_SYMBOL_GPL(__libeth_xdpsq_put); + +void __acquires(&lock->lock) +__libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + spin_lock(&lock->lock); +} +EXPORT_SYMBOL_GPL(__libeth_xdpsq_lock); + +void __releases(&lock->lock) +__libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + spin_unlock(&lock->lock); +} +EXPORT_SYMBOL_GPL(__libeth_xdpsq_unlock); + /* ``XDP_TX`` bulking */ static void __cold diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index ad7a5c1f119f..abfccae1a346 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -4,7 +4,7 @@ #ifndef __LIBETH_TYPES_H #define __LIBETH_TYPES_H -#include +#include /** * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop @@ -41,4 +41,23 @@ struct libeth_xdpsq_napi_stats { }; }; +/* XDP */ + +/* + * The following structures should be embedded into driver's queue structure + * and passed to the libeth_xdp helpers, never used directly. + */ + +/* XDPSQ sharing */ + +/** + * struct libeth_xdpsq_lock - locking primitive for sharing XDPSQs + * @lock: spinlock for locking the queue + * @share: whether this particular queue is shared + */ +struct libeth_xdpsq_lock { + spinlock_t lock; + bool share; +}; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index c47ecba56020..20977fdfd6c9 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -60,6 +60,123 @@ static_assert(offsetof(struct libeth_xdp_buff, desc) == static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), __alignof(struct libeth_xdp_buff))); +/* XDPSQ sharing */ + +DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share); + +/** + * libeth_xdpsq_num - calculate optimal number of XDPSQs for this device + sys + * @rxq: current number of active Rx queues + * @txq: current number of active Tx queues + * @max: maximum number of Tx queues + * + * Each RQ must have its own XDPSQ for XSk pairs, each CPU must have own XDPSQ + * for lockless sending (``XDP_TX``, .ndo_xdp_xmit()). Cap the maximum of these + * two with the number of SQs the device can have (minus used ones). + * + * Return: number of XDP Tx queues the device needs to use. + */ +static inline u32 libeth_xdpsq_num(u32 rxq, u32 txq, u32 max) +{ + return min(max(nr_cpu_ids, rxq), max - txq); +} + +/** + * libeth_xdpsq_shared - whether XDPSQs can be shared between several CPUs + * @num: number of active XDPSQs + * + * Return: true if there's no 1:1 XDPSQ/CPU association, false otherwise. + */ +static inline bool libeth_xdpsq_shared(u32 num) +{ + return num < nr_cpu_ids; +} + +/** + * libeth_xdpsq_id - get XDPSQ index corresponding to this CPU + * @num: number of active XDPSQs + * + * Helper for libeth_xdp routines, do not use in drivers directly. + * + * Return: XDPSQ index needs to be used on this CPU. + */ +static inline u32 libeth_xdpsq_id(u32 num) +{ + u32 ret = raw_smp_processor_id(); + + if (static_branch_unlikely(&libeth_xdpsq_share) && + libeth_xdpsq_shared(num)) + ret %= num; + + return ret; +} + +void __libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); +void __libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev); + +/** + * libeth_xdpsq_get - initialize &libeth_xdpsq_lock + * @lock: lock to initialize + * @dev: netdev which this lock belongs to + * @share: whether XDPSQs can be shared + * + * Tracks the current XDPSQ association and enables the static lock + * if needed. + */ +static inline void libeth_xdpsq_get(struct libeth_xdpsq_lock *lock, + const struct net_device *dev, + bool share) +{ + if (unlikely(share)) + __libeth_xdpsq_get(lock, dev); +} + +/** + * libeth_xdpsq_put - deinitialize &libeth_xdpsq_lock + * @lock: lock to deinitialize + * @dev: netdev which this lock belongs to + * + * Tracks the current XDPSQ association and disables the static lock + * if needed. + */ +static inline void libeth_xdpsq_put(struct libeth_xdpsq_lock *lock, + const struct net_device *dev) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_put(lock, dev); +} + +void __libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock); +void __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock); + +/** + * libeth_xdpsq_lock - grab &libeth_xdpsq_lock if needed + * @lock: lock to take + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_lock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_lock(lock); +} + +/** + * libeth_xdpsq_unlock - free &libeth_xdpsq_lock if needed + * @lock: lock to free + * + * Touches the underlying spinlock only if the static key is enabled + * and the queue itself is marked as shareable. + */ +static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) +{ + if (static_branch_unlikely(&libeth_xdpsq_share) && lock->share) + __libeth_xdpsq_unlock(lock); +} + /* Common Tx bits */ /** @@ -179,6 +296,7 @@ struct libeth_xdp_tx_bulk { * @count: number of descriptors on that queue * @pending: pointer to the number of sent-not-completed descs on that queue * @xdp_tx: pointer to the above + * @lock: corresponding XDPSQ lock * * Abstraction for driver-independent implementation of Tx. Placed on the stack * and filled by the driver before the transmission, so that the generic @@ -193,6 +311,7 @@ struct libeth_xdpsq { u32 *pending; u32 *xdp_tx; + struct libeth_xdpsq_lock *lock; }; /** @@ -229,7 +348,8 @@ struct libeth_xdp_tx_desc { * * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for * all types of frames. - * @unroll greatly increases the object code size, but also greatly increases + * @prep must lock the queue as this function releases it at the end. @unroll + * greatly increases the object code size, but also greatly increases * performance. * The compilers inline all those onstack abstractions to direct data accesses. * @@ -253,7 +373,7 @@ libeth_xdp_tx_xmit_bulk(const struct libeth_xdp_tx_frame *bulk, void *xdpsq, n = min(n, prep(xdpsq, &sq)); if (unlikely(!n)) - return 0; + goto unlock; ntu = *sq.ntu; @@ -302,6 +422,9 @@ out: if (sq.xdp_tx) *sq.xdp_tx += n; +unlock: + libeth_xdpsq_unlock(sq.lock); + return n; } -- cgit v1.2.3 From 819bbaefeded93df36d71d58d9963d706e6e99e1 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:25 +0200 Subject: libeth: xdp: add XDPSQ cleanup timers When XDP Tx queues are not interrupt-driven but use lazy cleaning, i.e. only when there are less than `threshold` free descriptors left, we also need cleanup timers to avoid &xdp_buff and &xdp_frame stall for too long, especially with Page Pool (it warns every about inflight pages every 60 second). Let's say we sent 256 frames and don't need to send more, but we clean only when the number of pending items >= 384. In that case, those 256 will stall until 128 more are sent. For this, add simple helpers to run a timer which will clean the queue regardless, after 1 second of the last send. The timer is triggered when finalizing the queue. As long as there is regular active traffic, the timer doesn't fire. Signed-off-by: Alexander Lobakin Reviewed-by: Maciej Fijalkowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xdp.c | 23 +++++++++++++ include/net/libeth/types.h | 21 +++++++++++- include/net/libeth/xdp.h | 57 +++++++++++++++++++++++++++++++++ 3 files changed, 100 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index 0f08dd405190..6f62603cf568 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -56,6 +56,29 @@ __libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) } EXPORT_SYMBOL_GPL(__libeth_xdpsq_unlock); +/* XDPSQ clean-up timers */ + +/** + * libeth_xdpsq_init_timer - initialize an XDPSQ clean-up timer + * @timer: timer to initialize + * @xdpsq: queue this timer belongs to + * @lock: corresponding XDPSQ lock + * @poll: queue polling/completion function + * + * XDPSQ clean-up timers must be set up before using at the queue configuration + * time. Set the required pointers and the cleaning callback. + */ +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)) +{ + timer->xdpsq = xdpsq; + timer->lock = lock; + + INIT_DELAYED_WORK(&timer->dwork, poll); +} +EXPORT_SYMBOL_GPL(libeth_xdpsq_init_timer); + /* ``XDP_TX`` bulking */ static void __cold diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index abfccae1a346..4df703a9eb59 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -4,7 +4,7 @@ #ifndef __LIBETH_TYPES_H #define __LIBETH_TYPES_H -#include +#include /** * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop @@ -60,4 +60,23 @@ struct libeth_xdpsq_lock { bool share; }; +/* XDPSQ clean-up timers */ + +/** + * struct libeth_xdpsq_timer - timer for cleaning up XDPSQs w/o interrupts + * @xdpsq: queue this timer belongs to + * @lock: lock for the queue + * @dwork: work performing cleanups + * + * XDPSQs not using interrupts but lazy cleaning, i.e. only when there's no + * space for sending the current queued frame/bulk, must fire up timers to + * make sure there are no stale buffers to free. + */ +struct libeth_xdpsq_timer { + void *xdpsq; + struct libeth_xdpsq_lock *lock; + + struct delayed_work dwork; +}; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 20977fdfd6c9..22bd038decb6 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -177,6 +177,63 @@ static inline void libeth_xdpsq_unlock(struct libeth_xdpsq_lock *lock) __libeth_xdpsq_unlock(lock); } +/* XDPSQ clean-up timers */ + +void libeth_xdpsq_init_timer(struct libeth_xdpsq_timer *timer, void *xdpsq, + struct libeth_xdpsq_lock *lock, + void (*poll)(struct work_struct *work)); + +/** + * libeth_xdpsq_deinit_timer - deinitialize &libeth_xdpsq_timer + * @timer: timer to deinitialize + * + * Flush and disable the underlying workqueue. + */ +static inline void libeth_xdpsq_deinit_timer(struct libeth_xdpsq_timer *timer) +{ + cancel_delayed_work_sync(&timer->dwork); +} + +/** + * libeth_xdpsq_queue_timer - run &libeth_xdpsq_timer + * @timer: timer to queue + * + * Should be called after the queue was filled and the transmission was run + * to complete the pending buffers if no further sending will be done in a + * second (-> lazy cleaning won't happen). + * If the timer was already run, it will be requeued back to one second + * timeout again. + */ +static inline void libeth_xdpsq_queue_timer(struct libeth_xdpsq_timer *timer) +{ + mod_delayed_work_on(raw_smp_processor_id(), system_bh_highpri_wq, + &timer->dwork, HZ); +} + +/** + * libeth_xdpsq_run_timer - wrapper to run a queue clean-up on a timer event + * @work: workqueue belonging to the corresponding timer + * @poll: driver-specific completion queue poll function + * + * Run the polling function on the locked queue and requeue the timer if + * there's more work to do. + * Designed to be used via LIBETH_XDP_DEFINE_TIMER() below. + */ +static __always_inline void +libeth_xdpsq_run_timer(struct work_struct *work, + u32 (*poll)(void *xdpsq, u32 budget)) +{ + struct libeth_xdpsq_timer *timer = container_of(work, typeof(*timer), + dwork.work); + + libeth_xdpsq_lock(timer->lock); + + if (poll(timer->xdpsq, U32_MAX)) + libeth_xdpsq_queue_timer(timer); + + libeth_xdpsq_unlock(timer->lock); +} + /* Common Tx bits */ /** -- cgit v1.2.3 From 3ef2b0192e8ba133f597919632bd9cf196076f0b Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:26 +0200 Subject: libeth: xdp: add helpers for preparing/processing &libeth_xdp_buff Add convenience helpers to build an &xdp_buff. This means: general initialization before the NAPI loop, adding head, adding frags etc. libeth_xdp_process_buff() is the same what everybody have in their drivers: dma_sync_for_cpu(); if (!frag) { add_head(); prefetch(); } else { add_frag(); } Note that I don't use net_prefetch(), sticking to the original prefetch(). In none of my tests prefetching 128 bytes yielded better perf than 64 bytes. That might differ if the headers are huge enough, but then additional tunneling etc. overhead takes place, you either way won't win a lot. &libeth_xdp_stash is for cases when you exit the polling loop without finishing building the buff. If that happens, you need to store the buffer in the queue structure until the next loop and then restore it. It makes no sense to place a whole full &xdp_buff there. Define a minimal structure, which would store only the fields essential to restore it. I was able to pack it into 16 bytes, which is only 8 bytes bigger than `struct sk_buff *skb` on x64. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xdp.c | 90 +++++++++++++++++++ include/net/libeth/types.h | 23 +++++ include/net/libeth/xdp.h | 151 ++++++++++++++++++++++++++++++++ 3 files changed, 264 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index 6f62603cf568..d0669f1f02f3 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -174,6 +174,64 @@ EXPORT_SYMBOL_GPL(libeth_xdp_xmit_return_bulk); /* Rx polling path */ +/** + * libeth_xdp_load_stash - recreate an &xdp_buff from libeth_xdp buffer stash + * @dst: target &libeth_xdp_buff to initialize + * @src: source stash + * + * External helper used by libeth_xdp_init_buff(), do not call directly. + * Recreate an onstack &libeth_xdp_buff using the stash saved earlier. + * The only field untouched (rxq) is initialized later in the + * abovementioned function. + */ +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src) +{ + dst->data = src->data; + dst->base.data_end = src->data + src->len; + dst->base.data_meta = src->data; + dst->base.data_hard_start = src->data - src->headroom; + + dst->base.frame_sz = src->frame_sz; + dst->base.flags = src->flags; +} +EXPORT_SYMBOL_GPL(libeth_xdp_load_stash); + +/** + * libeth_xdp_save_stash - convert &xdp_buff to a libeth_xdp buffer stash + * @dst: target &libeth_xdp_buff_stash to initialize + * @src: source XDP buffer + * + * External helper used by libeth_xdp_save_buff(), do not call directly. + * Use the fields from the passed XDP buffer to initialize the stash on the + * queue, so that a partially received frame can be finished later during + * the next NAPI poll. + */ +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + dst->data = src->data; + dst->headroom = src->data - src->base.data_hard_start; + dst->len = src->base.data_end - src->data; + + dst->frame_sz = src->base.frame_sz; + dst->flags = src->base.flags; + + WARN_ON_ONCE(dst->flags != src->base.flags); +} +EXPORT_SYMBOL_GPL(libeth_xdp_save_stash); + +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + LIBETH_XDP_ONSTACK_BUFF(xdp); + + libeth_xdp_load_stash(xdp, stash); + libeth_xdp_return_buff_slow(xdp); + + stash->data = NULL; +} +EXPORT_SYMBOL_GPL(__libeth_xdp_return_stash); + /** * libeth_xdp_return_buff_slow - free &libeth_xdp_buff * @xdp: buffer to free/return @@ -187,6 +245,38 @@ void __cold libeth_xdp_return_buff_slow(struct libeth_xdp_buff *xdp) } EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_slow); +/** + * libeth_xdp_buff_add_frag - add frag to XDP buffer + * @xdp: head XDP buffer + * @fqe: Rx buffer containing the frag + * @len: frag length reported by HW + * + * External helper used by libeth_xdp_process_buff(), do not call directly. + * Frees both head and frag buffers on error. + * + * Return: true success, false on error (no space for a new frag). + */ +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + netmem_ref netmem = fqe->netmem; + + if (!xdp_buff_add_frag(&xdp->base, netmem, + fqe->offset + netmem_get_pp(netmem)->p.offset, + len, fqe->truesize)) + goto recycle; + + return true; + +recycle: + libeth_rx_recycle_slow(netmem); + libeth_xdp_return_buff_slow(xdp); + + return false; +} +EXPORT_SYMBOL_GPL(libeth_xdp_buff_add_frag); + /* Tx buffer completion */ static void libeth_xdp_put_netmem_bulk(netmem_ref netmem, diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index 4df703a9eb59..7b27c1966d45 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -79,4 +79,27 @@ struct libeth_xdpsq_timer { struct delayed_work dwork; }; +/* Rx polling path */ + +/** + * struct libeth_xdp_buff_stash - struct for stashing &xdp_buff onto a queue + * @data: pointer to the start of the frame, xdp_buff.data + * @headroom: frame headroom, xdp_buff.data - xdp_buff.data_hard_start + * @len: frame linear space length, xdp_buff.data_end - xdp_buff.data + * @frame_sz: truesize occupied by the frame, xdp_buff.frame_sz + * @flags: xdp_buff.flags + * + * &xdp_buff is 56 bytes long on x64, &libeth_xdp_buff is 64 bytes. This + * structure carries only necessary fields to save/restore a partially built + * frame on the queue structure to finish it during the next NAPI poll. + */ +struct libeth_xdp_buff_stash { + void *data; + u16 headroom; + u16 len; + + u32 frame_sz:24; + u32 flags:8; +} __aligned_largest; + #endif /* __LIBETH_TYPES_H */ diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 22bd038decb6..780447cdabc1 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -60,6 +60,42 @@ static_assert(offsetof(struct libeth_xdp_buff, desc) == static_assert(IS_ALIGNED(sizeof(struct xdp_buff_xsk), __alignof(struct libeth_xdp_buff))); +/** + * __LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: sizeof() of the driver-private data + */ +#define __LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + ___LIBETH_XDP_ONSTACK_BUFF(name, ##__VA_ARGS__) +/** + * LIBETH_XDP_ONSTACK_BUFF - declare a &libeth_xdp_buff on the stack + * @name: name of the variable to declare + * @...: type or variable name of the driver-private data + */ +#define LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __LIBETH_XDP_ONSTACK_BUFF(name, __libeth_xdp_priv_sz(__VA_ARGS__)) + +#define ___LIBETH_XDP_ONSTACK_BUFF(name, ...) \ + __DEFINE_FLEX(struct libeth_xdp_buff, name, priv, \ + LIBETH_XDP_PRIV_SZ(__VA_ARGS__ + 0), \ + __uninitialized); \ + LIBETH_XDP_ASSERT_PRIV_SZ(__VA_ARGS__ + 0) + +#define __libeth_xdp_priv_sz(...) \ + CONCATENATE(__libeth_xdp_psz, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#define __libeth_xdp_psz0(...) +#define __libeth_xdp_psz1(...) sizeof(__VA_ARGS__) + +#define LIBETH_XDP_PRIV_SZ(sz) \ + (ALIGN(sz, __alignof(struct libeth_xdp_buff)) / sizeof(long)) + +/* Performs XSK_CHECK_PRIV_TYPE() */ +#define LIBETH_XDP_ASSERT_PRIV_SZ(sz) \ + static_assert(offsetofend(struct xdp_buff_xsk, cb) >= \ + struct_size_t(struct libeth_xdp_buff, priv, \ + LIBETH_XDP_PRIV_SZ(sz))) + /* XDPSQ sharing */ DECLARE_STATIC_KEY_FALSE(libeth_xdpsq_share); @@ -963,6 +999,65 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, /* Rx polling path */ +void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src); +void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src); +void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash); + +/** + * libeth_xdp_init_buff - initialize a &libeth_xdp_buff for Rx NAPI poll + * @dst: onstack buffer to initialize + * @src: XDP buffer stash placed on the queue + * @rxq: registered &xdp_rxq_info corresponding to this queue + * + * Should be called before the main NAPI polling loop. Loads the content of + * the previously saved stash or initializes the buffer from scratch. + */ +static inline void +libeth_xdp_init_buff(struct libeth_xdp_buff *dst, + const struct libeth_xdp_buff_stash *src, + struct xdp_rxq_info *rxq) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_load_stash(dst, src); + + dst->base.rxq = rxq; +} + +/** + * libeth_xdp_save_buff - save a partially built buffer on a queue + * @dst: XDP buffer stash placed on the queue + * @src: onstack buffer to save + * + * Should be called after the main NAPI polling loop. If the loop exited before + * the buffer was finished, saves its content on the queue, so that it can be + * completed during the next poll. Otherwise, clears the stash. + */ +static inline void libeth_xdp_save_buff(struct libeth_xdp_buff_stash *dst, + const struct libeth_xdp_buff *src) +{ + if (likely(!src->data)) + dst->data = NULL; + else + libeth_xdp_save_stash(dst, src); +} + +/** + * libeth_xdp_return_stash - free an XDP buffer stash from a queue + * @stash: stash to free + * + * If the queue is about to be destroyed, but it still has an incompleted + * buffer stash, this helper should be called to free it. + */ +static inline void libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash) +{ + if (stash->data) + __libeth_xdp_return_stash(stash); +} + static inline void libeth_xdp_return_va(const void *data, bool napi) { netmem_ref netmem = virt_to_netmem(data); @@ -1004,6 +1099,62 @@ out: xdp->data = NULL; } +bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len); + +/** + * libeth_xdp_prepare_buff - fill &libeth_xdp_buff with head FQE data + * @xdp: XDP buffer to attach the head to + * @fqe: FQE containing the head buffer + * @len: buffer len passed from HW + * + * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer + * head with the Rx buffer data: data pointer, length, headroom, and + * truesize/tailroom. Zeroes the flags. + */ +static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + const struct page *page = __netmem_to_page(fqe->netmem); + + xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); + xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, + page->pp->p.offset, len, true); +} + +/** + * libeth_xdp_process_buff - attach Rx buffer to &libeth_xdp_buff + * @xdp: XDP buffer to attach the Rx buffer to + * @fqe: Rx buffer to process + * @len: received data length from the descriptor + * + * If the XDP buffer is empty, attaches the Rx buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: true on success, false if the descriptor must be skipped (empty or + * no space for a new frag). + */ +static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp, + const struct libeth_fqe *fqe, + u32 len) +{ + if (!libeth_rx_sync_for_cpu(fqe, len)) + return false; + + if (xdp->data) + return libeth_xdp_buff_add_frag(xdp, fqe, len); + + libeth_xdp_prepare_buff(xdp, fqe, len); + + prefetch(xdp->data); + + return true; +} + /* Tx buffer completion */ void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, -- cgit v1.2.3 From 4c805f7ae1ce61a90121378a5ee1f47b3b870c73 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:27 +0200 Subject: libeth: xdp: add XDP prog run and verdict result handling Running a prog and handling the verdicts, up to napi_gro_receive() is also pretty generic code not really differing between vendors (except for Tx descriptor filling and Rx descriptor parsing). Define a couple inlines to do that. The inline callbacks a driver needs to pass is mentioned above: Tx descriptor filling for XDP_TX, populating skb with the descriptor data for XDP_PASS, finalizing XDPSQs after the polling loop for XDP_TX (kicking the HW to start sending). The populate callback passes only &libeth_xdp_buff assuming buff::desc pointer is enough, plus you can always get the corresponding Rx queue structure via container_of(buff::rxq). If not, a driver can extend the buff with more fields directly on the stack without touching libeth_xdp definitions. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xdp.c | 27 +++ include/net/libeth/types.h | 22 +++ include/net/libeth/xdp.h | 281 ++++++++++++++++++++++++++++++++ 3 files changed, 330 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index d0669f1f02f3..1607579d65bb 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -277,6 +277,33 @@ recycle: } EXPORT_SYMBOL_GPL(libeth_xdp_buff_add_frag); +/** + * libeth_xdp_prog_exception - handle XDP prog exceptions + * @bq: XDP Tx bulk + * @xdp: buffer to process + * @act: original XDP prog verdict + * @ret: error code if redirect failed + * + * External helper used by __libeth_xdp_run_prog(), do not call directly. + * Reports invalid @act, XDP exception trace event and frees the buffer. + * + * Return: libeth_xdp XDP prog verdict. + */ +u32 __cold libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret) +{ + if (act > XDP_REDIRECT) + bpf_warn_invalid_xdp_action(bq->dev, bq->prog, act); + + libeth_trace_xdp_exception(bq->dev, bq->prog, act); + + libeth_xdp_return_buff_slow(xdp); + + return LIBETH_XDP_DROP; +} +EXPORT_SYMBOL_GPL(libeth_xdp_prog_exception); + /* Tx buffer completion */ static void libeth_xdp_put_netmem_bulk(netmem_ref netmem, diff --git a/include/net/libeth/types.h b/include/net/libeth/types.h index 7b27c1966d45..cf1d78a9dc38 100644 --- a/include/net/libeth/types.h +++ b/include/net/libeth/types.h @@ -6,6 +6,28 @@ #include +/* Stats */ + +/** + * struct libeth_rq_napi_stats - "hot" counters to update in Rx polling loop + * @packets: received frames counter + * @bytes: sum of bytes of received frames above + * @fragments: sum of fragments of received S/G frames + * @hsplit: number of frames the device performed the header split for + * @raw: alias to access all the fields as an array + */ +struct libeth_rq_napi_stats { + union { + struct { + u32 packets; + u32 bytes; + u32 fragments; + u32 hsplit; + }; + DECLARE_FLEX_ARRAY(u32, raw); + }; +}; + /** * struct libeth_sq_napi_stats - "hot" counters to update in Tx completion loop * @packets: completed frames counter diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 780447cdabc1..db99bc690eb6 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -20,6 +20,7 @@ enum { LIBETH_XDP_DROP = BIT(0), LIBETH_XDP_ABORTED = BIT(1), LIBETH_XDP_TX = BIT(2), + LIBETH_XDP_REDIRECT = BIT(3), }; /* @@ -353,6 +354,7 @@ static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == * @prog: corresponding active XDP program, %NULL for .ndo_xdp_xmit() * @dev: &net_device which the frames are transmitted on * @xdpsq: shortcut to the corresponding driver-specific XDPSQ structure + * @act_mask: Rx only, mask of all the XDP prog verdicts for that NAPI session * @count: current number of frames in @bulk * @bulk: array of queued frames for bulk Tx * @@ -366,6 +368,7 @@ struct libeth_xdp_tx_bulk { struct net_device *dev; void *xdpsq; + u32 act_mask; u32 count; struct libeth_xdp_tx_frame bulk[LIBETH_XDP_TX_BULK]; } __aligned(sizeof(struct libeth_xdp_tx_frame)); @@ -999,6 +1002,40 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, /* Rx polling path */ +/** + * libeth_xdp_tx_init_bulk - initialize an XDP Tx bulk for Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (can be %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. If @num == 0, + * assumes XDP is not enabled. + */ +#define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, ub, un) do { \ + typeof(bq) ub = (bq); \ + u32 un = (num); \ + \ + rcu_read_lock(); \ + \ + if (un) { \ + ub->prog = rcu_dereference(pr); \ + ub->dev = (d); \ + ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \ + } else { \ + ub->prog = NULL; \ + } \ + \ + ub->act_mask = 0; \ + ub->count = 0; \ +} while (0) + void libeth_xdp_load_stash(struct libeth_xdp_buff *dst, const struct libeth_xdp_buff_stash *src); void libeth_xdp_save_stash(struct libeth_xdp_buff_stash *dst, @@ -1155,6 +1192,250 @@ static inline bool libeth_xdp_process_buff(struct libeth_xdp_buff *xdp, return true; } +/** + * libeth_xdp_buff_stats_frags - update onstack RQ stats with XDP frags info + * @ss: onstack stats to update + * @xdp: buffer to account + * + * Internal helper used by __libeth_xdp_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +static inline void +libeth_xdp_buff_stats_frags(struct libeth_rq_napi_stats *ss, + const struct libeth_xdp_buff *xdp) +{ + const struct skb_shared_info *sinfo; + + sinfo = xdp_get_shared_info_from_buff(&xdp->base); + ss->bytes += sinfo->xdp_frags_size; + ss->fragments += sinfo->nr_frags + 1; +} + +u32 libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret); + +/** + * __libeth_xdp_run_prog - run XDP program on an XDP buffer + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program. Handles ``XDP_DROP`` + * and ``XDP_REDIRECT`` only, the rest is processed levels up. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act < XDP_DROP || act > XDP_REDIRECT)) + goto out; + + switch (act) { + case XDP_PASS: + return LIBETH_XDP_PASS; + case XDP_DROP: + libeth_xdp_return_buff(xdp); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_REDIRECT: + if (unlikely(xdp_do_redirect(bq->dev, &xdp->base, bq->prog))) + break; + + xdp->data = NULL; + + return LIBETH_XDP_REDIRECT; + default: + break; + } + +out: + return libeth_xdp_prog_exception(bq, xdp, act, 0); +} + +/** + * __libeth_xdp_run_flush - run XDP program and handle ``XDP_TX`` verdict + * @xdp: XDP buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * @run: internal callback for running XDP program + * @queue: internal callback for queuing ``XDP_TX`` frame + * @flush_bulk: driver callback for flushing a bulk + * + * Internal inline abstraction to run XDP program and additionally handle + * ``XDP_TX`` verdict. + * Do not use directly. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, + u32 (*run)(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq), + bool (*queue)(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk) + (struct libeth_xdp_tx_bulk *bq, + u32 flags)), + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + u32 act; + + act = run(xdp, bq); + if (act == LIBETH_XDP_TX && unlikely(!queue(bq, xdp, flush_bulk))) + act = LIBETH_XDP_DROP; + + bq->act_mask |= act; + + return act; +} + +/** + * libeth_xdp_run_prog - run XDP program and handle all verdicts + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. + * + * Return: true if the buffer should be passed up the stack, false if the poll + * should go to the next buffer. + */ +#define libeth_xdp_run_prog(xdp, bq, fl) \ + (__libeth_xdp_run_flush(xdp, bq, __libeth_xdp_run_prog, \ + libeth_xdp_tx_queue_bulk, \ + fl) == LIBETH_XDP_PASS) + +/** + * __libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XDP buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction that does the following: + * 1) adds frame size and frag number (if needed) to the onstack stats; + * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff + * 3) runs XDP program if present; + * 4) handles all possible verdicts; + * 5) on ``XDP_PASS`, builds an skb from the buffer; + * 6) populates it with the descriptor metadata; + * 7) passes it up the stack. + * + * In most cases, number 2 means just writing the pointer to the HW descriptor + * to the XDP buffer. If so, please use LIBETH_XDP_DEFINE_RUN{,_PASS}() + * wrappers to build a driver function. + */ +static __always_inline void +__libeth_xdp_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + bool (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (xdp_buff_has_frags(&xdp->base)) + libeth_xdp_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + if (!bq || !run || !bq->prog) + goto build; + + if (!run(xdp, bq)) + return; + +build: + skb = xdp_build_skb_from_buff(&xdp->base); + if (unlikely(!skb)) { + libeth_xdp_return_buff_slow(xdp); + return; + } + + xdp->data = NULL; + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return; + } + + napi_gro_receive(napi, skb); +} + +static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp, + const void *desc) +{ + xdp->desc = desc; +} + +/** + * libeth_xdp_run_pass - helper to run XDP program and handle the result + * @xdp: XDP buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @ss: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xdp_run_pass(xdp, bq, napi, ss, desc, run, populate) \ + __libeth_xdp_run_pass(xdp, bq, napi, ss, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xdp_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, 0, flush, finalize) + +static __always_inline void +__libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + if (bq->act_mask & LIBETH_XDP_TX) { + if (bq->count) + flush_bulk(bq, flags | LIBETH_XDP_TX_DROP); + finalize(bq->xdpsq, true, true); + } + if (bq->act_mask & LIBETH_XDP_REDIRECT) + xdp_do_flush(); + + rcu_read_unlock(); +} + /* Tx buffer completion */ void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, -- cgit v1.2.3 From 1bb635d3748b7158c6a19e6fca4fb85e6f96fd9a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:28 +0200 Subject: libeth: xdp: add templates for building driver-side callbacks Defining driver-specific functions to pass to libeth_xdp functions can induce boilerplates and/or look a bit cryptic with all those layers of indirection. On the other hand, this indirection is needed to allow compilers to uninline big functions even when passed to __always_inline helpers (too much inlining also hurts performance in some cases), plus to reuse some XDP helpers in XSk code. Add macros to quickly build them, with the detailed kdoc. They take names of the actual callbacks for filling a Tx descriptor and other purely HW-specific things and wrap them appropriately. LIBETH_XDP_DEFINE_{BEGIN,END}() is needed for GCC 8+ unfortunately to let the drivers control which functions will be static and which global without hitting `-Wold-style-declaration`. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- include/net/libeth/xdp.h | 195 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) (limited to 'include/net') diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index db99bc690eb6..46a2ec3c3037 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -742,6 +742,9 @@ __libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, * @flags: Tx flags, see above * @prep: driver callback to prepare the queue * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_TX() to define an ``XDP_TX`` driver + * callback. */ #define libeth_xdp_tx_flush_bulk(bq, flags, prep, xmit) \ __libeth_xdp_tx_flush_bulk(bq, flags, prep, libeth_xdp_tx_fill_buf, \ @@ -749,6 +752,25 @@ __libeth_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags, /* .ndo_xdp_xmit() implementation */ +/** + * libeth_xdp_xmit_init_bulk - internal helper to initialize bulk for XDP xmit + * @bq: bulk to initialize + * @dev: target &net_device + * @xdpsqs: array of driver-specific XDPSQ structs + * @num: number of active XDPSQs (the above array length) + */ +#define libeth_xdp_xmit_init_bulk(bq, dev, xdpsqs, num) \ + __libeth_xdp_xmit_init_bulk(bq, dev, (xdpsqs)[libeth_xdpsq_id(num)]) + +static inline void __libeth_xdp_xmit_init_bulk(struct libeth_xdp_tx_bulk *bq, + struct net_device *dev, + void *xdpsq) +{ + bq->dev = dev; + bq->xdpsq = xdpsq; + bq->count = 0; +} + /** * libeth_xdp_xmit_frame_dma - internal helper to access DMA of an &xdp_frame * @xf: pointer to the XDP frame @@ -941,6 +963,9 @@ libeth_xdp_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() * @prep: driver callback to prepare the queue * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XDP_DEFINE_FLUSH_XMIT() to define an XDP xmit driver + * callback. */ #define libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit) \ __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_NDO, prep, \ @@ -1000,6 +1025,44 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, return nxmit; } +/** + * libeth_xdp_xmit_do_bulk - implement full .ndo_xdp_xmit() in driver + * @dev: target &net_device + * @n: number of frames to send + * @fr: XDP frames to send + * @f: flags passed by the stack + * @xqs: array of XDPSQs driver structs + * @nqs: number of active XDPSQs, the above array length + * @fl: driver callback to flush an XDP xmit bulk + * @fin: driver cabback to finalize the queue + * + * If the driver has active XDPSQs, perform common checks and send the frames. + * Finalize the queue, if requested. + * + * Return: number of frames sent or -errno on error. + */ +#define libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin) \ + _libeth_xdp_xmit_do_bulk(dev, n, fr, f, xqs, nqs, fl, fin, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(ret_), \ + __UNIQUE_ID(nqs_)) + +#define _libeth_xdp_xmit_do_bulk(d, n, fr, f, xqs, nqs, fl, fin, ub, ur, un) \ +({ \ + u32 un = (nqs); \ + int ur; \ + \ + if (likely(un)) { \ + LIBETH_XDP_ONSTACK_BULK(ub); \ + \ + libeth_xdp_xmit_init_bulk(&ub, d, xqs, un); \ + ur = __libeth_xdp_xmit_do_bulk(&ub, fr, n, f, fl, fin); \ + } else { \ + ur = -ENXIO; \ + } \ + \ + ur; \ +}) + /* Rx polling path */ /** @@ -1305,6 +1368,7 @@ __libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, * @fl: driver ``XDP_TX`` bulk flush callback * * Run the attached XDP program and handle all possible verdicts. + * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}(). * * Return: true if the buffer should be passed up the stack, false if the poll * should go to the next buffer. @@ -1436,6 +1500,137 @@ __libeth_xdp_finalize_rx(struct libeth_xdp_tx_bulk *bq, u32 flags, rcu_read_unlock(); } +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XDP_DEFINE_FLUSH_TX(static driver_xdp_flush_tx, driver_xdp_tx_prep, + * driver_xdp_xmit); + * LIBETH_XDP_DEFINE_RUN(static driver_xdp_run, driver_xdp_run_prog, + * driver_xdp_flush_tx, driver_populate_skb); + * LIBETH_XDP_DEFINE_FINALIZE(static driver_xdp_finalize_rx, + * driver_xdp_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * driver_xdp_run(xdp, &bq, napi, &rs, desc); + * } + * driver_xdp_finalize_rx(&bq); + */ + +#define LIBETH_XDP_DEFINE_START() \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wold-style-declaration", \ + "Allow specifying \'static\' after the return type") + +/** + * LIBETH_XDP_DEFINE_TIMER - define a driver XDPSQ cleanup timer callback + * @name: name of the function to define + * @poll: Tx polling/completion function + */ +#define LIBETH_XDP_DEFINE_TIMER(name, poll) \ +void name(struct work_struct *work) \ +{ \ + libeth_xdpsq_run_timer(work, poll); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_TX - define a driver ``XDP_TX`` bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xdp) + +#define __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, pfx) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_##pfx##_tx_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_FLUSH_XMIT - define a driver XDP xmit bulk flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XDP_DEFINE_FLUSH_XMIT(name, prep, xmit) \ +bool name(struct libeth_xdp_tx_bulk *bq, u32 flags) \ +{ \ + return libeth_xdp_xmit_flush_bulk(bq, flags, prep, xmit); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + */ +#define LIBETH_XDP_DEFINE_RUN_PROG(name, flush) \ + bool __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq) \ +{ \ + return libeth_##pfx##_run_prog(xdp, bq, flush); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate) \ + void __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xdp) + +#define __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, pfx) \ +name(struct libeth_xdp_buff *xdp, struct libeth_xdp_tx_bulk *bq, \ + struct napi_struct *napi, struct libeth_rq_napi_stats *ss, \ + const void *desc) \ +{ \ + return libeth_##pfx##_run_pass(xdp, bq, napi, ss, desc, run, \ + populate); \ +} + +/** + * LIBETH_XDP_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XDP_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XDP) + +#define __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, pfx) \ + LIBETH_##pfx##_DEFINE_RUN_PROG(static run, flush); \ + LIBETH_##pfx##_DEFINE_RUN_PASS(name, run, populate) + +/** + * LIBETH_XDP_DEFINE_FINALIZE - define a driver Rx NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xdp) + +#define __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, pfx) \ +void name(struct libeth_xdp_tx_bulk *bq) \ +{ \ + libeth_##pfx##_finalize_rx(bq, flush, finalize); \ +} + +#define LIBETH_XDP_DEFINE_END() __diag_pop() + /* Tx buffer completion */ void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, -- cgit v1.2.3 From 576cc5c13d9ba53a1a24d9b34af2f939a87b7ce8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:29 +0200 Subject: libeth: xdp: add RSS hash hint and XDP features setup helpers End the XDP section by adding helpers to setup XDP features, flipping .ndo_xdp_xmit() support at runtime (in case when it's not always on), and calculating the queue clean/refill threshold. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xdp.c | 69 +++++++++++++++++++++++++ include/net/libeth/xdp.h | 90 +++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index 1607579d65bb..4eb0f3c6cdab 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -340,6 +340,75 @@ head: } EXPORT_SYMBOL_GPL(libeth_xdp_return_buff_bulk); +/* Misc */ + +/** + * libeth_xdp_queue_threshold - calculate XDP queue clean/refill threshold + * @count: number of descriptors in the queue + * + * The threshold is the limit at which RQs start to refill (when the number of + * empty buffers exceeds it) and SQs get cleaned up (when the number of free + * descriptors goes below it). To speed up hotpath processing, threshold is + * always pow-2, closest to 1/4 of the queue length. + * Don't call it on hotpath, calculate and cache the threshold during the + * queue initialization. + * + * Return: the calculated threshold. + */ +u32 libeth_xdp_queue_threshold(u32 count) +{ + u32 quarter, low, high; + + if (likely(is_power_of_2(count))) + return count >> 2; + + quarter = DIV_ROUND_CLOSEST(count, 4); + low = rounddown_pow_of_two(quarter); + high = roundup_pow_of_two(quarter); + + return high - quarter <= quarter - low ? high : low; +} +EXPORT_SYMBOL_GPL(libeth_xdp_queue_threshold); + +/** + * __libeth_xdp_set_features - set XDP features for netdev + * @dev: &net_device to configure + * @xmo: XDP metadata ops (Rx hints) + * + * Set all the features libeth_xdp supports. Only the first argument is + * necessary. + * Use the non-underscored versions in drivers instead. + */ +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo) +{ + xdp_set_features_flag(dev, + NETDEV_XDP_ACT_BASIC | + NETDEV_XDP_ACT_REDIRECT | + NETDEV_XDP_ACT_NDO_XMIT | + NETDEV_XDP_ACT_RX_SG | + NETDEV_XDP_ACT_NDO_XMIT_SG); + dev->xdp_metadata_ops = xmo; +} +EXPORT_SYMBOL_GPL(__libeth_xdp_set_features); + +/** + * libeth_xdp_set_redirect - toggle the XDP redirect feature + * @dev: &net_device to configure + * @enable: whether XDP is enabled + * + * Use this when XDPSQs are not always available to dynamically enable + * and disable redirect feature. + */ +void libeth_xdp_set_redirect(struct net_device *dev, bool enable) +{ + if (enable) + xdp_features_set_redirect_target(dev, true); + else + xdp_features_clear_redirect_target(dev); +} +EXPORT_SYMBOL_GPL(libeth_xdp_set_redirect); + /* Module */ static const struct libeth_xdp_ops xdp_ops __initconst = { diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 46a2ec3c3037..c36b2ca0d04c 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1631,6 +1631,51 @@ void name(struct libeth_xdp_tx_bulk *bq) \ #define LIBETH_XDP_DEFINE_END() __diag_pop() +/* XMO */ + +/** + * libeth_xdp_buff_to_rq - get RQ pointer from an XDP buffer pointer + * @xdp: &libeth_xdp_buff corresponding to the queue + * @type: typeof() of the driver Rx queue structure + * @member: name of &xdp_rxq_info inside @type + * + * Often times, pointer to the RQ is needed when reading/filling metadata from + * HW descriptors. The helper can be used to quickly jump from an XDP buffer + * to the queue corresponding to its &xdp_rxq_info without introducing + * additional fields (&libeth_xdp_buff is precisely 1 cacheline long on x64). + */ +#define libeth_xdp_buff_to_rq(xdp, type, member) \ + container_of_const((xdp)->base.rxq, type, member) + +/** + * libeth_xdpmo_rx_hash - convert &libeth_rx_pt to an XDP RSS hash metadata + * @hash: pointer to the variable to write the hash to + * @rss_type: pointer to the variable to write the hash type to + * @val: hash value from the HW descriptor + * @pt: libeth parsed packet type + * + * Handle zeroed/non-available hash and convert libeth parsed packet type to + * the corresponding XDP RSS hash type. To be called at the end of + * xdp_metadata_ops idpf_xdpmo::xmo_rx_hash() implementation. + * Note that if the driver doesn't use a constant packet type lookup table but + * generates it at runtime, it must call libeth_rx_pt_gen_hash_type(pt) to + * generate XDP RSS hash type for each packet type. + * + * Return: 0 on success, -ENODATA when the hash is not available. + */ +static inline int libeth_xdpmo_rx_hash(u32 *hash, + enum xdp_rss_hash_type *rss_type, + u32 val, struct libeth_rx_pt pt) +{ + if (unlikely(!val)) + return -ENODATA; + + *hash = val; + *rss_type = pt.hash_type; + + return 0; +} + /* Tx buffer completion */ void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, @@ -1697,4 +1742,49 @@ static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk); } +/* Misc */ + +u32 libeth_xdp_queue_threshold(u32 count); + +void __libeth_xdp_set_features(struct net_device *dev, + const struct xdp_metadata_ops *xmo); +void libeth_xdp_set_redirect(struct net_device *dev, bool enable); + +/** + * libeth_xdp_set_features - set XDP features for netdev + * @dev: &net_device to configure + * @...: optional params, see __libeth_xdp_set_features() + * + * Set all the features libeth_xdp supports, including .ndo_xdp_xmit(). That + * said, it should be used only when XDPSQs are always available regardless + * of whether an XDP prog is attached to @dev. + */ +#define libeth_xdp_set_features(dev, ...) \ + CONCATENATE(__libeth_xdp_feat, \ + COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__) + +#define __libeth_xdp_feat0(dev) \ + __libeth_xdp_set_features(dev, NULL) +#define __libeth_xdp_feat1(dev, xmo) \ + __libeth_xdp_set_features(dev, xmo) + +/** + * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir + * @dev: target &net_device + * @...: optional params, see __libeth_xdp_set_features() + * + * Enable everything except the .ndo_xdp_xmit() feature, use when XDPSQs are + * not available right after netdev registration. + */ +#define libeth_xdp_set_features_noredir(dev, ...) \ + __libeth_xdp_set_features_noredir(dev, __UNIQUE_ID(dev_), \ + ##__VA_ARGS__) + +#define __libeth_xdp_set_features_noredir(dev, ud, ...) do { \ + struct net_device *ud = (dev); \ + \ + libeth_xdp_set_features(ud, ##__VA_ARGS__); \ + libeth_xdp_set_redirect(ud, false); \ +} while (0) + #endif /* __LIBETH_XDP_H */ -- cgit v1.2.3 From b3ad8450b4dc46c4ab0641f665068fd2a4d1adba Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:30 +0200 Subject: libeth: xsk: add XSk XDP_TX sending helpers Add Xsk counterparts for XDP_TX buffer sending and completion. The same base structures and functions used from the libeth_xdp core, with adjustments to that XSk Rx always operates on &xdp_buff_xsk for both head and frags. And unlike regular Rx, here unlikely() are used for frags, as the header split gives no benefits for XSk Rx, at least for now. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/Kconfig | 2 +- drivers/net/ethernet/intel/libeth/Makefile | 1 + drivers/net/ethernet/intel/libeth/priv.h | 6 ++ drivers/net/ethernet/intel/libeth/tx.c | 5 +- drivers/net/ethernet/intel/libeth/xdp.c | 7 +- drivers/net/ethernet/intel/libeth/xsk.c | 34 +++++++ include/net/libeth/tx.h | 6 ++ include/net/libeth/xdp.h | 26 ++++- include/net/libeth/xsk.h | 148 +++++++++++++++++++++++++++++ 9 files changed, 226 insertions(+), 9 deletions(-) create mode 100644 drivers/net/ethernet/intel/libeth/xsk.c create mode 100644 include/net/libeth/xsk.h (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/Kconfig b/drivers/net/ethernet/intel/libeth/Kconfig index d8c4926574fb..2445b979c499 100644 --- a/drivers/net/ethernet/intel/libeth/Kconfig +++ b/drivers/net/ethernet/intel/libeth/Kconfig @@ -12,4 +12,4 @@ config LIBETH_XDP tristate "Common XDP library (libeth_xdp)" if COMPILE_TEST select LIBETH help - XDP helpers based on libeth hotpath management. + XDP and XSk helpers based on libeth hotpath management. diff --git a/drivers/net/ethernet/intel/libeth/Makefile b/drivers/net/ethernet/intel/libeth/Makefile index 51669840ee06..350bc0b38bad 100644 --- a/drivers/net/ethernet/intel/libeth/Makefile +++ b/drivers/net/ethernet/intel/libeth/Makefile @@ -9,3 +9,4 @@ libeth-y += tx.o obj-$(CONFIG_LIBETH_XDP) += libeth_xdp.o libeth_xdp-y += xdp.o +libeth_xdp-y += xsk.o diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h index 1bd6e2d7a3e7..ebcb26f24401 100644 --- a/drivers/net/ethernet/intel/libeth/priv.h +++ b/drivers/net/ethernet/intel/libeth/priv.h @@ -8,12 +8,18 @@ /* XDP */ +struct libeth_xdp_buff; +struct libeth_xdp_tx_frame; struct skb_shared_info; struct xdp_frame_bulk; +void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count); + struct libeth_xdp_ops { void (*bulk)(const struct skb_shared_info *sinfo, struct xdp_frame_bulk *bq, bool frags); + void (*xsk)(struct libeth_xdp_buff *xdp); }; void libeth_attach_xdp(const struct libeth_xdp_ops *ops); diff --git a/drivers/net/ethernet/intel/libeth/tx.c b/drivers/net/ethernet/intel/libeth/tx.c index 227c841ab16a..e0167f43d2a8 100644 --- a/drivers/net/ethernet/intel/libeth/tx.c +++ b/drivers/net/ethernet/intel/libeth/tx.c @@ -10,6 +10,7 @@ /* Tx buffer completion */ DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk); +DEFINE_STATIC_CALL_NULL(xsk, libeth_xsk_buff_free_slow); /** * libeth_tx_complete_any - perform Tx completion for one SQE of any type @@ -23,7 +24,8 @@ DEFINE_STATIC_CALL_NULL(bulk, libeth_xdp_return_buff_bulk); void libeth_tx_complete_any(struct libeth_sqe *sqe, struct libeth_cq_pp *cp) { if (sqe->type >= __LIBETH_SQE_XDP_START) - __libeth_xdp_complete_tx(sqe, cp, static_call(bulk)); + __libeth_xdp_complete_tx(sqe, cp, static_call(bulk), + static_call(xsk)); else libeth_tx_complete(sqe, cp); } @@ -34,5 +36,6 @@ EXPORT_SYMBOL_GPL(libeth_tx_complete_any); void libeth_attach_xdp(const struct libeth_xdp_ops *ops) { static_call_update(bulk, ops ? ops->bulk : NULL); + static_call_update(xsk, ops ? ops->xsk : NULL); } EXPORT_SYMBOL_GPL(libeth_attach_xdp); diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index 4eb0f3c6cdab..bd334d314a1d 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -114,7 +114,7 @@ static void __cold libeth_trace_xdp_exception(const struct net_device *dev, * libeth_xdp_tx_exception - handle Tx exceptions of XDP frames * @bq: XDP Tx frame bulk * @sent: number of frames sent successfully (from this bulk) - * @flags: internal libeth_xdp flags (.ndo_xdp_xmit etc.) + * @flags: internal libeth_xdp flags (XSk, .ndo_xdp_xmit etc.) * * Cold helper used by __libeth_xdp_tx_flush_bulk(), do not call directly. * Reports XDP Tx exceptions, frees the frames that won't be sent or adjust @@ -136,7 +136,9 @@ void __cold libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, return; } - if (!(flags & LIBETH_XDP_TX_NDO)) + if (flags & LIBETH_XDP_TX_XSK) + libeth_xsk_tx_return_bulk(pos, left); + else if (!(flags & LIBETH_XDP_TX_NDO)) libeth_xdp_tx_return_bulk(pos, left); else libeth_xdp_xmit_return_bulk(pos, left, bq->dev); @@ -413,6 +415,7 @@ EXPORT_SYMBOL_GPL(libeth_xdp_set_redirect); static const struct libeth_xdp_ops xdp_ops __initconst = { .bulk = libeth_xdp_return_buff_bulk, + .xsk = libeth_xsk_buff_free_slow, }; static int __init libeth_xdp_module_init(void) diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c new file mode 100644 index 000000000000..fba6d7a025b0 --- /dev/null +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (C) 2025 Intel Corporation */ + +#define DEFAULT_SYMBOL_NAMESPACE "LIBETH_XDP" + +#include + +#include + +#include "priv.h" + +/* ``XDP_TX`` bulking */ + +void __cold libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, + u32 count) +{ + for (u32 i = 0; i < count; i++) + libeth_xsk_buff_free_slow(bq[i].xsk); +} + +/* Rx polling path */ + +/** + * libeth_xsk_buff_free_slow - free an XSk Rx buffer + * @xdp: buffer to free + * + * Slowpath version of xsk_buff_free() to be used on exceptions, cleanups etc. + * to avoid unwanted inlining. + */ +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp) +{ + xsk_buff_free(&xdp->base); +} +EXPORT_SYMBOL_GPL(libeth_xsk_buff_free_slow); diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 33b9bb22f6ac..44192bec86d7 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -21,6 +21,8 @@ * @LIBETH_SQE_XDP_TX: &skb_shared_info, libeth_xdp_return_buff_bulk(), stats * @LIBETH_SQE_XDP_XMIT: &xdp_frame, unmap and xdp_return_frame_bulk(), stats * @LIBETH_SQE_XDP_XMIT_FRAG: &xdp_frame frag, only unmap DMA + * @LIBETH_SQE_XSK_TX: &libeth_xdp_buff on XSk queue, xsk_buff_free(), stats + * @LIBETH_SQE_XSK_TX_FRAG: &libeth_xdp_buff frag on XSk queue, xsk_buff_free() */ enum libeth_sqe_type { LIBETH_SQE_EMPTY = 0U, @@ -33,6 +35,8 @@ enum libeth_sqe_type { LIBETH_SQE_XDP_TX = __LIBETH_SQE_XDP_START, LIBETH_SQE_XDP_XMIT, LIBETH_SQE_XDP_XMIT_FRAG, + LIBETH_SQE_XSK_TX, + LIBETH_SQE_XSK_TX_FRAG, }; /** @@ -43,6 +47,7 @@ enum libeth_sqe_type { * @skb: &sk_buff to consume * @sinfo: skb shared info of an XDP_TX frame * @xdpf: XDP frame from ::ndo_xdp_xmit() + * @xsk: XSk Rx frame from XDP_TX action * @dma: DMA address to unmap * @len: length of the mapped region to unmap * @nr_frags: number of frags in the frame this buffer belongs to @@ -59,6 +64,7 @@ struct libeth_sqe { struct sk_buff *skb; struct skb_shared_info *sinfo; struct xdp_frame *xdpf; + struct libeth_xdp_buff *xsk; }; DEFINE_DMA_UNMAP_ADDR(dma); diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index c36b2ca0d04c..ab907f36a35b 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -279,6 +279,7 @@ libeth_xdpsq_run_timer(struct work_struct *work, * @LIBETH_XDP_TX_BATCH: batch size for which the queue fill loop is unrolled * @LIBETH_XDP_TX_DROP: indicates the send function must drop frames not sent * @LIBETH_XDP_TX_NDO: whether the send function is called from .ndo_xdp_xmit() + * @LIBETH_XDP_TX_XSK: whether the function is called for ``XDP_TX`` for XSk */ enum { LIBETH_XDP_TX_BULK = DEV_MAP_BULK_SIZE, @@ -286,6 +287,7 @@ enum { LIBETH_XDP_TX_DROP = BIT(0), LIBETH_XDP_TX_NDO = BIT(1), + LIBETH_XDP_TX_XSK = BIT(2), }; /** @@ -314,7 +316,8 @@ enum { * @frag: one (non-head) frag for ``XDP_TX`` * @xdpf: &xdp_frame for the head frag for .ndo_xdp_xmit() * @dma: DMA address of the non-head frag for .ndo_xdp_xmit() - * @len: frag length for .ndo_xdp_xmit() + * @xsk: ``XDP_TX`` for XSk, XDP buffer for any frag + * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit() * @flags: Tx flags for the above * @opts: combined @len + @flags for the above for speed */ @@ -330,11 +333,13 @@ struct libeth_xdp_tx_frame { /* ``XDP_TX`` frag */ skb_frag_t frag; - /* .ndo_xdp_xmit() */ + /* .ndo_xdp_xmit(), XSk ``XDP_TX`` */ struct { union { struct xdp_frame *xdpf; dma_addr_t dma; + + struct libeth_xdp_buff *xsk; }; union { struct { @@ -386,6 +391,7 @@ struct libeth_xdp_tx_bulk { /** * struct libeth_xdpsq - abstraction for an XDPSQ + * @pool: XSk buffer pool for XSk ``XDP_TX`` * @sqes: array of Tx buffers from the actual queue struct * @descs: opaque pointer to the HW descriptor array * @ntu: pointer to the next free descriptor index @@ -399,6 +405,7 @@ struct libeth_xdp_tx_bulk { * functions can access and modify driver-specific resources. */ struct libeth_xdpsq { + struct xsk_buff_pool *pool; struct libeth_sqe *sqes; void *descs; @@ -697,7 +704,7 @@ void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, /** * __libeth_xdp_tx_flush_bulk - internal helper to flush one XDP Tx bulk * @bq: bulk to flush - * @flags: XDP TX flags (.ndo_xdp_xmit() etc.) + * @flags: XDP TX flags (.ndo_xdp_xmit(), XSk etc.) * @prep: driver-specific callback to prepare the queue for sending * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc * @xmit: driver callback to fill a HW descriptor @@ -1680,12 +1687,14 @@ static inline int libeth_xdpmo_rx_hash(u32 *hash, void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, struct xdp_frame_bulk *bq, bool frags); +void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp); /** * __libeth_xdp_complete_tx - complete sent XDPSQE * @sqe: SQ element / Tx buffer to complete * @cp: Tx polling/completion params * @bulk: internal callback to bulk-free ``XDP_TX`` buffers + * @xsk: internal callback to free XSk ``XDP_TX`` buffers * * Use the non-underscored version in drivers instead. This one is shared * internally with libeth_tx_complete_any(). @@ -1694,7 +1703,8 @@ void libeth_xdp_return_buff_bulk(const struct skb_shared_info *sinfo, */ static __always_inline void __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, - typeof(libeth_xdp_return_buff_bulk) bulk) + typeof(libeth_xdp_return_buff_bulk) bulk, + typeof(libeth_xsk_buff_free_slow) xsk) { enum libeth_sqe_type type = sqe->type; @@ -1717,6 +1727,10 @@ __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, case LIBETH_SQE_XDP_XMIT: xdp_return_frame_bulk(sqe->xdpf, cp->bq); break; + case LIBETH_SQE_XSK_TX: + case LIBETH_SQE_XSK_TX_FRAG: + xsk(sqe->xsk); + break; default: break; } @@ -1724,6 +1738,7 @@ __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, switch (type) { case LIBETH_SQE_XDP_TX: case LIBETH_SQE_XDP_XMIT: + case LIBETH_SQE_XSK_TX: cp->xdp_tx -= sqe->nr_frags; cp->xss->packets++; @@ -1739,7 +1754,8 @@ __libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp, static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, struct libeth_cq_pp *cp) { - __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk); + __libeth_xdp_complete_tx(sqe, cp, libeth_xdp_return_buff_bulk, + libeth_xsk_buff_free_slow); } /* Misc */ diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h new file mode 100644 index 000000000000..af69b46fa7e4 --- /dev/null +++ b/include/net/libeth/xsk.h @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* Copyright (C) 2025 Intel Corporation */ + +#ifndef __LIBETH_XSK_H +#define __LIBETH_XSK_H + +#include +#include + +/* ``XDP_TX`` bulking */ + +/** + * libeth_xsk_tx_queue_head - internal helper for queueing XSk ``XDP_TX`` head + * @bq: XDP Tx bulk to queue the head frag to + * @xdp: XSk buffer with the head to queue + * + * Return: false if it's the only frag of the frame, true if it's an S/G frame. + */ +static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = xdp, + .len = xdp->base.data_end - xdp->data, + .flags = LIBETH_XDP_TX_FIRST, + }; + + if (likely(!xdp_buff_has_frags(&xdp->base))) + return false; + + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_MULTI; + + return true; +} + +/** + * libeth_xsk_tx_queue_frag - internal helper for queueing XSk ``XDP_TX`` frag + * @bq: XDP Tx bulk to queue the frag to + * @frag: XSk frag to queue + */ +static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *frag) +{ + bq->bulk[bq->count++] = (typeof(*bq->bulk)){ + .xsk = frag, + .len = frag->base.data_end - frag->data, + }; +} + +/** + * libeth_xsk_tx_queue_bulk - internal helper for queueing XSk ``XDP_TX`` frame + * @bq: XDP Tx bulk to queue the frame to + * @xdp: XSk buffer to queue + * @flush_bulk: driver callback to flush the bulk to the HW queue + * + * Return: true on success, false on flush error. + */ +static __always_inline bool +libeth_xsk_tx_queue_bulk(struct libeth_xdp_tx_bulk *bq, + struct libeth_xdp_buff *xdp, + bool (*flush_bulk)(struct libeth_xdp_tx_bulk *bq, + u32 flags)) +{ + bool ret = true; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + libeth_xsk_buff_free_slow(xdp); + return false; + } + + if (!libeth_xsk_tx_queue_head(bq, xdp)) + goto out; + + for (const struct libeth_xdp_buff *head = xdp; ; ) { + xdp = container_of(xsk_buff_get_frag(&head->base), + typeof(*xdp), base); + if (!xdp) + break; + + if (unlikely(bq->count == LIBETH_XDP_TX_BULK) && + unlikely(!flush_bulk(bq, LIBETH_XDP_TX_XSK))) { + ret = false; + break; + } + + libeth_xsk_tx_queue_frag(bq, xdp); + } + +out: + bq->bulk[bq->count - 1].flags |= LIBETH_XDP_TX_LAST; + + return ret; +} + +/** + * libeth_xsk_tx_fill_buf - internal helper to fill XSk ``XDP_TX`` &libeth_sqe + * @frm: XDP Tx frame from the bulk + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: private data + * + * Return: XDP Tx descriptor with the synced DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_buff *xdp = frm.xsk; + struct libeth_xdp_tx_desc desc = { + .addr = xsk_buff_xdp_get_dma(&xdp->base), + .opts = frm.opts, + }; + struct libeth_sqe *sqe; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + sqe = &sq->sqes[i]; + sqe->xsk = xdp; + + if (!(desc.flags & LIBETH_XDP_TX_FIRST)) { + sqe->type = LIBETH_SQE_XSK_TX_FRAG; + return desc; + } + + sqe->type = LIBETH_SQE_XSK_TX; + libeth_xdp_tx_fill_stats(sqe, &desc, + xdp_get_shared_info_from_buff(&xdp->base)); + + return desc; +} + +/** + * libeth_xsk_tx_flush_bulk - wrapper to define flush of XSk ``XDP_TX`` bulk + * @bq: bulk to flush + * @flags: Tx flags, see __libeth_xdp_tx_flush_bulk() + * @prep: driver callback to prepare the queue + * @xmit: driver callback to fill a HW descriptor + * + * Use via LIBETH_XSK_DEFINE_FLUSH_TX() to define an XSk ``XDP_TX`` driver + * callback. + */ +#define libeth_xsk_tx_flush_bulk(bq, flags, prep, xmit) \ + __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \ + libeth_xsk_tx_fill_buf, xmit) + +#endif /* __LIBETH_XSK_H */ -- cgit v1.2.3 From 40e846d122df9b299e700ec86d01ef647fc0b09f Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:31 +0200 Subject: libeth: xsk: add XSk xmit functions Reuse core sending functions to send XSk xmit frames. Both metadata and no metadata pools/driver are supported. libeth_xdp also provides generic XSk metadata ops, currently with the checksum offload only and for cases when HW doesn't require supplying L3/L4 checksum offsets. Drivers are free to pass their own ops. &libeth_xdp_tx_bulk is not used here as it would be redundant; pool->tx_descs are accessed directly. Fake "libeth_xsktmo" is needed to hide implementation details from the drivers when they want to use the generic ops: the original struct is defined in the same file where dev->xsk_tx_metadata_ops gets set to avoid duplication of slowpath; at the same time; XSk xmit functions use local "fast" copy to inline XMO callbacks. Tx descriptor filling loop is unrolled by 8. Suggested-by: Maciej Fijalkowski # optimizations Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/priv.h | 2 + drivers/net/ethernet/intel/libeth/xdp.c | 14 ++- drivers/net/ethernet/intel/libeth/xsk.c | 6 ++ include/net/libeth/tx.h | 4 +- include/net/libeth/xdp.h | 73 +++++++++++--- include/net/libeth/xsk.h | 166 +++++++++++++++++++++++++++++++ 6 files changed, 248 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h index ebcb26f24401..03e74382b2cb 100644 --- a/drivers/net/ethernet/intel/libeth/priv.h +++ b/drivers/net/ethernet/intel/libeth/priv.h @@ -13,6 +13,8 @@ struct libeth_xdp_tx_frame; struct skb_shared_info; struct xdp_frame_bulk; +extern const struct xsk_tx_metadata_ops libeth_xsktmo_slow; + void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, u32 count); diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index bd334d314a1d..b5fb2ce92da8 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -376,21 +376,31 @@ EXPORT_SYMBOL_GPL(libeth_xdp_queue_threshold); * __libeth_xdp_set_features - set XDP features for netdev * @dev: &net_device to configure * @xmo: XDP metadata ops (Rx hints) + * @zc_segs: maximum number of S/G frags the HW can transmit + * @tmo: XSk Tx metadata ops (Tx hints) * * Set all the features libeth_xdp supports. Only the first argument is - * necessary. + * necessary; without the third one (zero), XSk support won't be advertised. * Use the non-underscored versions in drivers instead. */ void __libeth_xdp_set_features(struct net_device *dev, - const struct xdp_metadata_ops *xmo) + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo) { xdp_set_features_flag(dev, NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT | NETDEV_XDP_ACT_NDO_XMIT | + (zc_segs ? NETDEV_XDP_ACT_XSK_ZEROCOPY : 0) | NETDEV_XDP_ACT_RX_SG | NETDEV_XDP_ACT_NDO_XMIT_SG); dev->xdp_metadata_ops = xmo; + + tmo = tmo == libeth_xsktmo ? &libeth_xsktmo_slow : tmo; + + dev->xdp_zc_max_segs = zc_segs ? : 1; + dev->xsk_tx_metadata_ops = zc_segs ? tmo : NULL; } EXPORT_SYMBOL_GPL(__libeth_xdp_set_features); diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c index fba6d7a025b0..f09e1940183b 100644 --- a/drivers/net/ethernet/intel/libeth/xsk.c +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -18,6 +18,12 @@ void __cold libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, libeth_xsk_buff_free_slow(bq[i].xsk); } +/* XSk TMO */ + +const struct xsk_tx_metadata_ops libeth_xsktmo_slow = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + /* Rx polling path */ /** diff --git a/include/net/libeth/tx.h b/include/net/libeth/tx.h index 44192bec86d7..c3db5c6f1641 100644 --- a/include/net/libeth/tx.h +++ b/include/net/libeth/tx.h @@ -12,7 +12,7 @@ /** * enum libeth_sqe_type - type of &libeth_sqe to act on Tx completion - * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX frag, no action required + * @LIBETH_SQE_EMPTY: unused/empty OR XDP_TX/XSk frame, no action required * @LIBETH_SQE_CTX: context descriptor with empty SQE, no action required * @LIBETH_SQE_SLAB: kmalloc-allocated buffer, unmap and kfree() * @LIBETH_SQE_FRAG: mapped skb frag, only unmap DMA @@ -93,7 +93,7 @@ struct libeth_sqe { * @bq: XDP frame bulk to combine return operations * @ss: onstack NAPI stats to fill * @xss: onstack XDPSQ NAPI stats to fill - * @xdp_tx: number of XDP frames processed + * @xdp_tx: number of XDP-not-XSk frames processed * @napi: whether it's called from the NAPI context * * libeth uses this structure to access objects needed for performing full diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index ab907f36a35b..c3655458047d 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -293,6 +293,8 @@ enum { /** * enum - &libeth_xdp_tx_frame and &libeth_xdp_tx_desc flags * @LIBETH_XDP_TX_LEN: only for ``XDP_TX``, [15:0] of ::len_fl is actual length + * @LIBETH_XDP_TX_CSUM: for XSk xmit, enable checksum offload + * @LIBETH_XDP_TX_XSKMD: for XSk xmit, mask of the metadata bits * @LIBETH_XDP_TX_FIRST: indicates the frag is the first one of the frame * @LIBETH_XDP_TX_LAST: whether the frag is the last one of the frame * @LIBETH_XDP_TX_MULTI: whether the frame contains several frags @@ -301,6 +303,9 @@ enum { enum { LIBETH_XDP_TX_LEN = GENMASK(15, 0), + LIBETH_XDP_TX_CSUM = XDP_TXMD_FLAGS_CHECKSUM, + LIBETH_XDP_TX_XSKMD = LIBETH_XDP_TX_LEN, + LIBETH_XDP_TX_FIRST = BIT(16), LIBETH_XDP_TX_LAST = BIT(17), LIBETH_XDP_TX_MULTI = BIT(18), @@ -320,6 +325,7 @@ enum { * @len: frag length for XSk ``XDP_TX`` and .ndo_xdp_xmit() * @flags: Tx flags for the above * @opts: combined @len + @flags for the above for speed + * @desc: XSk xmit descriptor for direct casting */ struct libeth_xdp_tx_frame { union { @@ -349,10 +355,14 @@ struct libeth_xdp_tx_frame { aligned_u64 opts; }; }; + + /* XSk xmit */ + struct xdp_desc desc; }; -} __aligned_largest; +} __aligned(sizeof(struct xdp_desc)); static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == offsetof(struct libeth_xdp_tx_frame, len_fl)); +static_assert(sizeof(struct libeth_xdp_tx_frame) == sizeof(struct xdp_desc)); /** * struct libeth_xdp_tx_bulk - XDP Tx frame bulk for bulk sending @@ -363,10 +373,13 @@ static_assert(offsetof(struct libeth_xdp_tx_frame, frag.len) == * @count: current number of frames in @bulk * @bulk: array of queued frames for bulk Tx * - * All XDP Tx operations queue each frame to the bulk first and flush it - * when @count reaches the array end. Bulk is always placed on the stack - * for performance. One bulk element contains all the data necessary + * All XDP Tx operations except XSk xmit queue each frame to the bulk first + * and flush it when @count reaches the array end. Bulk is always placed on + * the stack for performance. One bulk element contains all the data necessary * for sending a frame and then freeing it on completion. + * For XSk xmit, Tx descriptor array from &xsk_buff_pool is casted directly + * to &libeth_xdp_tx_frame as they are compatible and the bulk structure is + * not used. */ struct libeth_xdp_tx_bulk { const struct bpf_prog *prog; @@ -391,13 +404,13 @@ struct libeth_xdp_tx_bulk { /** * struct libeth_xdpsq - abstraction for an XDPSQ - * @pool: XSk buffer pool for XSk ``XDP_TX`` + * @pool: XSk buffer pool for XSk ``XDP_TX`` and xmit * @sqes: array of Tx buffers from the actual queue struct * @descs: opaque pointer to the HW descriptor array * @ntu: pointer to the next free descriptor index * @count: number of descriptors on that queue * @pending: pointer to the number of sent-not-completed descs on that queue - * @xdp_tx: pointer to the above + * @xdp_tx: pointer to the above, but only for non-XSk-xmit frames * @lock: corresponding XDPSQ lock * * Abstraction for driver-independent implementation of Tx. Placed on the stack @@ -438,6 +451,30 @@ struct libeth_xdp_tx_desc { }; } __aligned_largest; +/** + * libeth_xdp_ptr_to_priv - convert pointer to a libeth_xdp u64 priv + * @ptr: pointer to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when you want to pass a pointer there. + */ +#define libeth_xdp_ptr_to_priv(ptr) ({ \ + typecheck_pointer(ptr); \ + ((u64)(uintptr_t)(ptr)); \ +}) +/** + * libeth_xdp_priv_to_ptr - convert libeth_xdp u64 priv to a pointer + * @priv: private data to convert + * + * The main sending function passes private data as the largest scalar, u64. + * Use this helper when your callback takes this u64 and you want to convert + * it back to a pointer. + */ +#define libeth_xdp_priv_to_ptr(priv) ({ \ + static_assert(__same_type(priv, u64)); \ + ((const void *)(uintptr_t)(priv)); \ +}) + /** * libeth_xdp_tx_xmit_bulk - main XDP Tx function * @bulk: array of frames to send @@ -450,10 +487,11 @@ struct libeth_xdp_tx_desc { * @xmit: callback for filling a HW descriptor with the frame info * * Internal abstraction for placing @n XDP Tx frames on the HW XDPSQ. Used for - * all types of frames. + * all types of frames: ``XDP_TX``, .ndo_xdp_xmit(), XSk ``XDP_TX``, and XSk + * xmit. * @prep must lock the queue as this function releases it at the end. @unroll - * greatly increases the object code size, but also greatly increases - * performance. + * greatly increases the object code size, but also greatly increases XSk xmit + * performance; for other types of frames, it's not enabled. * The compilers inline all those onstack abstractions to direct data accesses. * * Return: number of frames actually placed on the queue, <= @n. The function @@ -709,7 +747,8 @@ void libeth_xdp_tx_exception(struct libeth_xdp_tx_bulk *bq, u32 sent, * @fill: libeth_xdp callback to fill &libeth_sqe and &libeth_xdp_tx_desc * @xmit: driver callback to fill a HW descriptor * - * Internal abstraction to create bulk flush functions for drivers. + * Internal abstraction to create bulk flush functions for drivers. Used for + * everything except XSk xmit. * * Return: true if anything was sent, false otherwise. */ @@ -1763,7 +1802,9 @@ static inline void libeth_xdp_complete_tx(struct libeth_sqe *sqe, u32 libeth_xdp_queue_threshold(u32 count); void __libeth_xdp_set_features(struct net_device *dev, - const struct xdp_metadata_ops *xmo); + const struct xdp_metadata_ops *xmo, + u32 zc_segs, + const struct xsk_tx_metadata_ops *tmo); void libeth_xdp_set_redirect(struct net_device *dev, bool enable); /** @@ -1780,9 +1821,13 @@ void libeth_xdp_set_redirect(struct net_device *dev, bool enable); COUNT_ARGS(__VA_ARGS__))(dev, ##__VA_ARGS__) #define __libeth_xdp_feat0(dev) \ - __libeth_xdp_set_features(dev, NULL) + __libeth_xdp_set_features(dev, NULL, 0, NULL) #define __libeth_xdp_feat1(dev, xmo) \ - __libeth_xdp_set_features(dev, xmo) + __libeth_xdp_set_features(dev, xmo, 0, NULL) +#define __libeth_xdp_feat2(dev, xmo, zc_segs) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, NULL) +#define __libeth_xdp_feat3(dev, xmo, zc_segs, tmo) \ + __libeth_xdp_set_features(dev, xmo, zc_segs, tmo) /** * libeth_xdp_set_features_noredir - enable all libeth_xdp features w/o redir @@ -1803,4 +1848,6 @@ void libeth_xdp_set_redirect(struct net_device *dev, bool enable); libeth_xdp_set_redirect(ud, false); \ } while (0) +#define libeth_xsktmo ((const void *)GOLDEN_RATIO_PRIME) + #endif /* __LIBETH_XDP_H */ diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index af69b46fa7e4..16ca195981fe 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -7,6 +7,11 @@ #include #include +/* ``XDP_TXMD_FLAGS_VALID`` is defined only under ``CONFIG_XDP_SOCKETS`` */ +#ifdef XDP_TXMD_FLAGS_VALID +static_assert(XDP_TXMD_FLAGS_VALID <= LIBETH_XDP_TX_XSKMD); +#endif + /* ``XDP_TX`` bulking */ /** @@ -145,4 +150,165 @@ libeth_xsk_tx_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, __libeth_xdp_tx_flush_bulk(bq, (flags) | LIBETH_XDP_TX_XSK, prep, \ libeth_xsk_tx_fill_buf, xmit) +/* XSk TMO */ + +/** + * libeth_xsktmo_req_csum - XSk Tx metadata op to request checksum offload + * @csum_start: unused + * @csum_offset: unused + * @priv: &libeth_xdp_tx_desc from the filling helper + * + * Generic implementation of ::tmo_request_checksum. Works only when HW doesn't + * require filling checksum offsets and other parameters beside the checksum + * request bit. + * Consider using within @libeth_xsktmo unless the driver requires HW-specific + * callbacks. + */ +static inline void libeth_xsktmo_req_csum(u16 csum_start, u16 csum_offset, + void *priv) +{ + ((struct libeth_xdp_tx_desc *)priv)->flags |= LIBETH_XDP_TX_CSUM; +} + +/* Only to inline the callbacks below, use @libeth_xsktmo in drivers instead */ +static const struct xsk_tx_metadata_ops __libeth_xsktmo = { + .tmo_request_checksum = libeth_xsktmo_req_csum, +}; + +/** + * __libeth_xsk_xmit_fill_buf_md - internal helper to prepare XSk xmit w/meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Same as __libeth_xsk_xmit_fill_buf(), but requests metadata pointer and + * fills additional fields in &libeth_xdp_tx_desc to ask for metadata offload. + * + * Return: XDP Tx descriptor with the DMA, metadata request bits, and other + * info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq, + u64 priv) +{ + const struct xsk_tx_metadata_ops *tmo = libeth_xdp_priv_to_ptr(priv); + struct libeth_xdp_tx_desc desc; + struct xdp_desc_ctx ctx; + + ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); + desc = (typeof(desc)){ + .addr = ctx.dma, + .len = xdesc->len, + }; + + BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); + tmo = tmo == libeth_xsktmo ? &__libeth_xsktmo : tmo; + + xsk_tx_metadata_request(ctx.meta, tmo, &desc); + + return desc; +} + +/* XSk xmit implementation */ + +/** + * __libeth_xsk_xmit_fill_buf - internal helper to prepare XSk xmit w/o meta + * @xdesc: &xdp_desc from the XSk buffer pool + * @sq: XDPSQ abstraction for the queue + * + * Return: XDP Tx descriptor with the DMA and other info to pass to + * the driver callback. + */ +static inline struct libeth_xdp_tx_desc +__libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, + const struct libeth_xdpsq *sq) +{ + return (struct libeth_xdp_tx_desc){ + .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), + .len = xdesc->len, + }; +} + +/** + * libeth_xsk_xmit_fill_buf - internal helper to prepare an XSk xmit + * @frm: &xdp_desc from the XSk buffer pool + * @i: index on the HW queue + * @sq: XDPSQ abstraction for the queue + * @priv: XSk Tx metadata ops + * + * Depending on the metadata ops presence (determined at compile time), calls + * the quickest helper to build a libeth XDP Tx descriptor. + * + * Return: XDP Tx descriptor with the synced DMA, metadata request bits, + * and other info to pass to the driver callback. + */ +static __always_inline struct libeth_xdp_tx_desc +libeth_xsk_xmit_fill_buf(struct libeth_xdp_tx_frame frm, u32 i, + const struct libeth_xdpsq *sq, u64 priv) +{ + struct libeth_xdp_tx_desc desc; + + if (priv) + desc = __libeth_xsk_xmit_fill_buf_md(&frm.desc, sq, priv); + else + desc = __libeth_xsk_xmit_fill_buf(&frm.desc, sq); + + desc.flags |= xsk_is_eop_desc(&frm.desc) ? LIBETH_XDP_TX_LAST : 0; + + xsk_buff_raw_dma_sync_for_device(sq->pool, desc.addr, desc.len); + + return desc; +} + +/** + * libeth_xsk_xmit_do_bulk - send XSk xmit frames + * @pool: XSk buffer pool containing the frames to send + * @xdpsq: opaque pointer to driver's XDPSQ struct + * @budget: maximum number of frames can be sent + * @tmo: optional XSk Tx metadata ops + * @prep: driver callback to build a &libeth_xdpsq + * @xmit: driver callback to put frames to a HW queue + * @finalize: driver callback to start a transmission + * + * Implements generic XSk xmit. Always turns on XSk Tx wakeup as it's assumed + * lazy cleaning is used and interrupts are disabled for the queue. + * HW descriptor filling is unrolled by ``LIBETH_XDP_TX_BATCH`` to optimize + * writes. + * Note that unlike other XDP Tx ops, the queue must be locked and cleaned + * prior to calling this function to already know available @budget. + * @prepare must only build a &libeth_xdpsq and return ``U32_MAX``. + * + * Return: false if @budget was exhausted, true otherwise. + */ +static __always_inline bool +libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget, + const struct xsk_tx_metadata_ops *tmo, + u32 (*prep)(void *xdpsq, struct libeth_xdpsq *sq), + void (*xmit)(struct libeth_xdp_tx_desc desc, u32 i, + const struct libeth_xdpsq *sq, u64 priv), + void (*finalize)(void *xdpsq, bool sent, bool flush)) +{ + const struct libeth_xdp_tx_frame *bulk; + bool wake; + u32 n; + + wake = xsk_uses_need_wakeup(pool); + if (wake) + xsk_clear_tx_need_wakeup(pool); + + n = xsk_tx_peek_release_desc_batch(pool, budget); + bulk = container_of(&pool->tx_descs[0], typeof(*bulk), desc); + + libeth_xdp_tx_xmit_bulk(bulk, xdpsq, n, true, + libeth_xdp_ptr_to_priv(tmo), prep, + libeth_xsk_xmit_fill_buf, xmit); + finalize(xdpsq, n, true); + + if (wake) + xsk_set_tx_need_wakeup(pool); + + return n < budget; +} + #endif /* __LIBETH_XSK_H */ -- cgit v1.2.3 From 5495c58c65aa3d650cccaa19dc59115b9a0069a5 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:32 +0200 Subject: libeth: xsk: add XSk Rx processing support Add XSk counterparts for preparing XSk &libeth_xdp_buff (adding head and frags), running the program, and handling the verdict, inc. XDP_PASS. Shortcuts in comparison with regular Rx: frags and all verdicts except XDP_REDIRECT are under unlikely() and out of line; no checks for XDP program presence as it's always true for XSk. Suggested-by: Maciej Fijalkowski # optimizations Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/priv.h | 3 + drivers/net/ethernet/intel/libeth/xdp.c | 6 +- drivers/net/ethernet/intel/libeth/xsk.c | 107 ++++++++++++ include/net/libeth/xdp.h | 17 +- include/net/libeth/xsk.h | 273 +++++++++++++++++++++++++++++++ 5 files changed, 398 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/priv.h b/drivers/net/ethernet/intel/libeth/priv.h index 03e74382b2cb..9b811d31015c 100644 --- a/drivers/net/ethernet/intel/libeth/priv.h +++ b/drivers/net/ethernet/intel/libeth/priv.h @@ -8,6 +8,7 @@ /* XDP */ +enum xdp_action; struct libeth_xdp_buff; struct libeth_xdp_tx_frame; struct skb_shared_info; @@ -17,6 +18,8 @@ extern const struct xsk_tx_metadata_ops libeth_xsktmo_slow; void libeth_xsk_tx_return_bulk(const struct libeth_xdp_tx_frame *bq, u32 count); +u32 libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp, enum xdp_action act, + int ret); struct libeth_xdp_ops { void (*bulk)(const struct skb_shared_info *sinfo, diff --git a/drivers/net/ethernet/intel/libeth/xdp.c b/drivers/net/ethernet/intel/libeth/xdp.c index b5fb2ce92da8..d4ac027d9584 100644 --- a/drivers/net/ethernet/intel/libeth/xdp.c +++ b/drivers/net/ethernet/intel/libeth/xdp.c @@ -286,7 +286,8 @@ EXPORT_SYMBOL_GPL(libeth_xdp_buff_add_frag); * @act: original XDP prog verdict * @ret: error code if redirect failed * - * External helper used by __libeth_xdp_run_prog(), do not call directly. + * External helper used by __libeth_xdp_run_prog() and + * __libeth_xsk_run_prog_slow(), do not call directly. * Reports invalid @act, XDP exception trace event and frees the buffer. * * Return: libeth_xdp XDP prog verdict. @@ -300,6 +301,9 @@ u32 __cold libeth_xdp_prog_exception(const struct libeth_xdp_tx_bulk *bq, libeth_trace_xdp_exception(bq->dev, bq->prog, act); + if (xdp->base.rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + return libeth_xsk_prog_exception(xdp, act, ret); + libeth_xdp_return_buff_slow(xdp); return LIBETH_XDP_DROP; diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c index f09e1940183b..f8f4016d1b25 100644 --- a/drivers/net/ethernet/intel/libeth/xsk.c +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -38,3 +38,110 @@ void libeth_xsk_buff_free_slow(struct libeth_xdp_buff *xdp) xsk_buff_free(&xdp->base); } EXPORT_SYMBOL_GPL(libeth_xsk_buff_free_slow); + +/** + * libeth_xsk_buff_add_frag - add frag to XSk Rx buffer + * @head: head buffer + * @xdp: frag buffer + * + * External helper used by libeth_xsk_process_buff(), do not call directly. + * Frees both main and frag buffers on error. + * + * Return: main buffer with attached frag on success, %NULL on error (no space + * for a new frag). + */ +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp) +{ + if (!xsk_buff_add_frag(&head->base, &xdp->base)) + goto free; + + return head; + +free: + libeth_xsk_buff_free_slow(xdp); + libeth_xsk_buff_free_slow(head); + + return NULL; +} +EXPORT_SYMBOL_GPL(libeth_xsk_buff_add_frag); + +/** + * libeth_xsk_buff_stats_frags - update onstack RQ stats with XSk frags info + * @rs: onstack stats to update + * @xdp: buffer to account + * + * External helper used by __libeth_xsk_run_pass(), do not call directly. + * Adds buffer's frags count and total len to the onstack stats. + */ +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp) +{ + libeth_xdp_buff_stats_frags(rs, xdp); +} +EXPORT_SYMBOL_GPL(libeth_xsk_buff_stats_frags); + +/** + * __libeth_xsk_run_prog_slow - process the non-``XDP_REDIRECT`` verdicts + * @xdp: buffer to process + * @bq: Tx bulk for queueing on ``XDP_TX`` + * @act: verdict to process + * @ret: error code if ``XDP_REDIRECT`` failed + * + * External helper used by __libeth_xsk_run_prog(), do not call directly. + * ``XDP_REDIRECT`` is the most common and hottest verdict on XSk, thus + * it is processed inline. The rest goes here for out-of-line processing, + * together with redirect errors. + * + * Return: libeth_xdp XDP prog verdict. + */ +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret) +{ + switch (act) { + case XDP_DROP: + xsk_buff_free(&xdp->base); + + return LIBETH_XDP_DROP; + case XDP_TX: + return LIBETH_XDP_TX; + case XDP_PASS: + return LIBETH_XDP_PASS; + default: + break; + } + + return libeth_xdp_prog_exception(bq, xdp, act, ret); +} +EXPORT_SYMBOL_GPL(__libeth_xsk_run_prog_slow); + +/** + * libeth_xsk_prog_exception - handle XDP prog exceptions on XSk + * @xdp: buffer to process + * @act: verdict returned by the prog + * @ret: error code if ``XDP_REDIRECT`` failed + * + * Internal. Frees the buffer and, if the queue uses XSk wakeups, stop the + * current NAPI poll when there are no free buffers left. + * + * Return: libeth_xdp's XDP prog verdict. + */ +u32 __cold libeth_xsk_prog_exception(struct libeth_xdp_buff *xdp, + enum xdp_action act, int ret) +{ + const struct xdp_buff_xsk *xsk; + u32 __ret = LIBETH_XDP_DROP; + + if (act != XDP_REDIRECT) + goto drop; + + xsk = container_of(&xdp->base, typeof(*xsk), xdp); + if (xsk_uses_need_wakeup(xsk->pool) && ret == -ENOBUFS) + __ret = LIBETH_XDP_ABORTED; + +drop: + libeth_xsk_buff_free_slow(xdp); + + return __ret; +} diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index c3655458047d..dba09a9168f1 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1122,18 +1122,19 @@ __libeth_xdp_xmit_do_bulk(struct libeth_xdp_tx_bulk *bq, * Should be called on an onstack XDP Tx bulk before the NAPI polling loop. * Initializes all the needed fields to run libeth_xdp functions. If @num == 0, * assumes XDP is not enabled. + * Do not use for XSk, it has its own optimized helper. */ #define libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, false, \ __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) -#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, ub, un) do { \ +#define __libeth_xdp_tx_init_bulk(bq, pr, d, xdpsqs, num, xsk, ub, un) do { \ typeof(bq) ub = (bq); \ u32 un = (num); \ \ rcu_read_lock(); \ \ - if (un) { \ + if (un || (xsk)) { \ ub->prog = rcu_dereference(pr); \ ub->dev = (d); \ ub->xdpsq = (xdpsqs)[libeth_xdpsq_id(un)]; \ @@ -1159,6 +1160,7 @@ void __libeth_xdp_return_stash(struct libeth_xdp_buff_stash *stash); * * Should be called before the main NAPI polling loop. Loads the content of * the previously saved stash or initializes the buffer from scratch. + * Do not use for XSk. */ static inline void libeth_xdp_init_buff(struct libeth_xdp_buff *dst, @@ -1378,7 +1380,7 @@ out: * @flush_bulk: driver callback for flushing a bulk * * Internal inline abstraction to run XDP program and additionally handle - * ``XDP_TX`` verdict. + * ``XDP_TX`` verdict. Used by both XDP and XSk, hence @run and @queue. * Do not use directly. * * Return: libeth_xdp prog verdict depending on the prog's verdict. @@ -1408,12 +1410,13 @@ __libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, } /** - * libeth_xdp_run_prog - run XDP program and handle all verdicts + * libeth_xdp_run_prog - run XDP program (non-XSk path) and handle all verdicts * @xdp: XDP buffer to process * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers * @fl: driver ``XDP_TX`` bulk flush callback * - * Run the attached XDP program and handle all possible verdicts. + * Run the attached XDP program and handle all possible verdicts. XSk has its + * own version. * Prefer using it via LIBETH_XDP_DEFINE_RUN{,_PASS,_PROG}(). * * Return: true if the buffer should be passed up the stack, false if the poll @@ -1435,7 +1438,7 @@ __libeth_xdp_run_flush(struct libeth_xdp_buff *xdp, * @run: driver wrapper to run XDP program * @populate: driver callback to populate an skb with the HW descriptor data * - * Inline abstraction that does the following: + * Inline abstraction that does the following (non-XSk path): * 1) adds frame size and frag number (if needed) to the onstack stats; * 2) fills the descriptor metadata to the onstack &libeth_xdp_buff * 3) runs XDP program if present; @@ -1518,7 +1521,7 @@ static inline void libeth_xdp_prep_desc(struct libeth_xdp_buff *xdp, run, populate) /** - * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop + * libeth_xdp_finalize_rx - finalize XDPSQ after a NAPI polling loop (non-XSk) * @bq: ``XDP_TX`` frame bulk * @flush: driver callback to flush the bulk * @finalize: driver callback to start sending the frames and run the timer diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index 16ca195981fe..f3f338e566fc 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -311,4 +311,277 @@ libeth_xsk_xmit_do_bulk(struct xsk_buff_pool *pool, void *xdpsq, u32 budget, return n < budget; } +/* Rx polling path */ + +/** + * libeth_xsk_tx_init_bulk - initialize XDP Tx bulk for an XSk Rx NAPI poll + * @bq: bulk to initialize + * @prog: RCU pointer to the XDP program (never %NULL) + * @dev: target &net_device + * @xdpsqs: array of driver XDPSQ structs + * @num: number of active XDPSQs, the above array length + * + * Should be called on an onstack XDP Tx bulk before the XSk NAPI polling loop. + * Initializes all the needed fields to run libeth_xdp functions. + * Never checks if @prog is %NULL or @num == 0 as XDP must always be enabled + * when hitting this path. + */ +#define libeth_xsk_tx_init_bulk(bq, prog, dev, xdpsqs, num) \ + __libeth_xdp_tx_init_bulk(bq, prog, dev, xdpsqs, num, true, \ + __UNIQUE_ID(bq_), __UNIQUE_ID(nqs_)) + +struct libeth_xdp_buff *libeth_xsk_buff_add_frag(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp); + +/** + * libeth_xsk_process_buff - attach XSk Rx buffer to &libeth_xdp_buff + * @head: head XSk buffer to attach the XSk buffer to (or %NULL) + * @xdp: XSk buffer to process + * @len: received data length from the descriptor + * + * If @head == %NULL, treats the XSk buffer as head and initializes + * the required fields. Otherwise, attaches the buffer as a frag. + * Already performs DMA sync-for-CPU and frame start prefetch + * (for head buffers only). + * + * Return: head XSk buffer on success or if the descriptor must be skipped + * (empty), %NULL if there is no space for a new frag. + */ +static inline struct libeth_xdp_buff * +libeth_xsk_process_buff(struct libeth_xdp_buff *head, + struct libeth_xdp_buff *xdp, u32 len) +{ + if (unlikely(!len)) { + libeth_xsk_buff_free_slow(xdp); + return head; + } + + xsk_buff_set_size(&xdp->base, len); + xsk_buff_dma_sync_for_cpu(&xdp->base); + + if (head) + return libeth_xsk_buff_add_frag(head, xdp); + + prefetch(xdp->data); + + return xdp; +} + +void libeth_xsk_buff_stats_frags(struct libeth_rq_napi_stats *rs, + const struct libeth_xdp_buff *xdp); + +u32 __libeth_xsk_run_prog_slow(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq, + enum xdp_action act, int ret); + +/** + * __libeth_xsk_run_prog - run XDP program on XSk buffer + * @xdp: XSk buffer to run the prog on + * @bq: buffer bulk for ``XDP_TX`` queueing + * + * Internal inline abstraction to run XDP program on XSk Rx path. Handles + * only the most common ``XDP_REDIRECT`` inline, the rest is processed + * externally. + * Reports an XDP prog exception on errors. + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +static __always_inline u32 +__libeth_xsk_run_prog(struct libeth_xdp_buff *xdp, + const struct libeth_xdp_tx_bulk *bq) +{ + enum xdp_action act; + int ret = 0; + + act = bpf_prog_run_xdp(bq->prog, &xdp->base); + if (unlikely(act != XDP_REDIRECT)) +rest: + return __libeth_xsk_run_prog_slow(xdp, bq, act, ret); + + ret = xdp_do_redirect(bq->dev, &xdp->base, bq->prog); + if (unlikely(ret)) + goto rest; + + return LIBETH_XDP_REDIRECT; +} + +/** + * libeth_xsk_run_prog - run XDP program on XSk path and handle all verdicts + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` buffers + * @fl: driver ``XDP_TX`` bulk flush callback + * + * Run the attached XDP program and handle all possible verdicts. + * Prefer using it via LIBETH_XSK_DEFINE_RUN{,_PASS,_PROG}(). + * + * Return: libeth_xdp prog verdict depending on the prog's verdict. + */ +#define libeth_xsk_run_prog(xdp, bq, fl) \ + __libeth_xdp_run_flush(xdp, bq, __libeth_xsk_run_prog, \ + libeth_xsk_tx_queue_bulk, fl) + +/** + * __libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @md: metadata that should be filled to the XSk buffer + * @prep: callback for filling the metadata + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Inline abstraction, XSk's counterpart of __libeth_xdp_run_pass(), see its + * doc for details. + * + * Return: false if the polling loop must be exited due to lack of free + * buffers, true otherwise. + */ +static __always_inline bool +__libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq, struct napi_struct *napi, + struct libeth_rq_napi_stats *rs, const void *md, + void (*prep)(struct libeth_xdp_buff *xdp, + const void *md), + u32 (*run)(struct libeth_xdp_buff *xdp, + struct libeth_xdp_tx_bulk *bq), + bool (*populate)(struct sk_buff *skb, + const struct libeth_xdp_buff *xdp, + struct libeth_rq_napi_stats *rs)) +{ + struct sk_buff *skb; + u32 act; + + rs->bytes += xdp->base.data_end - xdp->data; + rs->packets++; + + if (unlikely(xdp_buff_has_frags(&xdp->base))) + libeth_xsk_buff_stats_frags(rs, xdp); + + if (prep && (!__builtin_constant_p(!!md) || md)) + prep(xdp, md); + + act = run(xdp, bq); + if (likely(act == LIBETH_XDP_REDIRECT)) + return true; + + if (act != LIBETH_XDP_PASS) + return act != LIBETH_XDP_ABORTED; + + skb = xdp_build_skb_from_zc(&xdp->base); + if (unlikely(!skb)) { + libeth_xsk_buff_free_slow(xdp); + return true; + } + + if (unlikely(!populate(skb, xdp, rs))) { + napi_consume_skb(skb, true); + return true; + } + + napi_gro_receive(napi, skb); + + return true; +} + +/** + * libeth_xsk_run_pass - helper to run XDP program and handle the result + * @xdp: XSk buffer to process + * @bq: XDP Tx bulk to queue ``XDP_TX`` frames + * @napi: NAPI to build an skb and pass it up the stack + * @rs: onstack libeth RQ stats + * @desc: pointer to the HW descriptor for that frame + * @run: driver wrapper to run XDP program + * @populate: driver callback to populate an skb with the HW descriptor data + * + * Wrapper around the underscored version when "fill the descriptor metadata" + * means just writing the pointer to the HW descriptor as @xdp->desc. + */ +#define libeth_xsk_run_pass(xdp, bq, napi, rs, desc, run, populate) \ + __libeth_xsk_run_pass(xdp, bq, napi, rs, desc, libeth_xdp_prep_desc, \ + run, populate) + +/** + * libeth_xsk_finalize_rx - finalize XDPSQ after an XSk NAPI polling loop + * @bq: ``XDP_TX`` frame bulk + * @flush: driver callback to flush the bulk + * @finalize: driver callback to start sending the frames and run the timer + * + * Flush the bulk if there are frames left to send, kick the queue and flush + * the XDP maps. + */ +#define libeth_xsk_finalize_rx(bq, flush, finalize) \ + __libeth_xdp_finalize_rx(bq, LIBETH_XDP_TX_XSK, flush, finalize) + +/* + * Helpers to reduce boilerplate code in drivers. + * + * Typical driver XSk Rx flow would be (excl. bulk and buff init, frag attach): + * + * LIBETH_XDP_DEFINE_START(); + * LIBETH_XSK_DEFINE_FLUSH_TX(static driver_xsk_flush_tx, driver_xsk_tx_prep, + * driver_xdp_xmit); + * LIBETH_XSK_DEFINE_RUN(static driver_xsk_run, driver_xsk_run_prog, + * driver_xsk_flush_tx, driver_populate_skb); + * LIBETH_XSK_DEFINE_FINALIZE(static driver_xsk_finalize_rx, + * driver_xsk_flush_tx, driver_xdp_finalize_sq); + * LIBETH_XDP_DEFINE_END(); + * + * This will build a set of 4 static functions. The compiler is free to decide + * whether to inline them. + * Then, in the NAPI polling function: + * + * while (packets < budget) { + * // ... + * if (!driver_xsk_run(xdp, &bq, napi, &rs, desc)) + * break; + * } + * driver_xsk_finalize_rx(&bq); + */ + +/** + * LIBETH_XSK_DEFINE_FLUSH_TX - define a driver XSk ``XDP_TX`` flush function + * @name: name of the function to define + * @prep: driver callback to clean an XDPSQ + * @xmit: driver callback to write a HW Tx descriptor + */ +#define LIBETH_XSK_DEFINE_FLUSH_TX(name, prep, xmit) \ + __LIBETH_XDP_DEFINE_FLUSH_TX(name, prep, xmit, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PROG - define a driver XDP program run function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + */ +#define LIBETH_XSK_DEFINE_RUN_PROG(name, flush) \ + u32 __LIBETH_XDP_DEFINE_RUN_PROG(name, flush, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN_PASS - define a driver buffer process + pass function + * @name: name of the function to define + * @run: driver callback to run XDP program (above) + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN_PASS(name, run, populate) \ + bool __LIBETH_XDP_DEFINE_RUN_PASS(name, run, populate, xsk) + +/** + * LIBETH_XSK_DEFINE_RUN - define a driver buffer process, run + pass function + * @name: name of the function to define + * @run: name of the XDP prog run function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @populate: driver callback to fill an skb with HW descriptor info + */ +#define LIBETH_XSK_DEFINE_RUN(name, run, flush, populate) \ + __LIBETH_XDP_DEFINE_RUN(name, run, flush, populate, XSK) + +/** + * LIBETH_XSK_DEFINE_FINALIZE - define a driver XSk NAPI poll finalize function + * @name: name of the function to define + * @flush: driver callback to flush an XSk ``XDP_TX`` bulk + * @finalize: driver callback to finalize an XDPSQ and run the timer + */ +#define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \ + __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk) + #endif /* __LIBETH_XSK_H */ -- cgit v1.2.3 From 3ced71a8b39e84f91a4fa9d42e85815515f9b1bc Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:33 +0200 Subject: libeth: xsk: add XSkFQ refill and XSk wakeup helpers XSkFQ refill is pretty generic across the drivers minus FQ descriptor filling and can easily be unified with one inline callback. XSk wakeup is usually not, but here, instead of commonly used "SW interrupts", I picked firing an IPI. In most tests, it showed better performance; it also provides better control for userspace on which CPU will handle the xmit, as SW interrupts honor IRQ affinity no matter which core produces XSk xmit descs (while XDPSQs are associated 1:1 with cores having the same ID). Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/libeth/xsk.c | 124 ++++++++++++++++++++++++++++++++ include/net/libeth/xsk.h | 98 +++++++++++++++++++++++++ 2 files changed, 222 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/intel/libeth/xsk.c b/drivers/net/ethernet/intel/libeth/xsk.c index f8f4016d1b25..846e902e31b6 100644 --- a/drivers/net/ethernet/intel/libeth/xsk.c +++ b/drivers/net/ethernet/intel/libeth/xsk.c @@ -145,3 +145,127 @@ drop: return __ret; } + +/* Refill */ + +/** + * libeth_xskfq_create - create an XSkFQ + * @fq: fill queue to initialize + * + * Allocates the FQEs and initializes the fields used by libeth_xdp: number + * of buffers to refill, refill threshold and buffer len. + * + * Return: %0 on success, -errno otherwise. + */ +int libeth_xskfq_create(struct libeth_xskfq *fq) +{ + fq->fqes = kvcalloc_node(fq->count, sizeof(*fq->fqes), GFP_KERNEL, + fq->nid); + if (!fq->fqes) + return -ENOMEM; + + fq->pending = fq->count; + fq->thresh = libeth_xdp_queue_threshold(fq->count); + fq->buf_len = xsk_pool_get_rx_frame_size(fq->pool); + + return 0; +} +EXPORT_SYMBOL_GPL(libeth_xskfq_create); + +/** + * libeth_xskfq_destroy - destroy an XSkFQ + * @fq: fill queue to destroy + * + * Zeroes the used fields and frees the FQEs array. + */ +void libeth_xskfq_destroy(struct libeth_xskfq *fq) +{ + fq->buf_len = 0; + fq->thresh = 0; + fq->pending = 0; + + kvfree(fq->fqes); +} +EXPORT_SYMBOL_GPL(libeth_xskfq_destroy); + +/* .ndo_xsk_wakeup */ + +static void libeth_xsk_napi_sched(void *info) +{ + __napi_schedule_irqoff(info); +} + +/** + * libeth_xsk_init_wakeup - initialize libeth XSk wakeup structure + * @csd: struct to initialize + * @napi: NAPI corresponding to this queue + * + * libeth_xdp uses inter-processor interrupts to perform XSk wakeups. In order + * to do that, the corresponding CSDs must be initialized when creating the + * queues. + */ +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi) +{ + INIT_CSD(csd, libeth_xsk_napi_sched, napi); +} +EXPORT_SYMBOL_GPL(libeth_xsk_init_wakeup); + +/** + * libeth_xsk_wakeup - perform an XSk wakeup + * @csd: CSD corresponding to the queue + * @qid: the stack queue index + * + * Try to mark the NAPI as missed first, so that it could be rescheduled. + * If it's not, schedule it on the corresponding CPU using IPIs (or directly + * if already running on it). + */ +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid) +{ + struct napi_struct *napi = csd->info; + + if (napi_if_scheduled_mark_missed(napi) || + unlikely(!napi_schedule_prep(napi))) + return; + + if (unlikely(qid >= nr_cpu_ids)) + qid %= nr_cpu_ids; + + if (qid != raw_smp_processor_id() && cpu_online(qid)) + smp_call_function_single_async(qid, csd); + else + __napi_schedule(napi); +} +EXPORT_SYMBOL_GPL(libeth_xsk_wakeup); + +/* Pool setup */ + +#define LIBETH_XSK_DMA_ATTR \ + (DMA_ATTR_WEAK_ORDERING | DMA_ATTR_SKIP_CPU_SYNC) + +/** + * libeth_xsk_setup_pool - setup or destroy an XSk pool for a queue + * @dev: target &net_device + * @qid: stack queue index to configure + * @enable: whether to enable or disable the pool + * + * Check that @qid is valid and then map or unmap the pool. + * + * Return: %0 on success, -errno otherwise. + */ +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable) +{ + struct xsk_buff_pool *pool; + + pool = xsk_get_pool_from_qid(dev, qid); + if (!pool) + return -EINVAL; + + if (enable) + return xsk_pool_dma_map(pool, dev->dev.parent, + LIBETH_XSK_DMA_ATTR); + else + xsk_pool_dma_unmap(pool, LIBETH_XSK_DMA_ATTR); + + return 0; +} +EXPORT_SYMBOL_GPL(libeth_xsk_setup_pool); diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index f3f338e566fc..213778a68476 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -584,4 +584,102 @@ __libeth_xsk_run_pass(struct libeth_xdp_buff *xdp, #define LIBETH_XSK_DEFINE_FINALIZE(name, flush, finalize) \ __LIBETH_XDP_DEFINE_FINALIZE(name, flush, finalize, xsk) +/* Refilling */ + +/** + * struct libeth_xskfq - structure representing an XSk buffer (fill) queue + * @fp: hotpath part of the structure + * @pool: &xsk_buff_pool for buffer management + * @fqes: array of XSk buffer pointers + * @descs: opaque pointer to the HW descriptor array + * @ntu: index of the next buffer to poll + * @count: number of descriptors/buffers the queue has + * @pending: current number of XSkFQEs to refill + * @thresh: threshold below which the queue is refilled + * @buf_len: HW-writeable length per each buffer + * @nid: ID of the closest NUMA node with memory + */ +struct libeth_xskfq { + struct_group_tagged(libeth_xskfq_fp, fp, + struct xsk_buff_pool *pool; + struct libeth_xdp_buff **fqes; + void *descs; + + u32 ntu; + u32 count; + ); + + /* Cold fields */ + u32 pending; + u32 thresh; + + u32 buf_len; + int nid; +}; + +int libeth_xskfq_create(struct libeth_xskfq *fq); +void libeth_xskfq_destroy(struct libeth_xskfq *fq); + +/** + * libeth_xsk_buff_xdp_get_dma - get DMA address of XSk &libeth_xdp_buff + * @xdp: buffer to get the DMA addr for + */ +#define libeth_xsk_buff_xdp_get_dma(xdp) \ + xsk_buff_xdp_get_dma(&(xdp)->base) + +/** + * libeth_xskfqe_alloc - allocate @n XSk Rx buffers + * @fq: hotpath part of the XSkFQ, usually onstack + * @n: number of buffers to allocate + * @fill: driver callback to write DMA addresses to HW descriptors + * + * Note that @fq->ntu gets updated, but ::pending must be recalculated + * by the caller. + * + * Return: number of buffers refilled. + */ +static __always_inline u32 +libeth_xskfqe_alloc(struct libeth_xskfq_fp *fq, u32 n, + void (*fill)(const struct libeth_xskfq_fp *fq, u32 i)) +{ + u32 this, ret, done = 0; + struct xdp_buff **xskb; + + this = fq->count - fq->ntu; + if (likely(this > n)) + this = n; + +again: + xskb = (typeof(xskb))&fq->fqes[fq->ntu]; + ret = xsk_buff_alloc_batch(fq->pool, xskb, this); + + for (u32 i = 0, ntu = fq->ntu; likely(i < ret); i++) + fill(fq, ntu + i); + + done += ret; + fq->ntu += ret; + + if (likely(fq->ntu < fq->count) || unlikely(ret < this)) + goto out; + + fq->ntu = 0; + + if (this < n) { + this = n - this; + goto again; + } + +out: + return done; +} + +/* .ndo_xsk_wakeup */ + +void libeth_xsk_init_wakeup(call_single_data_t *csd, struct napi_struct *napi); +void libeth_xsk_wakeup(call_single_data_t *csd, u32 qid); + +/* Pool setup */ + +int libeth_xsk_setup_pool(struct net_device *dev, u32 qid, bool enable); + #endif /* __LIBETH_XSK_H */ -- cgit v1.2.3 From 80bae9df2108cb72a060ee5235614d7c072af1de Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 12 Jun 2025 18:02:34 +0200 Subject: libeth: xdp, xsk: access adjacent u32s as u64 where applicable On 64-bit systems, writing/reading one u64 is faster than two u32s even when they're are adjacent in a struct. The compilers won't guarantee they will combine those; I observed both successful and unsuccessful attempts with both GCC and Clang, and it's not easy to say what it depends on. There's a few places in libeth_xdp winning up to several percent from combined access (both performance and object code size, especially when unrolling). Add __LIBETH_WORD_ACCESS and use it there on LE. Drivers are free to optimize HW-specific callbacks under the same definition. Signed-off-by: Alexander Lobakin Signed-off-by: Tony Nguyen --- include/net/libeth/xdp.h | 29 ++++++++++++++++++++++++++--- include/net/libeth/xsk.h | 10 +++++----- 2 files changed, 31 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index dba09a9168f1..6ce6aec6884c 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -475,6 +475,21 @@ struct libeth_xdp_tx_desc { ((const void *)(uintptr_t)(priv)); \ }) +/* + * On 64-bit systems, assigning one u64 is faster than two u32s. When ::len + * occupies lowest 32 bits (LE), whole ::opts can be assigned directly instead. + */ +#ifdef __LITTLE_ENDIAN +#define __LIBETH_WORD_ACCESS 1 +#endif +#ifdef __LIBETH_WORD_ACCESS +#define __libeth_xdp_tx_len(flen, ...) \ + .opts = ((flen) | FIELD_PREP(GENMASK_ULL(63, 32), (__VA_ARGS__ + 0))) +#else +#define __libeth_xdp_tx_len(flen, ...) \ + .len = (flen), .flags = (__VA_ARGS__ + 0) +#endif + /** * libeth_xdp_tx_xmit_bulk - main XDP Tx function * @bulk: array of frames to send @@ -870,8 +885,7 @@ static inline u32 libeth_xdp_xmit_queue_head(struct libeth_xdp_tx_bulk *bq, bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xdpf = xdpf, - .len = xdpf->len, - .flags = LIBETH_XDP_TX_FIRST, + __libeth_xdp_tx_len(xdpf->len, LIBETH_XDP_TX_FIRST), }; if (!xdp_frame_has_frags(xdpf)) @@ -902,7 +916,7 @@ static inline bool libeth_xdp_xmit_queue_frag(struct libeth_xdp_tx_bulk *bq, bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .dma = dma, - .len = skb_frag_size(frag), + __libeth_xdp_tx_len(skb_frag_size(frag)), }; return true; @@ -1260,6 +1274,7 @@ bool libeth_xdp_buff_add_frag(struct libeth_xdp_buff *xdp, * Internal, use libeth_xdp_process_buff() instead. Initializes XDP buffer * head with the Rx buffer data: data pointer, length, headroom, and * truesize/tailroom. Zeroes the flags. + * Uses faster single u64 write instead of per-field access. */ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, const struct libeth_fqe *fqe, @@ -1267,7 +1282,15 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, { const struct page *page = __netmem_to_page(fqe->netmem); +#ifdef __LIBETH_WORD_ACCESS + static_assert(offsetofend(typeof(xdp->base), flags) - + offsetof(typeof(xdp->base), frame_sz) == + sizeof(u64)); + + *(u64 *)&xdp->base.frame_sz = fqe->truesize; +#else xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); +#endif xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, page->pp->p.offset, len, true); } diff --git a/include/net/libeth/xsk.h b/include/net/libeth/xsk.h index 213778a68476..481a7b28e6f2 100644 --- a/include/net/libeth/xsk.h +++ b/include/net/libeth/xsk.h @@ -26,8 +26,8 @@ static inline bool libeth_xsk_tx_queue_head(struct libeth_xdp_tx_bulk *bq, { bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xsk = xdp, - .len = xdp->base.data_end - xdp->data, - .flags = LIBETH_XDP_TX_FIRST, + __libeth_xdp_tx_len(xdp->base.data_end - xdp->data, + LIBETH_XDP_TX_FIRST), }; if (likely(!xdp_buff_has_frags(&xdp->base))) @@ -48,7 +48,7 @@ static inline void libeth_xsk_tx_queue_frag(struct libeth_xdp_tx_bulk *bq, { bq->bulk[bq->count++] = (typeof(*bq->bulk)){ .xsk = frag, - .len = frag->base.data_end - frag->data, + __libeth_xdp_tx_len(frag->base.data_end - frag->data), }; } @@ -199,7 +199,7 @@ __libeth_xsk_xmit_fill_buf_md(const struct xdp_desc *xdesc, ctx = xsk_buff_raw_get_ctx(sq->pool, xdesc->addr); desc = (typeof(desc)){ .addr = ctx.dma, - .len = xdesc->len, + __libeth_xdp_tx_len(xdesc->len), }; BUILD_BUG_ON(!__builtin_constant_p(tmo == libeth_xsktmo)); @@ -226,7 +226,7 @@ __libeth_xsk_xmit_fill_buf(const struct xdp_desc *xdesc, { return (struct libeth_xdp_tx_desc){ .addr = xsk_buff_raw_get_dma(sq->pool, xdesc->addr), - .len = xdesc->len, + __libeth_xdp_tx_len(xdesc->len), }; } -- cgit v1.2.3 From 7768c5f417336fa58dbfef9bb7ecd7eeec6d8886 Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 13 Jun 2025 10:00:34 -0700 Subject: net: mana: Add handler for hardware servicing events To collaborate with hardware servicing events, upon receiving the special EQE notification from the HW channel, remove the devices on this bus. Then, after a waiting period based on the device specs, rescan the parent bus to recover the devices. Signed-off-by: Haiyang Zhang Reviewed-by: Shradha Gupta Reviewed-by: Simon Horman Link: https://patch.msgid.link/1749834034-18498-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 75 +++++++++++++++++++++++++ include/net/mana/gdma.h | 10 +++- 2 files changed, 83 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 3504507477c6..069b7a871b78 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -352,11 +352,59 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) } EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA"); +#define MANA_SERVICE_PERIOD 10 + +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; +}; + +static void mana_serv_func(struct work_struct *w) +{ + struct mana_serv_work *mns_wk; + struct pci_bus *bus, *parent; + struct pci_dev *pdev; + + mns_wk = container_of(w, struct mana_serv_work, serv_work); + pdev = mns_wk->pdev; + + pci_lock_rescan_remove(); + + if (!pdev) + goto out; + + bus = pdev->bus; + if (!bus) { + dev_err(&pdev->dev, "MANA service: no bus\n"); + goto out; + } + + parent = bus->parent; + if (!parent) { + dev_err(&pdev->dev, "MANA service: no parent bus\n"); + goto out; + } + + pci_stop_and_remove_bus_device(bus->self); + + msleep(MANA_SERVICE_PERIOD * 1000); + + pci_rescan_bus(parent); + +out: + pci_unlock_rescan_remove(); + + pci_dev_put(pdev); + kfree(mns_wk); + module_put(THIS_MODULE); +} + static void mana_gd_process_eqe(struct gdma_queue *eq) { u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); struct gdma_context *gc = eq->gdma_dev->gdma_context; struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; + struct mana_serv_work *mns_wk; union gdma_eqe_info eqe_info; enum gdma_eqe_type type; struct gdma_event event; @@ -401,6 +449,33 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) eq->eq.callback(eq->eq.context, eq, &event); break; + case GDMA_EQE_HWC_FPGA_RECONFIG: + dev_info(gc->dev, "Recv MANA service type:%d\n", type); + + if (gc->in_service) { + dev_info(gc->dev, "Already in service\n"); + break; + } + + if (!try_module_get(THIS_MODULE)) { + dev_info(gc->dev, "Module is unloading\n"); + break; + } + + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); + if (!mns_wk) { + module_put(THIS_MODULE); + break; + } + + dev_info(gc->dev, "Start MANA service type:%d\n", type); + gc->in_service = true; + mns_wk->pdev = to_pci_dev(gc->dev); + pci_dev_get(mns_wk->pdev); + INIT_WORK(&mns_wk->serv_work, mana_serv_func); + schedule_work(&mns_wk->serv_work); + break; + default: break; } diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 3ce56a816425..bfae59202669 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -58,7 +58,7 @@ enum gdma_eqe_type { GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, GDMA_EQE_HWC_INIT_DATA = 130, GDMA_EQE_HWC_INIT_DONE = 131, - GDMA_EQE_HWC_SOC_RECONFIG = 132, + GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, GDMA_EQE_HWC_SOC_SERVICE = 134, GDMA_EQE_RNIC_QP_FATAL = 176, @@ -403,6 +403,8 @@ struct gdma_context { u32 test_event_eq_id; bool is_pf; + bool in_service; + phys_addr_t bar0_pa; void __iomem *bar0_va; void __iomem *shm_base; @@ -578,12 +580,16 @@ enum { /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver can self reset on FPGA Reconfig EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ - GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) #define GDMA_DRV_CAP_FLAGS2 0 -- cgit v1.2.3 From 755391121038c06cb653241aa94dcabd87179f62 Mon Sep 17 00:00:00 2001 From: Shradha Gupta Date: Wed, 11 Jun 2025 07:11:13 -0700 Subject: net: mana: Allocate MSI-X vectors dynamically Currently, the MANA driver allocates MSI-X vectors statically based on MANA_MAX_NUM_QUEUES and num_online_cpus() values and in some cases ends up allocating more vectors than it needs. This is because, by this time we do not have a HW channel and do not know how many IRQs should be allocated. To avoid this, we allocate 1 MSI-X vector during the creation of HWC and after getting the value supported by hardware, dynamically add the remaining MSI-X vectors. Signed-off-by: Shradha Gupta Reviewed-by: Haiyang Zhang --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 311 +++++++++++++++++------- include/net/mana/gdma.h | 8 +- 2 files changed, 235 insertions(+), 84 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 6e468c0f2c40..d0040c12b8a2 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include @@ -80,8 +82,15 @@ static int mana_gd_query_max_resources(struct pci_dev *pdev) return err ? err : -EPROTO; } - if (gc->num_msix_usable > resp.max_msix) - gc->num_msix_usable = resp.max_msix; + if (!pci_msix_can_alloc_dyn(pdev)) { + if (gc->num_msix_usable > resp.max_msix) + gc->num_msix_usable = resp.max_msix; + } else { + /* If dynamic allocation is enabled we have already allocated + * hwc msi + */ + gc->num_msix_usable = min(resp.max_msix, num_online_cpus() + 1); + } if (gc->num_msix_usable <= 1) return -ENOSPC; @@ -483,7 +492,9 @@ static int mana_gd_register_irq(struct gdma_queue *queue, } queue->eq.msix_index = msi_index; - gic = &gc->irq_contexts[msi_index]; + gic = xa_load(&gc->irq_contexts, msi_index); + if (WARN_ON(!gic)) + return -EINVAL; spin_lock_irqsave(&gic->lock, flags); list_add_rcu(&queue->entry, &gic->eq_list); @@ -508,7 +519,10 @@ static void mana_gd_deregiser_irq(struct gdma_queue *queue) if (WARN_ON(msix_index >= gc->num_msix_usable)) return; - gic = &gc->irq_contexts[msix_index]; + gic = xa_load(&gc->irq_contexts, msix_index); + if (WARN_ON(!gic)) + return; + spin_lock_irqsave(&gic->lock, flags); list_for_each_entry_rcu(eq, &gic->eq_list, entry) { if (queue == eq) { @@ -1366,47 +1380,108 @@ done: return 0; } -static int mana_gd_setup_irqs(struct pci_dev *pdev) +static int mana_gd_setup_dyn_irqs(struct pci_dev *pdev, int nvec) { struct gdma_context *gc = pci_get_drvdata(pdev); - unsigned int max_queues_per_port; struct gdma_irq_context *gic; - unsigned int max_irqs, cpu; - int start_irq_index = 1; - int nvec, *irqs, irq; - int err, i = 0, j; + bool skip_first_cpu = false; + int *irqs, irq, err, i; - cpus_read_lock(); - max_queues_per_port = num_online_cpus(); - if (max_queues_per_port > MANA_MAX_NUM_QUEUES) - max_queues_per_port = MANA_MAX_NUM_QUEUES; + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) + return -ENOMEM; + + /* + * While processing the next pci irq vector, we start with index 1, + * as IRQ vector at index 0 is already processed for HWC. + * However, the population of irqs array starts with index 0, to be + * further used in irq_setup() + */ + for (i = 1; i <= nvec; i++) { + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + err = -ENOMEM; + goto free_irq; + } + gic->handler = mana_gd_process_eq_events; + INIT_LIST_HEAD(&gic->eq_list); + spin_lock_init(&gic->lock); - /* Need 1 interrupt for the Hardware communication Channel (HWC) */ - max_irqs = max_queues_per_port + 1; + snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", + i - 1, pci_name(pdev)); - nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX); - if (nvec < 0) { - cpus_read_unlock(); - return nvec; + /* one pci vector is already allocated for HWC */ + irqs[i - 1] = pci_irq_vector(pdev, i); + if (irqs[i - 1] < 0) { + err = irqs[i - 1]; + goto free_current_gic; + } + + err = request_irq(irqs[i - 1], mana_gd_intr, 0, gic->name, gic); + if (err) + goto free_current_gic; + + xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } - if (nvec <= num_online_cpus()) - start_irq_index = 0; - irqs = kmalloc_array((nvec - start_irq_index), sizeof(int), GFP_KERNEL); - if (!irqs) { - err = -ENOMEM; - goto free_irq_vector; + /* + * When calling irq_setup() for dynamically added IRQs, if number of + * CPUs is more than or equal to allocated MSI-X, we need to skip the + * first CPU sibling group since they are already affinitized to HWC IRQ + */ + cpus_read_lock(); + if (gc->num_msix_usable <= num_online_cpus()) + skip_first_cpu = true; + + err = irq_setup(irqs, nvec, gc->numa_node, skip_first_cpu); + if (err) { + cpus_read_unlock(); + goto free_irq; } - gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context), - GFP_KERNEL); - if (!gc->irq_contexts) { - err = -ENOMEM; - goto free_irq_array; + cpus_read_unlock(); + kfree(irqs); + return 0; + +free_current_gic: + kfree(gic); +free_irq: + for (i -= 1; i > 0; i--) { + irq = pci_irq_vector(pdev, i); + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; + + irq_update_affinity_hint(irq, NULL); + free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } + kfree(irqs); + return err; +} + +static int mana_gd_setup_irqs(struct pci_dev *pdev, int nvec) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct gdma_irq_context *gic; + int *irqs, *start_irqs, irq; + unsigned int cpu; + int err, i; + + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) + return -ENOMEM; + + start_irqs = irqs; for (i = 0; i < nvec; i++) { - gic = &gc->irq_contexts[i]; + gic = kzalloc(sizeof(*gic), GFP_KERNEL); + if (!gic) { + err = -ENOMEM; + goto free_irq; + } + gic->handler = mana_gd_process_eq_events; INIT_LIST_HEAD(&gic->eq_list); spin_lock_init(&gic->lock); @@ -1418,69 +1493,128 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", i - 1, pci_name(pdev)); - irq = pci_irq_vector(pdev, i); - if (irq < 0) { - err = irq; - goto free_irq; + irqs[i] = pci_irq_vector(pdev, i); + if (irqs[i] < 0) { + err = irqs[i]; + goto free_current_gic; } - if (!i) { - err = request_irq(irq, mana_gd_intr, 0, gic->name, gic); - if (err) - goto free_irq; - - /* If number of IRQ is one extra than number of online CPUs, - * then we need to assign IRQ0 (hwc irq) and IRQ1 to - * same CPU. - * Else we will use different CPUs for IRQ0 and IRQ1. - * Also we are using cpumask_local_spread instead of - * cpumask_first for the node, because the node can be - * mem only. - */ - if (start_irq_index) { - cpu = cpumask_local_spread(i, gc->numa_node); - irq_set_affinity_and_hint(irq, cpumask_of(cpu)); - } else { - irqs[start_irq_index] = irq; - } - } else { - irqs[i - start_irq_index] = irq; - err = request_irq(irqs[i - start_irq_index], mana_gd_intr, 0, - gic->name, gic); - if (err) - goto free_irq; - } + err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic); + if (err) + goto free_current_gic; + + xa_store(&gc->irq_contexts, i, gic, GFP_KERNEL); } - err = irq_setup(irqs, nvec - start_irq_index, gc->numa_node, false); - if (err) + /* If number of IRQ is one extra than number of online CPUs, + * then we need to assign IRQ0 (hwc irq) and IRQ1 to + * same CPU. + * Else we will use different CPUs for IRQ0 and IRQ1. + * Also we are using cpumask_local_spread instead of + * cpumask_first for the node, because the node can be + * mem only. + */ + cpus_read_lock(); + if (nvec > num_online_cpus()) { + cpu = cpumask_local_spread(0, gc->numa_node); + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu)); + irqs++; + nvec -= 1; + } + + err = irq_setup(irqs, nvec, gc->numa_node, false); + if (err) { + cpus_read_unlock(); goto free_irq; + } - gc->max_num_msix = nvec; - gc->num_msix_usable = nvec; cpus_read_unlock(); - kfree(irqs); + kfree(start_irqs); return 0; +free_current_gic: + kfree(gic); free_irq: - for (j = i - 1; j >= 0; j--) { - irq = pci_irq_vector(pdev, j); - gic = &gc->irq_contexts[j]; + for (i -= 1; i >= 0; i--) { + irq = pci_irq_vector(pdev, i); + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } - kfree(gc->irq_contexts); - gc->irq_contexts = NULL; -free_irq_array: - kfree(irqs); -free_irq_vector: - cpus_read_unlock(); - pci_free_irq_vectors(pdev); + kfree(start_irqs); return err; } +static int mana_gd_setup_hwc_irqs(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + unsigned int max_irqs, min_irqs; + int nvec, err; + + if (pci_msix_can_alloc_dyn(pdev)) { + max_irqs = 1; + min_irqs = 1; + } else { + /* Need 1 interrupt for HWC */ + max_irqs = min(num_online_cpus(), MANA_MAX_NUM_QUEUES) + 1; + min_irqs = 2; + } + + nvec = pci_alloc_irq_vectors(pdev, min_irqs, max_irqs, PCI_IRQ_MSIX); + if (nvec < 0) + return nvec; + + err = mana_gd_setup_irqs(pdev, nvec); + if (err) { + pci_free_irq_vectors(pdev); + return err; + } + + gc->num_msix_usable = nvec; + gc->max_num_msix = nvec; + + return 0; +} + +static int mana_gd_setup_remaining_irqs(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct msi_map irq_map; + int max_irqs, i, err; + + if (!pci_msix_can_alloc_dyn(pdev)) + /* remain irqs are already allocated with HWC IRQ */ + return 0; + + /* allocate only remaining IRQs*/ + max_irqs = gc->num_msix_usable - 1; + + for (i = 1; i <= max_irqs; i++) { + irq_map = pci_msix_alloc_irq_at(pdev, i, NULL); + if (!irq_map.virq) { + err = irq_map.index; + /* caller will handle cleaning up all allocated + * irqs, after HWC is destroyed + */ + return err; + } + } + + err = mana_gd_setup_dyn_irqs(pdev, max_irqs); + if (err) + return err; + + gc->max_num_msix = gc->max_num_msix + max_irqs; + + return 0; +} + static void mana_gd_remove_irqs(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -1495,19 +1629,21 @@ static void mana_gd_remove_irqs(struct pci_dev *pdev) if (irq < 0) continue; - gic = &gc->irq_contexts[i]; + gic = xa_load(&gc->irq_contexts, i); + if (WARN_ON(!gic)) + continue; /* Need to clear the hint before free_irq */ irq_update_affinity_hint(irq, NULL); free_irq(irq, gic); + xa_erase(&gc->irq_contexts, i); + kfree(gic); } pci_free_irq_vectors(pdev); gc->max_num_msix = 0; gc->num_msix_usable = 0; - kfree(gc->irq_contexts); - gc->irq_contexts = NULL; } static int mana_gd_setup(struct pci_dev *pdev) @@ -1522,9 +1658,10 @@ static int mana_gd_setup(struct pci_dev *pdev) if (!gc->service_wq) return -ENOMEM; - err = mana_gd_setup_irqs(pdev); + err = mana_gd_setup_hwc_irqs(pdev); if (err) { - dev_err(gc->dev, "Failed to setup IRQs: %d\n", err); + dev_err(gc->dev, "Failed to setup IRQs for HWC creation: %d\n", + err); goto free_workqueue; } @@ -1540,6 +1677,12 @@ static int mana_gd_setup(struct pci_dev *pdev) if (err) goto destroy_hwc; + err = mana_gd_setup_remaining_irqs(pdev); + if (err) { + dev_err(gc->dev, "Failed to setup remaining IRQs: %d", err); + goto destroy_hwc; + } + err = mana_gd_detect_devices(pdev); if (err) goto destroy_hwc; @@ -1620,6 +1763,7 @@ static int mana_gd_probe(struct pci_dev *pdev, const struct pci_device_id *ent) gc->is_pf = mana_is_pf(pdev->device); gc->bar0_va = bar0_va; gc->dev = &pdev->dev; + xa_init(&gc->irq_contexts); if (gc->is_pf) gc->mana_pci_debugfs = debugfs_create_dir("0", mana_debugfs_root); @@ -1654,6 +1798,7 @@ unmap_bar: */ debugfs_remove_recursive(gc->mana_pci_debugfs); gc->mana_pci_debugfs = NULL; + xa_destroy(&gc->irq_contexts); pci_iounmap(pdev, bar0_va); free_gc: pci_set_drvdata(pdev, NULL); @@ -1679,6 +1824,8 @@ static void mana_gd_remove(struct pci_dev *pdev) gc->mana_pci_debugfs = NULL; + xa_destroy(&gc->irq_contexts); + pci_iounmap(pdev, gc->bar0_va); vfree(gc); diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 3ce56a816425..87162ba96d91 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -388,7 +388,7 @@ struct gdma_context { unsigned int max_num_queues; unsigned int max_num_msix; unsigned int num_msix_usable; - struct gdma_irq_context *irq_contexts; + struct xarray irq_contexts; /* L2 MTU */ u16 adapter_mtu; @@ -578,12 +578,16 @@ enum { /* Driver can handle holes (zeros) in the device list */ #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11) +/* Driver supports dynamic MSI-X vector allocation */ +#define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) + #define GDMA_DRV_CAP_FLAGS1 \ (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \ GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \ GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \ GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ - GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP) + GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ + GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT) #define GDMA_DRV_CAP_FLAGS2 0 -- cgit v1.2.3 From 2410251cde0bac9f660f276307d6c967466eef0c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 16 Jun 2025 10:46:25 +0100 Subject: net: timestamp: add helper returning skb's tx tstamp Add a helper function skb_get_tx_timestamp() that returns a tx timestamp associated with an error queue skb. Signed-off-by: Pavel Begunkov Acked-by: Willem de Bruijn Link: https://patch.msgid.link/702357dd8936ef4c0d3864441e853bfe3224a677.1750065793.git.asml.silence@gmail.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 4 ++++ net/socket.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 92e7c1aae3cc..f5f5a9ad290b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2677,6 +2677,10 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk); +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts); + static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { diff --git a/net/socket.c b/net/socket.c index 9a0e720f0859..2cab805943c0 100644 --- a/net/socket.c +++ b/net/socket.c @@ -843,6 +843,52 @@ static void put_ts_pktinfo(struct msghdr *msg, struct sk_buff *skb, sizeof(ts_pktinfo), &ts_pktinfo); } +bool skb_has_tx_timestamp(struct sk_buff *skb, const struct sock *sk) +{ + const struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + u32 tsflags = READ_ONCE(sk->sk_tsflags); + + if (serr->ee.ee_errno != ENOMSG || + serr->ee.ee_origin != SO_EE_ORIGIN_TIMESTAMPING) + return false; + + /* software time stamp available and wanted */ + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && skb->tstamp) + return true; + /* hardware time stamps available and wanted */ + return (tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && + skb_hwtstamps(skb)->hwtstamp; +} + +int skb_get_tx_timestamp(struct sk_buff *skb, struct sock *sk, + struct timespec64 *ts) +{ + u32 tsflags = READ_ONCE(sk->sk_tsflags); + ktime_t hwtstamp; + int if_index = 0; + + if ((tsflags & SOF_TIMESTAMPING_SOFTWARE) && + ktime_to_timespec64_cond(skb->tstamp, ts)) + return SOF_TIMESTAMPING_TX_SOFTWARE; + + if (!(tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) || + skb_is_swtx_tstamp(skb, false)) + return -ENOENT; + + if (skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP_NETDEV) + hwtstamp = get_timestamp(sk, skb, &if_index); + else + hwtstamp = skb_hwtstamps(skb)->hwtstamp; + + if (tsflags & SOF_TIMESTAMPING_BIND_PHC) + hwtstamp = ptp_convert_timestamp(&hwtstamp, + READ_ONCE(sk->sk_bind_phc)); + if (!ktime_to_timespec64_cond(hwtstamp, ts)) + return -ENOENT; + + return SOF_TIMESTAMPING_TX_HARDWARE; +} + /* * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP) */ -- cgit v1.2.3 From ba4618885b23372c45bb1566ed8e3f1c191ff22d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 14 Jun 2025 20:14:34 -0400 Subject: tcp: remove RFC3517/RFC6675 hint state: lost_skb_hint, lost_cnt_hint Now that obsolete RFC3517/RFC6675 TCP loss detection has been removed, we can remove the somewhat complex and intrusive code to maintain its hint state: lost_skb_hint and lost_cnt_hint. This commit makes tcp_clear_retrans_hints_partial() empty. We will remove tcp_clear_retrans_hints_partial() and its call sites in the next commit. Suggested-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250615001435.2390793-3-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/tcp_sock.rst | 2 -- include/linux/tcp.h | 3 --- include/net/tcp.h | 1 - net/ipv4/tcp.c | 3 +-- net/ipv4/tcp_input.c | 19 ------------------- net/ipv4/tcp_output.c | 5 ----- 6 files changed, 1 insertion(+), 32 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst index bc9b2131bf7a..7bbda5944ee2 100644 --- a/Documentation/networking/net_cachelines/tcp_sock.rst +++ b/Documentation/networking/net_cachelines/tcp_sock.rst @@ -115,7 +115,6 @@ u32 lost_out read_mostly read_m u32 sacked_out read_mostly read_mostly tcp_left_out(tx);tcp_packets_in_flight(tx/rx);tcp_clean_rtx_queue(rx) struct hrtimer pacing_timer struct hrtimer compressed_ack_timer -struct sk_buff* lost_skb_hint read_mostly tcp_clean_rtx_queue struct sk_buff* retransmit_skb_hint read_mostly tcp_clean_rtx_queue struct rb_root out_of_order_queue read_mostly tcp_data_queue,tcp_fast_path_check struct sk_buff* ooo_last_skb @@ -123,7 +122,6 @@ struct tcp_sack_block[1] duplicate_sack struct tcp_sack_block[4] selective_acks struct tcp_sack_block[4] recv_sack_cache struct sk_buff* highest_sack read_write tcp_event_new_data_sent -int lost_cnt_hint u32 prior_ssthresh u32 high_seq u32 retrans_stamp diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 29f59d50dc73..1a5737b3753d 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -208,7 +208,6 @@ struct tcp_sock { u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u16 gso_segs; /* Max number of segs per GSO packet */ /* from STCP, retrans queue hinting */ - struct sk_buff *lost_skb_hint; struct sk_buff *retransmit_skb_hint; __cacheline_group_end(tcp_sock_read_tx); @@ -419,8 +418,6 @@ struct tcp_sock { struct tcp_sack_block recv_sack_cache[4]; - int lost_cnt_hint; - u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 5078ad868fee..f57d12183794 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1813,7 +1813,6 @@ static inline void tcp_mib_init(struct net *net) /* from STCP */ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { - tp->lost_skb_hint = NULL; } static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f64f8276a73c..27d3ef83ce7b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5053,9 +5053,8 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); + CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dc234d3854aa..e8e130e946f1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1451,11 +1451,6 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; - - /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ - if (tp->lost_skb_hint && - before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) - tp->lost_cnt_hint += pcount; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1496,9 +1491,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); - if (skb == tp->lost_skb_hint) - tp->lost_cnt_hint += pcount; - TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1531,10 +1523,6 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, if (skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = prev; - if (skb == tp->lost_skb_hint) { - tp->lost_skb_hint = prev; - tp->lost_cnt_hint -= tcp_skb_pcount(prev); - } TCP_SKB_CB(prev)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags; TCP_SKB_CB(prev)->eor = TCP_SKB_CB(skb)->eor; @@ -3318,8 +3306,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, next = skb_rb_next(skb); if (unlikely(skb == tp->retransmit_skb_hint)) tp->retransmit_skb_hint = NULL; - if (unlikely(skb == tp->lost_skb_hint)) - tp->lost_skb_hint = NULL; tcp_highest_sack_replace(sk, skb, next); tcp_rtx_queue_unlink_and_free(skb, sk); } @@ -3377,14 +3363,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (flag & FLAG_RETRANS_DATA_ACKED) flag &= ~FLAG_ORIG_SACK_ACKED; } else { - int delta; - /* Non-retransmitted hole got filled? That's reordering */ if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); - - delta = prior_sacked - tp->sacked_out; - tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta); } } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3ac8d2d17e1f..b0ffefe604b4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1554,11 +1554,6 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de if (tcp_is_reno(tp) && decr > 0) tp->sacked_out -= min_t(u32, tp->sacked_out, decr); - if (tp->lost_skb_hint && - before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) && - (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) - tp->lost_cnt_hint -= decr; - tcp_verify_left_out(tp); } -- cgit v1.2.3 From db16319efcc717a31dcb9c8f038acb6e4111c12e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 14 Jun 2025 20:14:35 -0400 Subject: tcp: remove RFC3517/RFC6675 tcp_clear_retrans_hints_partial() Now that we have removed the RFC3517/RFC6675 hints, tcp_clear_retrans_hints_partial() is empty, and can be removed. Suggested-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250615001435.2390793-4-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 5 ----- net/ipv4/tcp_input.c | 2 -- net/ipv4/tcp_output.c | 1 - 3 files changed, 8 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index f57d12183794..9f852f5f8b95 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1811,13 +1811,8 @@ static inline void tcp_mib_init(struct net *net) } /* from STCP */ -static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) -{ -} - static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { - tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e8e130e946f1..05b9571c9c92 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2769,8 +2769,6 @@ void tcp_simple_retransmit(struct sock *sk) tcp_mark_skb_lost(sk, skb); } - tcp_clear_retrans_hints_partial(tp); - if (!tp->lost_out) return; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b0ffefe604b4..eb50746dc482 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3247,7 +3247,6 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->eor = TCP_SKB_CB(next_skb)->eor; /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); if (next_skb == tp->retransmit_skb_hint) tp->retransmit_skb_hint = skb; -- cgit v1.2.3 From ef07df397a621707903ef0d294a7df11f80cf206 Mon Sep 17 00:00:00 2001 From: Álvaro Fernández Rojas Date: Sat, 14 Jun 2025 09:59:48 +0200 Subject: net: dsa: tag_brcm: add support for legacy FCS tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add support for legacy Broadcom FCS tags, which are similar to DSA_TAG_PROTO_BRCM_LEGACY. BCM5325 and BCM5365 switches require including the original FCS value and length, as opposed to BCM63xx switches. Adding the original FCS value and length to DSA_TAG_PROTO_BRCM_LEGACY would impact performance of BCM63xx switches, so it's better to create a new tag. Signed-off-by: Álvaro Fernández Rojas Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250614080000.1884236-3-noltari@gmail.com Signed-off-by: Jakub Kicinski --- include/net/dsa.h | 2 ++ net/dsa/Kconfig | 16 ++++++++++-- net/dsa/tag_brcm.c | 73 +++++++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 88 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/dsa.h b/include/net/dsa.h index 55e2d97f247e..d73ea0880066 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -54,11 +54,13 @@ struct tc_action; #define DSA_TAG_PROTO_RZN1_A5PSW_VALUE 26 #define DSA_TAG_PROTO_LAN937X_VALUE 27 #define DSA_TAG_PROTO_VSC73XX_8021Q_VALUE 28 +#define DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE 29 enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = DSA_TAG_PROTO_NONE_VALUE, DSA_TAG_PROTO_BRCM = DSA_TAG_PROTO_BRCM_VALUE, DSA_TAG_PROTO_BRCM_LEGACY = DSA_TAG_PROTO_BRCM_LEGACY_VALUE, + DSA_TAG_PROTO_BRCM_LEGACY_FCS = DSA_TAG_PROTO_BRCM_LEGACY_FCS_VALUE, DSA_TAG_PROTO_BRCM_PREPEND = DSA_TAG_PROTO_BRCM_PREPEND_VALUE, DSA_TAG_PROTO_DSA = DSA_TAG_PROTO_DSA_VALUE, DSA_TAG_PROTO_EDSA = DSA_TAG_PROTO_EDSA_VALUE, diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 2dfe9063613f..869cbe57162f 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -42,12 +42,24 @@ config NET_DSA_TAG_BRCM Broadcom switches which place the tag after the MAC source address. config NET_DSA_TAG_BRCM_LEGACY - tristate "Tag driver for Broadcom legacy switches using in-frame headers" + tristate "Tag driver for BCM63xx legacy switches using in-frame headers" select NET_DSA_TAG_BRCM_COMMON help Say Y if you want to enable support for tagging frames for the - Broadcom legacy switches which place the tag after the MAC source + BCM63xx legacy switches which place the tag after the MAC source address. + This tag is used in BCM63xx legacy switches which work without the + original FCS and length before the tag insertion. + +config NET_DSA_TAG_BRCM_LEGACY_FCS + tristate "Tag driver for BCM53xx legacy switches using in-frame headers" + select NET_DSA_TAG_BRCM_COMMON + help + Say Y if you want to enable support for tagging frames for the + BCM53xx legacy switches which place the tag after the MAC source + address. + This tag is used in BCM53xx legacy switches which expect original + FCS and length before the tag insertion to be present. config NET_DSA_TAG_BRCM_PREPEND tristate "Tag driver for Broadcom switches using prepended headers" diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 9f4b0bcd95cd..26bb657ceac3 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -15,6 +15,7 @@ #define BRCM_NAME "brcm" #define BRCM_LEGACY_NAME "brcm-legacy" +#define BRCM_LEGACY_FCS_NAME "brcm-legacy-fcs" #define BRCM_PREPEND_NAME "brcm-prepend" /* Legacy Broadcom tag (6 bytes) */ @@ -32,6 +33,10 @@ #define BRCM_LEG_MULTICAST (1 << 5) #define BRCM_LEG_EGRESS (2 << 5) #define BRCM_LEG_INGRESS (3 << 5) +#define BRCM_LEG_LEN_HI(x) (((x) >> 8) & 0x7) + +/* 4th byte in the tag */ +#define BRCM_LEG_LEN_LO(x) ((x) & 0xff) /* 6th byte in the tag */ #define BRCM_LEG_PORT_ID (0xf) @@ -212,7 +217,8 @@ DSA_TAG_DRIVER(brcm_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM, BRCM_NAME); #endif -#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) || \ + IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS) static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, struct net_device *dev) { @@ -244,7 +250,9 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, return skb; } +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY || CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */ +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) static struct sk_buff *brcm_leg_tag_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -294,6 +302,66 @@ DSA_TAG_DRIVER(brcm_legacy_netdev_ops); MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY, BRCM_LEGACY_NAME); #endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY */ +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS) +static struct sk_buff *brcm_leg_fcs_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + unsigned int fcs_len; + __le32 fcs_val; + u8 *brcm_tag; + + /* The Ethernet switch we are interfaced with needs packets to be at + * least 64 bytes (including FCS) otherwise they will be discarded when + * they enter the switch port logic. When Broadcom tags are enabled, we + * need to make sure that packets are at least 70 bytes (including FCS + * and tag) because the length verification is done after the Broadcom + * tag is stripped off the ingress packet. + * + * Let dsa_user_xmit() free the SKB. + */ + if (__skb_put_padto(skb, ETH_ZLEN + BRCM_LEG_TAG_LEN, false)) + return NULL; + + fcs_len = skb->len; + fcs_val = cpu_to_le32(crc32_le(~0, skb->data, fcs_len) ^ ~0); + + skb_push(skb, BRCM_LEG_TAG_LEN); + + dsa_alloc_etype_header(skb, BRCM_LEG_TAG_LEN); + + brcm_tag = skb->data + 2 * ETH_ALEN; + + /* Broadcom tag type */ + brcm_tag[0] = BRCM_LEG_TYPE_HI; + brcm_tag[1] = BRCM_LEG_TYPE_LO; + + /* Broadcom tag value */ + brcm_tag[2] = BRCM_LEG_EGRESS | BRCM_LEG_LEN_HI(fcs_len); + brcm_tag[3] = BRCM_LEG_LEN_LO(fcs_len); + brcm_tag[4] = 0; + brcm_tag[5] = dp->index & BRCM_LEG_PORT_ID; + + /* Original FCS value */ + if (__skb_pad(skb, ETH_FCS_LEN, false)) + return NULL; + skb_put_data(skb, &fcs_val, ETH_FCS_LEN); + + return skb; +} + +static const struct dsa_device_ops brcm_legacy_fcs_netdev_ops = { + .name = BRCM_LEGACY_FCS_NAME, + .proto = DSA_TAG_PROTO_BRCM_LEGACY_FCS, + .xmit = brcm_leg_fcs_tag_xmit, + .rcv = brcm_leg_tag_rcv, + .needed_headroom = BRCM_LEG_TAG_LEN, +}; + +DSA_TAG_DRIVER(brcm_legacy_fcs_netdev_ops); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_BRCM_LEGACY_FCS, BRCM_LEGACY_FCS_NAME); +#endif /* CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS */ + #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) static struct sk_buff *brcm_tag_xmit_prepend(struct sk_buff *skb, struct net_device *dev) @@ -328,6 +396,9 @@ static struct dsa_tag_driver *dsa_tag_driver_array[] = { #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY) &DSA_TAG_DRIVER_NAME(brcm_legacy_netdev_ops), #endif +#if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_LEGACY_FCS) + &DSA_TAG_DRIVER_NAME(brcm_legacy_fcs_netdev_ops), +#endif #if IS_ENABLED(CONFIG_NET_DSA_TAG_BRCM_PREPEND) &DSA_TAG_DRIVER_NAME(brcm_prepend_netdev_ops), #endif -- cgit v1.2.3 From 0f66b616b87cb4a57d22f6f0e0e1698a70d8ad21 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Sun, 15 Jun 2025 20:35:09 +0000 Subject: netmem: fix netmem comments Trivial fix to a couple of outdated netmem comments. No code changes, just more accurately describing current code. Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20250615203511.591438-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netmem.h b/include/net/netmem.h index 386164fb9c18..850869b45b45 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -89,8 +89,7 @@ static inline unsigned int net_iov_idx(const struct net_iov *niov) * typedef netmem_ref - a nonexistent type marking a reference to generic * network memory. * - * A netmem_ref currently is always a reference to a struct page. This - * abstraction is introduced so support for new memory types can be added. + * A netmem_ref can be a struct page* or a struct net_iov* underneath. * * Use the supplied helpers to obtain the underlying memory pointer and fields. */ @@ -117,9 +116,6 @@ static inline struct page *__netmem_to_page(netmem_ref netmem) return (__force struct page *)netmem; } -/* This conversion fails (returns NULL) if the netmem_ref is not struct page - * backed. - */ static inline struct page *netmem_to_page(netmem_ref netmem) { if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) @@ -178,6 +174,21 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) return page_to_pfn(netmem_to_page(netmem)); } +/* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to + * common fields. + * @netmem: netmem reference to extract as net_iov. + * + * All the sub types of netmem_ref (page, net_iov) have the same pp, pp_magic, + * dma_addr, and pp_ref_count fields at the same offsets. Thus, we can access + * these fields without a type check to make sure that the underlying mem is + * net_iov or page. + * + * The resulting value of this function can only be used to access the fields + * that are NET_IOV_ASSERT_OFFSET'd. Accessing any other fields will result in + * undefined behavior. + * + * Return: the netmem_ref cast to net_iov* regardless of its underlying type. + */ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) { return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); -- cgit v1.2.3 From e3411e326fa48c9be09ba449330352ba698db698 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:09 +0200 Subject: net: ipv4: Add a flags argument to iptunnel_xmit(), udp_tunnel_xmit_skb() iptunnel_xmit() erases the contents of the SKB control block. In order to be able to set particular IPCB flags on the SKB, add a corresponding parameter, and propagate it to udp_tunnel_xmit_skb() as well. In one of the following patches, VXLAN driver will use this facility to mark packets as subject to IP multicast routing. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Acked-by: Antonio Quartulli Link: https://patch.msgid.link/89c9daf9f2dc088b6b92ccebcc929f51742de91f.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/amt.c | 9 ++++++--- drivers/net/bareudp.c | 4 ++-- drivers/net/geneve.c | 4 ++-- drivers/net/gtp.c | 10 ++++++---- drivers/net/ovpn/udp.c | 2 +- drivers/net/vxlan/vxlan_core.c | 2 +- drivers/net/wireguard/socket.c | 2 +- include/net/ip_tunnels.h | 2 +- include/net/udp_tunnel.h | 2 +- net/ipv4/ip_tunnel.c | 4 ++-- net/ipv4/ip_tunnel_core.c | 4 +++- net/ipv4/udp_tunnel_core.c | 5 +++-- net/ipv6/sit.c | 2 +- net/sctp/protocol.c | 3 ++- net/tipc/udp_media.c | 2 +- 15 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/drivers/net/amt.c b/drivers/net/amt.c index fb130fde68c0..ed86537b2f61 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -1046,7 +1046,8 @@ static bool amt_send_membership_update(struct amt_dev *amt, amt->gw_port, amt->relay_port, false, - false); + false, + 0); amt_update_gw_status(amt, AMT_STATUS_SENT_UPDATE, true); return false; } @@ -1103,7 +1104,8 @@ static void amt_send_multicast_data(struct amt_dev *amt, amt->relay_port, tunnel->source_port, false, - false); + false, + 0); } static bool amt_send_membership_query(struct amt_dev *amt, @@ -1161,7 +1163,8 @@ static bool amt_send_membership_query(struct amt_dev *amt, amt->relay_port, tunnel->source_port, false, - false); + false, + 0); amt_update_relay_status(tunnel, AMT_STATUS_SENT_QUERY, true); return false; } diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index a9dffdcac805..5e613080d3f8 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -362,8 +362,8 @@ static int bareudp_xmit_skb(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, bareudp->port, !net_eq(bareudp->net, dev_net(bareudp->dev)), - !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), + 0); return 0; free_dst: diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index ffc15a432689..c668e8b00ed2 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -921,8 +921,8 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, saddr, info->key.u.ipv4.dst, tos, ttl, df, sport, geneve->cfg.info.key.tp_dst, !net_eq(geneve->net, dev_net(geneve->dev)), - !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + !test_bit(IP_TUNNEL_CSUM_BIT, info->key.tun_flags), + 0); return 0; } diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index d4dec741c7f4..14584793fe4e 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -446,7 +446,8 @@ static int gtp0_send_echo_resp_ip(struct gtp_dev *gtp, struct sk_buff *skb) htons(GTP0_PORT), htons(GTP0_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), - false); + false, + 0); return 0; } @@ -704,7 +705,8 @@ static int gtp1u_send_echo_resp(struct gtp_dev *gtp, struct sk_buff *skb) htons(GTP1U_PORT), htons(GTP1U_PORT), !net_eq(sock_net(gtp->sk1u), dev_net(gtp->dev)), - false); + false, + 0); return 0; } @@ -1304,7 +1306,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) pktinfo.gtph_port, pktinfo.gtph_port, !net_eq(sock_net(pktinfo.pctx->sk), dev_net(dev)), - false); + false, 0); break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) @@ -2405,7 +2407,7 @@ static int gtp_genl_send_echo_req(struct sk_buff *skb, struct genl_info *info) port, port, !net_eq(sock_net(sk), dev_net(gtp->dev)), - false); + false, 0); return 0; } diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index bff00946eae2..d866e6bfda70 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -199,7 +199,7 @@ static int ovpn_udp4_output(struct ovpn_peer *peer, struct ovpn_bind *bind, transmit: udp_tunnel_xmit_skb(rt, sk, skb, fl.saddr, fl.daddr, 0, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, - fl.fl4_dport, false, sk->sk_no_check_tx); + fl.fl4_dport, false, sk->sk_no_check_tx, 0); ret = 0; err: local_bh_enable(); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 97792de896b7..1cc18acd242d 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2522,7 +2522,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum); + src_port, dst_port, xnet, !udp_sum, 0); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 0414d7a6ce74..88e685667bc0 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -84,7 +84,7 @@ static int send4(struct wg_device *wg, struct sk_buff *skb, skb->ignore_df = 1; udp_tunnel_xmit_skb(rt, sock, skb, fl.saddr, fl.daddr, ds, ip4_dst_hoplimit(&rt->dst), 0, fl.fl4_sport, - fl.fl4_dport, false, false); + fl.fl4_dport, false, false, 0); goto out; err: diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 0c3d571a04a1..8cf1380f3656 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -603,7 +603,7 @@ static inline int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, u8 proto, - u8 tos, u8 ttl, __be16 df, bool xnet); + u8 tos, u8 ttl, __be16 df, bool xnet, u16 ipcb_flags); struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md, gfp_t flags); int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst, diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 2df3b8344eb5..28102c8fd8a8 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -150,7 +150,7 @@ static inline void udp_tunnel_drop_rx_info(struct net_device *dev) void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck); + bool xnet, bool nocheck, u16 ipcb_flags); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 678b8f96e3e9..aaeb5d16f0c9 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -668,7 +668,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_adj_headroom(dev, headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return; tx_error: DEV_STATS_INC(dev, tx_errors); @@ -857,7 +857,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_adj_headroom(dev, max_headroom); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return; #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index f65d2f727381..cc9915543637 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -49,7 +49,8 @@ EXPORT_SYMBOL(ip6tun_encaps); void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 proto, - __u8 tos, __u8 ttl, __be16 df, bool xnet) + __u8 tos, __u8 ttl, __be16 df, bool xnet, + u16 ipcb_flags) { int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); @@ -62,6 +63,7 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_clear_hash_if_not_l4(skb); skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); + IPCB(skb)->flags = ipcb_flags; /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 2326548997d3..9efd62505916 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, - bool xnet, bool nocheck) + bool xnet, bool nocheck, u16 ipcb_flags) { struct udphdr *uh; @@ -185,7 +185,8 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb udp_set_csum(nocheck, skb, src, dst, skb->len); - iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); + iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet, + ipcb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index a72dbca9e8fc..12496ba1b7d4 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1035,7 +1035,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, - df, !net_eq(tunnel->net, dev_net(dev))); + df, !net_eq(tunnel->net, dev_net(dev)), 0); return NETDEV_TX_OK; tx_error_icmp: diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f402f90eb6b6..a5ccada55f2b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1103,7 +1103,8 @@ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_set_inner_ipproto(skb, IPPROTO_SCTP); udp_tunnel_xmit_skb(dst_rtable(dst), sk, skb, fl4->saddr, fl4->daddr, dscp, ip4_dst_hoplimit(dst), df, - sctp_sk(sk)->udp_port, t->encap_port, false, false); + sctp_sk(sk)->udp_port, t->encap_port, false, false, + 0); return 0; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 108a4cc2e001..87e8c1e6d550 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -197,7 +197,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, ttl = ip4_dst_hoplimit(&rt->dst); udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr, dst->ipv4.s_addr, 0, ttl, 0, src->port, - dst->port, false, true); + dst->port, false, true, 0); #if IS_ENABLED(CONFIG_IPV6) } else { if (!ndst) { -- cgit v1.2.3 From 35bec72a24ace52a7f57642ff2813f22733b08fd Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:12 +0200 Subject: net: ipv4: Add ip_mr_output() Multicast routing is today handled in the input path. Locally generated MC packets don't hit the IPMR code today. Thus if a VXLAN remote address is multicast, the driver needs to set an OIF during route lookup. Thus MC routing configuration needs to be kept in sync with the VXLAN FDB and MDB. Ideally, the VXLAN packets would be routed by the MC routing code instead. To that end, this patch adds support to route locally generated multicast packets. The newly-added routines do largely what ip_mr_input() and ip_mr_forward() do: make an MR cache lookup to find where to send the packets, and use ip_mc_output() to send each of them. When no cache entry is found, the packet is punted to the daemon for resolution. However, an installation that uses a VXLAN underlay netdevice for which it also has matching MC routes, would get a different routing with this patch. Previously, the MC packets would be delivered directly to the underlay port, whereas now they would be MC-routed. In order to avoid this change in behavior, introduce an IPCB flag. Only if the flag is set will ip_mr_output() actually engage, otherwise it reverts to ip_mc_output(). This code is based on work by Roopa Prabhu and Nikolay Aleksandrov. Signed-off-by: Roopa Prabhu Signed-off-by: Nikolay Aleksandrov Signed-off-by: Benjamin Poirier Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/0aadbd49330471c0f758d54afb05eb3b6e3a6b65.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/ip.h | 2 + net/ipv4/ipmr.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/route.c | 2 +- 3 files changed, 120 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/ip.h b/include/net/ip.h index 47ed6d23853d..375304bb99f6 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -59,6 +59,7 @@ struct inet_skb_parm { #define IPSKB_L3SLAVE BIT(7) #define IPSKB_NOPOLICY BIT(8) #define IPSKB_MULTIPATH BIT(9) +#define IPSKB_MCROUTE BIT(10) u16 frag_max_size; }; @@ -167,6 +168,7 @@ void ip_list_rcv(struct list_head *head, struct packet_type *pt, int ip_local_deliver(struct sk_buff *skb); void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int proto); int ip_mr_input(struct sk_buff *skb); +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 74d45fd5d11e..f78c4e53dc8c 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1965,6 +1965,19 @@ out_free: kfree_skb(skb); } +static void ipmr_queue_output_xmit(struct net *net, struct mr_table *mrt, + struct sk_buff *skb, int vifi) +{ + if (ipmr_prepare_xmit(net, mrt, skb, vifi)) + goto out_free; + + ip_mc_output(net, NULL, skb); + return; + +out_free: + kfree_skb(skb); +} + /* Called with mrt_lock or rcu_read_lock() */ static int ipmr_find_vif(const struct mr_table *mrt, struct net_device *dev) { @@ -2224,6 +2237,110 @@ dont_forward: return 0; } +static void ip_mr_output_finish(struct net *net, struct mr_table *mrt, + struct net_device *dev, struct sk_buff *skb, + struct mfc_cache *c) +{ + int psend = -1; + int ct; + + atomic_long_inc(&c->_c.mfc_un.res.pkt); + atomic_long_add(skb->len, &c->_c.mfc_un.res.bytes); + WRITE_ONCE(c->_c.mfc_un.res.lastuse, jiffies); + + /* Forward the frame */ + if (c->mfc_origin == htonl(INADDR_ANY) && + c->mfc_mcastgrp == htonl(INADDR_ANY)) { + if (ip_hdr(skb)->ttl > + c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { + /* It's an (*,*) entry and the packet is not coming from + * the upstream: forward the packet to the upstream + * only. + */ + psend = c->_c.mfc_parent; + goto last_xmit; + } + goto dont_xmit; + } + + for (ct = c->_c.mfc_un.res.maxvif - 1; + ct >= c->_c.mfc_un.res.minvif; ct--) { + if (ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_output_xmit(net, mrt, + skb2, psend); + } + psend = ct; + } + } + +last_xmit: + if (psend != -1) { + ipmr_queue_output_xmit(net, mrt, skb, psend); + return; + } + +dont_xmit: + kfree_skb(skb); +} + +/* Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); + */ +int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct mfc_cache *cache; + struct net_device *dev; + struct mr_table *mrt; + int vif; + + WARN_ON_ONCE(!rcu_read_lock_held()); + dev = rt->dst.dev; + + if (IPCB(skb)->flags & IPSKB_FORWARDED) + goto mc_output; + if (!(IPCB(skb)->flags & IPSKB_MCROUTE)) + goto mc_output; + + skb->dev = dev; + + mrt = ipmr_rt_fib_lookup(net, skb); + if (IS_ERR(mrt)) + goto mc_output; + + /* already under rcu_read_lock() */ + cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); + if (!cache) { + vif = ipmr_find_vif(mrt, dev); + if (vif >= 0) + cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr, + vif); + } + + /* No usable cache entry */ + if (!cache) { + vif = ipmr_find_vif(mrt, dev); + if (vif >= 0) + return ipmr_cache_unresolved(mrt, vif, skb, dev); + goto mc_output; + } + + vif = cache->_c.mfc_parent; + if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) + goto mc_output; + + ip_mr_output_finish(net, mrt, dev, skb, cache); + return 0; + +mc_output: + return ip_mc_output(net, sk, skb); +} + #ifdef CONFIG_IP_PIMSM_V1 /* Handle IGMP messages of PIMv1 */ int pim_rcv_v1(struct sk_buff *skb) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index fccb05fb3a79..3ddf6bf40357 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2660,7 +2660,7 @@ add: if (IN_DEV_MFORWARD(in_dev) && !ipv4_is_local_multicast(fl4->daddr)) { rth->dst.input = ip_mr_input; - rth->dst.output = ip_mc_output; + rth->dst.output = ip_mr_output; } } #endif -- cgit v1.2.3 From 6a7d88ca15f73c5c570c372238f71d63da1fda55 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:13 +0200 Subject: net: ipv6: Make udp_tunnel6_xmit_skb() void The function always returns zero, thus the return value does not carry any signal. Just make it void. Most callers already ignore the return value. However: - Refold arguments of the call from sctp_v6_xmit() so that they fit into the 80-column limit. - tipc_udp_xmit() initializes err from the return value, but that should already be always zero at that point. So there's no practical change, but elision of the assignment prompts a couple more tweaks to clean up the function. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/7facacf9d8ca3ca9391a4aee88160913671b868d.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 14 +++++++------- net/ipv6/ip6_udp_tunnel.c | 15 +++++++-------- net/sctp/ipv6.c | 7 ++++--- net/tipc/udp_media.c | 10 +++++----- 4 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 28102c8fd8a8..0b01f6ade20d 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -152,13 +152,13 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck, u16 ipcb_flags); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck); void udp_tunnel_sock_release(struct socket *sock); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index c99053189ea8..21681718b7bb 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -74,13 +74,13 @@ error: } EXPORT_SYMBOL_GPL(udp_sock_create6); -int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, - struct sk_buff *skb, - struct net_device *dev, - const struct in6_addr *saddr, - const struct in6_addr *daddr, - __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck) +void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, + struct sk_buff *skb, + struct net_device *dev, + const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u8 prio, __u8 ttl, __be32 label, + __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -109,7 +109,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev); - return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index a9ed2ccab1bd..d1ecf7454827 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -261,9 +261,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) skb_set_inner_ipproto(skb, IPPROTO_SCTP); label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, fl6); - return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, - &fl6->daddr, tclass, ip6_dst_hoplimit(dst), - label, sctp_sk(sk)->udp_port, t->encap_port, false); + udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, + tclass, ip6_dst_hoplimit(dst), label, + sctp_sk(sk)->udp_port, t->encap_port, false); + return 0; } /* Returns the dst cache entry for the given source and destination ip diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 87e8c1e6d550..414713fcd8c5 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -172,7 +172,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, struct udp_media_addr *dst, struct dst_cache *cache) { struct dst_entry *ndst; - int ttl, err = 0; + int ttl, err; local_bh_disable(); ndst = dst_cache_get(cache); @@ -217,13 +217,13 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, dst_cache_set_ip6(cache, ndst, &fl6.saddr); } ttl = ip6_dst_hoplimit(ndst); - err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, - &src->ipv6, &dst->ipv6, 0, ttl, 0, - src->port, dst->port, false); + udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, + &src->ipv6, &dst->ipv6, 0, ttl, 0, + src->port, dst->port, false); #endif } local_bh_enable(); - return err; + return 0; tx_error: local_bh_enable(); -- cgit v1.2.3 From f78c75d84fe83898f0a00658f593d4f17b38cbc6 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:14 +0200 Subject: net: ipv6: Add a flags argument to ip6tunnel_xmit(), udp_tunnel6_xmit_skb() ip6tunnel_xmit() erases the contents of the SKB control block. In order to be able to set particular IP6CB flags on the SKB, add a corresponding parameter, and propagate it to udp_tunnel6_xmit_skb() as well. In one of the following patches, VXLAN driver will use this facility to mark packets as subject to IPv6 multicast routing. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/acb4f9f3e40c3a931236c3af08a720b017fbfbfb.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/bareudp.c | 3 ++- drivers/net/geneve.c | 3 ++- drivers/net/gtp.c | 2 +- drivers/net/ovpn/udp.c | 2 +- drivers/net/vxlan/vxlan_core.c | 3 ++- drivers/net/wireguard/socket.c | 2 +- include/net/ip6_tunnel.h | 3 ++- include/net/udp_tunnel.h | 3 ++- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 5 +++-- net/sctp/ipv6.c | 2 +- net/tipc/udp_media.c | 2 +- 12 files changed, 19 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index 5e613080d3f8..0df3208783ad 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -431,7 +431,8 @@ static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, &saddr, &daddr, prio, ttl, info->key.label, sport, bareudp->port, !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + info->key.tun_flags), + 0); return 0; free_dst: diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index c668e8b00ed2..f6bd155aae7f 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1014,7 +1014,8 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, &saddr, &key->u.ipv6.dst, prio, ttl, info->key.label, sport, geneve->cfg.info.key.tp_dst, !test_bit(IP_TUNNEL_CSUM_BIT, - info->key.tun_flags)); + info->key.tun_flags), + 0); return 0; } #endif diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 14584793fe4e..4b668ebaa0f7 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1316,7 +1316,7 @@ static netdev_tx_t gtp_dev_xmit(struct sk_buff *skb, struct net_device *dev) ip6_dst_hoplimit(&pktinfo.rt->dst), 0, pktinfo.gtph_port, pktinfo.gtph_port, - false); + false, 0); #else goto tx_err; #endif diff --git a/drivers/net/ovpn/udp.c b/drivers/net/ovpn/udp.c index d866e6bfda70..254cc94c4617 100644 --- a/drivers/net/ovpn/udp.c +++ b/drivers/net/ovpn/udp.c @@ -274,7 +274,7 @@ transmit: skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sk, skb, skb->dev, &fl.saddr, &fl.daddr, 0, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, - fl.fl6_dport, udp_get_no_check6_tx(sk)); + fl.fl6_dport, udp_get_no_check6_tx(sk), 0); ret = 0; err: local_bh_enable(); diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 1cc18acd242d..b22f9866be8e 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2586,7 +2586,8 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, - pkey->label, src_port, dst_port, !udp_sum); + pkey->label, src_port, dst_port, !udp_sum, + 0); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); diff --git a/drivers/net/wireguard/socket.c b/drivers/net/wireguard/socket.c index 88e685667bc0..253488f8c00f 100644 --- a/drivers/net/wireguard/socket.c +++ b/drivers/net/wireguard/socket.c @@ -151,7 +151,7 @@ static int send6(struct wg_device *wg, struct sk_buff *skb, skb->ignore_df = 1; udp_tunnel6_xmit_skb(dst, sock, skb, skb->dev, &fl.saddr, &fl.daddr, ds, ip6_dst_hoplimit(dst), 0, fl.fl6_sport, - fl.fl6_dport, false); + fl.fl6_dport, false, 0); goto out; err: diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 399592405c72..dd163495f353 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -152,11 +152,12 @@ int ip6_tnl_get_iflink(const struct net_device *dev); int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu); static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, - struct net_device *dev) + struct net_device *dev, u16 ip6cb_flags) { int pkt_len, err; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 0b01f6ade20d..e3c70b579095 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -158,7 +158,8 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck); + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags); void udp_tunnel_sock_release(struct socket *sock); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 894d3158a6f0..a885bb5c98ea 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1278,7 +1278,7 @@ route_lookup: ipv6h->nexthdr = proto; ipv6h->saddr = fl6->saddr; ipv6h->daddr = fl6->daddr; - ip6tunnel_xmit(NULL, skb, dev); + ip6tunnel_xmit(NULL, skb, dev, 0); return 0; tx_err_link_failure: DEV_STATS_INC(dev, tx_carrier_errors); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 21681718b7bb..8ebe17a6058a 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -80,7 +80,8 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, const struct in6_addr *saddr, const struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, - __be16 src_port, __be16 dst_port, bool nocheck) + __be16 src_port, __be16 dst_port, bool nocheck, + u16 ip6cb_flags) { struct udphdr *uh; struct ipv6hdr *ip6h; @@ -108,7 +109,7 @@ void udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, ip6h->daddr = *daddr; ip6h->saddr = *saddr; - ip6tunnel_xmit(sk, skb, dev); + ip6tunnel_xmit(sk, skb, dev, ip6cb_flags); } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index d1ecf7454827..3336dcfb4515 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -263,7 +263,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t) udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr, &fl6->daddr, tclass, ip6_dst_hoplimit(dst), label, - sctp_sk(sk)->udp_port, t->encap_port, false); + sctp_sk(sk)->udp_port, t->encap_port, false, 0); return 0; } diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 414713fcd8c5..a024fcc8c0cb 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -219,7 +219,7 @@ static int tipc_udp_xmit(struct net *net, struct sk_buff *skb, ttl = ip6_dst_hoplimit(ndst); udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL, &src->ipv6, &dst->ipv6, 0, ttl, 0, - src->port, dst->port, false); + src->port, dst->port, false, 0); #endif } local_bh_enable(); -- cgit v1.2.3 From f8337efa4ff5a27e6c1d4e384166413eecd21a65 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Tue, 17 Jun 2025 00:44:19 +0200 Subject: vxlan: Support MC routing in the underlay Locally-generated MC packets have so far not been subject to MC routing. Instead an MC-enabled installation would maintain the MC routing tables, and separately from that the list of interfaces to send packets to as part of the VXLAN FDB and MDB. In a previous patch, a ip_mr_output() and ip6_mr_output() routines were added for IPv4 and IPv6. All locally generated MC traffic is now passed through these functions. For reasons of backward compatibility, an SKB (IPCB / IP6CB) flag guards the actual MC routing. This patch adds logic to set the flag, and the UAPI to enable the behavior. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/d899655bb7e9b2521ee8c793e67056b9fd02ba12.1750113335.git.petrm@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_core.c | 22 ++++++++++++++++++++-- include/net/vxlan.h | 5 ++++- include/uapi/linux/if_link.h | 1 + 3 files changed, 25 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index b22f9866be8e..a6cc1de4d8b8 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -2451,6 +2451,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, rcu_read_lock(); if (addr_family == AF_INET) { struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock); + u16 ipcb_flags = 0; struct rtable *rt; __be16 df = 0; __be32 saddr; @@ -2467,6 +2468,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ipcb_flags |= IPSKB_MCROUTE; + if (!info) { /* Bypass encapsulation if the destination is local */ err = encap_bypass_if_local(skb, dev, vxlan, AF_INET, @@ -2522,11 +2526,13 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, saddr, pkey->u.ipv4.dst, tos, ttl, df, - src_port, dst_port, xnet, !udp_sum, 0); + src_port, dst_port, xnet, !udp_sum, + ipcb_flags); #if IS_ENABLED(CONFIG_IPV6) } else { struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock); struct in6_addr saddr; + u16 ip6cb_flags = 0; if (!ifindex) ifindex = sock6->sock->sk->sk_bound_dev_if; @@ -2542,6 +2548,9 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, goto tx_error; } + if (flags & VXLAN_F_MC_ROUTE) + ip6cb_flags |= IP6SKB_MCROUTE; + if (!info) { u32 rt6i_flags = dst_rt6_info(ndst)->rt6i_flags; @@ -2587,7 +2596,7 @@ void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev, udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev, &saddr, &pkey->u.ipv6.dst, tos, ttl, pkey->label, src_port, dst_port, !udp_sum, - 0); + ip6cb_flags); #endif } vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len); @@ -3402,6 +3411,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = { [IFLA_VXLAN_LOCALBYPASS] = NLA_POLICY_MAX(NLA_U8, 1), [IFLA_VXLAN_LABEL_POLICY] = NLA_POLICY_MAX(NLA_U32, VXLAN_LABEL_MAX), [IFLA_VXLAN_RESERVED_BITS] = NLA_POLICY_EXACT_LEN(sizeof(struct vxlanhdr)), + [IFLA_VXLAN_MC_ROUTE] = NLA_POLICY_MAX(NLA_U8, 1), }; static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[], @@ -4315,6 +4325,14 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], return err; } + if (data[IFLA_VXLAN_MC_ROUTE]) { + err = vxlan_nl2flag(conf, data, IFLA_VXLAN_MC_ROUTE, + VXLAN_F_MC_ROUTE, changelink, + true, extack); + if (err) + return err; + } + if (tb[IFLA_MTU]) { if (changelink) { NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU], diff --git a/include/net/vxlan.h b/include/net/vxlan.h index e2f7ca045d3e..0ee50785f4f1 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -332,6 +332,7 @@ struct vxlan_dev { #define VXLAN_F_VNIFILTER 0x20000 #define VXLAN_F_MDB 0x40000 #define VXLAN_F_LOCALBYPASS 0x80000 +#define VXLAN_F_MC_ROUTE 0x100000 /* Flags that are used in the receive path. These flags must match in * order for a socket to be shareable @@ -353,7 +354,9 @@ struct vxlan_dev { VXLAN_F_UDP_ZERO_CSUM6_RX | \ VXLAN_F_COLLECT_METADATA | \ VXLAN_F_VNIFILTER | \ - VXLAN_F_LOCALBYPASS) + VXLAN_F_LOCALBYPASS | \ + VXLAN_F_MC_ROUTE | \ + 0) struct net_device *vxlan_dev_create(struct net *net, const char *name, u8 name_assign_type, struct vxlan_config *conf); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 3ad2d5d98034..873c285996fe 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -1398,6 +1398,7 @@ enum { IFLA_VXLAN_LOCALBYPASS, IFLA_VXLAN_LABEL_POLICY, /* IPv6 flow label policy; ifla_vxlan_label_policy */ IFLA_VXLAN_RESERVED_BITS, + IFLA_VXLAN_MC_ROUTE, __IFLA_VXLAN_MAX }; #define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) -- cgit v1.2.3 From fd0406e5ca53b804353d4b1b60a980c13cbfbea3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 16 Jun 2025 08:10:47 -1000 Subject: net: tcp: tsq: Convert from tasklet to BH workqueue The only generic interface to execute asynchronously in the BH context is tasklet; however, it's marked deprecated and has some design flaws. To replace tasklets, BH workqueue support was recently added. A BH workqueue behaves similarly to regular workqueues except that the queued work items are executed in the BH context. This patch converts TCP Small Queues implementation from tasklet to BH workqueue. Semantically, this is an equivalent conversion and there shouldn't be any user-visible behavior changes. While workqueue's queueing and execution paths are a bit heavier than tasklet's, unless the work item is being queued every packet, the difference hopefully shouldn't matter. My experience with the networking stack is very limited and this patch definitely needs attention from someone who actually understands networking. Signed-off-by: Tejun Heo Reviewed-by: Jason Xing Reviewed-by: Eric Dumazet Cc: David Ahern Link: https://patch.msgid.link/aFBeJ38AS1ZF3Dq5@slm.duckdns.org Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_output.c | 36 ++++++++++++++++++------------------ 3 files changed, 20 insertions(+), 20 deletions(-) (limited to 'include/net') diff --git a/include/net/tcp.h b/include/net/tcp.h index 9f852f5f8b95..761c4a0ad386 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -321,7 +321,7 @@ extern struct proto tcp_prot; #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) -void tcp_tasklet_init(void); +void tcp_tsq_work_init(void); int tcp_v4_err(struct sk_buff *skb, u32); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 27d3ef83ce7b..8a3c99246d2e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5242,6 +5242,6 @@ void __init tcp_init(void) tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); - tcp_tasklet_init(); + tcp_tsq_work_init(); mptcp_init(); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eb50746dc482..28f840724fe8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1066,15 +1066,15 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb * needs to be reallocated in a driver. * The invariant being skb->truesize subtracted from sk->sk_wmem_alloc * - * Since transmit from skb destructor is forbidden, we use a tasklet + * Since transmit from skb destructor is forbidden, we use a BH work item * to process all sockets that eventually need to send more skbs. - * We use one tasklet per cpu, with its own queue of sockets. + * We use one work item per cpu, with its own queue of sockets. */ -struct tsq_tasklet { - struct tasklet_struct tasklet; +struct tsq_work { + struct work_struct work; struct list_head head; /* queue of tcp sockets */ }; -static DEFINE_PER_CPU(struct tsq_tasklet, tsq_tasklet); +static DEFINE_PER_CPU(struct tsq_work, tsq_work); static void tcp_tsq_write(struct sock *sk) { @@ -1104,14 +1104,14 @@ static void tcp_tsq_handler(struct sock *sk) bh_unlock_sock(sk); } /* - * One tasklet per cpu tries to send more skbs. - * We run in tasklet context but need to disable irqs when + * One work item per cpu tries to send more skbs. + * We run in BH context but need to disable irqs when * transferring tsq->head because tcp_wfree() might * interrupt us (non NAPI drivers) */ -static void tcp_tasklet_func(struct tasklet_struct *t) +static void tcp_tsq_workfn(struct work_struct *work) { - struct tsq_tasklet *tsq = from_tasklet(tsq, t, tasklet); + struct tsq_work *tsq = container_of(work, struct tsq_work, work); LIST_HEAD(list); unsigned long flags; struct list_head *q, *n; @@ -1181,15 +1181,15 @@ void tcp_release_cb(struct sock *sk) } EXPORT_IPV6_MOD(tcp_release_cb); -void __init tcp_tasklet_init(void) +void __init tcp_tsq_work_init(void) { int i; for_each_possible_cpu(i) { - struct tsq_tasklet *tsq = &per_cpu(tsq_tasklet, i); + struct tsq_work *tsq = &per_cpu(tsq_work, i); INIT_LIST_HEAD(&tsq->head); - tasklet_setup(&tsq->tasklet, tcp_tasklet_func); + INIT_WORK(&tsq->work, tcp_tsq_workfn); } } @@ -1203,11 +1203,11 @@ void tcp_wfree(struct sk_buff *skb) struct sock *sk = skb->sk; struct tcp_sock *tp = tcp_sk(sk); unsigned long flags, nval, oval; - struct tsq_tasklet *tsq; + struct tsq_work *tsq; bool empty; /* Keep one reference on sk_wmem_alloc. - * Will be released by sk_free() from here or tcp_tasklet_func() + * Will be released by sk_free() from here or tcp_tsq_workfn() */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc)); @@ -1229,13 +1229,13 @@ void tcp_wfree(struct sk_buff *skb) nval = (oval & ~TSQF_THROTTLED) | TSQF_QUEUED; } while (!try_cmpxchg(&sk->sk_tsq_flags, &oval, nval)); - /* queue this socket to tasklet queue */ + /* queue this socket to BH workqueue */ local_irq_save(flags); - tsq = this_cpu_ptr(&tsq_tasklet); + tsq = this_cpu_ptr(&tsq_work); empty = list_empty(&tsq->head); list_add(&tp->tsq_node, &tsq->head); if (empty) - tasklet_schedule(&tsq->tasklet); + queue_work(system_bh_wq, &tsq->work); local_irq_restore(flags); return; out: @@ -2634,7 +2634,7 @@ static bool tcp_small_queue_check(struct sock *sk, const struct sk_buff *skb, if (refcount_read(&sk->sk_wmem_alloc) > limit) { /* Always send skb if rtx queue is empty or has one skb. * No need to wait for TX completion to call us back, - * after softirq/tasklet schedule. + * after softirq schedule. * This helps when TX completions are delayed too much. */ if (tcp_rtx_queue_empty_or_single_skb(sk)) -- cgit v1.2.3 From c9e1225352d48b184991a4edc77b897cac66991e Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 16 Jun 2025 17:14:30 +0300 Subject: net: Allow const args for of page_to_netmem() This allows calling page_to_netmem() with a const page * argument. Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Reviewed-by: Cosmin Ratiu Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250616141441.1243044-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netmem.h b/include/net/netmem.h index 850869b45b45..7a1dafa3f080 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -139,7 +139,7 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) return (__force netmem_ref)((unsigned long)niov | NET_IOV); } -static inline netmem_ref page_to_netmem(struct page *page) +static inline netmem_ref page_to_netmem(const struct page *page) { return (__force netmem_ref)page; } -- cgit v1.2.3 From a202f24b08587021a39eade5aa5444d5714689fb Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Mon, 16 Jun 2025 17:14:32 +0300 Subject: page_pool: Add page_pool_dev_alloc_netmems helper This is the netmem counterpart of page_pool_dev_alloc_pages() which uses the default GFP flags for RX. Signed-off-by: Dragos Tatulea Reviewed-by: Mina Almasry Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250616141441.1243044-4-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/net') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 93f2c31baf9b..773fc65780b5 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -153,6 +153,13 @@ static inline netmem_ref page_pool_dev_alloc_netmem(struct page_pool *pool, return page_pool_alloc_netmem(pool, offset, size, gfp); } +static inline netmem_ref page_pool_dev_alloc_netmems(struct page_pool *pool) +{ + gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; + + return page_pool_alloc_netmems(pool, gfp); +} + static inline struct page *page_pool_alloc(struct page_pool *pool, unsigned int *offset, unsigned int *size, gfp_t gfp) -- cgit v1.2.3 From a33556940b5727191613104bced53c93f4a7a3aa Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 17 Jun 2025 21:06:13 +0800 Subject: tcp: Remove inet_hashinfo2_free_mod() DCCP was removed, inet_hashinfo2_free_mod() is unused now. Signed-off-by: Yue Haibing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250617130613.498659-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 4564b5d348b1..ae09e91398a5 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -202,12 +202,6 @@ static inline spinlock_t *inet_ehash_lockp( int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo); -static inline void inet_hashinfo2_free_mod(struct inet_hashinfo *h) -{ - kfree(h->lhash2); - h->lhash2 = NULL; -} - static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) { kvfree(hashinfo->ehash_locks); -- cgit v1.2.3 From 1ead7501094c6a61461c0c98dde9ec5660fa1e24 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 16 Jun 2025 09:21:14 -0700 Subject: udp_tunnel: remove rtnl_lock dependency Drivers that are using ops lock and don't depend on RTNL lock still need to manage it because udp_tunnel's RTNL dependency. Introduce new udp_tunnel_nic_lock and use it instead of rtnl_lock. Drop non-UDP_TUNNEL_NIC_INFO_MAY_SLEEP mode from udp_tunnel infra (udp_tunnel_nic_device_sync_work needs to grab udp_tunnel_nic_lock mutex and might sleep). Cover more places in v4: - netlink - udp_tunnel_notify_add_rx_port (ndo_open) - triggers udp_tunnel_nic_device_sync_work - udp_tunnel_notify_del_rx_port (ndo_stop) - triggers udp_tunnel_nic_device_sync_work - udp_tunnel_get_rx_info (__netdev_update_features) - triggers NETDEV_UDP_TUNNEL_PUSH_INFO - udp_tunnel_drop_rx_info (__netdev_update_features) - triggers NETDEV_UDP_TUNNEL_DROP_INFO - udp_tunnel_nic_reset_ntf (ndo_open) - notifiers - udp_tunnel_nic_netdevice_event, depending on the event: - triggers NETDEV_UDP_TUNNEL_PUSH_INFO - triggers NETDEV_UDP_TUNNEL_DROP_INFO - ethnl_tunnel_info_reply_size - udp_tunnel_nic_set_port_priv (two intel drivers) Cc: Michael Chan Suggested-by: Jakub Kicinski Signed-off-by: Stanislav Fomichev Link: https://patch.msgid.link/20250616162117.287806-4-stfomichev@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c | 3 +- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 6 +- drivers/net/ethernet/emulex/benet/be_main.c | 3 +- drivers/net/ethernet/intel/i40e/i40e_main.c | 1 - drivers/net/ethernet/intel/ice/ice_main.c | 1 - drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 3 +- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 +- .../net/ethernet/netronome/nfp/nfp_net_common.c | 3 +- drivers/net/ethernet/qlogic/qede/qede_filter.c | 3 - drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c | 1 - drivers/net/ethernet/sfc/ef10.c | 1 - drivers/net/netdevsim/udp_tunnels.c | 4 - include/net/udp_tunnel.h | 87 ++++++++++++++++------ net/core/dev.c | 2 + net/ipv4/udp_tunnel_core.c | 16 ++-- net/ipv4/udp_tunnel_nic.c | 78 ++++++++++++++----- 16 files changed, 142 insertions(+), 73 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c index c9a1a1d504c0..3ee4b848ef53 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c @@ -10219,8 +10219,7 @@ static int bnx2x_udp_tunnel_sync(struct net_device *netdev, unsigned int table) static const struct udp_tunnel_nic_info bnx2x_udp_tunnels = { .sync_table = bnx2x_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 00a60b2b90c4..ededd292b9d3 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -15573,8 +15573,7 @@ static int bnxt_udp_tunnel_unset_port(struct net_device *netdev, unsigned int ta static const struct udp_tunnel_nic_info bnxt_udp_tunnels = { .set_port = bnxt_udp_tunnel_set_port, .unset_port = bnxt_udp_tunnel_unset_port, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, @@ -15582,8 +15581,7 @@ static const struct udp_tunnel_nic_info bnxt_udp_tunnels = { }, bnxt_udp_tunnels_p7 = { .set_port = bnxt_udp_tunnel_set_port, .unset_port = bnxt_udp_tunnel_unset_port, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 3d2e21592119..f49400ba9729 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4031,8 +4031,7 @@ static int be_vxlan_unset_port(struct net_device *netdev, unsigned int table, static const struct udp_tunnel_nic_info be_udp_tunnels = { .set_port = be_vxlan_set_port, .unset_port = be_vxlan_unset_port, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, }, diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1156a5b3055c..3b4f59d978a5 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -15895,7 +15895,6 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pf->udp_tunnel_nic.set_port = i40e_udp_tunnel_set_port; pf->udp_tunnel_nic.unset_port = i40e_udp_tunnel_unset_port; - pf->udp_tunnel_nic.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP; pf->udp_tunnel_nic.shared = &pf->udp_tunnel_shared; pf->udp_tunnel_nic.tables[0].n_entries = I40E_MAX_PF_UDP_OFFLOAD_PORTS; pf->udp_tunnel_nic.tables[0].tunnel_types = UDP_TUNNEL_TYPE_VXLAN | diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 7959a65c0903..f8ef80069e3d 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4767,7 +4767,6 @@ int ice_init_dev(struct ice_pf *pf) pf->hw.udp_tunnel_nic.set_port = ice_udp_tunnel_set_port; pf->hw.udp_tunnel_nic.unset_port = ice_udp_tunnel_unset_port; - pf->hw.udp_tunnel_nic.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP; pf->hw.udp_tunnel_nic.shared = &pf->hw.udp_tunnel_shared; if (pf->hw.tnl.valid_count[TNL_VXLAN]) { pf->hw.udp_tunnel_nic.tables[0].n_entries = diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 281b34af0bb4..d2071aff7b8f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2670,8 +2670,7 @@ static int mlx4_udp_tunnel_sync(struct net_device *dev, unsigned int table) static const struct udp_tunnel_nic_info mlx4_udp_tunnels = { .sync_table = mlx4_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_IPV4_ONLY, + .flags = UDP_TUNNEL_NIC_INFO_IPV4_ONLY, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, }, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 24559cbcbfc2..dca5ca51a470 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5351,8 +5351,7 @@ void mlx5e_vxlan_set_netdev_info(struct mlx5e_priv *priv) priv->nic_info.set_port = mlx5e_vxlan_set_port; priv->nic_info.unset_port = mlx5e_vxlan_unset_port; - priv->nic_info.flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN; + priv->nic_info.flags = UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN; priv->nic_info.tables[0].tunnel_types = UDP_TUNNEL_TYPE_VXLAN; /* Don't count the space hard-coded to the IANA port */ priv->nic_info.tables[0].n_entries = diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 932f59d70f41..132626a3f9f7 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2394,8 +2394,7 @@ static int nfp_udp_tunnel_sync(struct net_device *netdev, unsigned int table) static const struct udp_tunnel_nic_info nfp_udp_tunnels = { .sync_table = nfp_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP | - UDP_TUNNEL_NIC_INFO_OPEN_ONLY, + .flags = UDP_TUNNEL_NIC_INFO_OPEN_ONLY, .tables = { { .n_entries = NFP_NET_N_VXLAN_PORTS, diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c index 985026dd816f..7e341e026489 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_filter.c +++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c @@ -987,20 +987,17 @@ static int qede_udp_tunnel_sync(struct net_device *dev, unsigned int table) static const struct udp_tunnel_nic_info qede_udp_tunnels_both = { .sync_table = qede_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, }, }, qede_udp_tunnels_vxlan = { .sync_table = qede_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, }, }, qede_udp_tunnels_geneve = { .sync_table = qede_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_GENEVE, }, }, diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c index eb69121df726..53cdd36c4123 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c @@ -486,7 +486,6 @@ static int qlcnic_udp_tunnel_sync(struct net_device *dev, unsigned int table) static const struct udp_tunnel_nic_info qlcnic_udp_tunnels = { .sync_table = qlcnic_udp_tunnel_sync, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP, .tables = { { .n_entries = 1, .tunnel_types = UDP_TUNNEL_TYPE_VXLAN, }, }, diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c index 47349c148c0c..fcec81f862ec 100644 --- a/drivers/net/ethernet/sfc/ef10.c +++ b/drivers/net/ethernet/sfc/ef10.c @@ -3985,7 +3985,6 @@ static int efx_ef10_udp_tnl_unset_port(struct net_device *dev, static const struct udp_tunnel_nic_info efx_ef10_udp_tunnels = { .set_port = efx_ef10_udp_tnl_set_port, .unset_port = efx_ef10_udp_tnl_unset_port, - .flags = UDP_TUNNEL_NIC_INFO_MAY_SLEEP, .tables = { { .n_entries = 16, diff --git a/drivers/net/netdevsim/udp_tunnels.c b/drivers/net/netdevsim/udp_tunnels.c index 640b4983a9a0..10cbbf1c584b 100644 --- a/drivers/net/netdevsim/udp_tunnels.c +++ b/drivers/net/netdevsim/udp_tunnels.c @@ -112,12 +112,10 @@ nsim_udp_tunnels_info_reset_write(struct file *file, const char __user *data, struct net_device *dev = file->private_data; struct netdevsim *ns = netdev_priv(dev); - rtnl_lock(); if (dev->reg_state == NETREG_REGISTERED) { memset(ns->udp_ports.ports, 0, sizeof(ns->udp_ports.__ports)); udp_tunnel_nic_reset_ntf(dev); } - rtnl_unlock(); return count; } @@ -181,8 +179,6 @@ int nsim_udp_tunnels_info_create(struct nsim_dev *nsim_dev, info->sync_table = NULL; } - if (ns->udp_ports.sleep) - info->flags |= UDP_TUNNEL_NIC_INFO_MAY_SLEEP; if (nsim_dev->udp_ports.open_only) info->flags |= UDP_TUNNEL_NIC_INFO_OPEN_ONLY; if (nsim_dev->udp_ports.ipv4_only) diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index e3c70b579095..cbd3a43074bd 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -130,22 +130,6 @@ void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type); void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type); -static inline void udp_tunnel_get_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); -} - -static inline void udp_tunnel_drop_rx_info(struct net_device *dev) -{ - ASSERT_RTNL(); - if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) - return; - call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); -} - /* Transmit the skb using UDP encapsulation. */ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, @@ -222,19 +206,17 @@ static inline void udp_tunnel_encap_enable(struct sock *sk) #define UDP_TUNNEL_NIC_MAX_TABLES 4 enum udp_tunnel_nic_info_flags { - /* Device callbacks may sleep */ - UDP_TUNNEL_NIC_INFO_MAY_SLEEP = BIT(0), /* Device only supports offloads when it's open, all ports * will be removed before close and re-added after open. */ - UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(1), + UDP_TUNNEL_NIC_INFO_OPEN_ONLY = BIT(0), /* Device supports only IPv4 tunnels */ - UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(2), + UDP_TUNNEL_NIC_INFO_IPV4_ONLY = BIT(1), /* Device has hard-coded the IANA VXLAN port (4789) as VXLAN. * This port must not be counted towards n_entries of any table. * Driver will not receive any callback associated with port 4789. */ - UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(3), + UDP_TUNNEL_NIC_INFO_STATIC_IANA_VXLAN = BIT(2), }; struct udp_tunnel_nic; @@ -325,6 +307,9 @@ struct udp_tunnel_nic_ops { size_t (*dump_size)(struct net_device *dev, unsigned int table); int (*dump_write)(struct net_device *dev, unsigned int table, struct sk_buff *skb); + void (*assert_locked)(struct net_device *dev); + void (*lock)(struct net_device *dev); + void (*unlock)(struct net_device *dev); }; #ifdef CONFIG_INET @@ -353,8 +338,29 @@ static inline void udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { - if (udp_tunnel_nic_ops) + if (udp_tunnel_nic_ops) { + udp_tunnel_nic_ops->lock(dev); udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); + udp_tunnel_nic_ops->unlock(dev); + } +} + +static inline void udp_tunnel_nic_assert_locked(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->assert_locked(dev); +} + +static inline void udp_tunnel_nic_lock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->lock(dev); +} + +static inline void udp_tunnel_nic_unlock(struct net_device *dev) +{ + if (udp_tunnel_nic_ops) + udp_tunnel_nic_ops->unlock(dev); } static inline void @@ -396,17 +402,50 @@ static inline void udp_tunnel_nic_reset_ntf(struct net_device *dev) static inline size_t udp_tunnel_nic_dump_size(struct net_device *dev, unsigned int table) { + size_t ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_size(dev, table); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_size(dev, table); + udp_tunnel_nic_ops->unlock(dev); + + return ret; } static inline int udp_tunnel_nic_dump_write(struct net_device *dev, unsigned int table, struct sk_buff *skb) { + int ret; + if (!udp_tunnel_nic_ops) return 0; - return udp_tunnel_nic_ops->dump_write(dev, table, skb); + + udp_tunnel_nic_ops->lock(dev); + ret = udp_tunnel_nic_ops->dump_write(dev, table, skb); + udp_tunnel_nic_ops->unlock(dev); + + return ret; +} + +static inline void udp_tunnel_get_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_PUSH_INFO, dev); } + +static inline void udp_tunnel_drop_rx_info(struct net_device *dev) +{ + ASSERT_RTNL(); + if (!(dev->features & NETIF_F_RX_UDP_TUNNEL_PORT)) + return; + udp_tunnel_nic_assert_locked(dev); + call_netdevice_notifiers(NETDEV_UDP_TUNNEL_DROP_INFO, dev); +} + #endif diff --git a/net/core/dev.c b/net/core/dev.c index 5baa4691074f..43f56b44f351 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10771,12 +10771,14 @@ sync_lower: * *before* calling udp_tunnel_get_rx_info, * but *after* calling udp_tunnel_drop_rx_info. */ + udp_tunnel_nic_lock(dev); if (features & NETIF_F_RX_UDP_TUNNEL_PORT) { dev->features = features; udp_tunnel_get_rx_info(dev); } else { udp_tunnel_drop_rx_info(dev); } + udp_tunnel_nic_unlock(dev); } if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) { diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 9efd62505916..fce945f23069 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -134,15 +134,17 @@ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) struct udp_tunnel_info ti; struct net_device *dev; + ASSERT_RTNL(); + ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { + udp_tunnel_nic_lock(dev); udp_tunnel_nic_add_port(dev, &ti); + udp_tunnel_nic_unlock(dev); } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); @@ -154,15 +156,17 @@ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) struct udp_tunnel_info ti; struct net_device *dev; + ASSERT_RTNL(); + ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; - rcu_read_lock(); - for_each_netdev_rcu(net, dev) { + for_each_netdev(net, dev) { + udp_tunnel_nic_lock(dev); udp_tunnel_nic_del_port(dev, &ti); + udp_tunnel_nic_unlock(dev); } - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index b6d2d16189c0..ff66db48453c 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -29,6 +29,7 @@ struct udp_tunnel_nic_table_entry { * struct udp_tunnel_nic - UDP tunnel port offload state * @work: async work for talking to hardware from process context * @dev: netdev pointer + * @lock: protects all fields * @need_sync: at least one port start changed * @need_replay: space was freed, we need a replay of all ports * @work_pending: @work is currently scheduled @@ -41,6 +42,8 @@ struct udp_tunnel_nic { struct net_device *dev; + struct mutex lock; + u8 need_sync:1; u8 need_replay:1; u8 work_pending:1; @@ -298,22 +301,11 @@ __udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) static void udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn) { - const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; - bool may_sleep; - if (!utn->need_sync) return; - /* Drivers which sleep in the callback need to update from - * the workqueue, if we come from the tunnel driver's notification. - */ - may_sleep = info->flags & UDP_TUNNEL_NIC_INFO_MAY_SLEEP; - if (!may_sleep) - __udp_tunnel_nic_device_sync(dev, utn); - if (may_sleep || utn->need_replay) { - queue_work(udp_tunnel_nic_workqueue, &utn->work); - utn->work_pending = 1; - } + queue_work(udp_tunnel_nic_workqueue, &utn->work); + utn->work_pending = 1; } static bool @@ -554,12 +546,12 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) struct udp_tunnel_nic *utn; unsigned int i, j; - ASSERT_RTNL(); - utn = dev->udp_tunnel_nic; if (!utn) return; + mutex_lock(&utn->lock); + utn->need_sync = false; for (i = 0; i < utn->n_tables; i++) for (j = 0; j < info->tables[i].n_entries; j++) { @@ -569,7 +561,7 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) entry->flags &= ~(UDP_TUNNEL_NIC_ENTRY_DEL | UDP_TUNNEL_NIC_ENTRY_OP_FAIL); - /* We don't release rtnl across ops */ + /* We don't release utn lock across ops */ WARN_ON(entry->flags & UDP_TUNNEL_NIC_ENTRY_FROZEN); if (!entry->use_cnt) continue; @@ -579,6 +571,8 @@ static void __udp_tunnel_nic_reset_ntf(struct net_device *dev) } __udp_tunnel_nic_device_sync(dev, utn); + + mutex_unlock(&utn->lock); } static size_t @@ -643,6 +637,33 @@ err_cancel: return -EMSGSIZE; } +static void __udp_tunnel_nic_assert_locked(struct net_device *dev) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (utn) + lockdep_assert_held(&utn->lock); +} + +static void __udp_tunnel_nic_lock(struct net_device *dev) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (utn) + mutex_lock(&utn->lock); +} + +static void __udp_tunnel_nic_unlock(struct net_device *dev) +{ + struct udp_tunnel_nic *utn; + + utn = dev->udp_tunnel_nic; + if (utn) + mutex_unlock(&utn->lock); +} + static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .get_port = __udp_tunnel_nic_get_port, .set_port_priv = __udp_tunnel_nic_set_port_priv, @@ -651,6 +672,9 @@ static const struct udp_tunnel_nic_ops __udp_tunnel_nic_ops = { .reset_ntf = __udp_tunnel_nic_reset_ntf, .dump_size = __udp_tunnel_nic_dump_size, .dump_write = __udp_tunnel_nic_dump_write, + .assert_locked = __udp_tunnel_nic_assert_locked, + .lock = __udp_tunnel_nic_lock, + .unlock = __udp_tunnel_nic_unlock, }; static void @@ -710,11 +734,15 @@ static void udp_tunnel_nic_device_sync_work(struct work_struct *work) container_of(work, struct udp_tunnel_nic, work); rtnl_lock(); + mutex_lock(&utn->lock); + utn->work_pending = 0; __udp_tunnel_nic_device_sync(utn->dev, utn); if (utn->need_replay) udp_tunnel_nic_replay(utn->dev, utn); + + mutex_unlock(&utn->lock); rtnl_unlock(); } @@ -730,6 +758,7 @@ udp_tunnel_nic_alloc(const struct udp_tunnel_nic_info *info, return NULL; utn->n_tables = n_tables; INIT_WORK(&utn->work, udp_tunnel_nic_device_sync_work); + mutex_init(&utn->lock); for (i = 0; i < n_tables; i++) { utn->entries[i] = kcalloc(info->tables[i].n_entries, @@ -821,8 +850,11 @@ static int udp_tunnel_nic_register(struct net_device *dev) dev_hold(dev); dev->udp_tunnel_nic = utn; - if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) + if (!(info->flags & UDP_TUNNEL_NIC_INFO_OPEN_ONLY)) { + udp_tunnel_nic_lock(dev); udp_tunnel_get_rx_info(dev); + udp_tunnel_nic_unlock(dev); + } return 0; } @@ -832,6 +864,8 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) { const struct udp_tunnel_nic_info *info = dev->udp_tunnel_nic_info; + udp_tunnel_nic_lock(dev); + /* For a shared table remove this dev from the list of sharing devices * and if there are other devices just detach. */ @@ -841,8 +875,10 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) list_for_each_entry(node, &info->shared->devices, list) if (node->dev == dev) break; - if (list_entry_is_head(node, &info->shared->devices, list)) + if (list_entry_is_head(node, &info->shared->devices, list)) { + udp_tunnel_nic_unlock(dev); return; + } list_del(&node->list); kfree(node); @@ -852,6 +888,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) if (first) { udp_tunnel_drop_rx_info(dev); utn->dev = first->dev; + udp_tunnel_nic_unlock(dev); goto release_dev; } @@ -862,6 +899,7 @@ udp_tunnel_nic_unregister(struct net_device *dev, struct udp_tunnel_nic *utn) * from the work which we will boot immediately. */ udp_tunnel_nic_flush(dev, utn); + udp_tunnel_nic_unlock(dev); /* Wait for the work to be done using the state, netdev core will * retry unregister until we give up our reference on this device. @@ -910,12 +948,16 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused, return NOTIFY_DONE; if (event == NETDEV_UP) { + udp_tunnel_nic_lock(dev); WARN_ON(!udp_tunnel_nic_is_empty(dev, utn)); udp_tunnel_get_rx_info(dev); + udp_tunnel_nic_unlock(dev); return NOTIFY_OK; } if (event == NETDEV_GOING_DOWN) { + udp_tunnel_nic_lock(dev); udp_tunnel_nic_flush(dev, utn); + udp_tunnel_nic_unlock(dev); return NOTIFY_OK; } -- cgit v1.2.3 From cea465a96a294e7bc2537f27a737cfa7c6234b3d Mon Sep 17 00:00:00 2001 From: David Arinzon Date: Tue, 17 Jun 2025 14:05:41 +0300 Subject: devlink: Add new "enable_phc" generic device param Add a new device generic parameter to enable/disable the PHC (PTP Hardware Clock) functionality in the device associated with the devlink instance. Signed-off-by: David Arinzon Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250617110545.5659-6-darinzon@amazon.com Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 3 +++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index 4e01dc32bc08..3da8f4ef2417 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -137,3 +137,6 @@ own name. * - ``event_eq_size`` - u32 - Control the size of asynchronous control events EQ. + * - ``enable_phc`` + - Boolean + - Enable PHC (PTP Hardware Clock) functionality in the device. diff --git a/include/net/devlink.h b/include/net/devlink.h index 0091f23a40f7..63517646a497 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -520,6 +520,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, + DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -578,6 +579,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME "event_eq_size" #define DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE DEVLINK_PARAM_TYPE_U32 +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc" +#define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index b29abf8d3ed4..396b8a7f6013 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -92,6 +92,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME, + .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From 75cabb46935b6de8e2bdfde563e460ac41cfff12 Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Tue, 17 Jun 2025 00:17:34 -0700 Subject: net: mana: Add support for net_shaper_ops Introduce support for net_shaper_ops in the MANA driver, enabling configuration of rate limiting on the MANA NIC. To apply rate limiting, the driver issues a HWC command via mana_set_bw_clamp() and updates the corresponding shaper object in the net_shaper cache. If an error occurs during this process, the driver restores the previous speed by querying the current link configuration using mana_query_link_cfg(). The minimum supported bandwidth is 100 Mbps, and only values that are exact multiples of 100 Mbps are allowed. Any other values are rejected. To remove a shaper, the driver resets the bandwidth to the maximum supported by the SKU using mana_set_bw_clamp() and clears the associated cache entry. If an error occurs during this process, the shaper details are retained. On the hardware that does not support these APIs, the net-shaper calls to set speed would fail. Set the speed: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/net_shaper.yaml \ --do set --json '{"ifindex":'$IFINDEX', "handle":{"scope": "netdev", "id":'$ID' }, "bw-max": 200000000 }' Get the shaper details: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/net_shaper.yaml \ --do get --json '{"ifindex":'$IFINDEX', "handle":{"scope": "netdev", "id":'$ID' }}' > {'bw-max': 200000000, > 'handle': {'scope': 'netdev'}, > 'ifindex': $IFINDEX, > 'metric': 'bps'} Delete the shaper object: ./tools/net/ynl/pyynl/cli.py \ --spec Documentation/netlink/specs/net_shaper.yaml \ --do delete --json '{"ifindex":'$IFINDEX', "handle":{"scope": "netdev","id":'$ID' }}' Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Reviewed-by: Shradha Gupta Reviewed-by: Saurabh Singh Sengar Reviewed-by: Long Li Link: https://patch.msgid.link/1750144656-2021-3-git-send-email-ernis@linux.microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 155 ++++++++++++++++++++++++++ include/net/mana/mana.h | 40 +++++++ 2 files changed, 195 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index bcc33ea7aca3..547dff450b6d 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -719,6 +719,78 @@ out: return err; } +static int mana_shaper_set(struct net_shaper_binding *binding, + const struct net_shaper *shaper, + struct netlink_ext_ack *extack) +{ + struct mana_port_context *apc = netdev_priv(binding->netdev); + u32 old_speed, rate; + int err; + + if (shaper->handle.scope != NET_SHAPER_SCOPE_NETDEV) { + NL_SET_ERR_MSG_MOD(extack, "net shaper scope should be netdev"); + return -EINVAL; + } + + if (apc->handle.id && shaper->handle.id != apc->handle.id) { + NL_SET_ERR_MSG_MOD(extack, "Cannot create multiple shapers"); + return -EOPNOTSUPP; + } + + if (!shaper->bw_max || (shaper->bw_max % 100000000)) { + NL_SET_ERR_MSG_MOD(extack, "Please use multiples of 100Mbps for bandwidth"); + return -EINVAL; + } + + rate = div_u64(shaper->bw_max, 1000); /* Convert bps to Kbps */ + rate = div_u64(rate, 1000); /* Convert Kbps to Mbps */ + + /* Get current speed */ + err = mana_query_link_cfg(apc); + old_speed = (err) ? SPEED_UNKNOWN : apc->speed; + + if (!err) { + err = mana_set_bw_clamp(apc, rate, TRI_STATE_TRUE); + apc->speed = (err) ? old_speed : rate; + apc->handle = (err) ? apc->handle : shaper->handle; + } + + return err; +} + +static int mana_shaper_del(struct net_shaper_binding *binding, + const struct net_shaper_handle *handle, + struct netlink_ext_ack *extack) +{ + struct mana_port_context *apc = netdev_priv(binding->netdev); + int err; + + err = mana_set_bw_clamp(apc, 0, TRI_STATE_FALSE); + + if (!err) { + /* Reset mana port context parameters */ + apc->handle.id = 0; + apc->handle.scope = NET_SHAPER_SCOPE_UNSPEC; + apc->speed = 0; + } + + return err; +} + +static void mana_shaper_cap(struct net_shaper_binding *binding, + enum net_shaper_scope scope, + unsigned long *flags) +{ + *flags = BIT(NET_SHAPER_A_CAPS_SUPPORT_BW_MAX) | + BIT(NET_SHAPER_A_CAPS_SUPPORT_METRIC_BPS); +} + +static const struct net_shaper_ops mana_shaper_ops = { + .set = mana_shaper_set, + .delete = mana_shaper_del, + .capabilities = mana_shaper_cap, +}; + static const struct net_device_ops mana_devops = { .ndo_open = mana_open, .ndo_stop = mana_close, @@ -729,6 +801,7 @@ static const struct net_device_ops mana_devops = { .ndo_bpf = mana_bpf, .ndo_xdp_xmit = mana_xdp_xmit, .ndo_change_mtu = mana_change_mtu, + .net_shaper_ops = &mana_shaper_ops, }; static void mana_cleanup_port_context(struct mana_port_context *apc) @@ -1162,6 +1235,86 @@ out: return err; } +int mana_query_link_cfg(struct mana_port_context *apc) +{ + struct net_device *ndev = apc->ndev; + struct mana_query_link_config_resp resp = {}; + struct mana_query_link_config_req req = {}; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_QUERY_LINK_CONFIG, + sizeof(req), sizeof(resp)); + + req.vport = apc->port_handle; + req.hdr.resp.msg_version = GDMA_MESSAGE_V2; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + + if (err) { + netdev_err(ndev, "Failed to query link config: %d\n", err); + return err; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_QUERY_LINK_CONFIG, + sizeof(resp)); + + if (err || resp.hdr.status) { + netdev_err(ndev, "Failed to query link config: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EOPNOTSUPP; + return err; + } + + if (resp.qos_unconfigured) { + err = -EINVAL; + return err; + } + apc->speed = resp.link_speed_mbps; + return 0; +} + +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping) +{ + struct mana_set_bw_clamp_resp resp = {}; + struct mana_set_bw_clamp_req req = {}; + struct net_device *ndev = apc->ndev; + int err; + + mana_gd_init_req_hdr(&req.hdr, MANA_SET_BW_CLAMP, + sizeof(req), sizeof(resp)); + req.vport = apc->port_handle; + req.link_speed_mbps = speed; + req.enable_clamping = enable_clamping; + + err = mana_send_request(apc->ac, &req, sizeof(req), &resp, + sizeof(resp)); + + if (err) { + netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d", + speed, err); + return err; + } + + err = mana_verify_resp_hdr(&resp.hdr, MANA_SET_BW_CLAMP, + sizeof(resp)); + + if (err || resp.hdr.status) { + netdev_err(ndev, "Failed to set bandwidth clamp: %d, 0x%x\n", err, + resp.hdr.status); + if (!err) + err = -EOPNOTSUPP; + return err; + } + + if (resp.qos_unconfigured) + netdev_info(ndev, "QoS is unconfigured\n"); + + return 0; +} + int mana_create_wq_obj(struct mana_port_context *apc, mana_handle_t vport, u32 wq_type, struct mana_obj_spec *wq_spec, @@ -3011,6 +3164,8 @@ static int mana_probe_port(struct mana_context *ac, int port_idx, goto free_indir; } + debugfs_create_u32("current_speed", 0400, apc->mana_port_debugfs, &apc->speed); + return 0; free_indir: diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 4176edf1be71..038b18340e51 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -5,6 +5,7 @@ #define _MANA_H #include +#include #include "gdma.h" #include "hw_channel.h" @@ -526,7 +527,12 @@ struct mana_port_context { struct mutex vport_mutex; int vport_use_count; + /* Net shaper handle*/ + struct net_shaper_handle handle; + u16 port_idx; + /* Currently configured speed (mbps) */ + u32 speed; bool port_is_up; bool port_st_save; /* Saved port state */ @@ -562,6 +568,9 @@ struct bpf_prog *mana_xdp_get(struct mana_port_context *apc); void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog); int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf); void mana_query_gf_stats(struct mana_port_context *apc); +int mana_query_link_cfg(struct mana_port_context *apc); +int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, + int enable_clamping); void mana_query_phy_stats(struct mana_port_context *apc); int mana_pre_alloc_rxbufs(struct mana_port_context *apc, int mtu, int num_queues); void mana_pre_dealloc_rxbufs(struct mana_port_context *apc); @@ -589,6 +598,8 @@ enum mana_command_code { MANA_FENCE_RQ = 0x20006, MANA_CONFIG_VPORT_RX = 0x20007, MANA_QUERY_VPORT_CONFIG = 0x20008, + MANA_QUERY_LINK_CONFIG = 0x2000A, + MANA_SET_BW_CLAMP = 0x2000B, MANA_QUERY_PHY_STAT = 0x2000c, /* Privileged commands for the PF mode */ @@ -598,6 +609,35 @@ enum mana_command_code { MANA_DEREGISTER_HW_PORT = 0x28004, }; +/* Query Link Configuration*/ +struct mana_query_link_config_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; +}; /* HW DATA */ + +struct mana_query_link_config_resp { + struct gdma_resp_hdr hdr; + u32 qos_speed_mbps; + u8 qos_unconfigured; + u8 reserved1[3]; + u32 link_speed_mbps; + u8 reserved2[4]; +}; /* HW DATA */ + +/* Set Bandwidth Clamp*/ +struct mana_set_bw_clamp_req { + struct gdma_req_hdr hdr; + mana_handle_t vport; + enum TRI_STATE enable_clamping; + u32 link_speed_mbps; +}; /* HW DATA */ + +struct mana_set_bw_clamp_resp { + struct gdma_resp_hdr hdr; + u8 qos_unconfigured; + u8 reserved[7]; +}; /* HW DATA */ + /* Query Device Configuration */ struct mana_query_device_cfg_req { struct gdma_req_hdr hdr; -- cgit v1.2.3 From a6d5edf11e0cf5a4650f1d353d20ec29de093813 Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Tue, 17 Jun 2025 00:17:35 -0700 Subject: net: mana: Add speed support in mana_get_link_ksettings Allow mana ethtool get_link_ksettings operation to report the maximum speed supported by the SKU in mbps. The driver retrieves this information by issuing a HWC command to the hardware via mana_query_link_cfg(), which retrieves the SKU's maximum supported speed. These APIs when invoked on hardware that are older/do not support these APIs, the speed would be reported as UNKNOWN. Before: $ethtool enP30832s1 > Settings for enP30832s1: Supported ports: [ ] Supported link modes: Not reported Supported pause frame use: No Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: Not reported Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: Unknown! Duplex: Full Auto-negotiation: off Port: Other PHYAD: 0 Transceiver: internal Link detected: yes After: $ethtool enP30832s1 > Settings for enP30832s1: Supported ports: [ ] Supported link modes: Not reported Supported pause frame use: No Supports auto-negotiation: No Supported FEC modes: Not reported Advertised link modes: Not reported Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: Not reported Speed: 16000Mb/s Duplex: Full Auto-negotiation: off Port: Other PHYAD: 0 Transceiver: internal Link detected: yes Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Reviewed-by: Shradha Gupta Reviewed-by: Saurabh Singh Sengar Reviewed-by: Long Li Link: https://patch.msgid.link/1750144656-2021-4-git-send-email-ernis@linux.microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/mana_en.c | 1 + drivers/net/ethernet/microsoft/mana/mana_ethtool.c | 6 ++++++ include/net/mana/mana.h | 2 ++ 3 files changed, 9 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 547dff450b6d..d7079e05dfb8 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -1272,6 +1272,7 @@ int mana_query_link_cfg(struct mana_port_context *apc) return err; } apc->speed = resp.link_speed_mbps; + apc->max_speed = resp.qos_speed_mbps; return 0; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c index 4fb3a04994a2..a1afa75a9463 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c +++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c @@ -495,6 +495,12 @@ out: static int mana_get_link_ksettings(struct net_device *ndev, struct ethtool_link_ksettings *cmd) { + struct mana_port_context *apc = netdev_priv(ndev); + int err; + + err = mana_query_link_cfg(apc); + cmd->base.speed = (err) ? SPEED_UNKNOWN : apc->max_speed; + cmd->base.duplex = DUPLEX_FULL; cmd->base.port = PORT_OTHER; diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h index 038b18340e51..e1030a7d2daa 100644 --- a/include/net/mana/mana.h +++ b/include/net/mana/mana.h @@ -533,6 +533,8 @@ struct mana_port_context { u16 port_idx; /* Currently configured speed (mbps) */ u32 speed; + /* Maximum speed supported by the SKU (mbps) */ + u32 max_speed; bool port_is_up; bool port_st_save; /* Saved port state */ -- cgit v1.2.3 From ca8ac489ca33c986ff02ee14c3e1c10b86355428 Mon Sep 17 00:00:00 2001 From: Erni Sri Satya Vennela Date: Tue, 17 Jun 2025 00:17:36 -0700 Subject: net: mana: Handle unsupported HWC commands If any of the HWC commands are not recognized by the underlying hardware, the hardware returns the response header status of -1. Log the information using netdev_info_once to avoid multiple error logs in dmesg. Signed-off-by: Erni Sri Satya Vennela Reviewed-by: Haiyang Zhang Reviewed-by: Shradha Gupta Reviewed-by: Saurabh Singh Sengar Reviewed-by: Dipayaan Roy Link: https://patch.msgid.link/1750144656-2021-5-git-send-email-ernis@linux.microsoft.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/microsoft/mana/hw_channel.c | 4 ++++ drivers/net/ethernet/microsoft/mana/mana_en.c | 11 +++++++++++ include/net/mana/gdma.h | 1 + 3 files changed, 16 insertions(+) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 3d3677c0d014..650d22654d49 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -891,6 +891,10 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, } if (ctx->status_code && ctx->status_code != GDMA_STATUS_MORE_ENTRIES) { + if (ctx->status_code == GDMA_STATUS_CMD_UNSUPPORTED) { + err = -EOPNOTSUPP; + goto out; + } if (req_msg->req.msg_type != MANA_QUERY_PHY_STAT) dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n", ctx->status_code); diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index d7079e05dfb8..5aee7bda1504 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -847,6 +847,9 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, err = mana_gd_send_request(gc, in_len, in_buf, out_len, out_buf); if (err || resp->status) { + if (err == -EOPNOTSUPP) + return err; + if (req->req.msg_type != MANA_QUERY_PHY_STAT) dev_err(dev, "Failed to send mana message: %d, 0x%x\n", err, resp->status); @@ -1252,6 +1255,10 @@ int mana_query_link_cfg(struct mana_port_context *apc) sizeof(resp)); if (err) { + if (err == -EOPNOTSUPP) { + netdev_info_once(ndev, "MANA_QUERY_LINK_CONFIG not supported\n"); + return err; + } netdev_err(ndev, "Failed to query link config: %d\n", err); return err; } @@ -1294,6 +1301,10 @@ int mana_set_bw_clamp(struct mana_port_context *apc, u32 speed, sizeof(resp)); if (err) { + if (err == -EOPNOTSUPP) { + netdev_info_once(ndev, "MANA_SET_BW_CLAMP not supported\n"); + return err; + } netdev_err(ndev, "Failed to set bandwidth clamp for speed %u, err = %d", speed, err); return err; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 6fe6cbcd512d..92ab85061df0 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -10,6 +10,7 @@ #include "shm_channel.h" #define GDMA_STATUS_MORE_ENTRIES 0x00000105 +#define GDMA_STATUS_CMD_UNSUPPORTED 0xffffffff /* Structures labeled with "HW DATA" are exchanged with the hardware. All of * them are naturally aligned and hence don't need __packed. -- cgit v1.2.3 From c7d78566bbd30544a0618a6ffbc97bc0ddac7035 Mon Sep 17 00:00:00 2001 From: Nicolas Escande Date: Tue, 17 Jun 2025 16:13:34 +0200 Subject: neighbour: add support for NUD_PERMANENT proxy entries As discussesd before in [0] proxy entries (which are more configuration than runtime data) should stay when the link (carrier) goes does down. This is what happens for regular neighbour entries. So lets fix this by: - storing in proxy entries the fact that it was added as NUD_PERMANENT - not removing NUD_PERMANENT proxy entries when the carrier goes down (same as how it's done in neigh_flush_dev() for regular neigh entries) [0]: https://lore.kernel.org/netdev/c584ef7e-6897-01f3-5b80-12b53f7b4bf4@kernel.org/ Signed-off-by: Nicolas Escande Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250617141334.3724863-1-nico.escande@gmail.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 + net/core/neighbour.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 9a832cab5b1d..c7ce5ec7be23 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -182,6 +182,7 @@ struct pneigh_entry { netdevice_tracker dev_tracker; u32 flags; u8 protocol; + bool permanent; u32 key[]; }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 49dce9a82295..85a5535de8ba 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,7 +54,8 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev); + struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -423,7 +424,7 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev); + pneigh_ifdown_and_unlock(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -803,7 +804,8 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, } static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev) + struct net_device *dev, + bool skip_perm) { struct pneigh_entry *n, **np, *freelist = NULL; u32 h; @@ -811,12 +813,15 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = *np) != NULL) { + if (skip_perm && n->permanent) + goto skip; if (!dev || n->dev == dev) { *np = n->next; n->next = freelist; freelist = n; continue; } +skip: np = &n->next; } } @@ -1983,6 +1988,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, pn = pneigh_lookup(tbl, net, dst, dev, 1); if (pn) { pn->flags = ndm_flags; + pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); if (protocol) pn->protocol = protocol; err = 0; -- cgit v1.2.3 From df42bfc96e0ad90d243c0ee6b783a33bdb72a184 Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Tue, 27 May 2025 14:11:43 +0530 Subject: wifi: cfg80211: Add utility API to get radio index from channel Add utility API cfg80211_get_radio_idx_by_chan() to retrieve the radio index corresponding to a given channel in a multi-radio wiphy. This utility function can be used when we want to check the radio-specific data for a channel in a multi-radio wiphy. For example, it can help determine the radio index required to handle a scan request. This index can then be used to decide whether the scan can proceed without interfering with ongoing DFS operations on another radio. Signed-off-by: Vasanthakumar Thiagarajan Co-developed-by: Raj Kumar Bhagat Signed-off-by: Raj Kumar Bhagat Link: https://patch.msgid.link/20250527-mlo-dfs-acs-v2-1-92c2f37c81d9@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 +++++++++++ net/wireless/util.c | 24 ++++++++++++++++++++++++ 2 files changed, 35 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d1848dc8ec99..7719a90ab4d7 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9372,6 +9372,17 @@ int cfg80211_iter_combinations(struct wiphy *wiphy, void (*iter)(const struct ieee80211_iface_combination *c, void *data), void *data); +/** + * cfg80211_get_radio_idx_by_chan - get the radio index by the channel + * + * @wiphy: the wiphy + * @chan: channel for which the supported radio index is required + * + * Return: radio index on success or a negative error code + */ +int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, + const struct ieee80211_channel *chan); + /** * cfg80211_stop_iface - trigger interface disconnection diff --git a/net/wireless/util.c b/net/wireless/util.c index ed868c0f7ca8..e438f883f085 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2516,6 +2516,30 @@ int cfg80211_check_combinations(struct wiphy *wiphy, } EXPORT_SYMBOL(cfg80211_check_combinations); +int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, + const struct ieee80211_channel *chan) +{ + const struct wiphy_radio *radio; + int i, j; + u32 freq; + + if (!chan) + return -EINVAL; + + freq = ieee80211_channel_to_khz(chan); + for (i = 0; i < wiphy->n_radio; i++) { + radio = &wiphy->radio[i]; + for (j = 0; j < radio->n_freq_range; j++) { + if (freq >= radio->freq_range[j].start_freq && + freq < radio->freq_range[j].end_freq) + return i; + } + } + + return -ENOENT; +} +EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); + int ieee80211_get_ratemask(struct ieee80211_supported_band *sband, const u8 *rates, unsigned int n_rates, u32 *mask) -- cgit v1.2.3 From 7c598c653ad465138ecc2fe64492633c541effef Mon Sep 17 00:00:00 2001 From: Kavita Kavita Date: Wed, 4 Jun 2025 16:27:57 +0530 Subject: wifi: cfg80211: Add support for link reconfiguration negotiation offload to driver In the case of SME-in-driver, the driver can internally choose to update the links based on the AP MLD recommendation and do link reconfiguration negotiation with AP MLD. (e.g., After the driver processing the BSS Transition Management request frame received from the AP MLD with Neighbor Report containing Multi-Link element with recommended links information chooses to do link reconfiguration negotiation with AP MLD). To support this, extend cfg80211_mlo_reconf_add_done() and NL80211_CMD_ASSOC_MLO_RECONF to indicate added links information for driver-initiated link reconfiguration requests. For removed links, the driver indicates links information using the NL80211_CMD_LINKS_REMOVED event for driver-initiated cases, the same as supplicant initiated cases. For the driver-initiated case, cfg80211 will receive link reconfiguration result asynchronously from driver so holding BSSes of the accepted add links is needed in the event path. Also, no need of unhold call for the rejected add link BSSes since there was no hold call happened previously. Once the supplicant receives the NL80211_CMD_ASSOC_MLO_RECONF event, it needs to process the information about newly added links and install per-link group keys (e.g., GTK/IGTK/BIGTK etc.). In case of the SME-in-driver, using a vendor interface etc. to notify the supplicant to initiate a link reconfiguration request and then supplicant sending command to the cfg80211 can lead to race conditions. The correct design to avoid this is that the driver indicates the cfg80211 directly with the results of the link reconfiguration negotiation. Signed-off-by: Kavita Kavita Link: https://patch.msgid.link/20250604105757.2542-3-quic_kkavita@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 ++++++ include/uapi/linux/nl80211.h | 6 +++++- net/wireless/mlme.c | 10 ++++++++-- net/wireless/trace.h | 10 ++++++---- 4 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7719a90ab4d7..47b4235eea59 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9747,6 +9747,11 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); * struct cfg80211_mlo_reconf_done_data - MLO reconfiguration data * @buf: MLO Reconfiguration Response frame (header + body) * @len: length of the frame data + * @driver_initiated: Indicates whether the add links request is initiated by + * driver. This is set to true when the link reconfiguration request + * initiated by driver due to AP link recommendation requests + * (Ex: BTM (BSS Transition Management) request) handling offloaded to + * driver. * @added_links: BIT mask of links successfully added to the association * @links: per-link information indexed by link ID * @links.bss: the BSS that MLO reconfiguration was requested for, ownership of @@ -9759,6 +9764,7 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask); struct cfg80211_mlo_reconf_done_data { const u8 *buf; size_t len; + bool driver_initiated; u16 added_links; struct { struct cfg80211_bss *bss; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index e53840d009d1..a289014abe37 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1334,7 +1334,11 @@ * reconfiguration request results from the driver, this command is also * used as an event to notify userspace about the added links information. * For notifying the removed links information, the existing - * %NL80211_CMD_LINKS_REMOVED command is used. + * %NL80211_CMD_LINKS_REMOVED command is used. This command is also used to + * notify userspace about newly added links for the current connection in + * case of AP-initiated link recommendation requests, received via + * a BTM (BSS Transition Management) request or a link reconfig notify + * frame, where the driver handles the link recommendation offload. * * @NL80211_CMD_EPCS_CFG: EPCS configuration for a station. Used by userland to * control EPCS configuration. Used to notify userland on the current state diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 05d44a443518..29e1ce8aff42 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1331,7 +1331,8 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, lockdep_assert_wiphy(wiphy); trace_cfg80211_mlo_reconf_add_done(dev, data->added_links, - data->buf, data->len); + data->buf, data->len, + data->driver_initiated); if (WARN_ON(!wdev->valid_links)) return; @@ -1361,11 +1362,16 @@ void cfg80211_mlo_reconf_add_done(struct net_device *dev, wdev->links[link_id].client.current_bss = bss_from_pub(bss); + if (data->driver_initiated) + cfg80211_hold_bss(bss_from_pub(bss)); + memcpy(wdev->links[link_id].addr, data->links[link_id].addr, ETH_ALEN); } else { - cfg80211_unhold_bss(bss_from_pub(bss)); + if (!data->driver_initiated) + cfg80211_unhold_bss(bss_from_pub(bss)); + cfg80211_put_bss(wiphy, bss); } } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 4ed9fada4ec0..61a5eca9c513 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -4126,20 +4126,22 @@ TRACE_EVENT(cfg80211_links_removed, TRACE_EVENT(cfg80211_mlo_reconf_add_done, TP_PROTO(struct net_device *netdev, u16 link_mask, - const u8 *buf, size_t len), - TP_ARGS(netdev, link_mask, buf, len), + const u8 *buf, size_t len, bool driver_initiated), + TP_ARGS(netdev, link_mask, buf, len, driver_initiated), TP_STRUCT__entry( NETDEV_ENTRY __field(u16, link_mask) __dynamic_array(u8, buf, len) + __field(bool, driver_initiated) ), TP_fast_assign( NETDEV_ASSIGN; __entry->link_mask = link_mask; memcpy(__get_dynamic_array(buf), buf, len); + __entry->driver_initiated = driver_initiated; ), - TP_printk(NETDEV_PR_FMT ", link_mask:0x%x", - NETDEV_PR_ARG, __entry->link_mask) + TP_printk(NETDEV_PR_FMT ", link_mask:0x%x, driver_initiated:%d", + NETDEV_PR_ARG, __entry->link_mask, __entry->driver_initiated) ); TRACE_EVENT(rdev_assoc_ml_reconf, -- cgit v1.2.3 From 1d6123102e9fbedc8d25bf4731da6d513173e49e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 17 Jun 2025 09:58:13 -0700 Subject: Bluetooth: hci_core: Fix use-after-free in vhci_flush() syzbot reported use-after-free in vhci_flush() without repro. [0] From the splat, a thread close()d a vhci file descriptor while its device was being used by iotcl() on another thread. Once the last fd refcnt is released, vhci_release() calls hci_unregister_dev(), hci_free_dev(), and kfree() for struct vhci_data, which is set to hci_dev->dev->driver_data. The problem is that there is no synchronisation after unlinking hdev from hci_dev_list in hci_unregister_dev(). There might be another thread still accessing the hdev which was fetched before the unlink operation. We can use SRCU for such synchronisation. Let's run hci_dev_reset() under SRCU and wait for its completion in hci_unregister_dev(). Another option would be to restore hci_dev->destruct(), which was removed in commit 587ae086f6e4 ("Bluetooth: Remove unused hci-destruct cb"). However, this would not be a good solution, as we should not run hci_unregister_dev() while there are in-flight ioctl() requests, which could lead to another data-race KCSAN splat. Note that other drivers seem to have the same problem, for exmaple, virtbt_remove(). [0]: BUG: KASAN: slab-use-after-free in skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] BUG: KASAN: slab-use-after-free in skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 Read of size 8 at addr ffff88807cb8d858 by task syz.1.219/6718 CPU: 1 UID: 0 PID: 6718 Comm: syz.1.219 Not tainted 6.16.0-rc1-syzkaller-00196-g08207f42d3ff #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/07/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:408 [inline] print_report+0xd2/0x2b0 mm/kasan/report.c:521 kasan_report+0x118/0x150 mm/kasan/report.c:634 skb_queue_empty_lockless include/linux/skbuff.h:1891 [inline] skb_queue_purge_reason+0x99/0x360 net/core/skbuff.c:3937 skb_queue_purge include/linux/skbuff.h:3368 [inline] vhci_flush+0x44/0x50 drivers/bluetooth/hci_vhci.c:69 hci_dev_do_reset net/bluetooth/hci_core.c:552 [inline] hci_dev_reset+0x420/0x5c0 net/bluetooth/hci_core.c:592 sock_do_ioctl+0xd9/0x300 net/socket.c:1190 sock_ioctl+0x576/0x790 net/socket.c:1311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:907 [inline] __se_sys_ioctl+0xf9/0x170 fs/ioctl.c:893 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fcf5b98e929 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fcf5c7b9038 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007fcf5bbb6160 RCX: 00007fcf5b98e929 RDX: 0000000000000000 RSI: 00000000400448cb RDI: 0000000000000009 RBP: 00007fcf5ba10b39 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 00007fcf5bbb6160 R15: 00007ffd6353d528 Allocated by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 poison_kmalloc_redzone mm/kasan/common.c:377 [inline] __kasan_kmalloc+0x93/0xb0 mm/kasan/common.c:394 kasan_kmalloc include/linux/kasan.h:260 [inline] __kmalloc_cache_noprof+0x230/0x3d0 mm/slub.c:4359 kmalloc_noprof include/linux/slab.h:905 [inline] kzalloc_noprof include/linux/slab.h:1039 [inline] vhci_open+0x57/0x360 drivers/bluetooth/hci_vhci.c:635 misc_open+0x2bc/0x330 drivers/char/misc.c:161 chrdev_open+0x4c9/0x5e0 fs/char_dev.c:414 do_dentry_open+0xdf0/0x1970 fs/open.c:964 vfs_open+0x3b/0x340 fs/open.c:1094 do_open fs/namei.c:3887 [inline] path_openat+0x2ee5/0x3830 fs/namei.c:4046 do_filp_open+0x1fa/0x410 fs/namei.c:4073 do_sys_openat2+0x121/0x1c0 fs/open.c:1437 do_sys_open fs/open.c:1452 [inline] __do_sys_openat fs/open.c:1468 [inline] __se_sys_openat fs/open.c:1463 [inline] __x64_sys_openat+0x138/0x170 fs/open.c:1463 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 6535: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3e/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x46/0x50 mm/kasan/generic.c:576 poison_slab_object mm/kasan/common.c:247 [inline] __kasan_slab_free+0x62/0x70 mm/kasan/common.c:264 kasan_slab_free include/linux/kasan.h:233 [inline] slab_free_hook mm/slub.c:2381 [inline] slab_free mm/slub.c:4643 [inline] kfree+0x18e/0x440 mm/slub.c:4842 vhci_release+0xbc/0xd0 drivers/bluetooth/hci_vhci.c:671 __fput+0x44c/0xa70 fs/file_table.c:465 task_work_run+0x1d1/0x260 kernel/task_work.c:227 exit_task_work include/linux/task_work.h:40 [inline] do_exit+0x6ad/0x22e0 kernel/exit.c:955 do_group_exit+0x21c/0x2d0 kernel/exit.c:1104 __do_sys_exit_group kernel/exit.c:1115 [inline] __se_sys_exit_group kernel/exit.c:1113 [inline] __x64_sys_exit_group+0x3f/0x40 kernel/exit.c:1113 x64_sys_call+0x21ba/0x21c0 arch/x86/include/generated/asm/syscalls_64.h:232 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The buggy address belongs to the object at ffff88807cb8d800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 88 bytes inside of freed 1024-byte region [ffff88807cb8d800, ffff88807cb8dc00) Fixes: bf18c7118cf8 ("Bluetooth: vhci: Free driver_data on file release") Reported-by: syzbot+2faa4825e556199361f9@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f62d64848fc4c7c30cd6 Signed-off-by: Kuniyuki Iwashima Acked-by: Paul Menzel Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 ++ net/bluetooth/hci_core.c | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index a760f05fa3fb..9fc8f544e20e 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -347,6 +348,7 @@ struct adv_monitor { struct hci_dev { struct list_head list; + struct srcu_struct srcu; struct mutex lock; struct ida unset_handle_ida; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 07a8b4281a39..14d7221b8ac0 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -64,7 +64,7 @@ static DEFINE_IDA(hci_index_ida); /* Get HCI device by index. * Device is held on return. */ -struct hci_dev *hci_dev_get(int index) +static struct hci_dev *__hci_dev_get(int index, int *srcu_index) { struct hci_dev *hdev = NULL, *d; @@ -77,6 +77,8 @@ struct hci_dev *hci_dev_get(int index) list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); + if (srcu_index) + *srcu_index = srcu_read_lock(&d->srcu); break; } } @@ -84,6 +86,22 @@ struct hci_dev *hci_dev_get(int index) return hdev; } +struct hci_dev *hci_dev_get(int index) +{ + return __hci_dev_get(index, NULL); +} + +static struct hci_dev *hci_dev_get_srcu(int index, int *srcu_index) +{ + return __hci_dev_get(index, srcu_index); +} + +static void hci_dev_put_srcu(struct hci_dev *hdev, int srcu_index) +{ + srcu_read_unlock(&hdev->srcu, srcu_index); + hci_dev_put(hdev); +} + /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) @@ -568,9 +586,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev) int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; - int err; + int err, srcu_index; - hdev = hci_dev_get(dev); + hdev = hci_dev_get_srcu(dev, &srcu_index); if (!hdev) return -ENODEV; @@ -592,7 +610,7 @@ int hci_dev_reset(__u16 dev) err = hci_dev_do_reset(hdev); done: - hci_dev_put(hdev); + hci_dev_put_srcu(hdev, srcu_index); return err; } @@ -2433,6 +2451,11 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) if (!hdev) return NULL; + if (init_srcu_struct(&hdev->srcu)) { + kfree(hdev); + return NULL; + } + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); @@ -2678,6 +2701,9 @@ void hci_unregister_dev(struct hci_dev *hdev) list_del(&hdev->list); write_unlock(&hci_dev_list_lock); + synchronize_srcu(&hdev->srcu); + cleanup_srcu_struct(&hdev->srcu); + disable_work_sync(&hdev->rx_work); disable_work_sync(&hdev->cmd_work); disable_work_sync(&hdev->tx_work); -- cgit v1.2.3 From e84a4927a404f369c842c19de93b216627fcc690 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 13:30:00 +0000 Subject: net: annotate races around sk->sk_uid MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sk->sk_uid can be read while another thread changes its value in sockfs_setattr(). Add sk_uid(const struct sock *sk) helper to factorize the needed READ_ONCE() annotations, and add corresponding WRITE_ONCE() where needed. Fixes: 86741ec25462 ("net: core: Add a UID field to struct sock.") Signed-off-by: Eric Dumazet Cc: Lorenzo Colitti Reviewed-by: Maciej Żenczykowski Link: https://patch.msgid.link/20250620133001.4090592-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/route.h | 4 ++-- include/net/sock.h | 12 ++++++++++-- net/ipv4/inet_connection_sock.c | 4 ++-- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/route.c | 3 ++- net/ipv4/syncookies.c | 3 ++- net/ipv4/udp.c | 3 ++- net/ipv6/af_inet6.c | 2 +- net/ipv6/datagram.c | 2 +- net/ipv6/inet6_connection_sock.c | 4 ++-- net/ipv6/ping.c | 2 +- net/ipv6/raw.c | 2 +- net/ipv6/route.c | 4 ++-- net/ipv6/syncookies.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/udp.c | 5 +++-- net/l2tp/l2tp_ip6.c | 2 +- net/mptcp/protocol.c | 2 +- net/socket.c | 8 +++++--- 20 files changed, 42 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/route.h b/include/net/route.h index 8e39aa822cf9..3d3d6048ffca 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -153,7 +153,7 @@ static inline void inet_sk_init_flowi4(const struct inet_sock *inet, ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), sk->sk_protocol, inet_sk_flowi_flags(sk), daddr, inet->inet_saddr, inet->inet_dport, - inet->inet_sport, sk->sk_uid); + inet->inet_sport, sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); } @@ -331,7 +331,7 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, flowi4_init_output(fl4, oif, READ_ONCE(sk->sk_mark), ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), protocol, flow_flags, dst, - src, dport, sport, sk->sk_uid); + src, dport, sport, sk_uid(sk)); } static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, diff --git a/include/net/sock.h b/include/net/sock.h index ca532227cbfd..fc5e6f66b00a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2076,6 +2076,7 @@ static inline void sock_orphan(struct sock *sk) sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; + /* Note: sk_uid is unchanged. */ write_unlock_bh(&sk->sk_callback_lock); } @@ -2086,18 +2087,25 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; + WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } kuid_t sock_i_uid(struct sock *sk); + +static inline kuid_t sk_uid(const struct sock *sk) +{ + /* Paired with WRITE_ONCE() in sockfs_setattr() */ + return READ_ONCE(sk->sk_uid); +} + unsigned long __sock_i_ino(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { - return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); + return sk ? sk_uid(sk) : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 6906bedad19a..46750c96d08e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -812,7 +812,7 @@ struct dst_entry *inet_csk_route_req(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sk->sk_uid); + htons(ireq->ir_num), sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) @@ -849,7 +849,7 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, sk->sk_protocol, inet_sk_flowi_flags(sk), (opt && opt->opt.srr) ? opt->opt.faddr : ireq->ir_rmt_addr, ireq->ir_loc_addr, ireq->ir_rmt_port, - htons(ireq->ir_num), sk->sk_uid); + htons(ireq->ir_num), sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); if (IS_ERR(rt)) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c14baa6589c7..4eacaf00e2e9 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -781,7 +781,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(&fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, inet_sk_flowi_flags(sk), faddr, - saddr, 0, 0, sk->sk_uid); + saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = user_icmph.type; fl4.fl4_icmp_code = user_icmph.code; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 6aace4d55733..32f942d0f944 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -610,7 +610,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) hdrincl ? ipc.protocol : sk->sk_protocol, inet_sk_flowi_flags(sk) | (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), - daddr, saddr, 0, 0, sk->sk_uid); + daddr, saddr, 0, 0, sk_uid(sk)); fl4.fl4_icmp_type = 0; fl4.fl4_icmp_code = 0; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3ddf6bf40357..3ff2bd56d050 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -556,7 +556,8 @@ static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) inet_test_bit(HDRINCL, sk) ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk), - daddr, inet->inet_saddr, 0, 0, sk->sk_uid); + daddr, inet->inet_saddr, 0, 0, + sk_uid(sk)); rcu_read_unlock(); } diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 5459a78b9809..eb0819463fae 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -454,7 +454,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ip_sock_rt_tos(sk), ip_sock_rt_scope(sk), IPPROTO_TCP, inet_sk_flowi_flags(sk), opt->srr ? opt->faddr : ireq->ir_rmt_addr, - ireq->ir_loc_addr, th->source, th->dest, sk->sk_uid); + ireq->ir_loc_addr, th->source, th->dest, + sk_uid(sk)); security_req_classify_flow(req, flowi4_to_flowi_common(&fl4)); rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dde52b8050b8..f94bb222aa2d 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1445,7 +1445,8 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) flowi4_init_output(fl4, ipc.oif, ipc.sockc.mark, ipc.tos & INET_DSCP_MASK, scope, sk->sk_protocol, flow_flags, faddr, saddr, - dport, inet->inet_sport, sk->sk_uid); + dport, inet->inet_sport, + sk_uid(sk)); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); rt = ip_route_output_flow(net, fl4, sk); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index acaff1296783..1992621e3f3f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -842,7 +842,7 @@ int inet6_sk_rebuild_header(struct sock *sk) fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = inet->inet_dport; fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index fff78496803d..83f5aa5e133a 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -53,7 +53,7 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, fl6->fl6_dport = inet->inet_dport; fl6->fl6_sport = inet->inet_sport; fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); if (!oif) oif = np->sticky_pktinfo.ipi6_ifindex; diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 8f500eaf33cf..333e43434dd7 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -45,7 +45,7 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_mark = ireq->ir_mark; fl6->fl6_dport = ireq->ir_rmt_port; fl6->fl6_sport = htons(ireq->ir_num); - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); @@ -79,7 +79,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, fl6->flowi6_mark = sk->sk_mark; fl6->fl6_sport = inet->inet_sport; fl6->fl6_dport = inet->inet_dport; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 84d90dd8b3f0..82b0492923d4 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -142,7 +142,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) fl6.saddr = np->saddr; fl6.daddr = *daddr; fl6.flowi6_mark = ipc6.sockc.mark; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); fl6.fl6_icmp_type = user_icmph.icmp6_type; fl6.fl6_icmp_code = user_icmph.icmp6_code; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index fda640ebd53f..4c3f8245c40f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -777,7 +777,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = ipc6.sockc.mark; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index df0caffefb38..d7a9b5bf30c8 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3011,7 +3011,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) oif = l3mdev_master_ifindex(skb->dev); ip6_update_pmtu(skb, sock_net(sk), mtu, oif, READ_ONCE(sk->sk_mark), - sk->sk_uid); + sk_uid(sk)); dst = __sk_dst_get(sk); if (!dst || !dst->obsolete || @@ -3233,7 +3233,7 @@ void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif) void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk) { ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, - READ_ONCE(sk->sk_mark), sk->sk_uid); + READ_ONCE(sk->sk_mark), sk_uid(sk)); } EXPORT_SYMBOL_GPL(ip6_sk_redirect); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 9d83eadd308b..f0ee1a909771 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -236,7 +236,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) fl6.flowi6_mark = ireq->ir_mark; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = inet_sk(sk)->inet_sport; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e8e68a142649..f61b0396ef6b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -269,7 +269,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, fl6.fl6_sport = inet->inet_sport; if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 7317f8e053f1..ebb95d8bc681 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -750,7 +750,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (type == NDISC_REDIRECT) { if (tunnel) { ip6_redirect(skb, sock_net(sk), inet6_iif(skb), - READ_ONCE(sk->sk_mark), sk->sk_uid); + READ_ONCE(sk->sk_mark), + sk_uid(sk)); } else { ip6_sk_redirect(skb, sk); } @@ -1620,7 +1621,7 @@ do_udp_sendmsg: if (!fl6->flowi6_oif) fl6->flowi6_oif = np->sticky_pktinfo.ipi6_ifindex; - fl6->flowi6_uid = sk->sk_uid; + fl6->flowi6_uid = sk_uid(sk); if (msg->msg_controllen) { opt = &opt_space; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index b98d13584c81..ea232f338dcb 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -545,7 +545,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = READ_ONCE(sk->sk_mark); - fl6.flowi6_uid = sk->sk_uid; + fl6.flowi6_uid = sk_uid(sk); ipcm6_init_sk(&ipc6, sk); diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index edf14c2c2062..e7972e633236 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -3503,7 +3503,7 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); - sk->sk_uid = SOCK_INODE(parent)->i_uid; + WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); write_unlock_bh(&sk->sk_callback_lock); } diff --git a/net/socket.c b/net/socket.c index 2cab805943c0..682969deaed3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -592,10 +592,12 @@ static int sockfs_setattr(struct mnt_idmap *idmap, if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry)); - if (sock->sk) - sock->sk->sk_uid = iattr->ia_uid; - else + if (sock->sk) { + /* Paired with READ_ONCE() in sk_uid() */ + WRITE_ONCE(sock->sk->sk_uid, iattr->ia_uid); + } else { err = -ENOENT; + } } return err; -- cgit v1.2.3 From c51da3f7a161c6822232be832abdffe47eb55b4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 13:30:01 +0000 Subject: net: remove sock_i_uid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Difference between sock_i_uid() and sk_uid() is that after sock_orphan(), sock_i_uid() returns GLOBAL_ROOT_UID while sk_uid() returns the last cached sk->sk_uid value. None of sock_i_uid() callers care about this. Use sk_uid() which is much faster and inlined. Note that diag/dump users are calling sock_i_ino() and can not see the full benefit yet. Signed-off-by: Eric Dumazet Cc: Lorenzo Colitti Reviewed-by: Maciej Żenczykowski Link: https://patch.msgid.link/20250620133001.4090592-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 -- net/appletalk/atalk_proc.c | 2 +- net/bluetooth/af_bluetooth.c | 2 +- net/core/sock.c | 11 ----------- net/ipv4/inet_connection_sock.c | 27 ++++++++++++--------------- net/ipv4/inet_diag.c | 2 +- net/ipv4/inet_hashtables.c | 4 ++-- net/ipv4/ping.c | 2 +- net/ipv4/raw.c | 2 +- net/ipv4/tcp_ipv4.c | 8 ++++---- net/ipv4/udp.c | 16 ++++++++-------- net/ipv6/datagram.c | 2 +- net/ipv6/tcp_ipv6.c | 4 ++-- net/key/af_key.c | 2 +- net/llc/llc_proc.c | 2 +- net/packet/af_packet.c | 2 +- net/packet/diag.c | 2 +- net/phonet/socket.c | 4 ++-- net/sctp/input.c | 2 +- net/sctp/proc.c | 4 ++-- net/sctp/socket.c | 4 ++-- net/smc/smc_diag.c | 2 +- net/tipc/socket.c | 2 +- net/unix/af_unix.c | 2 +- net/unix/diag.c | 2 +- net/xdp/xsk_diag.c | 2 +- 26 files changed, 50 insertions(+), 66 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index fc5e6f66b00a..bbd97fbc5935 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2092,8 +2092,6 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) write_unlock_bh(&sk->sk_callback_lock); } -kuid_t sock_i_uid(struct sock *sk); - static inline kuid_t sk_uid(const struct sock *sk) { /* Paired with WRITE_ONCE() in sockfs_setattr() */ diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c index 9c1241292d1d..01787fb6a7bc 100644 --- a/net/appletalk/atalk_proc.c +++ b/net/appletalk/atalk_proc.c @@ -181,7 +181,7 @@ static int atalk_seq_socket_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), sk_rmem_alloc_get(s), s->sk_state, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s))); + from_kuid_munged(seq_user_ns(seq), sk_uid(s))); out: return 0; } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 6ad2f72f53f4..ee9bf84c88a7 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -815,7 +815,7 @@ static int bt_seq_show(struct seq_file *seq, void *v) refcount_read(&sk->sk_refcnt), sk_rmem_alloc_get(sk), sk_wmem_alloc_get(sk), - from_kuid(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), bt->parent ? sock_i_ino(bt->parent) : 0LU); diff --git a/net/core/sock.c b/net/core/sock.c index 502042a0d3b5..ceb74ceecb6c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2780,17 +2780,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -kuid_t sock_i_uid(struct sock *sk) -{ - kuid_t uid; - - read_lock_bh(&sk->sk_callback_lock); - uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; - read_unlock_bh(&sk->sk_callback_lock); - return uid; -} -EXPORT_SYMBOL(sock_i_uid); - unsigned long __sock_i_ino(struct sock *sk) { unsigned long ino; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 46750c96d08e..f4157d26ec9e 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -168,7 +168,7 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) } static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, - kuid_t sk_uid, bool relax, + kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { int bound_dev_if2; @@ -185,12 +185,12 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, if (!relax || (!reuseport_ok && sk->sk_reuseport && sk2->sk_reuseport && reuseport_cb_ok && (sk2->sk_state == TCP_TIME_WAIT || - uid_eq(sk_uid, sock_i_uid(sk2))))) + uid_eq(uid, sk_uid(sk2))))) return true; } else if (!reuseport_ok || !sk->sk_reuseport || !sk2->sk_reuseport || !reuseport_cb_ok || (sk2->sk_state != TCP_TIME_WAIT && - !uid_eq(sk_uid, sock_i_uid(sk2)))) { + !uid_eq(uid, sk_uid(sk2)))) { return true; } } @@ -198,7 +198,7 @@ static bool inet_bind_conflict(const struct sock *sk, struct sock *sk2, } static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, - kuid_t sk_uid, bool relax, + kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { if (ipv6_only_sock(sk2)) { @@ -211,20 +211,20 @@ static bool __inet_bhash2_conflict(const struct sock *sk, struct sock *sk2, #endif } - return inet_bind_conflict(sk, sk2, sk_uid, relax, + return inet_bind_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok); } static bool inet_bhash2_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, - kuid_t sk_uid, + kuid_t uid, bool relax, bool reuseport_cb_ok, bool reuseport_ok) { struct sock *sk2; sk_for_each_bound(sk2, &tb2->owners) { - if (__inet_bhash2_conflict(sk, sk2, sk_uid, relax, + if (__inet_bhash2_conflict(sk, sk2, uid, relax, reuseport_cb_ok, reuseport_ok)) return true; } @@ -242,8 +242,8 @@ static int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind2_bucket *tb2, /* may be null */ bool relax, bool reuseport_ok) { - kuid_t uid = sock_i_uid((struct sock *)sk); struct sock_reuseport *reuseport_cb; + kuid_t uid = sk_uid(sk); bool reuseport_cb_ok; struct sock *sk2; @@ -287,11 +287,11 @@ static int inet_csk_bind_conflict(const struct sock *sk, static bool inet_bhash2_addr_any_conflict(const struct sock *sk, int port, int l3mdev, bool relax, bool reuseport_ok) { - kuid_t uid = sock_i_uid((struct sock *)sk); const struct net *net = sock_net(sk); struct sock_reuseport *reuseport_cb; struct inet_bind_hashbucket *head2; struct inet_bind2_bucket *tb2; + kuid_t uid = sk_uid(sk); bool conflict = false; bool reuseport_cb_ok; @@ -425,15 +425,13 @@ success: static inline int sk_reuseport_match(struct inet_bind_bucket *tb, struct sock *sk) { - kuid_t uid = sock_i_uid(sk); - if (tb->fastreuseport <= 0) return 0; if (!sk->sk_reuseport) return 0; if (rcu_access_pointer(sk->sk_reuseport_cb)) return 0; - if (!uid_eq(tb->fastuid, uid)) + if (!uid_eq(tb->fastuid, sk_uid(sk))) return 0; /* We only need to check the rcv_saddr if this tb was once marked * without fastreuseport and then was reset, as we can only know that @@ -458,14 +456,13 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, struct sock *sk) { - kuid_t uid = sock_i_uid(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; if (hlist_empty(&tb->bhash2)) { tb->fastreuse = reuse; if (sk->sk_reuseport) { tb->fastreuseport = FASTREUSEPORT_ANY; - tb->fastuid = uid; + tb->fastuid = sk_uid(sk); tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; @@ -492,7 +489,7 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ if (!sk_reuseport_match(tb, sk)) { tb->fastreuseport = FASTREUSEPORT_STRICT; - tb->fastuid = uid; + tb->fastuid = sk_uid(sk); tb->fast_rcv_saddr = sk->sk_rcv_saddr; tb->fast_ipv6_only = ipv6_only_sock(sk); tb->fast_sk_family = sk->sk_family; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 1d1d6ad53f4c..2fa53b16fe77 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -181,7 +181,7 @@ int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, goto errout; #endif - r->idiag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->idiag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->idiag_inode = sock_i_ino(sk); memset(&inet_sockopt, 0, sizeof(inet_sockopt)); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 77a0b52b2eab..ceeeec9b7290 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -721,8 +721,8 @@ static int inet_reuseport_add_sock(struct sock *sk, { struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash; const struct hlist_nulls_node *node; + kuid_t uid = sk_uid(sk); struct sock *sk2; - kuid_t uid = sock_i_uid(sk); sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) { if (sk2 != sk && @@ -730,7 +730,7 @@ static int inet_reuseport_add_sock(struct sock *sk, ipv6_only_sock(sk2) == ipv6_only_sock(sk) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if && inet_csk(sk2)->icsk_bind_hash == tb && - sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 4eacaf00e2e9..031df4c19fcc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1116,7 +1116,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, - from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 32f942d0f944..1d2c89d63cc7 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -1043,7 +1043,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) sk_wmem_alloc_get(sp), sk_rmem_alloc_get(sp), 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6a14f9e6fef6..429fb34b075e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2896,7 +2896,7 @@ static void get_openreq4(const struct request_sock *req, jiffies_delta_to_clock_t(delta), req->num_timeout, from_kuid_munged(seq_user_ns(f), - sock_i_uid(req->rsk_listener)), + sk_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, @@ -2954,7 +2954,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(f), sk_uid(sk)), icsk->icsk_probes_out, sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, @@ -3246,9 +3246,9 @@ static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v) const struct request_sock *req = v; uid = from_kuid_munged(seq_user_ns(seq), - sock_i_uid(req->rsk_listener)); + sk_uid(req->rsk_listener)); } else { - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); } meta.seq = seq; diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f94bb222aa2d..19573ee64a0f 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -145,8 +145,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, unsigned long *bitmap, struct sock *sk, unsigned int log) { + kuid_t uid = sk_uid(sk); struct sock *sk2; - kuid_t uid = sock_i_uid(sk); sk_for_each(sk2, &hslot->head) { if (net_eq(sock_net(sk2), net) && @@ -158,7 +158,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(uid, sock_i_uid(sk2))) { + uid_eq(uid, sk_uid(sk2))) { if (!bitmap) return 0; } else { @@ -180,8 +180,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, struct udp_hslot *hslot2, struct sock *sk) { + kuid_t uid = sk_uid(sk); struct sock *sk2; - kuid_t uid = sock_i_uid(sk); int res = 0; spin_lock(&hslot2->lock); @@ -195,7 +195,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, inet_rcv_saddr_equal(sk, sk2, true)) { if (sk2->sk_reuseport && sk->sk_reuseport && !rcu_access_pointer(sk->sk_reuseport_cb) && - uid_eq(uid, sock_i_uid(sk2))) { + uid_eq(uid, sk_uid(sk2))) { res = 0; } else { res = 1; @@ -210,7 +210,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num, static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) { struct net *net = sock_net(sk); - kuid_t uid = sock_i_uid(sk); + kuid_t uid = sk_uid(sk); struct sock *sk2; sk_for_each(sk2, &hslot->head) { @@ -220,7 +220,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot) ipv6_only_sock(sk2) == ipv6_only_sock(sk) && (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) && (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) && + sk2->sk_reuseport && uid_eq(uid, sk_uid(sk2)) && inet_rcv_saddr_equal(sk, sk2, false)) { return reuseport_add_sock(sk, sk2, inet_rcv_saddr_any(sk)); @@ -3387,7 +3387,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, sk_wmem_alloc_get(sp), udp_rqueue_get(sp), 0, 0L, 0, - from_kuid_munged(seq_user_ns(f), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); @@ -3630,7 +3630,7 @@ static int bpf_iter_udp_seq_show(struct seq_file *seq, void *v) goto unlock; } - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = udp_prog_seq_show(prog, &meta, v, uid, state->bucket); diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 83f5aa5e133a..281722817a65 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1064,7 +1064,7 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, sk_wmem_alloc_get(sp), rqueue, 0, 0L, 0, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f61b0396ef6b..f0ce62549d90 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2168,7 +2168,7 @@ static void get_openreq6(struct seq_file *seq, jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), - sock_i_uid(req->rsk_listener)), + sk_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); @@ -2234,7 +2234,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, diff --git a/net/key/af_key.c b/net/key/af_key.c index efc2a91f4c48..1f82f69acfde 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3788,7 +3788,7 @@ static int pfkey_seq_show(struct seq_file *f, void *v) refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), - from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), + from_kuid_munged(seq_user_ns(f), sk_uid(s)), sock_i_ino(s) ); return 0; diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c index 07e9abb5978a..aa81c67b24a1 100644 --- a/net/llc/llc_proc.c +++ b/net/llc/llc_proc.c @@ -151,7 +151,7 @@ static int llc_seq_socket_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk) - llc->copied_seq, sk->sk_state, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), llc->link); out: return 0; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3d43f3eae759..f6b1ff883c93 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -4783,7 +4783,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) READ_ONCE(po->ifindex), packet_sock_flag(po, PACKET_SOCK_RUNNING), atomic_read(&s->sk_rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)), + from_kuid_munged(seq_user_ns(seq), sk_uid(s)), sock_i_ino(s)); } diff --git a/net/packet/diag.c b/net/packet/diag.c index 47f69f3dbf73..6ce1dcc284d9 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -153,7 +153,7 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, if ((req->pdiag_show & PACKET_SHOW_INFO) && nla_put_u32(skb, PACKET_DIAG_UID, - from_kuid_munged(user_ns, sock_i_uid(sk)))) + from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->pdiag_show & PACKET_SHOW_MCLIST) && diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 5ce0b3ee5def..ea4d5e6533db 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -584,7 +584,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, atomic_read(&sk->sk_drops)); @@ -755,7 +755,7 @@ static int pn_res_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk)); } seq_pad(seq, '\n'); diff --git a/net/sctp/input.c b/net/sctp/input.c index 0c0d2757f6f8..2dc2666988fb 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -756,7 +756,7 @@ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || - !uid_eq(sock_i_uid(sk2), sock_i_uid(sk)) || + !uid_eq(sk_uid(sk2), sk_uid(sk)) || !sk2->sk_reuseport) continue; diff --git a/net/sctp/proc.c b/net/sctp/proc.c index ec00ee75d59a..74bff317e205 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -177,7 +177,7 @@ static int sctp_eps_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%8pK %8pK %-3d %-3d %-4d %-5d %5u %5lu ", ep, sk, sctp_sk(sk)->type, sk->sk_state, hash, ep->base.bind_addr.port, - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk)); sctp_seq_dump_local_addrs(seq, &ep->base); @@ -267,7 +267,7 @@ static int sctp_assocs_seq_show(struct seq_file *seq, void *v) assoc->assoc_id, assoc->sndbuf_used, atomic_read(&assoc->rmem_alloc), - from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), + from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), epb->bind_addr.port, assoc->peer.port); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1e5739858c20..aa6400811018 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8345,8 +8345,8 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) bool reuse = (sk->sk_reuse || sp->reuse); struct sctp_bind_hashbucket *head; /* hash list */ struct net *net = sock_net(sk); - kuid_t uid = sock_i_uid(sk); struct sctp_bind_bucket *pp; + kuid_t uid = sk_uid(sk); unsigned short snum; int ret; @@ -8444,7 +8444,7 @@ pp_found: (reuse && (sk2->sk_reuse || sp2->reuse) && sk2->sk_state != SCTP_SS_LISTENING) || (sk->sk_reuseport && sk2->sk_reuseport && - uid_eq(uid, sock_i_uid(sk2)))) + uid_eq(uid, sk_uid(sk2)))) continue; if ((!sk->sk_bound_dev_if || !bound_dev_if2 || diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 6fdb2d96777a..8ed2f6689b01 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -64,7 +64,7 @@ static int smc_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, if (nla_put_u8(skb, SMC_DIAG_SHUTDOWN, sk->sk_shutdown)) return 1; - r->diag_uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + r->diag_uid = from_kuid_munged(user_ns, sk_uid(sk)); r->diag_inode = sock_i_ino(sk); return 0; } diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7c61d47ea208..e028bf658499 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3642,7 +3642,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_INO, sock_i_ino(sk)) || nla_put_u32(skb, TIPC_NLA_SOCK_UID, from_kuid_munged(sk_user_ns(NETLINK_CB(cb->skb).sk), - sock_i_uid(sk))) || + sk_uid(sk))) || nla_put_u64_64bit(skb, TIPC_NLA_SOCK_COOKIE, tipc_diag_gen_cookie(sk), TIPC_NLA_SOCK_PAD)) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22e170fb5dda..1e320f89168d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3682,7 +3682,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) goto unlock; } - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); diff --git a/net/unix/diag.c b/net/unix/diag.c index 79b182d0e62a..ca3473026151 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -106,7 +106,7 @@ static int sk_diag_show_rqlen(struct sock *sk, struct sk_buff *nlskb) static int sk_diag_dump_uid(struct sock *sk, struct sk_buff *nlskb, struct user_namespace *user_ns) { - uid_t uid = from_kuid_munged(user_ns, sock_i_uid(sk)); + uid_t uid = from_kuid_munged(user_ns, sk_uid(sk)); return nla_put(nlskb, UNIX_DIAG_UID, sizeof(uid_t), &uid); } diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 09dcea0cbbed..0e0bca031c03 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -119,7 +119,7 @@ static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, if ((req->xdiag_show & XDP_SHOW_INFO) && nla_put_u32(nlskb, XDP_DIAG_UID, - from_kuid_munged(user_ns, sock_i_uid(sk)))) + from_kuid_munged(user_ns, sk_uid(sk)))) goto out_nlmsg_trim; if ((req->xdiag_show & XDP_SHOW_RING_CFG) && -- cgit v1.2.3 From 3169e36ae14802b01abe4bfa7ec593b0a1af5cc7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 15:55:35 +0000 Subject: net: make sk->sk_sndtimeo lockless Followup of commit 285975dd6742 ("net: annotate data-races around sk->sk_{rcv|snd}timeo"). Remove lock_sock()/release_sock() from sock_set_sndtimeo(), and add READ_ONCE()/WRITE_ONCE() where it is needed. Also SO_SNDTIMEO_OLD and SO_SNDTIMEO_NEW can call sock_set_timeout() without holding the socket lock. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250620155536.335520-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 +- net/bluetooth/iso.c | 4 ++-- net/bluetooth/l2cap_sock.c | 4 ++-- net/bluetooth/sco.c | 4 ++-- net/core/sock.c | 12 ++++-------- net/sctp/socket.c | 2 +- net/smc/af_smc.c | 4 ++-- 7 files changed, 14 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index bbd97fbc5935..b08e36bf9669 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2601,7 +2601,7 @@ static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_sndtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_sndtimeo); } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 3c2c98eecc62..34e89bb5f384 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -413,7 +413,7 @@ static int iso_connect_bis(struct sock *sk) sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; - iso_sock_set_timer(sk, sk->sk_sndtimeo); + iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); @@ -503,7 +503,7 @@ static int iso_connect_cis(struct sock *sk) sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; - iso_sock_set_timer(sk, sk->sk_sndtimeo); + iso_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 5aa55fa69594..113656489db5 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -255,7 +255,7 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, err = l2cap_chan_connect(chan, la.l2_psm, __le16_to_cpu(la.l2_cid), &la.l2_bdaddr, la.l2_bdaddr_type, - sk->sk_sndtimeo); + READ_ONCE(sk->sk_sndtimeo)); if (err) return err; @@ -1725,7 +1725,7 @@ static long l2cap_sock_get_sndtimeo_cb(struct l2cap_chan *chan) { struct sock *sk = chan->data; - return sk->sk_sndtimeo; + return READ_ONCE(sk->sk_sndtimeo); } static struct pid *l2cap_sock_get_peer_pid_cb(struct l2cap_chan *chan) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 2945d27e75dc..d382d980fd9a 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -338,7 +338,7 @@ static int sco_connect(struct sock *sk) hcon = hci_connect_sco(hdev, type, &sco_pi(sk)->dst, sco_pi(sk)->setting, &sco_pi(sk)->codec, - sk->sk_sndtimeo); + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -367,7 +367,7 @@ static int sco_connect(struct sock *sk) sk->sk_state = BT_CONNECTED; } else { sk->sk_state = BT_CONNECT; - sco_sock_set_timer(sk, sk->sk_sndtimeo); + sco_sock_set_timer(sk, READ_ONCE(sk->sk_sndtimeo)); } release_sock(sk); diff --git a/net/core/sock.c b/net/core/sock.c index ceb74ceecb6c..b0b5a0a76045 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -818,12 +818,10 @@ EXPORT_SYMBOL(sock_set_priority); void sock_set_sndtimeo(struct sock *sk, s64 secs) { - lock_sock(sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); else WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); - release_sock(sk); } EXPORT_SYMBOL(sock_set_sndtimeo); @@ -1287,6 +1285,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_DEVMEM_DONTNEED: return sock_devmem_dontneed(sk, optval, optlen); #endif + case SO_SNDTIMEO_OLD: + case SO_SNDTIMEO_NEW: + return sock_set_timeout(&sk->sk_sndtimeo, optval, + optlen, optname == SO_SNDTIMEO_OLD); } sockopt_lock_sock(sk); @@ -1448,12 +1450,6 @@ set_sndbuf: optlen, optname == SO_RCVTIMEO_OLD); break; - case SO_SNDTIMEO_OLD: - case SO_SNDTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_sndtimeo, optval, - optlen, optname == SO_SNDTIMEO_OLD); - break; - case SO_ATTACH_FILTER: { struct sock_fprog fprog; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index aa6400811018..5b690a4d2969 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9493,7 +9493,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_rcvbuf = sk->sk_rcvbuf; newsk->sk_lingertime = sk->sk_lingertime; newsk->sk_rcvtimeo = sk->sk_rcvtimeo; - newsk->sk_sndtimeo = sk->sk_sndtimeo; + newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo); newsk->sk_rxhash = sk->sk_rxhash; newinet = inet_sk(newsk); diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 3760131f1484..6375a86fe2b5 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -486,7 +486,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, { /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; - nsk->sk_sndtimeo = osk->sk_sndtimeo; + nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo); nsk->sk_rcvtimeo = osk->sk_rcvtimeo; nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = READ_ONCE(osk->sk_priority); @@ -1585,7 +1585,7 @@ static void smc_connect_work(struct work_struct *work) { struct smc_sock *smc = container_of(work, struct smc_sock, connect_work); - long timeo = smc->sk.sk_sndtimeo; + long timeo = READ_ONCE(smc->sk.sk_sndtimeo); int rc = 0; if (!timeo) -- cgit v1.2.3 From 935b67675a9f233aa4ac4ae6452b2cc45418d839 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 15:55:36 +0000 Subject: net: make sk->sk_rcvtimeo lockless Followup of commit 285975dd6742 ("net: annotate data-races around sk->sk_{rcv|snd}timeo"). Remove lock_sock()/release_sock() from ksmbd_tcp_rcv_timeout() and add READ_ONCE()/WRITE_ONCE() where it is needed. Also SO_RCVTIMEO_OLD and SO_RCVTIMEO_NEW can call sock_set_timeout() without holding the socket lock. Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250620155536.335520-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- fs/smb/server/transport_tcp.c | 6 ++---- include/net/sock.h | 2 +- net/core/sock.c | 10 ++++------ net/llc/af_llc.c | 6 +++--- net/sctp/socket.c | 2 +- net/smc/af_smc.c | 2 +- net/smc/smc_clc.c | 6 +++--- net/strparser/strparser.c | 2 +- net/x25/af_x25.c | 2 +- 9 files changed, 17 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 4e9f98db9ff4..f8c772a7cb43 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -58,12 +58,10 @@ static inline void ksmbd_tcp_reuseaddr(struct socket *sock) static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs) { - lock_sock(sock->sk); if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) - sock->sk->sk_rcvtimeo = secs * HZ; + WRITE_ONCE(sock->sk->sk_rcvtimeo, secs * HZ); else - sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; - release_sock(sock->sk); + WRITE_ONCE(sock->sk->sk_rcvtimeo, MAX_SCHEDULE_TIMEOUT); } static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs) diff --git a/include/net/sock.h b/include/net/sock.h index b08e36bf9669..0f2443d4ec58 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2596,7 +2596,7 @@ static inline gfp_t gfp_memcg_charge(void) static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { - return noblock ? 0 : sk->sk_rcvtimeo; + return noblock ? 0 : READ_ONCE(sk->sk_rcvtimeo); } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) diff --git a/net/core/sock.c b/net/core/sock.c index b0b5a0a76045..3a71d6c4ccf0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1289,6 +1289,10 @@ int sk_setsockopt(struct sock *sk, int level, int optname, case SO_SNDTIMEO_NEW: return sock_set_timeout(&sk->sk_sndtimeo, optval, optlen, optname == SO_SNDTIMEO_OLD); + case SO_RCVTIMEO_OLD: + case SO_RCVTIMEO_NEW: + return sock_set_timeout(&sk->sk_rcvtimeo, optval, + optlen, optname == SO_RCVTIMEO_OLD); } sockopt_lock_sock(sk); @@ -1444,12 +1448,6 @@ set_sndbuf: WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); break; } - case SO_RCVTIMEO_OLD: - case SO_RCVTIMEO_NEW: - ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, - optlen, optname == SO_RCVTIMEO_OLD); - break; - case SO_ATTACH_FILTER: { struct sock_fprog fprog; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cc77ec5769d8..5958a80fe14c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -210,7 +210,7 @@ static int llc_ui_release(struct socket *sock) dprintk("%s: closing local(%02X) remote(%02X)\n", __func__, llc->laddr.lsap, llc->daddr.lsap); if (!llc_send_disc(sk)) - llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); if (!sock_flag(sk, SOCK_ZAPPED)) { struct llc_sap *sap = llc->sap; @@ -455,7 +455,7 @@ static int llc_ui_shutdown(struct socket *sock, int how) goto out; rc = llc_send_disc(sk); if (!rc) - rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + rc = llc_ui_wait_for_disc(sk, READ_ONCE(sk->sk_rcvtimeo)); /* Wake up anyone sleeping in poll */ sk->sk_state_change(sk); out: @@ -712,7 +712,7 @@ static int llc_ui_accept(struct socket *sock, struct socket *newsock, goto out; /* wait for a connection to arrive. */ if (skb_queue_empty(&sk->sk_receive_queue)) { - rc = llc_wait_data(sk, sk->sk_rcvtimeo); + rc = llc_wait_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 5b690a4d2969..4921416434f9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -9492,7 +9492,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newsk->sk_sndbuf = sk->sk_sndbuf; newsk->sk_rcvbuf = sk->sk_rcvbuf; newsk->sk_lingertime = sk->sk_lingertime; - newsk->sk_rcvtimeo = sk->sk_rcvtimeo; + newsk->sk_rcvtimeo = READ_ONCE(sk->sk_rcvtimeo); newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo); newsk->sk_rxhash = sk->sk_rxhash; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 6375a86fe2b5..8d56e4db63e0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -487,7 +487,7 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, /* options we don't get control via setsockopt for */ nsk->sk_type = osk->sk_type; nsk->sk_sndtimeo = READ_ONCE(osk->sk_sndtimeo); - nsk->sk_rcvtimeo = osk->sk_rcvtimeo; + nsk->sk_rcvtimeo = READ_ONCE(osk->sk_rcvtimeo); nsk->sk_mark = READ_ONCE(osk->sk_mark); nsk->sk_priority = READ_ONCE(osk->sk_priority); nsk->sk_rcvlowat = osk->sk_rcvlowat; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 521f5df80e10..5a4db151fe95 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -688,7 +688,7 @@ out: int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, u8 expected_type, unsigned long timeout) { - long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo; + long rcvtimeo = READ_ONCE(smc->clcsock->sk->sk_rcvtimeo); struct sock *clc_sk = smc->clcsock->sk; struct smc_clc_msg_hdr *clcm = buf; struct msghdr msg = {NULL, 0}; @@ -707,7 +707,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, * sizeof(struct smc_clc_msg_hdr) */ krflags = MSG_PEEK | MSG_WAITALL; - clc_sk->sk_rcvtimeo = timeout; + WRITE_ONCE(clc_sk->sk_rcvtimeo, timeout); iov_iter_kvec(&msg.msg_iter, ITER_DEST, &vec, 1, sizeof(struct smc_clc_msg_hdr)); len = sock_recvmsg(smc->clcsock, &msg, krflags); @@ -795,7 +795,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen, } out: - clc_sk->sk_rcvtimeo = rcvtimeo; + WRITE_ONCE(clc_sk->sk_rcvtimeo, rcvtimeo); return reason_code; } diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index d946bfb424c7..43b1f558b33d 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -333,7 +333,7 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, struct strparser *strp = (struct strparser *)desc->arg.data; return __strp_recv(desc, orig_skb, orig_offset, orig_len, - strp->sk->sk_rcvbuf, strp->sk->sk_rcvtimeo); + strp->sk->sk_rcvbuf, READ_ONCE(strp->sk->sk_rcvtimeo)); } static int default_read_sock_done(struct strparser *strp, int err) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 1f8ae9f4a3f1..655d1e0ae25f 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -891,7 +891,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, if (sk->sk_state != TCP_LISTEN) goto out2; - rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); + rc = x25_wait_for_data(sk, READ_ONCE(sk->sk_rcvtimeo)); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); -- cgit v1.2.3 From e581b7fe62218d390520287e0095bfd6fe0454f8 Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Wed, 28 May 2025 11:14:11 +0530 Subject: wifi: mac80211: add support towards MLO handling of station statistics Currently, in supporting API's to fill sinfo structure from sta structure, is mapped to fill the fields from sta->deflink. However, for multi-link (ML) station, sinfo structure should be filled from corresponding link_id. Therefore, add link_id as an additional argument in supporting API's for filling sinfo structure correctly. Link_id is set to -1 for non-ML station and corresponding link_id for ML stations. In supporting API's for filling sinfo structure, check for link_id, if link_id < 0, fill the sinfo structure from sta->deflink, otherwise fill from sta->link[link_id]. Current, changes are done at the deflink level i.e, pass -1 as link_id. Actual link_id will be added in subsequent patches to support station statistics for MLO. Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250528054420.3050133-2-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/dvm/lib.c | 2 +- include/net/mac80211.h | 3 +- net/mac80211/ibss.c | 4 +- net/mac80211/sta_info.c | 81 +++++++++++++++++++--------- net/mac80211/sta_info.h | 2 +- net/mac80211/util.c | 14 ++++- 6 files changed, 74 insertions(+), 32 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c index 1dc974e2c511..48711dbcfa5a 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/lib.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/lib.c @@ -586,7 +586,7 @@ static bool iwlagn_fill_txpower_mode(struct iwl_priv *priv, return false; } - ave_rssi = ieee80211_ave_rssi(ctx->vif); + ave_rssi = ieee80211_ave_rssi(ctx->vif, -1); if (!ave_rssi) { /* no rssi data, no changes to reduce tx power */ IWL_DEBUG_COEX(priv, "no rssi data available\n"); diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 82617579d910..a305e7f9c6b2 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -7242,13 +7242,14 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif); * ieee80211_ave_rssi - report the average RSSI for the specified interface * * @vif: the specified virtual interface + * @link_id: the link ID for MLO, or -1 for non-MLO * * Note: This function assumes that the given vif is valid. * * Return: The average RSSI value for the requested interface, or 0 if not * applicable. */ -int ieee80211_ave_rssi(struct ieee80211_vif *vif); +int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id); /** * ieee80211_report_wowlan_wakeup - report WoWLAN wakeup diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 9ed87d6f5019..6e36b09fe97f 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -635,7 +635,7 @@ static int ieee80211_sta_active_ibss(struct ieee80211_sub_if_data *sdata) rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { - unsigned long last_active = ieee80211_sta_last_active(sta); + unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sta->sdata == sdata && time_is_after_jiffies(last_active + @@ -1228,7 +1228,7 @@ static void ieee80211_ibss_sta_expire(struct ieee80211_sub_if_data *sdata) lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - unsigned long last_active = ieee80211_sta_last_active(sta); + unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sdata != sta->sdata) continue; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 61583173629e..6acbe1a7314b 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -1651,7 +1651,7 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { - unsigned long last_active = ieee80211_sta_last_active(sta); + unsigned long last_active = ieee80211_sta_last_active(sta, -1); if (sdata != sta->sdata) continue; @@ -2420,18 +2420,27 @@ void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, } static struct ieee80211_sta_rx_stats * -sta_get_last_rx_stats(struct sta_info *sta) +sta_get_last_rx_stats(struct sta_info *sta, int link_id) { - struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; + struct ieee80211_sta_rx_stats *stats; + struct link_sta_info *link_sta_info; int cpu; - if (!sta->deflink.pcpu_rx_stats) + if (link_id < 0) + link_sta_info = &sta->deflink; + else + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + + stats = &link_sta_info->rx_stats; + + if (!link_sta_info->pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; - cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); + cpustats = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; @@ -2499,9 +2508,10 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, } } -static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) +static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo, + int link_id) { - u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); + u32 rate = READ_ONCE(sta_get_last_rx_stats(sta, link_id)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; @@ -2526,20 +2536,28 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, - int tid) + int tid, int link_id) { struct ieee80211_local *local = sta->local; + struct link_sta_info *link_sta_info; int cpu; + if (link_id < 0) + link_sta_info = &sta->deflink; + else + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { - tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, - tid); + tidstats->rx_msdu += + sta_get_tidstats_msdu(&link_sta_info->rx_stats, + tid); - if (sta->deflink.pcpu_rx_stats) { + if (link_sta_info->pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; - cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); @@ -2551,19 +2569,21 @@ static void sta_set_tidstats(struct sta_info *sta, if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); - tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; + tidstats->tx_msdu = link_sta_info->tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); - tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; + tidstats->tx_msdu_retries = + link_sta_info->status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); - tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; + tidstats->tx_msdu_failed = + link_sta_info->status_stats.msdu_failed[tid]; } if (tid < IEEE80211_NUM_TIDS) { @@ -2634,7 +2654,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; - last_rxstats = sta_get_last_rx_stats(sta); + last_rxstats = sta_get_last_rx_stats(sta, -1); sinfo->generation = sdata->local->sta_generation; @@ -2662,7 +2682,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = - jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); + jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, -1)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { @@ -2751,7 +2771,8 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); - sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); + sinfo->rx_beacon_signal_avg = + ieee80211_ave_rssi(&sdata->vif, -1); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || @@ -2800,13 +2821,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { - if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) + if (sta_set_rate_info_rx(sta, &sinfo->rxrate, -1) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) - sta_set_tidstats(sta, &sinfo->pertid[i], i); + sta_set_tidstats(sta, &sinfo->pertid[i], i, -1); } #ifdef CONFIG_MAC80211_MESH @@ -2889,14 +2910,24 @@ u32 sta_get_expected_throughput(struct sta_info *sta) return thr; } -unsigned long ieee80211_sta_last_active(struct sta_info *sta) +unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id) { - struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); + struct ieee80211_sta_rx_stats *stats; + struct link_sta_info *link_sta_info; + + stats = sta_get_last_rx_stats(sta, link_id); - if (!sta->deflink.status_stats.last_ack || - time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) + if (link_id < 0) + link_sta_info = &sta->deflink; + else + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + + if (!link_sta_info->status_stats.last_ack || + time_after(stats->last_rx, link_sta_info->status_stats.last_ack)) return stats->last_rx; - return sta->deflink.status_stats.last_ack; + + return link_sta_info->status_stats.last_ack; } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 7a95d8d34fca..e5b91e60405b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -936,7 +936,7 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta); void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta); void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); -unsigned long ieee80211_sta_last_active(struct sta_info *sta); +unsigned long ieee80211_sta_last_active(struct sta_info *sta, int link_id); void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ea73a38fb866..24c43a1ef2aa 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -3265,14 +3265,24 @@ int ieee80211_put_srates_elem(struct sk_buff *skb, return 0; } -int ieee80211_ave_rssi(struct ieee80211_vif *vif) +int ieee80211_ave_rssi(struct ieee80211_vif *vif, int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + struct ieee80211_link_data *link_data; if (WARN_ON_ONCE(sdata->vif.type != NL80211_IFTYPE_STATION)) return 0; - return -ewma_beacon_signal_read(&sdata->deflink.u.mgd.ave_beacon_signal); + if (link_id < 0) + link_data = &sdata->deflink; + else + link_data = wiphy_dereference(sdata->local->hw.wiphy, + sdata->link[link_id]); + + if (WARN_ON_ONCE(!link_data)) + return -99; + + return -ewma_beacon_signal_read(&link_data->u.mgd.ave_beacon_signal); } EXPORT_SYMBOL_GPL(ieee80211_ave_rssi); -- cgit v1.2.3 From d2329fff7e527e8b350086be2e7cbf0d190177a3 Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Wed, 28 May 2025 11:14:12 +0530 Subject: wifi: cfg80211: add link_station_info structure to support MLO statistics Current implementation of NL80211_GET_STATION does not work for multi-link operation(MLO) since in case of MLO only deflink (or one of the links) is considered and not all links. Therefore to support for MLO, add link_station_info structure to account link level statistics for station. Additionally, add valid_links in station_info structure to indicate bitmap of valid links for MLO. This will be helpful to check the link related statistics during MLO. Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250528054420.3050133-3-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 101 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 47b4235eea59..b008357cac03 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2017,6 +2017,99 @@ struct cfg80211_tid_stats { #define IEEE80211_MAX_CHAINS 4 +/** + * struct link_station_info - link station information + * + * Link station information filled by driver for get_station() and + * dump_station(). + * @filled: bit flag of flags using the bits of &enum nl80211_sta_info to + * indicate the relevant values in this struct for them + * @connected_time: time(in secs) since a link of station is last connected + * @inactive_time: time since last activity for link station(tx/rx) + * in milliseconds + * @assoc_at: bootime (ns) of the last association of link of station + * @rx_bytes: bytes (size of MPDUs) received from this link of station + * @tx_bytes: bytes (size of MPDUs) transmitted to this link of station + * @signal: The signal strength, type depends on the wiphy's signal_type. + * For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_. + * @signal_avg: Average signal strength, type depends on the wiphy's + * signal_type. For CFG80211_SIGNAL_TYPE_MBM, value is expressed in _dBm_ + * @chains: bitmask for filled values in @chain_signal, @chain_signal_avg + * @chain_signal: per-chain signal strength of last received packet in dBm + * @chain_signal_avg: per-chain signal strength average in dBm + * @txrate: current unicast bitrate from this link of station + * @rxrate: current unicast bitrate to this link of station + * @rx_packets: packets (MSDUs & MMPDUs) received from this link of station + * @tx_packets: packets (MSDUs & MMPDUs) transmitted to this link of station + * @tx_retries: cumulative retry counts (MPDUs) for this link of station + * @tx_failed: number of failed transmissions (MPDUs) (retries exceeded, no ACK) + * @rx_dropped_misc: Dropped for un-specified reason. + * @bss_param: current BSS parameters + * @beacon_loss_count: Number of times beacon loss event has triggered. + * @expected_throughput: expected throughput in kbps (including 802.11 headers) + * towards this station. + * @rx_beacon: number of beacons received from this peer + * @rx_beacon_signal_avg: signal strength average (in dBm) for beacons received + * from this peer + * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer + * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer + * @airtime_weight: current airtime scheduling weight + * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last + * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. + * Note that this doesn't use the @filled bit, but is used if non-NULL. + * @ack_signal: signal strength (in dBm) of the last ACK frame. + * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has + * been sent. + * @rx_mpdu_count: number of MPDUs received from this station + * @fcs_err_count: number of packets (MPDUs) received from this station with + * an FCS error. This counter should be incremented only when TA of the + * received packet with an FCS error matches the peer MAC address. + * @addr: For MLO STA connection, filled with address of the link of station. + */ +struct link_station_info { + u64 filled; + u32 connected_time; + u32 inactive_time; + u64 assoc_at; + u64 rx_bytes; + u64 tx_bytes; + s8 signal; + s8 signal_avg; + + u8 chains; + s8 chain_signal[IEEE80211_MAX_CHAINS]; + s8 chain_signal_avg[IEEE80211_MAX_CHAINS]; + + struct rate_info txrate; + struct rate_info rxrate; + u32 rx_packets; + u32 tx_packets; + u32 tx_retries; + u32 tx_failed; + u32 rx_dropped_misc; + struct sta_bss_parameters bss_param; + + u32 beacon_loss_count; + + u32 expected_throughput; + + u64 tx_duration; + u64 rx_duration; + u64 rx_beacon; + u8 rx_beacon_signal_avg; + + u16 airtime_weight; + + s8 ack_signal; + s8 avg_ack_signal; + struct cfg80211_tid_stats *pertid; + + u32 rx_mpdu_count; + u32 fcs_err_count; + + u8 addr[ETH_ALEN] __aligned(2); +}; + /** * struct station_info - station information * @@ -2101,6 +2194,11 @@ struct cfg80211_tid_stats { * dump_station() callbacks. User space needs this information to determine * the accepted and rejected affiliated links of the connected station. * @assoc_resp_ies_len: Length of @assoc_resp_ies buffer in octets. + * @valid_links: bitmap of valid links, or 0 for non-MLO. Drivers fill this + * information in cfg80211_new_sta(), cfg80211_del_sta_sinfo(), + * get_station() and dump_station() callbacks. + * @links: reference to Link sta entries for MLO STA, all link specific + * information is accessed through links[link_id]. */ struct station_info { u64 filled; @@ -2165,6 +2263,9 @@ struct station_info { u8 mld_addr[ETH_ALEN] __aligned(2); const u8 *assoc_resp_ies; size_t assoc_resp_ies_len; + + u16 valid_links; + struct link_station_info *links[IEEE80211_MLD_MAX_NUM_LINKS]; }; /** -- cgit v1.2.3 From 49e47223ecc4af0bd15b5267184d46b3654d520b Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Wed, 28 May 2025 11:14:15 +0530 Subject: wifi: cfg80211: allocate memory for link_station info structure Currently, station_info structure is passed to fill station statistics from mac80211/drivers. After NL message send to user space for requested station statistics, memory for station statistics is freed in cfg80211. Therefore, memory allocation/free for link station statistics should also happen in cfg80211 only. Hence, allocate the memory for link_station structure for all possible links and free in cfg80211_sinfo_release_content(). Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250528054420.3050133-6-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ net/wireless/nl80211.c | 27 ++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b008357cac03..7bf0c97d2ab1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -8577,6 +8577,13 @@ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); static inline void cfg80211_sinfo_release_content(struct station_info *sinfo) { kfree(sinfo->pertid); + + for (int link_id = 0; link_id < ARRAY_SIZE(sinfo->links); link_id++) { + if (sinfo->links[link_id]) { + kfree(sinfo->links[link_id]->pertid); + kfree(sinfo->links[link_id]); + } + } } /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 83f6291eac92..5137824520a1 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7366,7 +7366,7 @@ static int nl80211_dump_station(struct sk_buff *skb, struct wireless_dev *wdev; u8 mac_addr[ETH_ALEN]; int sta_idx = cb->args[2]; - int err; + int err, i; err = nl80211_prepare_wdev_dump(cb, &rdev, &wdev, NULL); if (err) @@ -7386,6 +7386,16 @@ static int nl80211_dump_station(struct sk_buff *skb, while (1) { memset(&sinfo, 0, sizeof(sinfo)); + + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { + sinfo.links[i] = + kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL); + if (!sinfo.links[i]) { + err = -ENOMEM; + goto out_err; + } + } + err = rdev_dump_station(rdev, wdev->netdev, sta_idx, mac_addr, &sinfo); if (err == -ENOENT) @@ -7410,6 +7420,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[2] = sta_idx; err = skb->len; out_err: + cfg80211_sinfo_release_content(&sinfo); wiphy_unlock(&rdev->wiphy); return err; @@ -7422,7 +7433,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) struct station_info sinfo; struct sk_buff *msg; u8 *mac_addr = NULL; - int err; + int err, i; memset(&sinfo, 0, sizeof(sinfo)); @@ -7434,9 +7445,19 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->get_station) return -EOPNOTSUPP; + for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { + sinfo.links[i] = kzalloc(sizeof(*sinfo.links[0]), GFP_KERNEL); + if (!sinfo.links[i]) { + cfg80211_sinfo_release_content(&sinfo); + return -ENOMEM; + } + } + err = rdev_get_station(rdev, dev, mac_addr, &sinfo); - if (err) + if (err) { + cfg80211_sinfo_release_content(&sinfo); return err; + } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { -- cgit v1.2.3 From 505991fba9ec112770c79a0fea56b4c49a5ad2fa Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Wed, 28 May 2025 11:14:18 +0530 Subject: wifi: mac80211: extend support to fill link level sinfo structure Currently, sinfo structure is supported to fill information at deflink( or one of the links) level for station. This has problems when applied to fetch multi-link(ML) station information. Hence, if valid_links are present, support filling link_station structure for each link. This will be helpful to check the link related statistics during MLO. Additionally, TXQ stats for pertid are applicable at station level not at link level. Therefore check link_id is less then 0, before filling TXQ stats in pertid stats. Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250528054420.3050133-9-quic_sarishar@quicinc.com [fix some indentation] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 ++ net/mac80211/sta_info.c | 260 +++++++++++++++++++++++++++++++++++++++++++++++- net/wireless/util.c | 12 +++ 3 files changed, 281 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 7bf0c97d2ab1..eec066f4738a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -8566,6 +8566,17 @@ void cfg80211_tx_mgmt_expired(struct wireless_dev *wdev, u64 cookie, */ int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp); +/** + * cfg80211_link_sinfo_alloc_tid_stats - allocate per-tid statistics. + * + * @link_sinfo: the link station information + * @gfp: allocation flags + * + * Return: 0 on success. Non-zero on error. + */ +int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, + gfp_t gfp); + /** * cfg80211_sinfo_release_content - release contents of station info * @sinfo: the station information diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cf80b2fc8898..67af43d2e09b 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2634,7 +2634,7 @@ static void sta_set_tidstats(struct sta_info *sta, link_sta_info->status_stats.msdu_failed[tid]; } - if (tid < IEEE80211_NUM_TIDS) { + if (link_id < 0 && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -2719,13 +2719,249 @@ void sta_set_accumulated_removed_links_sinfo(struct sta_info *sta, } } +static void sta_set_link_sinfo(struct sta_info *sta, + struct link_station_info *link_sinfo, + struct ieee80211_link_data *link, + bool tidstats) +{ + struct ieee80211_sub_if_data *sdata = sta->sdata; + struct ieee80211_sta_rx_stats *last_rxstats; + int i, ac, cpu, link_id = link->link_id; + struct link_sta_info *link_sta_info; + u32 thr = 0; + + last_rxstats = sta_get_last_rx_stats(sta, link_id); + + link_sta_info = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + + /* do before driver, so beacon filtering drivers have a + * chance to e.g. just add the number of filtered beacons + * (or just modify the value entirely, of course) + */ + if (sdata->vif.type == NL80211_IFTYPE_STATION) + link_sinfo->rx_beacon = link->u.mgd.count_beacon_signal; + + ether_addr_copy(link_sinfo->addr, link_sta_info->addr); + + /* TODO: add drv_link_sta_statistics() ops to fill link_station + * statistics of station. + */ + + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | + BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | + BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); + + if (sdata->vif.type == NL80211_IFTYPE_STATION) { + link_sinfo->beacon_loss_count = + link->u.mgd.beacon_loss_count; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); + } + + link_sinfo->inactive_time = + jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta, link_id)); + + if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { + link_sinfo->tx_bytes = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->tx_bytes += + link_sta_info->tx_stats.bytes[ac]; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { + link_sinfo->tx_packets = 0; + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->tx_packets += + link_sta_info->tx_stats.packets[ac]; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); + } + + if (!(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | + BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { + link_sinfo->rx_bytes += + sta_get_stats_bytes(&link_sta_info->rx_stats); + + if (link_sta_info->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, + cpu); + link_sinfo->rx_bytes += + sta_get_stats_bytes(cpurxs); + } + } + + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { + link_sinfo->rx_packets = link_sta_info->rx_stats.packets; + if (link_sta_info->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, + cpu); + link_sinfo->rx_packets += cpurxs->packets; + } + } + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { + link_sinfo->tx_retries = + link_sta_info->status_stats.retry_count; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { + link_sinfo->tx_failed = + link_sta_info->status_stats.retry_failed; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->rx_duration += sta->airtime[ac].rx_airtime; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { + for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) + link_sinfo->tx_duration += sta->airtime[ac].tx_airtime; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { + link_sinfo->airtime_weight = sta->airtime_weight; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); + } + + link_sinfo->rx_dropped_misc = link_sta_info->rx_stats.dropped; + if (link_sta_info->pcpu_rx_stats) { + for_each_possible_cpu(cpu) { + struct ieee80211_sta_rx_stats *cpurxs; + + cpurxs = per_cpu_ptr(link_sta_info->pcpu_rx_stats, + cpu); + link_sinfo->rx_dropped_misc += cpurxs->dropped; + } + } + + if (sdata->vif.type == NL80211_IFTYPE_STATION && + !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | + BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); + link_sinfo->rx_beacon_signal_avg = + ieee80211_ave_rssi(&sdata->vif, -1); + } + + if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || + ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { + link_sinfo->signal = (s8)last_rxstats->last_signal; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); + } + + if (!link_sta_info->pcpu_rx_stats && + !(link_sinfo->filled & + BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { + link_sinfo->signal_avg = + -ewma_signal_read(&link_sta_info->rx_stats_avg.signal); + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); + } + } + + /* for the average - if pcpu_rx_stats isn't set - rxstats must point to + * the sta->rx_stats struct, so the check here is fine with and without + * pcpu statistics + */ + if (last_rxstats->chains && + !(link_sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); + if (!link_sta_info->pcpu_rx_stats) + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); + + link_sinfo->chains = last_rxstats->chains; + + for (i = 0; i < ARRAY_SIZE(link_sinfo->chain_signal); i++) { + link_sinfo->chain_signal[i] = + last_rxstats->chain_signal_last[i]; + link_sinfo->chain_signal_avg[i] = + -ewma_signal_read( + &link_sta_info->rx_stats_avg.chain_signal[i]); + } + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && + ieee80211_rate_valid(&link_sta_info->tx_stats.last_rate)) { + sta_set_rate_info_tx(sta, &link_sta_info->tx_stats.last_rate, + &link_sinfo->txrate); + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE))) { + if (sta_set_rate_info_rx(sta, &link_sinfo->rxrate, + link_id) == 0) + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_RX_BITRATE); + } + + if (tidstats && !cfg80211_link_sinfo_alloc_tid_stats(link_sinfo, + GFP_KERNEL)) { + for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) + sta_set_tidstats(sta, &link_sinfo->pertid[i], i, + link_id); + } + + link_sinfo->bss_param.flags = 0; + if (sdata->vif.bss_conf.use_cts_prot) + link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; + if (sdata->vif.bss_conf.use_short_preamble) + link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; + if (sdata->vif.bss_conf.use_short_slot) + link_sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; + link_sinfo->bss_param.dtim_period = link->conf->dtim_period; + link_sinfo->bss_param.beacon_interval = link->conf->beacon_int; + + thr = sta_get_expected_throughput(sta); + + if (thr != 0) { + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); + link_sinfo->expected_throughput = thr; + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && + link_sta_info->status_stats.ack_signal_filled) { + link_sinfo->ack_signal = + link_sta_info->status_stats.last_ack_signal; + link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); + } + + if (!(link_sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && + link_sta_info->status_stats.ack_signal_filled) { + link_sinfo->avg_ack_signal = + -(s8)ewma_avg_signal_read( + &link_sta_info->status_stats.avg_ack_signal); + link_sinfo->filled |= + BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); + } +} + void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; - int i, ac, cpu; + int i, ac, cpu, link_id; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta, -1); @@ -2963,6 +3199,26 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } + + if (sta->sta.valid_links) { + struct ieee80211_link_data *link; + struct link_sta_info *link_sta; + + ether_addr_copy(sinfo->mld_addr, sta->addr); + for_each_valid_link(sinfo, link_id) { + link_sta = wiphy_dereference(sta->local->hw.wiphy, + sta->link[link_id]); + link = wiphy_dereference(sdata->local->hw.wiphy, + sdata->link[link_id]); + + if (!link_sta || !sinfo->links[link_id] || !link) + continue; + + sinfo->valid_links = sta->sta.valid_links; + sta_set_link_sinfo(sta, sinfo->links[link_id], + link, tidstats); + } + } } u32 sta_get_expected_throughput(struct sta_info *sta) diff --git a/net/wireless/util.c b/net/wireless/util.c index e438f883f085..5aff11c35303 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -2650,6 +2650,18 @@ bool cfg80211_does_bw_fit_range(const struct ieee80211_freq_range *freq_range, return false; } +int cfg80211_link_sinfo_alloc_tid_stats(struct link_station_info *link_sinfo, + gfp_t gfp) +{ + link_sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, + sizeof(*link_sinfo->pertid), gfp); + if (!link_sinfo->pertid) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(cfg80211_link_sinfo_alloc_tid_stats); + int cfg80211_sinfo_alloc_tid_stats(struct station_info *sinfo, gfp_t gfp) { sinfo->pertid = kcalloc(IEEE80211_NUM_TIDS + 1, -- cgit v1.2.3 From 4cb1ce7e254adeeeec7ccbb45125307aec4d0f0b Mon Sep 17 00:00:00 2001 From: Sarika Sharma Date: Wed, 28 May 2025 11:14:20 +0530 Subject: wifi: mac80211: add link_sta_statistics ops to fill link station statistics Currently, link station statistics for MLO are filled by mac80211. But there are some statistics that kept by mac80211 might not be accurate, so let the driver pre-fill the link statistics. The driver can fill the values (indicating which field is filled, by setting the filled bitmapin in link_station structure). Statistics that driver don't fill are filled by mac80211. Hence, add link_sta_statistics callback to fill link station statistics for MLO in sta_set_link_sinfo() by drivers. Signed-off-by: Sarika Sharma Link: https://patch.msgid.link/20250528054420.3050133-11-quic_sarishar@quicinc.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 13 +++++++++++++ net/mac80211/driver-ops.h | 19 +++++++++++++++++++ net/mac80211/sta_info.c | 6 +++--- net/mac80211/trace.h | 27 +++++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a305e7f9c6b2..fa2325692abf 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4133,6 +4133,15 @@ struct ieee80211_prep_tx_info { * Statistics that the driver doesn't fill will be filled by mac80211. * The callback can sleep. * + * @link_sta_statistics: Get link statistics for this station. For example with + * beacon filtering, the statistics kept by mac80211 might not be + * accurate, so let the driver pre-fill the statistics. The driver can + * fill most of the values (indicating which by setting the filled + * bitmap), but not all of them make sense - see the source for which + * ones are possible. + * Statistics that the driver doesn't fill will be filled by mac80211. + * The callback can sleep. + * * @conf_tx: Configure TX queue parameters (EDCF (aifs, cw_min, cw_max), * bursting) for a hardware TX queue. * Returns a negative error code on failure. @@ -4627,6 +4636,10 @@ struct ieee80211_ops { s64 offset); void (*reset_tsf)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); int (*tx_last_beacon)(struct ieee80211_hw *hw); + void (*link_sta_statistics)(struct ieee80211_hw *hw, + struct ieee80211_vif *vif, + struct ieee80211_link_sta *link_sta, + struct link_station_info *link_sinfo); /** * @ampdu_action: diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 307587c8a003..ba017bf3fd15 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -631,6 +631,25 @@ static inline void drv_sta_statistics(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline void drv_link_sta_statistics(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, + struct link_station_info *link_sinfo) +{ + might_sleep(); + lockdep_assert_wiphy(local->hw.wiphy); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + trace_drv_link_sta_statistics(local, sdata, link_sta); + if (local->ops->link_sta_statistics) + local->ops->link_sta_statistics(&local->hw, &sdata->vif, + link_sta, link_sinfo); + trace_drv_return_void(local); +} + int drv_conf_tx(struct ieee80211_local *local, struct ieee80211_link_data *link, u16 ac, const struct ieee80211_tx_queue_params *params); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 67af43d2e09b..89cf365b07e6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2744,9 +2744,9 @@ static void sta_set_link_sinfo(struct sta_info *sta, ether_addr_copy(link_sinfo->addr, link_sta_info->addr); - /* TODO: add drv_link_sta_statistics() ops to fill link_station - * statistics of station. - */ + drv_link_sta_statistics(sta->local, sdata, + link_sta_info->pub, + link_sinfo); link_sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 72fad8ea8bb9..8215ca58ce5e 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -1002,6 +1002,33 @@ DEFINE_EVENT(sta_event, drv_sta_statistics, TP_ARGS(local, sdata, sta) ); +TRACE_EVENT(drv_link_sta_statistics, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta), + + TP_ARGS(local, sdata, link_sta), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, link_id) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + STA_NAMED_ASSIGN(link_sta->sta); + __entry->link_id = link_sta->link_id; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " (link %d)", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id + ) +); + DEFINE_EVENT(sta_event, drv_sta_add, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, -- cgit v1.2.3 From b74947b4f6ff7c122a1bb6eb38bb7ecfbb1d3820 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Sun, 15 Jun 2025 13:53:09 +0530 Subject: wifi: cfg80211/mac80211: Add support to get radio index Currently, per-radio attributes are set on per-phy basis, i.e., all the radios present in a wiphy will take attributes values sent from user. But each radio in a wiphy can get different values from userspace based on its requirement. To extend support to set per-radio attributes, add support to get radio index from userspace. Add an NL attribute - NL80211_ATTR_WIPHY_RADIO_INDEX, to get user specified radio index for which attributes should be changed. Pass this to individual drivers, so that the drivers can use this radio index to change per-radio attributes when necessary. Currently, per-radio attributes identified are: NL80211_ATTR_WIPHY_TX_POWER_LEVEL NL80211_ATTR_WIPHY_ANTENNA_TX NL80211_ATTR_WIPHY_ANTENNA_RX NL80211_ATTR_WIPHY_RETRY_SHORT NL80211_ATTR_WIPHY_RETRY_LONG NL80211_ATTR_WIPHY_FRAG_THRESHOLD NL80211_ATTR_WIPHY_RTS_THRESHOLD NL80211_ATTR_WIPHY_COVERAGE_CLASS NL80211_ATTR_TXQ_LIMIT NL80211_ATTR_TXQ_MEMORY_LIMIT NL80211_ATTR_TXQ_QUANTUM By default, the radio index is set to -1. This means the attribute should be treated as a global configuration. If the user has not specified any index, then the radio index passed to individual drivers would be -1. This would indicate that the attribute applies to all radios in that wiphy. Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20250615082312.619639-2-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- drivers/net/wireless/admtek/adm8211.c | 2 +- drivers/net/wireless/ath/ar5523/ar5523.c | 5 +- drivers/net/wireless/ath/ath10k/core.c | 2 +- drivers/net/wireless/ath/ath10k/hw.c | 1 + drivers/net/wireless/ath/ath10k/hw.h | 2 +- drivers/net/wireless/ath/ath10k/mac.c | 19 ++++-- drivers/net/wireless/ath/ath11k/mac.c | 14 ++-- drivers/net/wireless/ath/ath12k/mac.c | 14 ++-- drivers/net/wireless/ath/ath5k/mac80211-ops.c | 12 ++-- drivers/net/wireless/ath/ath6kl/cfg80211.c | 7 +- drivers/net/wireless/ath/ath9k/htc_drv_main.c | 10 +-- drivers/net/wireless/ath/ath9k/main.c | 9 ++- drivers/net/wireless/ath/carl9170/main.c | 2 +- drivers/net/wireless/ath/wcn36xx/main.c | 5 +- drivers/net/wireless/ath/wil6210/cfg80211.c | 3 +- drivers/net/wireless/atmel/at76c50x-usb.c | 2 +- drivers/net/wireless/broadcom/b43/main.c | 6 +- drivers/net/wireless/broadcom/b43legacy/main.c | 2 +- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 8 ++- .../broadcom/brcm80211/brcmsmac/mac80211_if.c | 3 +- drivers/net/wireless/intel/iwlegacy/common.c | 2 +- drivers/net/wireless/intel/iwlegacy/common.h | 2 +- drivers/net/wireless/intel/iwlwifi/dvm/agn.h | 2 +- drivers/net/wireless/intel/iwlwifi/dvm/rxon.c | 2 +- drivers/net/wireless/intel/iwlwifi/mld/mac80211.c | 6 +- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 9 ++- drivers/net/wireless/intel/iwlwifi/mvm/mvm.h | 12 ++-- drivers/net/wireless/intersil/p54/main.c | 3 +- drivers/net/wireless/marvell/libertas_tf/main.c | 2 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 11 ++- drivers/net/wireless/marvell/mwl8k.c | 12 ++-- drivers/net/wireless/mediatek/mt76/mac80211.c | 3 +- drivers/net/wireless/mediatek/mt76/mt76.h | 3 +- drivers/net/wireless/mediatek/mt76/mt7603/main.c | 5 +- drivers/net/wireless/mediatek/mt76/mt7615/main.c | 11 +-- drivers/net/wireless/mediatek/mt76/mt76x0/main.c | 2 +- drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h | 2 +- drivers/net/wireless/mediatek/mt76/mt76x02.h | 4 +- drivers/net/wireless/mediatek/mt76/mt76x02_util.c | 4 +- .../net/wireless/mediatek/mt76/mt76x2/pci_main.c | 6 +- .../net/wireless/mediatek/mt76/mt76x2/usb_main.c | 2 +- drivers/net/wireless/mediatek/mt76/mt7915/main.c | 13 ++-- drivers/net/wireless/mediatek/mt76/mt7921/main.c | 8 ++- drivers/net/wireless/mediatek/mt76/mt7925/main.c | 8 ++- drivers/net/wireless/mediatek/mt76/mt792x.h | 3 +- drivers/net/wireless/mediatek/mt76/mt792x_core.c | 3 +- drivers/net/wireless/mediatek/mt76/mt7996/main.c | 11 +-- drivers/net/wireless/mediatek/mt7601u/main.c | 5 +- drivers/net/wireless/microchip/wilc1000/cfg80211.c | 7 +- drivers/net/wireless/purelifi/plfxlc/mac.c | 5 +- drivers/net/wireless/quantenna/qtnfmac/cfg80211.c | 8 ++- drivers/net/wireless/ralink/rt2x00/rt2800lib.c | 2 +- drivers/net/wireless/ralink/rt2x00/rt2800lib.h | 3 +- drivers/net/wireless/ralink/rt2x00/rt2x00.h | 8 ++- drivers/net/wireless/ralink/rt2x00/rt2x00mac.c | 8 ++- drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c | 2 +- drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c | 2 +- drivers/net/wireless/realtek/rtl8xxxu/core.c | 8 ++- drivers/net/wireless/realtek/rtlwifi/core.c | 2 +- drivers/net/wireless/realtek/rtw88/mac80211.c | 9 ++- drivers/net/wireless/realtek/rtw88/main.h | 2 +- drivers/net/wireless/realtek/rtw88/rtw8822b.c | 1 + drivers/net/wireless/realtek/rtw88/rtw8822c.c | 1 + drivers/net/wireless/realtek/rtw89/mac80211.c | 10 +-- drivers/net/wireless/rsi/rsi_91x_mac80211.c | 9 ++- drivers/net/wireless/silabs/wfx/sta.c | 4 +- drivers/net/wireless/silabs/wfx/sta.h | 4 +- drivers/net/wireless/st/cw1200/sta.c | 5 +- drivers/net/wireless/st/cw1200/sta.h | 5 +- drivers/net/wireless/ti/wl1251/main.c | 5 +- drivers/net/wireless/ti/wlcore/main.c | 8 ++- drivers/net/wireless/virtual/mac80211_hwsim.c | 6 +- drivers/net/wireless/zydas/zd1211rw/zd_mac.c | 2 +- drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c | 6 +- include/net/cfg80211.h | 12 ++-- include/net/mac80211.h | 17 +++-- include/uapi/linux/nl80211.h | 10 +++ net/mac80211/cfg.c | 30 +++++--- net/mac80211/chan.c | 2 +- net/mac80211/driver-ops.h | 36 +++++----- net/mac80211/ieee80211_i.h | 5 +- net/mac80211/iface.c | 6 +- net/mac80211/main.c | 9 +-- net/mac80211/mlme.c | 12 ++-- net/mac80211/offchannel.c | 2 +- net/mac80211/pm.c | 2 +- net/mac80211/trace.h | 78 ++++++++++++++++----- net/mac80211/tx.c | 4 +- net/mac80211/util.c | 16 ++--- net/wireless/nl80211.c | 26 +++++-- net/wireless/rdev-ops.h | 39 ++++++----- net/wireless/trace.h | 79 +++++++++++++++------- net/wireless/wext-compat.c | 10 +-- 93 files changed, 520 insertions(+), 291 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/admtek/adm8211.c b/drivers/net/wireless/admtek/adm8211.c index a2d87c3ad196..e94a6b180314 100644 --- a/drivers/net/wireless/admtek/adm8211.c +++ b/drivers/net/wireless/admtek/adm8211.c @@ -1293,7 +1293,7 @@ static void adm8211_set_bssid(struct ieee80211_hw *dev, const u8 *bssid) ADM8211_CSR_WRITE(ABDA1, reg); } -static int adm8211_config(struct ieee80211_hw *dev, u32 changed) +static int adm8211_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { struct adm8211_priv *priv = dev->priv; struct ieee80211_conf *conf = &dev->conf; diff --git a/drivers/net/wireless/ath/ar5523/ar5523.c b/drivers/net/wireless/ath/ar5523/ar5523.c index 343c9de2749c..1230e6278f23 100644 --- a/drivers/net/wireless/ath/ar5523/ar5523.c +++ b/drivers/net/wireless/ath/ar5523/ar5523.c @@ -1083,7 +1083,8 @@ static void ar5523_stop(struct ieee80211_hw *hw, bool suspend) mutex_unlock(&ar->mutex); } -static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ar5523_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct ar5523 *ar = hw->priv; int ret; @@ -1137,7 +1138,7 @@ static void ar5523_remove_interface(struct ieee80211_hw *hw, ar->vif = NULL; } -static int ar5523_hwconfig(struct ieee80211_hw *hw, u32 changed) +static int ar5523_hwconfig(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ar5523 *ar = hw->priv; diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index fe3a8f4a1cc1..52163c2bfe7a 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -2606,7 +2606,7 @@ static void ath10k_core_set_coverage_class_work(struct work_struct *work) set_coverage_class_work); if (ar->hw_params.hw_ops->set_coverage_class) - ar->hw_params.hw_ops->set_coverage_class(ar, -1); + ar->hw_params.hw_ops->set_coverage_class(ar, -1, -1); } static int ath10k_core_init_firmware_features(struct ath10k *ar) diff --git a/drivers/net/wireless/ath/ath10k/hw.c b/drivers/net/wireless/ath/ath10k/hw.c index 84b35a22fc23..59b6cebfdd8f 100644 --- a/drivers/net/wireless/ath/ath10k/hw.c +++ b/drivers/net/wireless/ath/ath10k/hw.c @@ -590,6 +590,7 @@ void ath10k_hw_fill_survey_time(struct ath10k *ar, struct survey_info *survey, * function monitors and modifies the corresponding MAC registers. */ static void ath10k_hw_qca988x_set_coverage_class(struct ath10k *ar, + int radio_idx, s16 value) { u32 slottime_reg; diff --git a/drivers/net/wireless/ath/ath10k/hw.h b/drivers/net/wireless/ath/ath10k/hw.h index 7ffa1fbe2874..fec56b916497 100644 --- a/drivers/net/wireless/ath/ath10k/hw.h +++ b/drivers/net/wireless/ath/ath10k/hw.h @@ -646,7 +646,7 @@ struct htt_rx_ring_rx_desc_offsets; /* Defines needed for Rx descriptor abstraction */ struct ath10k_hw_ops { - void (*set_coverage_class)(struct ath10k *ar, s16 value); + void (*set_coverage_class)(struct ath10k *ar, int radio_idx, s16 value); int (*enable_pll_clk)(struct ath10k *ar); int (*tx_data_rssi_pad_bytes)(struct htt_resp *htt); int (*is_rssi_enable)(struct htt_resp *resp); diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c index 07fe05384cdf..590d7a8dd399 100644 --- a/drivers/net/wireless/ath/ath10k/mac.c +++ b/drivers/net/wireless/ath/ath10k/mac.c @@ -4820,7 +4820,8 @@ void ath10k_halt(struct ath10k *ar) spin_unlock_bh(&ar->data_lock); } -static int ath10k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath10k_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath10k *ar = hw->priv; @@ -5067,7 +5068,8 @@ static int __ath10k_set_antenna(struct ath10k *ar, u32 tx_ant, u32 rx_ant) return 0; } -static int ath10k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath10k_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath10k *ar = hw->priv; int ret; @@ -5437,7 +5439,7 @@ static int ath10k_config_ps(struct ath10k *ar) return ret; } -static int ath10k_config(struct ieee80211_hw *hw, u32 changed) +static int ath10k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath10k *ar = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -6336,7 +6338,8 @@ static void ath10k_bss_info_changed(struct ieee80211_hw *hw, mutex_unlock(&ar->conf_mutex); } -static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, s16 value) +static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 value) { struct ath10k *ar = hw->priv; @@ -6347,7 +6350,7 @@ static void ath10k_mac_op_set_coverage_class(struct ieee80211_hw *hw, s16 value) WARN_ON_ONCE(1); return; } - ar->hw_params.hw_ops->set_coverage_class(ar, value); + ar->hw_params.hw_ops->set_coverage_class(ar, -1, value); } struct ath10k_mac_tdls_iter_data { @@ -8035,7 +8038,8 @@ static int ath10k_cancel_remain_on_channel(struct ieee80211_hw *hw, * in ath10k, but device-specific in mac80211. */ -static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct ath10k *ar = hw->priv; struct ath10k_vif *arvif; @@ -8058,7 +8062,8 @@ static int ath10k_set_rts_threshold(struct ieee80211_hw *hw, u32 value) return ret; } -static int ath10k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int ath10k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { /* Even though there's a WMI enum for fragmentation threshold no known * firmware actually implements it. Moreover it is not possible to rely diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c index 13301ca317a5..758ef6f26432 100644 --- a/drivers/net/wireless/ath/ath11k/mac.c +++ b/drivers/net/wireless/ath/ath11k/mac.c @@ -1283,7 +1283,7 @@ static int ath11k_mac_config_ps(struct ath11k *ar) return ret; } -static int ath11k_mac_op_config(struct ieee80211_hw *hw, u32 changed) +static int ath11k_mac_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath11k *ar = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -7044,7 +7044,8 @@ static void ath11k_mac_op_configure_filter(struct ieee80211_hw *hw, mutex_unlock(&ar->conf_mutex); } -static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath11k *ar = hw->priv; @@ -7058,7 +7059,8 @@ static int ath11k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 * return 0; } -static int ath11k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath11k_mac_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath11k *ar = hw->priv; int ret; @@ -8182,7 +8184,8 @@ ath11k_set_vdev_param_to_all_vifs(struct ath11k *ar, int param, u32 value) /* mac80211 stores device specific RTS/Fragmentation threshold value, * this is set interface specific to firmware from ath11k driver */ -static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { struct ath11k *ar = hw->priv; int param_id = WMI_VDEV_PARAM_RTS_THRESHOLD; @@ -8190,7 +8193,8 @@ static int ath11k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) return ath11k_set_vdev_param_to_all_vifs(ar, param_id, value); } -static int ath11k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int ath11k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { /* Even though there's a WMI vdev param for fragmentation threshold no * known firmware actually implements it. Moreover it is not possible to diff --git a/drivers/net/wireless/ath/ath12k/mac.c b/drivers/net/wireless/ath/ath12k/mac.c index 59ec422992d3..81c6b80fa890 100644 --- a/drivers/net/wireless/ath/ath12k/mac.c +++ b/drivers/net/wireless/ath/ath12k/mac.c @@ -1392,7 +1392,7 @@ err: return ret; } -static int ath12k_mac_op_config(struct ieee80211_hw *hw, u32 changed) +static int ath12k_mac_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } @@ -9354,7 +9354,8 @@ static void ath12k_mac_op_configure_filter(struct ieee80211_hw *hw, ar->filter_flags = *total_flags; } -static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); int antennas_rx = 0, antennas_tx = 0; @@ -9374,7 +9375,8 @@ static int ath12k_mac_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 * return 0; } -static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath12k_mac_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); struct ath12k *ar; @@ -10735,7 +10737,8 @@ ath12k_set_vdev_param_to_all_vifs(struct ath12k *ar, int param, u32 value) /* mac80211 stores device specific RTS/Fragmentation threshold value, * this is set interface specific to firmware from ath12k driver */ -static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { struct ath12k_hw *ah = ath12k_hw_to_ah(hw); struct ath12k *ar; @@ -10760,7 +10763,8 @@ static int ath12k_mac_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) return ret; } -static int ath12k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int ath12k_mac_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { /* Even though there's a WMI vdev param for fragmentation threshold no * known firmware actually implements it. Moreover it is not possible to diff --git a/drivers/net/wireless/ath/ath5k/mac80211-ops.c b/drivers/net/wireless/ath/ath5k/mac80211-ops.c index d81b2ad0b095..eca8145d3874 100644 --- a/drivers/net/wireless/ath/ath5k/mac80211-ops.c +++ b/drivers/net/wireless/ath/ath5k/mac80211-ops.c @@ -192,7 +192,7 @@ ath5k_remove_interface(struct ieee80211_hw *hw, * TODO: Phy disable/diversity etc */ static int -ath5k_config(struct ieee80211_hw *hw, u32 changed) +ath5k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath5k_hw *ah = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -686,6 +686,7 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) * ath5k_set_coverage_class - Set IEEE 802.11 coverage class * * @hw: struct ieee80211_hw pointer + * @radio_idx: Radio index * @coverage_class: IEEE 802.11 coverage class number * * Mac80211 callback. Sets slot time, ACK timeout and CTS timeout for given @@ -693,7 +694,8 @@ ath5k_get_survey(struct ieee80211_hw *hw, int idx, struct survey_info *survey) * reset. */ static void -ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +ath5k_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct ath5k_hw *ah = hw->priv; @@ -704,7 +706,8 @@ ath5k_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) static int -ath5k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +ath5k_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, + u32 rx_ant) { struct ath5k_hw *ah = hw->priv; @@ -721,7 +724,8 @@ ath5k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) static int -ath5k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +ath5k_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath5k_hw *ah = hw->priv; diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 8c2e8081112e..88f0197fc041 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -1376,7 +1376,8 @@ void ath6kl_cfg80211_tkip_micerr_event(struct ath6kl_vif *vif, u8 keyid, GFP_KERNEL); } -static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct ath6kl *ar = (struct ath6kl *)wiphy_priv(wiphy); struct ath6kl_vif *vif; @@ -1405,6 +1406,7 @@ static int ath6kl_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm) { @@ -1441,6 +1443,7 @@ static int ath6kl_cfg80211_set_txpower(struct wiphy *wiphy, static int ath6kl_cfg80211_get_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { @@ -3242,7 +3245,7 @@ static int ath6kl_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, wait, buf, len, no_cck); } -static int ath6kl_get_antenna(struct wiphy *wiphy, +static int ath6kl_get_antenna(struct wiphy *wiphy, int radio_idx, u32 *tx_ant, u32 *rx_ant) { struct ath6kl *ar = wiphy_priv(wiphy); diff --git a/drivers/net/wireless/ath/ath9k/htc_drv_main.c b/drivers/net/wireless/ath/ath9k/htc_drv_main.c index 19600018e562..0d6272ac0dac 100644 --- a/drivers/net/wireless/ath/ath9k/htc_drv_main.c +++ b/drivers/net/wireless/ath/ath9k/htc_drv_main.c @@ -1172,7 +1172,7 @@ static void ath9k_htc_remove_interface(struct ieee80211_hw *hw, mutex_unlock(&priv->mutex); } -static int ath9k_htc_config(struct ieee80211_hw *hw, u32 changed) +static int ath9k_htc_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath9k_htc_priv *priv = hw->priv; struct ath_common *common = ath9k_hw_common(priv->ah); @@ -1737,12 +1737,14 @@ static void ath9k_htc_sw_scan_complete(struct ieee80211_hw *hw, mutex_unlock(&priv->mutex); } -static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int ath9k_htc_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { return 0; } static void ath9k_htc_set_coverage_class(struct ieee80211_hw *hw, + int radio_idx, s16 coverage_class) { struct ath9k_htc_priv *priv = hw->priv; @@ -1841,8 +1843,8 @@ struct base_eep_header *ath9k_htc_get_eeprom_base(struct ath9k_htc_priv *priv) } -static int ath9k_htc_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, - u32 *rx_ant) +static int ath9k_htc_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath9k_htc_priv *priv = hw->priv; struct base_eep_header *pBase = ath9k_htc_get_eeprom_base(priv); diff --git a/drivers/net/wireless/ath/ath9k/main.c b/drivers/net/wireless/ath/ath9k/main.c index c56f4f3b8990..740a6fc7b067 100644 --- a/drivers/net/wireless/ath/ath9k/main.c +++ b/drivers/net/wireless/ath/ath9k/main.c @@ -1484,7 +1484,7 @@ static void ath9k_disable_ps(struct ath_softc *sc) ath_dbg(common, PS, "PowerSave disabled\n"); } -static int ath9k_config(struct ieee80211_hw *hw, u32 changed) +static int ath9k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; @@ -2114,6 +2114,7 @@ static void ath9k_enable_dynack(struct ath_softc *sc) } static void ath9k_set_coverage_class(struct ieee80211_hw *hw, + int radio_idx, s16 coverage_class) { struct ath_softc *sc = hw->priv; @@ -2338,7 +2339,8 @@ static bool validate_antenna_mask(struct ath_hw *ah, u32 val) } } -static int ath9k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +static int ath9k_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ath_softc *sc = hw->priv; struct ath_hw *ah = sc->sc_ah; @@ -2367,7 +2369,8 @@ static int ath9k_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) return 0; } -static int ath9k_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +static int ath9k_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ath_softc *sc = hw->priv; diff --git a/drivers/net/wireless/ath/carl9170/main.c b/drivers/net/wireless/ath/carl9170/main.c index 755c068e4197..a7a9345f3483 100644 --- a/drivers/net/wireless/ath/carl9170/main.c +++ b/drivers/net/wireless/ath/carl9170/main.c @@ -890,7 +890,7 @@ static void carl9170_stat_work(struct work_struct *work) round_jiffies(msecs_to_jiffies(CARL9170_STAT_WORK))); } -static int carl9170_op_config(struct ieee80211_hw *hw, u32 changed) +static int carl9170_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ar9170 *ar = hw->priv; int err = 0; diff --git a/drivers/net/wireless/ath/wcn36xx/main.c b/drivers/net/wireless/ath/wcn36xx/main.c index 94d08d6ae1a3..02a525645bfa 100644 --- a/drivers/net/wireless/ath/wcn36xx/main.c +++ b/drivers/net/wireless/ath/wcn36xx/main.c @@ -361,7 +361,7 @@ static void wcn36xx_change_opchannel(struct wcn36xx *wcn, int ch) return; } -static int wcn36xx_config(struct ieee80211_hw *hw, u32 changed) +static int wcn36xx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct wcn36xx *wcn = hw->priv; int ret; @@ -965,7 +965,8 @@ out: } /* this is required when using IEEE80211_HW_HAS_RATE_CONTROL */ -static int wcn36xx_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int wcn36xx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct wcn36xx *wcn = hw->priv; wcn36xx_dbg(WCN36XX_DBG_MAC, "mac set RTS threshold %d\n", value); diff --git a/drivers/net/wireless/ath/wil6210/cfg80211.c b/drivers/net/wireless/ath/wil6210/cfg80211.c index 5473c01cbe66..7703a0933a14 100644 --- a/drivers/net/wireless/ath/wil6210/cfg80211.c +++ b/drivers/net/wireless/ath/wil6210/cfg80211.c @@ -1408,7 +1408,8 @@ static int wil_cfg80211_disconnect(struct wiphy *wiphy, return rc; } -static int wil_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int wil_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct wil6210_priv *wil = wiphy_to_wil(wiphy); int rc; diff --git a/drivers/net/wireless/atmel/at76c50x-usb.c b/drivers/net/wireless/atmel/at76c50x-usb.c index 6842c2b02b39..aa683eacaf38 100644 --- a/drivers/net/wireless/atmel/at76c50x-usb.c +++ b/drivers/net/wireless/atmel/at76c50x-usb.c @@ -2002,7 +2002,7 @@ exit: return 0; } -static int at76_config(struct ieee80211_hw *hw, u32 changed) +static int at76_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct at76_priv *priv = hw->priv; diff --git a/drivers/net/wireless/broadcom/b43/main.c b/drivers/net/wireless/broadcom/b43/main.c index 7529afd24aed..f1a77c4c445f 100644 --- a/drivers/net/wireless/broadcom/b43/main.c +++ b/drivers/net/wireless/broadcom/b43/main.c @@ -3975,7 +3975,7 @@ static void b43_set_retry_limits(struct b43_wldev *dev, long_retry); } -static int b43_op_config(struct ieee80211_hw *hw, u32 changed) +static int b43_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct b43_wl *wl = hw_to_b43_wl(hw); struct b43_wldev *dev = wl->current_dev; @@ -5073,7 +5073,7 @@ static int b43_op_start(struct ieee80211_hw *hw) * may hang the system. */ if (!err) - b43_op_config(hw, ~0); + b43_op_config(hw, -1, ~0); return err; } @@ -5248,7 +5248,7 @@ out: } /* reload configuration */ - b43_op_config(wl->hw, ~0); + b43_op_config(wl->hw, -1, ~0); if (wl->vif) b43_op_bss_info_changed(wl->hw, wl->vif, &wl->vif->bss_conf, ~0); diff --git a/drivers/net/wireless/broadcom/b43legacy/main.c b/drivers/net/wireless/broadcom/b43legacy/main.c index 2370a2e6a2e3..aada342e0b80 100644 --- a/drivers/net/wireless/broadcom/b43legacy/main.c +++ b/drivers/net/wireless/broadcom/b43legacy/main.c @@ -2662,7 +2662,7 @@ static void b43legacy_set_retry_limits(struct b43legacy_wldev *dev, b43legacy_shm_write16(dev, B43legacy_SHM_WIRELESS, 0x0007, long_retry); } -static int b43legacy_op_dev_config(struct ieee80211_hw *hw, +static int b43legacy_op_dev_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct b43legacy_wl *wl = hw_to_b43legacy_wl(hw); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 5a0b252dfeaf..40a9a8177de6 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -1637,7 +1637,8 @@ static s32 brcmf_set_retry(struct net_device *ndev, u32 retry, bool l) return err; } -static s32 brcmf_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static s32 brcmf_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct net_device *ndev = cfg_to_ndev(cfg); @@ -2645,7 +2646,8 @@ brcmf_cfg80211_disconnect(struct wiphy *wiphy, struct net_device *ndev, static s32 brcmf_cfg80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, s32 mbm) + int radio_idx, enum nl80211_tx_power_setting type, + s32 mbm) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct net_device *ndev = cfg_to_ndev(cfg); @@ -2696,7 +2698,7 @@ done: static s32 brcmf_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, s32 *dbm) + int radio_idx, unsigned int link_id, s32 *dbm) { struct brcmf_cfg80211_info *cfg = wiphy_to_cfg(wiphy); struct brcmf_cfg80211_vif *vif = wdev_to_vif(wdev); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c index 1c3d29dca424..8ab452cf48c4 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmsmac/mac80211_if.c @@ -525,7 +525,8 @@ brcms_ops_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif) spin_unlock_bh(&wl->lock); } -static int brcms_ops_config(struct ieee80211_hw *hw, u32 changed) +static int brcms_ops_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { struct ieee80211_conf *conf = &hw->conf; struct brcms_info *wl = hw->priv; diff --git a/drivers/net/wireless/intel/iwlegacy/common.c b/drivers/net/wireless/intel/iwlegacy/common.c index 9a86688aea67..b7bd3ec4cc50 100644 --- a/drivers/net/wireless/intel/iwlegacy/common.c +++ b/drivers/net/wireless/intel/iwlegacy/common.c @@ -4990,7 +4990,7 @@ il_update_qos(struct il_priv *il) * il_mac_config - mac80211 config callback */ int -il_mac_config(struct ieee80211_hw *hw, u32 changed) +il_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct il_priv *il = hw->priv; const struct il_channel_info *ch_info; diff --git a/drivers/net/wireless/intel/iwlegacy/common.h b/drivers/net/wireless/intel/iwlegacy/common.h index 52610f5e57a3..4c9836ab11dd 100644 --- a/drivers/net/wireless/intel/iwlegacy/common.h +++ b/drivers/net/wireless/intel/iwlegacy/common.h @@ -1956,7 +1956,7 @@ il_get_hw_mode(struct il_priv *il, enum nl80211_band band) } /* mac80211 handlers */ -int il_mac_config(struct ieee80211_hw *hw, u32 changed); +int il_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); void il_mac_reset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void il_mac_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss_conf, u64 changes); diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h index 1ebc7effcc2a..b39bf401567f 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/agn.h +++ b/drivers/net/wireless/intel/iwlwifi/dvm/agn.h @@ -88,7 +88,7 @@ void iwl_connection_init_rx_config(struct iwl_priv *priv, int iwlagn_set_pan_params(struct iwl_priv *priv); int iwlagn_commit_rxon(struct iwl_priv *priv, struct iwl_rxon_context *ctx); void iwlagn_set_rxon_chain(struct iwl_priv *priv, struct iwl_rxon_context *ctx); -int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed); +int iwlagn_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); void iwlagn_bss_info_changed(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *bss_conf, diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c index 2d3c1627f283..e08e44cae434 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/rxon.c @@ -1149,7 +1149,7 @@ void iwlagn_config_ht40(struct ieee80211_conf *conf, } } -int iwlagn_mac_config(struct ieee80211_hw *hw, u32 changed) +int iwlagn_mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw); struct iwl_rxon_context *ctx; diff --git a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c index 4ba050397632..76e7e3fa2d13 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/mac80211.c @@ -574,7 +574,8 @@ void iwl_mld_mac80211_stop(struct ieee80211_hw *hw, bool suspend) } static -int iwl_mld_mac80211_config(struct ieee80211_hw *hw, u32 changed) +int iwl_mld_mac80211_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { return 0; } @@ -1102,7 +1103,8 @@ void iwl_mld_unassign_vif_chanctx(struct ieee80211_hw *hw, } static -int iwl_mld_mac80211_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int iwl_mld_mac80211_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { return 0; } diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 956b491ae5a4..619d822efa5b 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -298,7 +298,8 @@ static const struct wiphy_iftype_ext_capab add_iftypes_ext_capa[] = { }, }; -int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); *tx_ant = iwl_mvm_get_valid_tx_ant(mvm); @@ -306,7 +307,8 @@ int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) return 0; } -int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, + u32 rx_ant) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); @@ -4249,7 +4251,8 @@ int iwl_mvm_mac_sta_state_common(struct ieee80211_hw *hw, return ret; } -int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct iwl_mvm *mvm = IWL_MAC80211_GET_MVM(hw); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index a4f412e750d0..5c8eaf1eacff 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -2866,13 +2866,16 @@ void iwl_mvm_mac_wake_tx_queue(struct ieee80211_hw *hw, int iwl_mvm_mac_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_ampdu_params *params); -int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); -int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); +int iwl_mvm_op_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant); +int iwl_mvm_op_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, + u32 rx_ant); int iwl_mvm_mac_start(struct ieee80211_hw *hw); void iwl_mvm_mac_reconfig_complete(struct ieee80211_hw *hw, enum ieee80211_reconfig_type reconfig_type); void iwl_mvm_mac_stop(struct ieee80211_hw *hw, bool suspend); -static inline int iwl_mvm_mac_config(struct ieee80211_hw *hw, u32 changed) +static inline int iwl_mvm_mac_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { return 0; } @@ -2905,7 +2908,8 @@ iwl_mvm_mac_release_buffered_frames(struct ieee80211_hw *hw, int num_frames, enum ieee80211_frame_release_type reason, bool more_data); -int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int iwl_mvm_mac_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value); void iwl_mvm_sta_rc_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_link_sta *link_sta, u32 changed); void iwl_mvm_mac_mgd_prepare_tx(struct ieee80211_hw *hw, diff --git a/drivers/net/wireless/intersil/p54/main.c b/drivers/net/wireless/intersil/p54/main.c index 42111bb53f58..2ec3655f1a9c 100644 --- a/drivers/net/wireless/intersil/p54/main.c +++ b/drivers/net/wireless/intersil/p54/main.c @@ -313,7 +313,7 @@ static void p54_reset_stats(struct p54_common *priv) priv->survey_raw.tx = 0; } -static int p54_config(struct ieee80211_hw *dev, u32 changed) +static int p54_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { int ret = 0; struct p54_common *priv = dev->priv; @@ -692,6 +692,7 @@ static void p54_flush(struct ieee80211_hw *dev, struct ieee80211_vif *vif, } static void p54_set_coverage_class(struct ieee80211_hw *dev, + int radio_idx, s16 coverage_class) { struct p54_common *priv = dev->priv; diff --git a/drivers/net/wireless/marvell/libertas_tf/main.c b/drivers/net/wireless/marvell/libertas_tf/main.c index 50c0f6179e2d..d1067874428f 100644 --- a/drivers/net/wireless/marvell/libertas_tf/main.c +++ b/drivers/net/wireless/marvell/libertas_tf/main.c @@ -337,7 +337,7 @@ static void lbtf_op_remove_interface(struct ieee80211_hw *hw, lbtf_deb_leave(LBTF_DEB_MACOPS); } -static int lbtf_op_config(struct ieee80211_hw *hw, u32 changed) +static int lbtf_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct lbtf_private *priv = hw->priv; struct ieee80211_conf *conf = &hw->conf; diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 60c12328c2f3..286378770e9e 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -375,6 +375,7 @@ mwifiex_cfg80211_cancel_remain_on_channel(struct wiphy *wiphy, static int mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm) { @@ -410,6 +411,7 @@ mwifiex_cfg80211_set_tx_power(struct wiphy *wiphy, static int mwifiex_cfg80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); @@ -737,7 +739,8 @@ mwifiex_set_rts(struct mwifiex_private *priv, u32 rts_thr) * Fragmentation threshold of the driver. */ static int -mwifiex_cfg80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +mwifiex_cfg80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv; @@ -1939,7 +1942,8 @@ mwifiex_cfg80211_del_station(struct wiphy *wiphy, struct net_device *dev, } static int -mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) +mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, int radio_idx, u32 tx_ant, + u32 rx_ant) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv = mwifiex_get_priv(adapter, @@ -2002,7 +2006,8 @@ mwifiex_cfg80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) } static int -mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) +mwifiex_cfg80211_get_antenna(struct wiphy *wiphy, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct mwifiex_adapter *adapter = mwifiex_cfg80211_get_adapter(wiphy); struct mwifiex_private *priv = mwifiex_get_priv(adapter, diff --git a/drivers/net/wireless/marvell/mwl8k.c b/drivers/net/wireless/marvell/mwl8k.c index bab9ef37a1ab..bc34a025acd6 100644 --- a/drivers/net/wireless/marvell/mwl8k.c +++ b/drivers/net/wireless/marvell/mwl8k.c @@ -3369,7 +3369,8 @@ struct mwl8k_cmd_set_rts_threshold { } __packed; static int -mwl8k_cmd_set_rts_threshold(struct ieee80211_hw *hw, int rts_thresh) +mwl8k_cmd_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + int rts_thresh) { struct mwl8k_cmd_set_rts_threshold *cmd; int rc; @@ -4955,7 +4956,7 @@ fail: wiphy_err(hw->wiphy, "Firmware restart failed\n"); } -static int mwl8k_config(struct ieee80211_hw *hw, u32 changed) +static int mwl8k_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct ieee80211_conf *conf = &hw->conf; struct mwl8k_priv *priv = hw->priv; @@ -5321,9 +5322,10 @@ static void mwl8k_configure_filter(struct ieee80211_hw *hw, mwl8k_fw_unlock(hw); } -static int mwl8k_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int mwl8k_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { - return mwl8k_cmd_set_rts_threshold(hw, value); + return mwl8k_cmd_set_rts_threshold(hw, radio_idx, value); } static int mwl8k_sta_remove(struct ieee80211_hw *hw, @@ -6056,7 +6058,7 @@ static int mwl8k_reload_firmware(struct ieee80211_hw *hw, char *fw_image) if (rc) goto fail; - rc = mwl8k_config(hw, ~0); + rc = mwl8k_config(hw, -1, ~0); if (rc) goto fail; diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index 45c8db939d55..3afe4c4cd7bb 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -1892,7 +1892,8 @@ void mt76_sw_scan_complete(struct ieee80211_hw *hw, struct ieee80211_vif *vif) } EXPORT_SYMBOL_GPL(mt76_sw_scan_complete); -int mt76_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int mt76_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct mt76_phy *phy = hw->priv; struct mt76_dev *dev = phy->dev; diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 5f8d81cda6cd..14927a92f9d1 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -1513,7 +1513,8 @@ int mt76_get_sar_power(struct mt76_phy *phy, void mt76_csa_check(struct mt76_dev *dev); void mt76_csa_finish(struct mt76_dev *dev); -int mt76_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); +int mt76_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant); int mt76_set_tim(struct ieee80211_hw *hw, struct ieee80211_sta *sta, bool set); void mt76_insert_ccmp_hdr(struct sk_buff *skb, u8 key_id); int mt76_get_rate(struct mt76_dev *dev, diff --git a/drivers/net/wireless/mediatek/mt76/mt7603/main.c b/drivers/net/wireless/mediatek/mt76/mt7603/main.c index 3e8b1ec76169..0d7c84941cd0 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7603/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7603/main.c @@ -216,7 +216,7 @@ static int mt7603_set_sar_specs(struct ieee80211_hw *hw, } static int -mt7603_config(struct ieee80211_hw *hw, u32 changed) +mt7603_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt7603_dev *dev = hw->priv; int ret = 0; @@ -657,7 +657,8 @@ mt7603_sta_rate_tbl_update(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } static void -mt7603_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7603_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7603_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt7615/main.c b/drivers/net/wireless/mediatek/mt76/mt7615/main.c index 8a37fb37f77d..15fe155ac3f3 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7615/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7615/main.c @@ -420,7 +420,7 @@ static int mt7615_set_sar_specs(struct ieee80211_hw *hw, return mt76_update_channel(phy->mt76); } -static int mt7615_config(struct ieee80211_hw *hw, u32 changed) +static int mt7615_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt7615_dev *dev = mt7615_hw_dev(hw); struct mt7615_phy *phy = mt7615_hw_phy(hw); @@ -784,7 +784,8 @@ static void mt7615_tx(struct ieee80211_hw *hw, mt76_connac_pm_queue_skb(hw, &dev->pm, wcid, skb); } -static int mt7615_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7615_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt7615_dev *dev = mt7615_hw_dev(hw); struct mt7615_phy *phy = mt7615_hw_phy(hw); @@ -972,7 +973,8 @@ mt7615_offset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } static void -mt7615_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7615_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7615_phy *phy = mt7615_hw_phy(hw); struct mt7615_dev *dev = phy->dev; @@ -984,7 +986,8 @@ mt7615_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) } static int -mt7615_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7615_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt7615_dev *dev = mt7615_hw_dev(hw); struct mt7615_phy *phy = mt7615_hw_phy(hw); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c index 4aa2dcedc874..a5c40d350612 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/main.c @@ -57,7 +57,7 @@ out: } EXPORT_SYMBOL_GPL(mt76x0_set_sar_specs); -int mt76x0_config(struct ieee80211_hw *hw, u32 changed) +int mt76x0_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt76x02_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h b/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h index 50f755344968..e5bc14d4c712 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x0/mt76x0.h @@ -48,7 +48,7 @@ void mt76x0_chip_onoff(struct mt76x02_dev *dev, bool enable, bool reset); void mt76x0_mac_stop(struct mt76x02_dev *dev); -int mt76x0_config(struct ieee80211_hw *hw, u32 changed); +int mt76x0_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); int mt76x0_set_channel(struct mt76_phy *mphy); int mt76x0_set_sar_specs(struct ieee80211_hw *hw, const struct cfg80211_sar_specs *sar); diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02.h b/drivers/net/wireless/mediatek/mt76/mt76x02.h index 4cd63bacd742..2094c7d2af81 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02.h +++ b/drivers/net/wireless/mediatek/mt76/mt76x02.h @@ -183,8 +183,8 @@ void mt76x02_wdt_work(struct work_struct *work); void mt76x02_tx_set_txpwr_auto(struct mt76x02_dev *dev, s8 txpwr); void mt76x02_set_tx_ackto(struct mt76x02_dev *dev); void mt76x02_set_coverage_class(struct ieee80211_hw *hw, - s16 coverage_class); -int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, u32 val); + int radio_idx, s16 coverage_class); +int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val); void mt76x02_remove_hdr_pad(struct sk_buff *skb, int len); bool mt76x02_tx_status_data(struct mt76_dev *mdev, u8 *update); void mt76x02_queue_rx_skb(struct mt76_dev *mdev, enum mt76_rxq_id q, diff --git a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c index 4fb30589fa7a..7dfcb20c692c 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x02_util.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x02_util.c @@ -548,7 +548,7 @@ void mt76x02_set_tx_ackto(struct mt76x02_dev *dev) EXPORT_SYMBOL_GPL(mt76x02_set_tx_ackto); void mt76x02_set_coverage_class(struct ieee80211_hw *hw, - s16 coverage_class) + int radio_idx, s16 coverage_class) { struct mt76x02_dev *dev = hw->priv; @@ -559,7 +559,7 @@ void mt76x02_set_coverage_class(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(mt76x02_set_coverage_class); -int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +int mt76x02_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val) { struct mt76x02_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c index eb70130d2711..c5dfb06d81e8 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/pci_main.c @@ -54,7 +54,7 @@ int mt76x2e_set_channel(struct mt76_phy *phy) } static int -mt76x2_config(struct ieee80211_hw *hw, u32 changed) +mt76x2_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt76x02_dev *dev = hw->priv; @@ -99,8 +99,8 @@ mt76x2_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, { } -static int mt76x2_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, - u32 rx_ant) +static int mt76x2_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt76x02_dev *dev = hw->priv; diff --git a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c index 83e7061b10e2..6671c53faf9f 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c +++ b/drivers/net/wireless/mediatek/mt76/mt76x2/usb_main.c @@ -50,7 +50,7 @@ int mt76x2u_set_channel(struct mt76_phy *mphy) } static int -mt76x2u_config(struct ieee80211_hw *hw, u32 changed) +mt76x2u_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt76x02_dev *dev = hw->priv; int err = 0; diff --git a/drivers/net/wireless/mediatek/mt76/mt7915/main.c b/drivers/net/wireless/mediatek/mt76/mt7915/main.c index 3aa31c5cefa6..fe0639c14bf9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7915/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7915/main.c @@ -449,7 +449,8 @@ out: return err; } -static int mt7915_config(struct ieee80211_hw *hw, u32 changed) +static int mt7915_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { struct mt7915_dev *dev = mt7915_hw_dev(hw); struct mt7915_phy *phy = mt7915_hw_phy(hw); @@ -906,7 +907,8 @@ static void mt7915_tx(struct ieee80211_hw *hw, mt76_tx(mphy, control->sta, wcid, skb); } -static int mt7915_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7915_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt7915_dev *dev = mt7915_hw_dev(hw); struct mt7915_phy *phy = mt7915_hw_phy(hw); @@ -1102,7 +1104,8 @@ mt7915_offset_tsf(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } static void -mt7915_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7915_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7915_phy *phy = mt7915_hw_phy(hw); struct mt7915_dev *dev = phy->dev; @@ -1114,7 +1117,7 @@ mt7915_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) } static int -mt7915_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7915_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, u32 rx_ant) { struct mt7915_dev *dev = mt7915_hw_dev(hw); struct mt7915_phy *phy = mt7915_hw_phy(hw); @@ -1655,7 +1658,7 @@ mt7915_twt_teardown_request(struct ieee80211_hw *hw, } static int -mt7915_set_frag_threshold(struct ieee80211_hw *hw, u32 val) +mt7915_set_frag_threshold(struct ieee80211_hw *hw, int radio_idx, u32 val) { return 0; } diff --git a/drivers/net/wireless/mediatek/mt76/mt7921/main.c b/drivers/net/wireless/mediatek/mt76/mt7921/main.c index 1fffa43379b2..1678204296d7 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7921/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7921/main.c @@ -624,7 +624,7 @@ void mt7921_set_runtime_pm(struct mt792x_dev *dev) mt76_connac_mcu_set_deep_sleep(&dev->mt76, pm->ds_enable); } -static int mt7921_config(struct ieee80211_hw *hw, u32 changed) +static int mt7921_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt792x_dev *dev = mt792x_hw_dev(hw); struct mt792x_phy *phy = mt792x_hw_phy(hw); @@ -907,7 +907,8 @@ void mt7921_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif, } EXPORT_SYMBOL_GPL(mt7921_mac_sta_remove); -static int mt7921_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7921_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt792x_dev *dev = mt792x_hw_dev(hw); @@ -1088,7 +1089,8 @@ mt7921_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) } static int -mt7921_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7921_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt792x_dev *dev = mt792x_hw_dev(hw); struct mt792x_phy *phy = mt792x_hw_phy(hw); diff --git a/drivers/net/wireless/mediatek/mt76/mt7925/main.c b/drivers/net/wireless/mediatek/mt76/mt7925/main.c index 94b0099dcd41..ed7cd75aa6bc 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7925/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7925/main.c @@ -757,7 +757,7 @@ void mt7925_set_runtime_pm(struct mt792x_dev *dev) mt7925_mcu_set_deep_sleep(dev, pm->ds_enable); } -static int mt7925_config(struct ieee80211_hw *hw, u32 changed) +static int mt7925_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt792x_dev *dev = mt792x_hw_dev(hw); int ret = 0; @@ -1265,7 +1265,8 @@ void mt7925_mac_sta_remove(struct mt76_dev *mdev, struct ieee80211_vif *vif, } EXPORT_SYMBOL_GPL(mt7925_mac_sta_remove); -static int mt7925_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7925_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt792x_dev *dev = mt792x_hw_dev(hw); @@ -1507,7 +1508,8 @@ mt7925_stop_sched_scan(struct ieee80211_hw *hw, struct ieee80211_vif *vif) } static int -mt7925_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7925_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt792x_dev *dev = mt792x_hw_dev(hw); struct mt792x_phy *phy = mt792x_hw_phy(hw); diff --git a/drivers/net/wireless/mediatek/mt76/mt792x.h b/drivers/net/wireless/mediatek/mt76/mt792x.h index e0359d431eca..443d397d9961 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x.h +++ b/drivers/net/wireless/mediatek/mt76/mt792x.h @@ -412,7 +412,8 @@ void mt792x_sta_statistics(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct station_info *sinfo); -void mt792x_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class); +void mt792x_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); void mt792x_dma_cleanup(struct mt792x_dev *dev); int mt792x_dma_enable(struct mt792x_dev *dev); int mt792x_wpdma_reset(struct mt792x_dev *dev, bool force); diff --git a/drivers/net/wireless/mediatek/mt76/mt792x_core.c b/drivers/net/wireless/mediatek/mt76/mt792x_core.c index a50c1723ca29..43a7ac0f718e 100644 --- a/drivers/net/wireless/mediatek/mt76/mt792x_core.c +++ b/drivers/net/wireless/mediatek/mt76/mt792x_core.c @@ -579,7 +579,8 @@ void mt792x_sta_statistics(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(mt792x_sta_statistics); -void mt792x_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +void mt792x_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt792x_phy *phy = mt792x_hw_phy(hw); struct mt792x_dev *dev = phy->dev; diff --git a/drivers/net/wireless/mediatek/mt76/mt7996/main.c b/drivers/net/wireless/mediatek/mt76/mt7996/main.c index 78ae9f5cb176..5283aee619a9 100644 --- a/drivers/net/wireless/mediatek/mt76/mt7996/main.c +++ b/drivers/net/wireless/mediatek/mt76/mt7996/main.c @@ -591,7 +591,7 @@ static int mt7996_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, return err; } -static int mt7996_config(struct ieee80211_hw *hw, u32 changed) +static int mt7996_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } @@ -1251,7 +1251,8 @@ unlock: rcu_read_unlock(); } -static int mt7996_set_rts_threshold(struct ieee80211_hw *hw, u32 val) +static int mt7996_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 val) { struct mt7996_dev *dev = mt7996_hw_dev(hw); int i, ret = 0; @@ -1491,7 +1492,8 @@ unlock: } static void -mt7996_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) +mt7996_set_coverage_class(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class) { struct mt7996_dev *dev = mt7996_hw_dev(hw); struct mt7996_phy *phy; @@ -1505,7 +1507,8 @@ mt7996_set_coverage_class(struct ieee80211_hw *hw, s16 coverage_class) } static int -mt7996_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +mt7996_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct mt7996_dev *dev = mt7996_hw_dev(hw); int i; diff --git a/drivers/net/wireless/mediatek/mt7601u/main.c b/drivers/net/wireless/mediatek/mt7601u/main.c index 7570c6ceecea..05ba43e1985c 100644 --- a/drivers/net/wireless/mediatek/mt7601u/main.c +++ b/drivers/net/wireless/mediatek/mt7601u/main.c @@ -78,7 +78,7 @@ static void mt7601u_remove_interface(struct ieee80211_hw *hw, dev->wcid_mask[wcid / BITS_PER_LONG] &= ~BIT(wcid % BITS_PER_LONG); } -static int mt7601u_config(struct ieee80211_hw *hw, u32 changed) +static int mt7601u_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct mt7601u_dev *dev = hw->priv; int ret = 0; @@ -334,7 +334,8 @@ mt7601u_set_key(struct ieee80211_hw *hw, enum set_key_cmd cmd, return mt76_mac_wcid_set_key(dev, msta->wcid.idx, key); } -static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int mt7601u_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct mt7601u_dev *dev = hw->priv; diff --git a/drivers/net/wireless/microchip/wilc1000/cfg80211.c b/drivers/net/wireless/microchip/wilc1000/cfg80211.c index e7aa0f991923..a395829ebadf 100644 --- a/drivers/net/wireless/microchip/wilc1000/cfg80211.c +++ b/drivers/net/wireless/microchip/wilc1000/cfg80211.c @@ -800,7 +800,7 @@ static int change_bss(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int set_wiphy_params(struct wiphy *wiphy, int radio_idx, u32 changed) { int ret = -EINVAL; struct cfg_param_attr cfg_param_val; @@ -1637,7 +1637,8 @@ static void wilc_set_wakeup(struct wiphy *wiphy, bool enabled) } static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + int radio_idx, enum nl80211_tx_power_setting type, + int mbm) { int ret; int srcu_idx; @@ -1669,7 +1670,7 @@ static int set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, } static int get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm) + int radio_idx, unsigned int link_id, int *dbm) { int ret; struct wilc_vif *vif = netdev_priv(wdev->netdev); diff --git a/drivers/net/wireless/purelifi/plfxlc/mac.c b/drivers/net/wireless/purelifi/plfxlc/mac.c index 82d1bf7edba2..d375ad60167f 100644 --- a/drivers/net/wireless/purelifi/plfxlc/mac.c +++ b/drivers/net/wireless/purelifi/plfxlc/mac.c @@ -531,7 +531,7 @@ static void plfxlc_op_remove_interface(struct ieee80211_hw *hw, mac->vif = NULL; } -static int plfxlc_op_config(struct ieee80211_hw *hw, u32 changed) +static int plfxlc_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } @@ -677,7 +677,8 @@ static void plfxlc_get_et_stats(struct ieee80211_hw *hw, data[1] = mac->crc_errors; } -static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int plfxlc_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { return 0; } diff --git a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c index 0b2282528342..f1188368e66b 100644 --- a/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c +++ b/drivers/net/wireless/quantenna/qtnfmac/cfg80211.c @@ -370,7 +370,8 @@ static int qtnf_stop_ap(struct wiphy *wiphy, struct net_device *dev, return ret; } -static int qtnf_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int qtnf_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct qtnf_wmac *mac = wiphy_priv(wiphy); struct qtnf_vif *vif; @@ -881,7 +882,7 @@ static int qtnf_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, } static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm) + int radio_idx, unsigned int link_id, int *dbm) { struct qtnf_vif *vif = qtnf_netdev_get_priv(wdev->netdev); int ret; @@ -894,7 +895,8 @@ static int qtnf_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, } static int qtnf_set_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + int radio_idx, enum nl80211_tx_power_setting type, + int mbm) { struct qtnf_vif *vif; int ret; diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c index b7ea606bda08..4b5a7c9b6499 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.c @@ -12100,7 +12100,7 @@ void rt2800_get_key_seq(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(rt2800_get_key_seq); -int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int rt2800_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value) { struct rt2x00_dev *rt2x00dev = hw->priv; u32 reg; diff --git a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h index 194de676df8f..620a3d9872ce 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2800lib.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2800lib.h @@ -253,7 +253,8 @@ int rt2800_probe_hw(struct rt2x00_dev *rt2x00dev); void rt2800_get_key_seq(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); -int rt2800_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int rt2800_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value); int rt2800_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, unsigned int link_id, u16 queue_idx, diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index dfb4bb370f01..09b9d1f9f793 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -1457,7 +1457,7 @@ int rt2x00mac_add_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif); void rt2x00mac_remove_interface(struct ieee80211_hw *hw, struct ieee80211_vif *vif); -int rt2x00mac_config(struct ieee80211_hw *hw, u32 changed); +int rt2x00mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); void rt2x00mac_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, @@ -1489,8 +1489,10 @@ int rt2x00mac_conf_tx(struct ieee80211_hw *hw, void rt2x00mac_rfkill_poll(struct ieee80211_hw *hw); void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); -int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); -int rt2x00mac_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); +int rt2x00mac_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); +int rt2x00mac_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); void rt2x00mac_get_ringparam(struct ieee80211_hw *hw, u32 *tx, u32 *tx_max, u32 *rx, u32 *rx_max); bool rt2x00mac_tx_frames_pending(struct ieee80211_hw *hw); diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c index 451632488805..3bc0c1c906c9 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c @@ -304,7 +304,7 @@ void rt2x00mac_remove_interface(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(rt2x00mac_remove_interface); -int rt2x00mac_config(struct ieee80211_hw *hw, u32 changed) +int rt2x00mac_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rt2x00_dev *rt2x00dev = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -740,7 +740,8 @@ void rt2x00mac_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, } EXPORT_SYMBOL_GPL(rt2x00mac_flush); -int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +int rt2x00mac_set_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct rt2x00_dev *rt2x00dev = hw->priv; struct link_ant *ant = &rt2x00dev->link.ant; @@ -785,7 +786,8 @@ int rt2x00mac_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) } EXPORT_SYMBOL_GPL(rt2x00mac_set_antenna); -int rt2x00mac_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int rt2x00mac_get_antenna(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct rt2x00_dev *rt2x00dev = hw->priv; struct link_ant *ant = &rt2x00dev->link.ant; diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c index ded8d4d59289..2905baea6239 100644 --- a/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c +++ b/drivers/net/wireless/realtek/rtl818x/rtl8180/dev.c @@ -1370,7 +1370,7 @@ static void rtl8180_remove_interface(struct ieee80211_hw *dev, priv->vif = NULL; } -static int rtl8180_config(struct ieee80211_hw *dev, u32 changed) +static int rtl8180_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { struct rtl8180_priv *priv = dev->priv; struct ieee80211_conf *conf = &dev->conf; diff --git a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c index 220ac5bdf279..8857bb542c7f 100644 --- a/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c +++ b/drivers/net/wireless/realtek/rtl818x/rtl8187/dev.c @@ -1151,7 +1151,7 @@ static void rtl8187_remove_interface(struct ieee80211_hw *dev, mutex_unlock(&priv->conf_mutex); } -static int rtl8187_config(struct ieee80211_hw *dev, u32 changed) +static int rtl8187_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { struct rtl8187_priv *priv = dev->priv; struct ieee80211_conf *conf = &dev->conf; diff --git a/drivers/net/wireless/realtek/rtl8xxxu/core.c b/drivers/net/wireless/realtek/rtl8xxxu/core.c index 569856ca677f..496836f716aa 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/core.c @@ -4552,7 +4552,8 @@ static void rtl8xxxu_cam_write(struct rtl8xxxu_priv *priv, } static -int rtl8xxxu_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int rtl8xxxu_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct rtl8xxxu_priv *priv = hw->priv; @@ -6839,7 +6840,7 @@ static void rtl8xxxu_remove_interface(struct ieee80211_hw *hw, priv->vifs[rtlvif->port_num] = NULL; } -static int rtl8xxxu_config(struct ieee80211_hw *hw, u32 changed) +static int rtl8xxxu_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtl8xxxu_priv *priv = hw->priv; struct device *dev = &priv->udev->dev; @@ -6988,7 +6989,8 @@ static void rtl8xxxu_configure_filter(struct ieee80211_hw *hw, FIF_PROBE_REQ); } -static int rtl8xxxu_set_rts_threshold(struct ieee80211_hw *hw, u32 rts) +static int rtl8xxxu_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 rts) { if (rts > 2347 && rts != (u32)-1) return -EINVAL; diff --git a/drivers/net/wireless/realtek/rtlwifi/core.c b/drivers/net/wireless/realtek/rtlwifi/core.c index 819cf519e66e..22633c301564 100644 --- a/drivers/net/wireless/realtek/rtlwifi/core.c +++ b/drivers/net/wireless/realtek/rtlwifi/core.c @@ -566,7 +566,7 @@ static int rtl_op_resume(struct ieee80211_hw *hw) } #endif -static int rtl_op_config(struct ieee80211_hw *hw, u32 changed) +static int rtl_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtl_priv *rtlpriv = rtl_priv(hw); struct rtl_phy *rtlphy = &(rtlpriv->phy); diff --git a/drivers/net/wireless/realtek/rtw88/mac80211.c b/drivers/net/wireless/realtek/rtw88/mac80211.c index 77f9fbe1870c..766f22d31079 100644 --- a/drivers/net/wireless/realtek/rtw88/mac80211.c +++ b/drivers/net/wireless/realtek/rtw88/mac80211.c @@ -71,7 +71,7 @@ static void rtw_ops_stop(struct ieee80211_hw *hw, bool suspend) mutex_unlock(&rtwdev->mutex); } -static int rtw_ops_config(struct ieee80211_hw *hw, u32 changed) +static int rtw_ops_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtw_dev *rtwdev = hw->priv; int ret = 0; @@ -708,7 +708,8 @@ static void rtw_ops_mgd_prepare_tx(struct ieee80211_hw *hw, mutex_unlock(&rtwdev->mutex); } -static int rtw_ops_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int rtw_ops_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct rtw_dev *rtwdev = hw->priv; @@ -797,6 +798,7 @@ static int rtw_ops_set_bitrate_mask(struct ieee80211_hw *hw, } static int rtw_ops_set_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 tx_antenna, u32 rx_antenna) { @@ -808,13 +810,14 @@ static int rtw_ops_set_antenna(struct ieee80211_hw *hw, return -EOPNOTSUPP; mutex_lock(&rtwdev->mutex); - ret = chip->ops->set_antenna(rtwdev, tx_antenna, rx_antenna); + ret = chip->ops->set_antenna(rtwdev, -1, tx_antenna, rx_antenna); mutex_unlock(&rtwdev->mutex); return ret; } static int rtw_ops_get_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 *tx_antenna, u32 *rx_antenna) { diff --git a/drivers/net/wireless/realtek/rtw88/main.h b/drivers/net/wireless/realtek/rtw88/main.h index b0f1fabe9554..7ae67143e909 100644 --- a/drivers/net/wireless/realtek/rtw88/main.h +++ b/drivers/net/wireless/realtek/rtw88/main.h @@ -873,7 +873,7 @@ struct rtw_chip_ops { void (*set_tx_power_index)(struct rtw_dev *rtwdev); int (*rsvd_page_dump)(struct rtw_dev *rtwdev, u8 *buf, u32 offset, u32 size); - int (*set_antenna)(struct rtw_dev *rtwdev, + int (*set_antenna)(struct rtw_dev *rtwdev, int radio_idx, u32 antenna_tx, u32 antenna_rx); void (*cfg_ldo25)(struct rtw_dev *rtwdev, bool enable); diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index ab199eaea3c7..710126379e77 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -983,6 +983,7 @@ static bool rtw8822b_check_rf_path(u8 antenna) } static int rtw8822b_set_antenna(struct rtw_dev *rtwdev, + int radio_idx, u32 antenna_tx, u32 antenna_rx) { diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822c.c b/drivers/net/wireless/realtek/rtw88/rtw8822c.c index 017d959de3ce..0ce6aa10493e 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822c.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822c.c @@ -2767,6 +2767,7 @@ static void rtw8822c_set_tx_power_index(struct rtw_dev *rtwdev) } static int rtw8822c_set_antenna(struct rtw_dev *rtwdev, + int radio_idx, u32 antenna_tx, u32 antenna_rx) { diff --git a/drivers/net/wireless/realtek/rtw89/mac80211.c b/drivers/net/wireless/realtek/rtw89/mac80211.c index a47971003bd4..b9e046208424 100644 --- a/drivers/net/wireless/realtek/rtw89/mac80211.c +++ b/drivers/net/wireless/realtek/rtw89/mac80211.c @@ -72,7 +72,7 @@ static void rtw89_ops_stop(struct ieee80211_hw *hw, bool suspend) rtw89_core_stop(rtwdev); } -static int rtw89_ops_config(struct ieee80211_hw *hw, u32 changed) +static int rtw89_ops_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct rtw89_dev *rtwdev = hw->priv; @@ -1007,7 +1007,8 @@ static int rtw89_ops_ampdu_action(struct ieee80211_hw *hw, return 0; } -static int rtw89_ops_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int rtw89_ops_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct rtw89_dev *rtwdev = hw->priv; @@ -1119,7 +1120,7 @@ static int rtw89_ops_set_bitrate_mask(struct ieee80211_hw *hw, } static -int rtw89_ops_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) +int rtw89_ops_set_antenna(struct ieee80211_hw *hw, int radio_idx, u32 tx_ant, u32 rx_ant) { struct rtw89_dev *rtwdev = hw->priv; struct rtw89_hal *hal = &rtwdev->hal; @@ -1142,7 +1143,8 @@ int rtw89_ops_set_antenna(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant) } static -int rtw89_ops_get_antenna(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant) +int rtw89_ops_get_antenna(struct ieee80211_hw *hw, int radio_idx, u32 *tx_ant, + u32 *rx_ant) { struct rtw89_dev *rtwdev = hw->priv; struct rtw89_hal *hal = &rtwdev->hal; diff --git a/drivers/net/wireless/rsi/rsi_91x_mac80211.c b/drivers/net/wireless/rsi/rsi_91x_mac80211.c index 0e115b428f96..f3a853edfc11 100644 --- a/drivers/net/wireless/rsi/rsi_91x_mac80211.c +++ b/drivers/net/wireless/rsi/rsi_91x_mac80211.c @@ -656,11 +656,13 @@ static int rsi_config_power(struct ieee80211_hw *hw) * requests. The stack calls this function to * change hardware configuration, e.g., channel. * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index. * @changed: Changed flags set. * * Return: 0 on success, negative error code on failure. */ static int rsi_mac80211_config(struct ieee80211_hw *hw, + int radio_idx, u32 changed) { struct rsi_hw *adapter = hw->priv; @@ -1201,12 +1203,13 @@ unlock: /** * rsi_mac80211_set_rts_threshold() - This function sets rts threshold value. * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index. * @value: Rts threshold value. * * Return: 0 on success. */ static int rsi_mac80211_set_rts_threshold(struct ieee80211_hw *hw, - u32 value) + int radio_idx, u32 value) { struct rsi_hw *adapter = hw->priv; struct rsi_common *common = adapter->priv; @@ -1583,12 +1586,14 @@ static int rsi_mac80211_sta_remove(struct ieee80211_hw *hw, * rsi_mac80211_set_antenna() - This function is used to configure * tx and rx antennas. * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index * @tx_ant: Bitmap for tx antenna * @rx_ant: Bitmap for rx antenna * * Return: 0 on success, Negative error code on failure. */ static int rsi_mac80211_set_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 tx_ant, u32 rx_ant) { struct rsi_hw *adapter = hw->priv; @@ -1634,12 +1639,14 @@ fail_set_antenna: * tx and rx antennas. * * @hw: Pointer to the ieee80211_hw structure. + * @radio_idx: Radio index * @tx_ant: Bitmap for tx antenna * @rx_ant: Bitmap for rx antenna * * Return: 0 on success, negative error codes on failure. */ static int rsi_mac80211_get_antenna(struct ieee80211_hw *hw, + int radio_idx, u32 *tx_ant, u32 *rx_ant) { struct rsi_hw *adapter = hw->priv; diff --git a/drivers/net/wireless/silabs/wfx/sta.c b/drivers/net/wireless/silabs/wfx/sta.c index e95b9ded17d9..d12fcc755701 100644 --- a/drivers/net/wireless/silabs/wfx/sta.c +++ b/drivers/net/wireless/silabs/wfx/sta.c @@ -220,7 +220,7 @@ int wfx_conf_tx(struct ieee80211_hw *hw, struct ieee80211_vif *vif, return 0; } -int wfx_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int wfx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value) { struct wfx_dev *wdev = hw->priv; struct wfx_vif *wvif = NULL; @@ -706,7 +706,7 @@ void wfx_unassign_vif_chanctx(struct ieee80211_hw *hw, struct ieee80211_vif *vif wvif->channel = NULL; } -int wfx_config(struct ieee80211_hw *hw, u32 changed) +int wfx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { return 0; } diff --git a/drivers/net/wireless/silabs/wfx/sta.h b/drivers/net/wireless/silabs/wfx/sta.h index 8702eed5267f..b4812b294f3c 100644 --- a/drivers/net/wireless/silabs/wfx/sta.h +++ b/drivers/net/wireless/silabs/wfx/sta.h @@ -21,8 +21,8 @@ struct wfx_sta_priv { /* mac80211 interface */ int wfx_start(struct ieee80211_hw *hw); void wfx_stop(struct ieee80211_hw *hw, bool suspend); -int wfx_config(struct ieee80211_hw *hw, u32 changed); -int wfx_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int wfx_config(struct ieee80211_hw *hw, int radio_idx, u32 changed); +int wfx_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, u32 value); void wfx_set_default_unicast_key(struct ieee80211_hw *hw, struct ieee80211_vif *vif, int idx); void wfx_configure_filter(struct ieee80211_hw *hw, unsigned int changed_flags, unsigned int *total_flags, u64 unused); diff --git a/drivers/net/wireless/st/cw1200/sta.c b/drivers/net/wireless/st/cw1200/sta.c index 5dd7f6a38900..b1dd76e8aecb 100644 --- a/drivers/net/wireless/st/cw1200/sta.c +++ b/drivers/net/wireless/st/cw1200/sta.c @@ -321,7 +321,7 @@ int cw1200_change_interface(struct ieee80211_hw *dev, return ret; } -int cw1200_config(struct ieee80211_hw *dev, u32 changed) +int cw1200_config(struct ieee80211_hw *dev, int radio_idx, u32 changed) { int ret = 0; struct cw1200_common *priv = dev->priv; @@ -857,7 +857,8 @@ void cw1200_wep_key_work(struct work_struct *work) wsm_unlock_tx(priv); } -int cw1200_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +int cw1200_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { int ret = 0; __le32 val32; diff --git a/drivers/net/wireless/st/cw1200/sta.h b/drivers/net/wireless/st/cw1200/sta.h index b955b92cfd73..b4f04371668d 100644 --- a/drivers/net/wireless/st/cw1200/sta.h +++ b/drivers/net/wireless/st/cw1200/sta.h @@ -22,7 +22,7 @@ int cw1200_change_interface(struct ieee80211_hw *dev, struct ieee80211_vif *vif, enum nl80211_iftype new_type, bool p2p); -int cw1200_config(struct ieee80211_hw *dev, u32 changed); +int cw1200_config(struct ieee80211_hw *dev, int radio_idx, u32 changed); void cw1200_configure_filter(struct ieee80211_hw *dev, unsigned int changed_flags, unsigned int *total_flags, @@ -36,7 +36,8 @@ int cw1200_set_key(struct ieee80211_hw *dev, enum set_key_cmd cmd, struct ieee80211_vif *vif, struct ieee80211_sta *sta, struct ieee80211_key_conf *key); -int cw1200_set_rts_threshold(struct ieee80211_hw *hw, u32 value); +int cw1200_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value); void cw1200_flush(struct ieee80211_hw *hw, struct ieee80211_vif *vif, u32 queues, bool drop); diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c index bb53d681c11b..69fc51f183ad 100644 --- a/drivers/net/wireless/ti/wl1251/main.c +++ b/drivers/net/wireless/ti/wl1251/main.c @@ -589,7 +589,7 @@ static bool wl1251_can_do_pm(struct ieee80211_conf *conf, struct wl1251 *wl) return (conf->flags & IEEE80211_CONF_PS) && !wl->monitor_present; } -static int wl1251_op_config(struct ieee80211_hw *hw, u32 changed) +static int wl1251_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct wl1251 *wl = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -1051,7 +1051,8 @@ out: return ret; } -static int wl1251_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int wl1251_op_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct wl1251 *wl = hw->priv; int ret; diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index f93c95edd991..6116a8522d96 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -3166,7 +3166,7 @@ static int wl12xx_config_vif(struct wl1271 *wl, struct wl12xx_vif *wlvif, return 0; } -static int wl1271_op_config(struct ieee80211_hw *hw, u32 changed) +static int wl1271_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct wl1271 *wl = hw->priv; struct wl12xx_vif *wlvif; @@ -3895,7 +3895,8 @@ out: return 0; } -static int wl1271_op_set_frag_threshold(struct ieee80211_hw *hw, u32 value) +static int wl1271_op_set_frag_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { struct wl1271 *wl = hw->priv; int ret = 0; @@ -3924,7 +3925,8 @@ out: return ret; } -static int wl1271_op_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int wl1271_op_set_rts_threshold(struct ieee80211_hw *hw, int radio_idx, + u32 value) { struct wl1271 *wl = hw->priv; struct wl12xx_vif *wlvif; diff --git a/drivers/net/wireless/virtual/mac80211_hwsim.c b/drivers/net/wireless/virtual/mac80211_hwsim.c index f6add19d1da1..eefe8da3b14d 100644 --- a/drivers/net/wireless/virtual/mac80211_hwsim.c +++ b/drivers/net/wireless/virtual/mac80211_hwsim.c @@ -2381,7 +2381,8 @@ static const char * const hwsim_chanwidths[] = { [NL80211_CHAN_WIDTH_320] = "eht320", }; -static int mac80211_hwsim_config(struct ieee80211_hw *hw, u32 changed) +static int mac80211_hwsim_config(struct ieee80211_hw *hw, int radio_idx, + u32 changed) { struct mac80211_hwsim_data *data = hw->priv; struct ieee80211_conf *conf = &hw->conf; @@ -3338,7 +3339,8 @@ static int mac80211_hwsim_tx_last_beacon(struct ieee80211_hw *hw) return 1; } -static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, u32 value) +static int mac80211_hwsim_set_rts_threshold(struct ieee80211_hw *hw, + int radio_idx, u32 value) { return -EOPNOTSUPP; } diff --git a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c index 9653dbaac3c0..f7c56174424d 100644 --- a/drivers/net/wireless/zydas/zd1211rw/zd_mac.c +++ b/drivers/net/wireless/zydas/zd1211rw/zd_mac.c @@ -1133,7 +1133,7 @@ static void zd_op_remove_interface(struct ieee80211_hw *hw, zd_mac_free_cur_beacon(mac); } -static int zd_op_config(struct ieee80211_hw *hw, u32 changed) +static int zd_op_config(struct ieee80211_hw *hw, int radio_idx, u32 changed) { struct zd_mac *mac = zd_hw_mac(hw); struct ieee80211_conf *conf = &hw->conf; diff --git a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c index 7fcc46a0bb48..4e29652f8ee7 100644 --- a/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c +++ b/drivers/staging/rtl8723bs/os_dep/ioctl_cfg80211.c @@ -1298,7 +1298,8 @@ exit: return ret; } -static int cfg80211_rtw_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int cfg80211_rtw_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { return 0; } @@ -1795,7 +1796,7 @@ static int cfg80211_rtw_disconnect(struct wiphy *wiphy, struct net_device *ndev, } static int cfg80211_rtw_set_txpower(struct wiphy *wiphy, - struct wireless_dev *wdev, + struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { return 0; @@ -1803,6 +1804,7 @@ static int cfg80211_rtw_set_txpower(struct wiphy *wiphy, static int cfg80211_rtw_get_txpower(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { *dbm = (12); diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index eec066f4738a..ffd9564fc840 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4853,12 +4853,14 @@ struct cfg80211_ops { int (*set_mcast_rate)(struct wiphy *wiphy, struct net_device *dev, int rate[NUM_NL80211_BANDS]); - int (*set_wiphy_params)(struct wiphy *wiphy, u32 changed); + int (*set_wiphy_params)(struct wiphy *wiphy, int radio_idx, + u32 changed); int (*set_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, enum nl80211_tx_power_setting type, int mbm); int (*get_tx_power)(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id, int *dbm); + int radio_idx, unsigned int link_id, int *dbm); void (*rfkill_poll)(struct wiphy *wiphy); @@ -4920,8 +4922,10 @@ struct cfg80211_ops { struct wireless_dev *wdev, struct mgmt_frame_regs *upd); - int (*set_antenna)(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*sched_scan_start)(struct wiphy *wiphy, struct net_device *dev, diff --git a/include/net/mac80211.h b/include/net/mac80211.h index fa2325692abf..a0de0da4d79b 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4517,7 +4517,7 @@ struct ieee80211_ops { enum nl80211_iftype new_type, bool p2p); void (*remove_interface)(struct ieee80211_hw *hw, struct ieee80211_vif *vif); - int (*config)(struct ieee80211_hw *hw, u32 changed); + int (*config)(struct ieee80211_hw *hw, int radio_idx, u32 changed); void (*bss_info_changed)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_bss_conf *info, @@ -4580,8 +4580,10 @@ struct ieee80211_ops { void (*get_key_seq)(struct ieee80211_hw *hw, struct ieee80211_key_conf *key, struct ieee80211_key_seq *seq); - int (*set_frag_threshold)(struct ieee80211_hw *hw, u32 value); - int (*set_rts_threshold)(struct ieee80211_hw *hw, u32 value); + int (*set_frag_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); + int (*set_rts_threshold)(struct ieee80211_hw *hw, int radio_idx, + u32 value); int (*sta_add)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_sta *sta); int (*sta_remove)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, @@ -4678,7 +4680,8 @@ struct ieee80211_ops { int (*get_survey)(struct ieee80211_hw *hw, int idx, struct survey_info *survey); void (*rfkill_poll)(struct ieee80211_hw *hw); - void (*set_coverage_class)(struct ieee80211_hw *hw, s16 coverage_class); + void (*set_coverage_class)(struct ieee80211_hw *hw, int radio_idx, + s16 coverage_class); #ifdef CONFIG_NL80211_TESTMODE int (*testmode_cmd)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, void *data, int len); @@ -4693,8 +4696,10 @@ struct ieee80211_ops { void (*channel_switch)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, struct ieee80211_channel_switch *ch_switch); - int (*set_antenna)(struct ieee80211_hw *hw, u32 tx_ant, u32 rx_ant); - int (*get_antenna)(struct ieee80211_hw *hw, u32 *tx_ant, u32 *rx_ant); + int (*set_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 tx_ant, u32 rx_ant); + int (*get_antenna)(struct ieee80211_hw *hw, int radio_idx, + u32 *tx_ant, u32 *rx_ant); int (*remain_on_channel)(struct ieee80211_hw *hw, struct ieee80211_vif *vif, diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index a289014abe37..2a71149c3065 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2907,6 +2907,14 @@ enum nl80211_commands { * APs Support". Drivers may set additional flags that they support * in the kernel or device. * + * @NL80211_ATTR_WIPHY_RADIO_INDEX: (int) Integer attribute denoting the index + * of the radio in interest. Internally a value of -1 is used to + * indicate that the radio id is not given in user-space. This means + * that all the attributes are applicable to all the radios. If there is + * a radio index provided in user-space, the attributes will be + * applicable to that specific radio only. If the radio id is greater + * thank the number of radios, error denoting invalid value is returned. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3464,6 +3472,8 @@ enum nl80211_attrs { NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS, + NL80211_ATTR_WIPHY_RADIO_INDEX, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 1a17d66dfa75..72cecc304658 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3045,7 +3045,8 @@ static int ieee80211_set_mcast_rate(struct wiphy *wiphy, struct net_device *dev, return 0; } -static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) +static int ieee80211_set_wiphy_params(struct wiphy *wiphy, int radio_idx, + u32 changed) { struct ieee80211_local *local = wiphy_priv(wiphy); int err; @@ -3053,7 +3054,8 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) if (changed & WIPHY_PARAM_FRAG_THRESHOLD) { ieee80211_check_fast_xmit_all(local); - err = drv_set_frag_threshold(local, wiphy->frag_threshold); + err = drv_set_frag_threshold(local, radio_idx, + wiphy->frag_threshold); if (err) { ieee80211_check_fast_xmit_all(local); @@ -3067,14 +3069,16 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) coverage_class = changed & WIPHY_PARAM_COVERAGE_CLASS ? wiphy->coverage_class : -1; - err = drv_set_coverage_class(local, coverage_class); + err = drv_set_coverage_class(local, radio_idx, + coverage_class); if (err) return err; } if (changed & WIPHY_PARAM_RTS_THRESHOLD) { - err = drv_set_rts_threshold(local, wiphy->rts_threshold); + err = drv_set_rts_threshold(local, radio_idx, + wiphy->rts_threshold); if (err) return err; @@ -3092,18 +3096,19 @@ static int ieee80211_set_wiphy_params(struct wiphy *wiphy, u32 changed) } if (changed & (WIPHY_PARAM_RETRY_SHORT | WIPHY_PARAM_RETRY_LONG)) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_RETRY_LIMITS); + ieee80211_hw_config(local, radio_idx, + IEEE80211_CONF_CHANGE_RETRY_LIMITS); if (changed & (WIPHY_PARAM_TXQ_LIMIT | WIPHY_PARAM_TXQ_MEMORY_LIMIT | WIPHY_PARAM_TXQ_QUANTUM)) - ieee80211_txq_set_params(local); + ieee80211_txq_set_params(local, radio_idx); return 0; } static int ieee80211_set_tx_power(struct wiphy *wiphy, - struct wireless_dev *wdev, + struct wireless_dev *wdev, int radio_idx, enum nl80211_tx_power_setting type, int mbm) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -3231,6 +3236,7 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy, static int ieee80211_get_tx_power(struct wiphy *wiphy, struct wireless_dev *wdev, + int radio_idx, unsigned int link_id, int *dbm) { @@ -3409,7 +3415,7 @@ static int ieee80211_set_power_mgmt(struct wiphy *wiphy, struct net_device *dev, } if (ieee80211_hw_check(&local->hw, SUPPORTS_DYNAMIC_PS)) - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); ieee80211_recalc_ps(local); ieee80211_recalc_ps_vif(sdata); @@ -4305,7 +4311,8 @@ ieee80211_update_mgmt_frame_registrations(struct wiphy *wiphy, ieee80211_configure_filter(local); } -static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) +static int ieee80211_set_antenna(struct wiphy *wiphy, int radio_idx, + u32 tx_ant, u32 rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); int ret; @@ -4321,11 +4328,12 @@ static int ieee80211_set_antenna(struct wiphy *wiphy, u32 tx_ant, u32 rx_ant) return 0; } -static int ieee80211_get_antenna(struct wiphy *wiphy, u32 *tx_ant, u32 *rx_ant) +static int ieee80211_get_antenna(struct wiphy *wiphy, int radio_idx, + u32 *tx_ant, u32 *rx_ant) { struct ieee80211_local *local = wiphy_priv(wiphy); - return drv_get_antenna(local, tx_ant, rx_ant); + return drv_get_antenna(local, radio_idx, tx_ant, rx_ant); } static int ieee80211_set_rekey_data(struct wiphy *wiphy, diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index d62f91656a19..4bcbcf9d98b5 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -744,7 +744,7 @@ static int ieee80211_add_chanctx(struct ieee80211_local *local, /* turn idle off *before* setting channel -- some drivers need that */ changed = ieee80211_idle_off(local); if (changed) - ieee80211_hw_config(local, changed); + ieee80211_hw_config(local, -1, changed); err = drv_add_chanctx(local, ctx); if (err) { diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index ba017bf3fd15..8baebb5636ec 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -143,15 +143,16 @@ int drv_change_interface(struct ieee80211_local *local, void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata); -static inline int drv_config(struct ieee80211_local *local, u32 changed) +static inline int drv_config(struct ieee80211_local *local, int radio_idx, + u32 changed) { int ret; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_config(local, changed); - ret = local->ops->config(&local->hw, changed); + trace_drv_config(local, radio_idx, changed); + ret = local->ops->config(&local->hw, radio_idx, changed); trace_drv_return_int(local, ret); return ret; } @@ -387,45 +388,47 @@ static inline void drv_get_key_seq(struct ieee80211_local *local, } static inline int drv_set_frag_threshold(struct ieee80211_local *local, - u32 value) + int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_frag_threshold(local, value); + trace_drv_set_frag_threshold(local, radio_idx, value); if (local->ops->set_frag_threshold) - ret = local->ops->set_frag_threshold(&local->hw, value); + ret = local->ops->set_frag_threshold(&local->hw, radio_idx, + value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_rts_threshold(struct ieee80211_local *local, - u32 value) + int radio_idx, u32 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_rts_threshold(local, value); + trace_drv_set_rts_threshold(local, radio_idx, value); if (local->ops->set_rts_threshold) - ret = local->ops->set_rts_threshold(&local->hw, value); + ret = local->ops->set_rts_threshold(&local->hw, radio_idx, + value); trace_drv_return_int(local, ret); return ret; } static inline int drv_set_coverage_class(struct ieee80211_local *local, - s16 value) + int radio_idx, s16 value) { int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); - trace_drv_set_coverage_class(local, value); + trace_drv_set_coverage_class(local, radio_idx, value); if (local->ops->set_coverage_class) - local->ops->set_coverage_class(&local->hw, value); + local->ops->set_coverage_class(&local->hw, radio_idx, value); else ret = -EOPNOTSUPP; @@ -772,20 +775,21 @@ static inline int drv_set_antenna(struct ieee80211_local *local, might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->set_antenna) - ret = local->ops->set_antenna(&local->hw, tx_ant, rx_ant); + ret = local->ops->set_antenna(&local->hw, -1, tx_ant, rx_ant); trace_drv_set_antenna(local, tx_ant, rx_ant, ret); return ret; } -static inline int drv_get_antenna(struct ieee80211_local *local, +static inline int drv_get_antenna(struct ieee80211_local *local, int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (local->ops->get_antenna) - ret = local->ops->get_antenna(&local->hw, tx_ant, rx_ant); - trace_drv_get_antenna(local, *tx_ant, *rx_ant, ret); + ret = local->ops->get_antenna(&local->hw, radio_idx, + tx_ant, rx_ant); + trace_drv_get_antenna(local, radio_idx, *tx_ant, *rx_ant, ret); return ret; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9b9c7209878b..f59a5b38e6f2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1872,7 +1872,8 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, struct ieee80211_rx_status *status, unsigned int mpdu_len, unsigned int mpdu_offset); -int ieee80211_hw_config(struct ieee80211_local *local, u32 changed); +int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, + u32 changed); int ieee80211_hw_conf_chan(struct ieee80211_local *local); void ieee80211_hw_conf_init(struct ieee80211_local *local); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx); @@ -2542,7 +2543,7 @@ static inline bool ieee80211_can_run_worker(struct ieee80211_local *local) } int ieee80211_txq_setup_flows(struct ieee80211_local *local); -void ieee80211_txq_set_params(struct ieee80211_local *local); +void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx); void ieee80211_txq_teardown_flows(struct ieee80211_local *local); void ieee80211_txq_init(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 7c27f3cd841c..7b2baebb8644 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -146,7 +146,7 @@ void ieee80211_recalc_idle(struct ieee80211_local *local) { u32 change = __ieee80211_recalc_idle(local, false); if (change) - ieee80211_hw_config(local, change); + ieee80211_hw_config(local, -1, change); } static int ieee80211_verify_mac(struct ieee80211_sub_if_data *sdata, u8 *addr, @@ -726,7 +726,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do /* do after stop to avoid reconfiguring when we stop anyway */ ieee80211_configure_filter(local); - ieee80211_hw_config(local, hw_reconf_flags); + ieee80211_hw_config(local, -1, hw_reconf_flags); if (local->virt_monitors == local->open_count) ieee80211_add_virtual_monitor(local); @@ -1491,7 +1491,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (local->open_count == 1) ieee80211_hw_conf_init(local); else if (hw_reconf_flags) - ieee80211_hw_config(local, hw_reconf_flags); + ieee80211_hw_config(local, -1, hw_reconf_flags); ieee80211_recalc_ps(local); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 6b6de43d9420..c1c758e76d2e 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -190,7 +190,8 @@ static u32 ieee80211_calc_hw_conf_chan(struct ieee80211_local *local, return changed; } -int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) +int ieee80211_hw_config(struct ieee80211_local *local, int radio_idx, + u32 changed) { int ret = 0; @@ -201,7 +202,7 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) IEEE80211_CONF_CHANGE_SMPS)); if (changed && local->open_count) { - ret = drv_config(local, changed); + ret = drv_config(local, radio_idx, changed); /* * Goal: * HW reconfiguration should never fail, the driver has told @@ -235,7 +236,7 @@ static int _ieee80211_hw_conf_chan(struct ieee80211_local *local, if (!changed) return 0; - return drv_config(local, changed); + return drv_config(local, -1, changed); } int ieee80211_hw_conf_chan(struct ieee80211_local *local) @@ -269,7 +270,7 @@ void ieee80211_hw_conf_init(struct ieee80211_local *local) ctx ? &ctx->conf : NULL); } - WARN_ON(drv_config(local, changed)); + WARN_ON(drv_config(local, -1, changed)); } int ieee80211_emulate_add_chanctx(struct ieee80211_hw *hw, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2d46d4af60d7..d526f2fe9fe5 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -3181,7 +3181,7 @@ static void ieee80211_enable_ps(struct ieee80211_local *local, return; conf->flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } @@ -3193,7 +3193,7 @@ static void ieee80211_change_ps(struct ieee80211_local *local) ieee80211_enable_ps(local, local->ps_sdata); } else if (conf->flags & IEEE80211_CONF_PS) { conf->flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); timer_delete_sync(&local->dynamic_ps_timer); wiphy_work_cancel(local->hw.wiphy, &local->dynamic_ps_enable_work); @@ -3302,7 +3302,7 @@ void ieee80211_dynamic_ps_disable_work(struct wiphy *wiphy, if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_wake_queues_by_reason(&local->hw, @@ -3377,7 +3377,7 @@ void ieee80211_dynamic_ps_enable_work(struct wiphy *wiphy, (ifmgd->flags & IEEE80211_STA_NULLFUNC_ACKED)) { ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; local->hw.conf.flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } @@ -3986,7 +3986,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, */ if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } local->ps_sdata = NULL; @@ -7340,7 +7340,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } ieee80211_send_nullfunc(local, sdata, false); diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 686d9f6e9b52..13df6321634d 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -39,7 +39,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) if (local->hw.conf.flags & IEEE80211_CONF_PS) { offchannel_ps_enabled = true; local->hw.conf.flags &= ~IEEE80211_CONF_PS; - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_PS); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } if (!offchannel_ps_enabled || diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index a9cc832240a5..5a508d99e84f 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -108,7 +108,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) sdata->u.mgd.powersave && !(local->hw.conf.flags & IEEE80211_CONF_PS)) { local->hw.conf.flags |= IEEE80211_CONF_PS; - ieee80211_hw_config(local, + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_PS); } } diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 8215ca58ce5e..0bfbce157486 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -384,12 +384,14 @@ DEFINE_EVENT(local_sdata_addr_evt, drv_remove_interface, TRACE_EVENT(drv_config, TP_PROTO(struct ieee80211_local *local, + int radio_idx, u32 changed), - TP_ARGS(local, changed), + TP_ARGS(local, radio_idx, changed), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(u32, changed) __field(u32, flags) __field(int, power_level) @@ -403,6 +405,7 @@ TRACE_EVENT(drv_config, TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->changed = changed; __entry->flags = local->hw.conf.flags; __entry->power_level = local->hw.conf.power_level; @@ -417,8 +420,8 @@ TRACE_EVENT(drv_config, ), TP_printk( - LOCAL_PR_FMT " ch:%#x" CHANDEF_PR_FMT, - LOCAL_PR_ARG, __entry->changed, CHANDEF_PR_ARG + LOCAL_PR_FMT " radio_idx:%d ch:%#x" CHANDEF_PR_FMT, + LOCAL_PR_ARG, __entry->radio_idx, __entry->changed, CHANDEF_PR_ARG ) ); @@ -818,34 +821,71 @@ TRACE_EVENT(drv_get_key_seq, ) ); -DEFINE_EVENT(local_u32_evt, drv_set_frag_threshold, - TP_PROTO(struct ieee80211_local *local, u32 value), - TP_ARGS(local, value) +TRACE_EVENT(drv_set_frag_threshold, + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value), + + TP_ARGS(local, radio_idx, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, radio_idx) + __field(u32, value) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " radio_id:%d value:%u", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value + ) ); -DEFINE_EVENT(local_u32_evt, drv_set_rts_threshold, - TP_PROTO(struct ieee80211_local *local, u32 value), - TP_ARGS(local, value) +TRACE_EVENT(drv_set_rts_threshold, + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 value), + + TP_ARGS(local, radio_idx, value), + + TP_STRUCT__entry( + LOCAL_ENTRY + __field(int, radio_idx) + __field(u32, value) + ), + TP_fast_assign( + LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->value = value; + ), + + TP_printk( + LOCAL_PR_FMT " radio_id:%d value:%u", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value + ) ); TRACE_EVENT(drv_set_coverage_class, - TP_PROTO(struct ieee80211_local *local, s16 value), + TP_PROTO(struct ieee80211_local *local, int radio_idx, s16 value), - TP_ARGS(local, value), + TP_ARGS(local, radio_idx, value), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(s16, value) ), TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->value = value; ), TP_printk( - LOCAL_PR_FMT " value:%d", - LOCAL_PR_ARG, __entry->value + LOCAL_PR_FMT " radio_id:%d value:%d", + LOCAL_PR_ARG, __entry->radio_idx, __entry->value ) ); @@ -1318,12 +1358,14 @@ TRACE_EVENT(drv_set_antenna, ); TRACE_EVENT(drv_get_antenna, - TP_PROTO(struct ieee80211_local *local, u32 tx_ant, u32 rx_ant, int ret), + TP_PROTO(struct ieee80211_local *local, int radio_idx, u32 tx_ant, + u32 rx_ant, int ret), - TP_ARGS(local, tx_ant, rx_ant, ret), + TP_ARGS(local, radio_idx, tx_ant, rx_ant, ret), TP_STRUCT__entry( LOCAL_ENTRY + __field(int, radio_idx) __field(u32, tx_ant) __field(u32, rx_ant) __field(int, ret) @@ -1331,14 +1373,16 @@ TRACE_EVENT(drv_get_antenna, TP_fast_assign( LOCAL_ASSIGN; + __entry->radio_idx = radio_idx; __entry->tx_ant = tx_ant; __entry->rx_ant = rx_ant; __entry->ret = ret; ), TP_printk( - LOCAL_PR_FMT " tx_ant:%d rx_ant:%d ret:%d", - LOCAL_PR_ARG, __entry->tx_ant, __entry->rx_ant, __entry->ret + LOCAL_PR_FMT " radio_idx:%d tx_ant:%d rx_ant:%d ret:%d", + LOCAL_PR_ARG, __entry->radio_idx, __entry->tx_ant, + __entry->rx_ant, __entry->ret ) ); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d58b80813bdd..6278d55aeb2e 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1541,7 +1541,7 @@ void ieee80211_txq_purge(struct ieee80211_local *local, spin_unlock_bh(&local->active_txq_lock[txqi->txq.ac]); } -void ieee80211_txq_set_params(struct ieee80211_local *local) +void ieee80211_txq_set_params(struct ieee80211_local *local, int radio_idx) { if (local->hw.wiphy->txq_limit) local->fq.limit = local->hw.wiphy->txq_limit; @@ -1605,7 +1605,7 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) for (i = 0; i < fq->flows_cnt; i++) codel_vars_init(&local->cvars[i]); - ieee80211_txq_set_params(local); + ieee80211_txq_set_params(local, -1); return 0; } diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 24c43a1ef2aa..773c8da0acc9 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1826,13 +1826,13 @@ int ieee80211_reconfig(struct ieee80211_local *local) } /* setup fragmentation threshold */ - drv_set_frag_threshold(local, hw->wiphy->frag_threshold); + drv_set_frag_threshold(local, -1, hw->wiphy->frag_threshold); /* setup RTS threshold */ - drv_set_rts_threshold(local, hw->wiphy->rts_threshold); + drv_set_rts_threshold(local, -1, hw->wiphy->rts_threshold); /* reset coverage class */ - drv_set_coverage_class(local, hw->wiphy->coverage_class); + drv_set_coverage_class(local, -1, hw->wiphy->coverage_class); ieee80211_led_radio(local, true); ieee80211_mod_tpt_led_trig(local, @@ -1890,11 +1890,11 @@ int ieee80211_reconfig(struct ieee80211_local *local) ieee80211_assign_chanctx(local, sdata, &sdata->deflink); /* reconfigure hardware */ - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | - IEEE80211_CONF_CHANGE_MONITOR | - IEEE80211_CONF_CHANGE_PS | - IEEE80211_CONF_CHANGE_RETRY_LIMITS | - IEEE80211_CONF_CHANGE_IDLE); + ieee80211_hw_config(local, -1, IEEE80211_CONF_CHANGE_LISTEN_INTERVAL | + IEEE80211_CONF_CHANGE_MONITOR | + IEEE80211_CONF_CHANGE_PS | + IEEE80211_CONF_CHANGE_RETRY_LIMITS | + IEEE80211_CONF_CHANGE_IDLE); ieee80211_configure_filter(local); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9ef618baac9e..b40978549790 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -854,6 +854,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 }, [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, + [NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 }, }; /* policy for the key attributes */ @@ -2639,7 +2640,7 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, u32 tx_ant = 0, rx_ant = 0; int res; - res = rdev_get_antenna(rdev, &tx_ant, &rx_ant); + res = rdev_get_antenna(rdev, -1, &tx_ant, &rx_ant); if (!res) { if (nla_put_u32(msg, NL80211_ATTR_WIPHY_ANTENNA_TX, @@ -3620,6 +3621,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 frag_threshold = 0, rts_threshold = 0; u8 coverage_class = 0; u32 txq_limit = 0, txq_memory_limit = 0, txq_quantum = 0; + int radio_idx = -1; rtnl_lock(); /* @@ -3670,6 +3672,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (result) return result; + if (info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]) { + /* Radio idx is not expected for non-multi radio wiphy */ + if (rdev->wiphy.n_radio <= 0) + return -EINVAL; + + radio_idx = nla_get_u8( + info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]); + if (radio_idx >= rdev->wiphy.n_radio) + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { struct ieee80211_txq_params txq_params; struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1]; @@ -3759,7 +3772,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) mbm = nla_get_u32(info->attrs[idx]); } - result = rdev_set_tx_power(rdev, txp_wdev, type, mbm); + result = rdev_set_tx_power(rdev, txp_wdev, radio_idx, type, + mbm); if (result) return result; } @@ -3785,7 +3799,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) tx_ant = tx_ant & rdev->wiphy.available_antennas_tx; rx_ant = rx_ant & rdev->wiphy.available_antennas_rx; - result = rdev_set_antenna(rdev, tx_ant, rx_ant); + result = rdev_set_antenna(rdev, radio_idx, tx_ant, rx_ant); if (result) return result; } @@ -3911,7 +3925,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (changed & WIPHY_PARAM_TXQ_QUANTUM) rdev->wiphy.txq_quantum = txq_quantum; - result = rdev_set_wiphy_params(rdev, changed); + result = rdev_set_wiphy_params(rdev, radio_idx, changed); if (result) { rdev->wiphy.retry_short = old_retry_short; rdev->wiphy.retry_long = old_retry_long; @@ -4012,7 +4026,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_tx_power && !wdev->valid_links) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, 0, &dbm); + ret = rdev_get_tx_power(rdev, wdev, -1, 0, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) @@ -4084,7 +4098,7 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag if (rdev->ops->get_tx_power) { int dbm, ret; - ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm); + ret = rdev_get_tx_power(rdev, wdev, -1, link_id, &dbm); if (ret == 0 && nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL, DBM_TO_MBM(dbm))) diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 9f4783c2354c..803b39c26587 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -577,35 +577,40 @@ static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, } static inline int -rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) +rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, int radio_idx, + u32 changed) { int ret = -EOPNOTSUPP; - trace_rdev_set_wiphy_params(&rdev->wiphy, changed); + trace_rdev_set_wiphy_params(&rdev->wiphy, radio_idx, changed); if (rdev->ops->set_wiphy_params) - ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); + ret = rdev->ops->set_wiphy_params(&rdev->wiphy, radio_idx, + changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm) + struct wireless_dev *wdev, int radio_idx, + enum nl80211_tx_power_setting type, + int mbm) { int ret; - trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm); - ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm); + trace_rdev_set_tx_power(&rdev->wiphy, wdev, radio_idx, type, mbm); + ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, radio_idx, type, + mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev, unsigned int link_id, - int *dbm) + struct wireless_dev *wdev, int radio_idx, + unsigned int link_id, int *dbm) { int ret; - trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id); - ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm); + trace_rdev_get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id); + ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, radio_idx, link_id, + dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } @@ -857,21 +862,21 @@ rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, - u32 tx_ant, u32 rx_ant) + int radio_idx, u32 tx_ant, u32 rx_ant) { int ret; - trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant); - ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); + trace_rdev_set_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); + ret = rdev->ops->set_antenna(&rdev->wiphy, -1, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, - u32 *tx_ant, u32 *rx_ant) + int radio_idx, u32 *tx_ant, u32 *rx_ant) { int ret; - trace_rdev_get_antenna(&rdev->wiphy); - ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant); + trace_rdev_get_antenna(&rdev->wiphy, radio_idx); + ret = rdev->ops->get_antenna(&rdev->wiphy, radio_idx, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 61a5eca9c513..7e43ab9de923 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -406,9 +406,19 @@ DEFINE_EVENT(wiphy_only_evt, rdev_return_void, TP_ARGS(wiphy) ); -DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy) +TRACE_EVENT(rdev_get_antenna, + TP_PROTO(struct wiphy *wiphy, int radio_idx), + TP_ARGS(wiphy, radio_idx), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(int, radio_idx) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; + ), + TP_printk(WIPHY_PR_FMT ", radio_idx: %d", + WIPHY_PR_ARG, __entry->radio_idx) ); DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll, @@ -1678,18 +1688,20 @@ TRACE_EVENT(rdev_join_ocb, ); TRACE_EVENT(rdev_set_wiphy_params, - TP_PROTO(struct wiphy *wiphy, u32 changed), - TP_ARGS(wiphy, changed), + TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 changed), + TP_ARGS(wiphy, radio_idx, changed), TP_STRUCT__entry( WIPHY_ENTRY + __field(int, radio_idx) __field(u32, changed) ), TP_fast_assign( WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; __entry->changed = changed; ), - TP_printk(WIPHY_PR_FMT ", changed: %u", - WIPHY_PR_ARG, __entry->changed) + TP_printk(WIPHY_PR_FMT ", radio_idx: %d, changed: %u", + WIPHY_PR_ARG, __entry->radio_idx, __entry->changed) ); DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, @@ -1710,30 +1722,51 @@ DECLARE_EVENT_CLASS(wiphy_wdev_link_evt, WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); -DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power, +TRACE_EVENT(rdev_get_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - unsigned int link_id), - TP_ARGS(wiphy, wdev, link_id) + int radio_idx, unsigned int link_id), + TP_ARGS(wiphy, wdev, radio_idx, link_id), + TP_STRUCT__entry( + WIPHY_ENTRY + WDEV_ENTRY + __field(int, radio_idx) + __field(unsigned int, link_id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + WDEV_ASSIGN; + __entry->radio_idx = radio_idx; + __entry->link_id = link_id; + ), + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", radio_idx: %d, link_id: %u", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->radio_idx, __entry->link_id) ); TRACE_EVENT(rdev_set_tx_power, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, - enum nl80211_tx_power_setting type, int mbm), - TP_ARGS(wiphy, wdev, type, mbm), + int radio_idx, enum nl80211_tx_power_setting type, + int mbm), + TP_ARGS(wiphy, wdev, radio_idx, type, mbm), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY + __field(int, radio_idx) __field(enum nl80211_tx_power_setting, type) __field(int, mbm) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; + __entry->radio_idx = radio_idx; __entry->type = type; __entry->mbm = mbm; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", type: %u, mbm: %d", - WIPHY_PR_ARG, WDEV_PR_ARG,__entry->type, __entry->mbm) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT + ", radio_idx: %d, type: %u, mbm: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, + __entry->radio_idx, __entry->type, __entry->mbm) ); TRACE_EVENT(rdev_return_int_int, @@ -1866,26 +1899,24 @@ TRACE_EVENT(rdev_return_void_tx_rx, __entry->rx_max) ); -DECLARE_EVENT_CLASS(tx_rx_evt, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, tx, rx), +TRACE_EVENT(rdev_set_antenna, + TP_PROTO(struct wiphy *wiphy, int radio_idx, u32 tx, u32 rx), + TP_ARGS(wiphy, radio_idx, tx, rx), TP_STRUCT__entry( WIPHY_ENTRY + __field(int, radio_idx) __field(u32, tx) __field(u32, rx) ), TP_fast_assign( WIPHY_ASSIGN; + __entry->radio_idx = radio_idx; __entry->tx = tx; __entry->rx = rx; ), - TP_printk(WIPHY_PR_FMT ", tx: %u, rx: %u ", - WIPHY_PR_ARG, __entry->tx, __entry->rx) -); - -DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, - TP_PROTO(struct wiphy *wiphy, u32 tx, u32 rx), - TP_ARGS(wiphy, tx, rx) + TP_printk(WIPHY_PR_FMT ", radio_idx: %d, tx: %u, rx: %u ", + WIPHY_PR_ARG, __entry->radio_idx, + __entry->tx, __entry->rx) ); DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a74b1afc594e..1241fda78a68 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -263,7 +263,7 @@ int cfg80211_wext_siwrts(struct net_device *dev, else wdev->wiphy->rts_threshold = rts->value; - err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD); + err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_RTS_THRESHOLD); if (err) wdev->wiphy->rts_threshold = orts; return err; @@ -304,7 +304,7 @@ int cfg80211_wext_siwfrag(struct net_device *dev, wdev->wiphy->frag_threshold = frag->value & ~0x1; } - err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD); + err = rdev_set_wiphy_params(rdev, -1, WIPHY_PARAM_FRAG_THRESHOLD); if (err) wdev->wiphy->frag_threshold = ofrag; return err; @@ -355,7 +355,7 @@ static int cfg80211_wext_siwretry(struct net_device *dev, changed |= WIPHY_PARAM_RETRY_SHORT; } - err = rdev_set_wiphy_params(rdev, changed); + err = rdev_set_wiphy_params(rdev, -1, changed); if (err) { wdev->wiphy->retry_short = oshort; wdev->wiphy->retry_long = olong; @@ -890,7 +890,7 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev, guard(wiphy)(&rdev->wiphy); - return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm)); + return rdev_set_tx_power(rdev, wdev, -1, type, DBM_TO_MBM(dbm)); } static int cfg80211_wext_giwtxpower(struct net_device *dev, @@ -910,7 +910,7 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev, return -EOPNOTSUPP; scoped_guard(wiphy, &rdev->wiphy) { - err = rdev_get_tx_power(rdev, wdev, 0, &val); + err = rdev_get_tx_power(rdev, wdev, -1, 0, &val); } if (err) return err; -- cgit v1.2.3 From 264637941cf45cd3ffe070e25853d7e1a29f2004 Mon Sep 17 00:00:00 2001 From: Roopni Devanathan Date: Sun, 15 Jun 2025 13:53:10 +0530 Subject: wifi: cfg80211: Add Support to Set RTS Threshold for each Radio Currently, setting RTS threshold is based on per-phy basis, i.e., all the radios present in a wiphy will take RTS threshold value to be the one sent from userspace. But each radio in a multi-radio wiphy can have different RTS threshold requirements. To extend support to set RTS threshold for each radio, get the radio for which RTS threshold needs to be changed from the user. Use the attribute in NL - NL80211_ATTR_WIPHY_RADIO_INDEX, to identify the radio of interest. Create a new structure - wiphy_radio_cfg and add rts_threshold in it as a u32 value to store RTS threshold of each radio in a wiphy and allocate memory for it during wiphy register based on the wiphy.n_radio updated by drivers. Pass radio id received from the user to mac80211 drivers along with its corresponding RTS threshold. Signed-off-by: Roopni Devanathan Link: https://patch.msgid.link/20250615082312.619639-3-quic_rdevanat@quicinc.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 18 +++++++++++++++ net/wireless/core.c | 19 ++++++++++++++++ net/wireless/nl80211.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 96 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ffd9564fc840..0003733b1e77 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -5547,6 +5547,18 @@ struct wiphy_iftype_akm_suites { int n_akm_suites; }; +/** + * struct wiphy_radio_cfg - physical radio config of a wiphy + * This structure describes the configurations of a physical radio in a + * wiphy. It is used to denote per-radio attributes belonging to a wiphy. + * + * @rts_threshold: RTS threshold (dot11RTSThreshold); + * -1 (default) = RTS/CTS disabled + */ +struct wiphy_radio_cfg { + u32 rts_threshold; +}; + /** * struct wiphy_radio_freq_range - wiphy frequency range * @start_freq: start range edge frequency (kHz) @@ -5802,6 +5814,10 @@ struct wiphy_radio { * supports enabling HW timestamping for all peers (i.e. no need to * specify a mac address). * + * @radio_cfg: configuration of radios belonging to a muli-radio wiphy. This + * struct contains a list of all radio specific attributes and should be + * used only for multi-radio wiphy. + * * @radio: radios belonging to this wiphy * @n_radio: number of radios */ @@ -5891,6 +5907,8 @@ struct wiphy { void (*reg_notifier)(struct wiphy *wiphy, struct regulatory_request *request); + struct wiphy_radio_cfg *radio_cfg; + /* fields below are read-only, assigned by cfg80211 */ const struct ieee80211_regdomain __rcu *regd; diff --git a/net/wireless/core.c b/net/wireless/core.c index 5c3c72df0591..f3cd70757ef2 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -995,6 +995,24 @@ int wiphy_register(struct wiphy *wiphy) wiphy->max_num_akm_suites > CFG80211_MAX_NUM_AKM_SUITES) return -EINVAL; + /* Allocate radio configuration space for multi-radio wiphy */ + if (wiphy->n_radio > 0) { + int idx; + + wiphy->radio_cfg = kcalloc(wiphy->n_radio, + sizeof(*wiphy->radio_cfg), + GFP_KERNEL); + if (!wiphy->radio_cfg) + return -ENOMEM; + /* + * Initialize wiphy radio parameters to IEEE 802.11 + * MIB default values. RTS threshold is disabled by + * default with the special -1 value. + */ + for (idx = 0; idx < wiphy->n_radio; idx++) + wiphy->radio_cfg[idx].rts_threshold = (u32)-1; + } + /* check and set up bitrates */ ieee80211_set_bitrate_flags(wiphy); @@ -1222,6 +1240,7 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev) void wiphy_free(struct wiphy *wiphy) { + kfree(wiphy->radio_cfg); put_device(&wiphy->dev); } EXPORT_SYMBOL(wiphy_free); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index b40978549790..b0176090182c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3609,6 +3609,33 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) return __nl80211_set_channel(rdev, netdev, info, link_id); } +static int nl80211_set_wiphy_radio(struct genl_info *info, + struct cfg80211_registered_device *rdev, + int radio_idx) +{ + u32 rts_threshold = 0, old_rts, changed = 0; + int result; + + if (!rdev->ops->set_wiphy_params) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]) { + rts_threshold = nla_get_u32( + info->attrs[NL80211_ATTR_WIPHY_RTS_THRESHOLD]); + changed |= WIPHY_PARAM_RTS_THRESHOLD; + } + + old_rts = rdev->wiphy.radio_cfg[radio_idx].rts_threshold; + + rdev->wiphy.radio_cfg[radio_idx].rts_threshold = rts_threshold; + + result = rdev_set_wiphy_params(rdev, radio_idx, changed); + if (result) + rdev->wiphy.radio_cfg[radio_idx].rts_threshold = old_rts; + + return 0; +} + static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = NULL; @@ -3681,6 +3708,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) info->attrs[NL80211_ATTR_WIPHY_RADIO_INDEX]); if (radio_idx >= rdev->wiphy.n_radio) return -EINVAL; + + return nl80211_set_wiphy_radio(info, rdev, radio_idx); } if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) { @@ -3893,16 +3922,30 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) if (changed) { u8 old_retry_short, old_retry_long; u32 old_frag_threshold, old_rts_threshold; - u8 old_coverage_class; + u8 old_coverage_class, i; u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum; + u32 *old_radio_rts_threshold = NULL; if (!rdev->ops->set_wiphy_params) return -EOPNOTSUPP; + if (rdev->wiphy.n_radio) { + old_radio_rts_threshold = kcalloc(rdev->wiphy.n_radio, + sizeof(u32), + GFP_KERNEL); + if (!old_radio_rts_threshold) + return -ENOMEM; + } + old_retry_short = rdev->wiphy.retry_short; old_retry_long = rdev->wiphy.retry_long; old_frag_threshold = rdev->wiphy.frag_threshold; old_rts_threshold = rdev->wiphy.rts_threshold; + if (old_radio_rts_threshold) { + for (i = 0 ; i < rdev->wiphy.n_radio; i++) + old_radio_rts_threshold[i] = + rdev->wiphy.radio_cfg[i].rts_threshold; + } old_coverage_class = rdev->wiphy.coverage_class; old_txq_limit = rdev->wiphy.txq_limit; old_txq_memory_limit = rdev->wiphy.txq_memory_limit; @@ -3914,8 +3957,13 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.retry_long = retry_long; if (changed & WIPHY_PARAM_FRAG_THRESHOLD) rdev->wiphy.frag_threshold = frag_threshold; - if (changed & WIPHY_PARAM_RTS_THRESHOLD) + if ((changed & WIPHY_PARAM_RTS_THRESHOLD) && + old_radio_rts_threshold) { rdev->wiphy.rts_threshold = rts_threshold; + for (i = 0 ; i < rdev->wiphy.n_radio; i++) + rdev->wiphy.radio_cfg[i].rts_threshold = + rdev->wiphy.rts_threshold; + } if (changed & WIPHY_PARAM_COVERAGE_CLASS) rdev->wiphy.coverage_class = coverage_class; if (changed & WIPHY_PARAM_TXQ_LIMIT) @@ -3931,12 +3979,20 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.retry_long = old_retry_long; rdev->wiphy.frag_threshold = old_frag_threshold; rdev->wiphy.rts_threshold = old_rts_threshold; + if (old_radio_rts_threshold) { + for (i = 0 ; i < rdev->wiphy.n_radio; i++) + rdev->wiphy.radio_cfg[i].rts_threshold = + old_radio_rts_threshold[i]; + } rdev->wiphy.coverage_class = old_coverage_class; rdev->wiphy.txq_limit = old_txq_limit; rdev->wiphy.txq_memory_limit = old_txq_memory_limit; rdev->wiphy.txq_quantum = old_txq_quantum; - return result; } + + if (old_rts_threshold) + kfree(old_radio_rts_threshold); + return result; } return 0; -- cgit v1.2.3 From 5ea255673cdb4a9bf99dd3e4fc9ca1089f5692a3 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 17 Jun 2025 18:06:07 +1000 Subject: wifi: cfg80211: support configuration of S1G station capabilities Currently there is no support for initialising a peers S1G capabilities, this patch adds support for configuring an S1G station. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250617080610.756048-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 +++- net/wireless/nl80211.c | 8 ++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0003733b1e77..4a092da3a9de 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -560,7 +560,7 @@ struct ieee80211_sta_s1g_cap { * @vht_cap: VHT capabilities in this band * @s1g_cap: S1G capabilities in this band * @edmg_cap: EDMG capabilities in this band - * @s1g_cap: S1G capabilities in this band (S1B band only, of course) + * @s1g_cap: S1G capabilities in this band (S1G band only, of course) * @n_iftype_data: number of iftype data entries * @iftype_data: interface type data entries. Note that the bits in * @types_mask inside this structure cannot overlap (i.e. only @@ -1653,6 +1653,7 @@ struct sta_txpwr { * @he_6ghz_capa: HE 6 GHz Band capabilities of station * @eht_capa: EHT capabilities of station * @eht_capa_len: the length of the EHT capabilities + * @s1g_capa: S1G capabilities of station */ struct link_station_parameters { const u8 *mld_mac; @@ -1671,6 +1672,7 @@ struct link_station_parameters { const struct ieee80211_he_6ghz_capa *he_6ghz_capa; const struct ieee80211_eht_cap_elem *eht_capa; u8 eht_capa_len; + const struct ieee80211_s1g_cap *s1g_capa; }; /** diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 70bfe2bfdcc7..70ca74a75f22 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7856,6 +7856,10 @@ static int nl80211_set_station_tdls(struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) + params->link_sta_params.s1g_capa = + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); + err = nl80211_parse_sta_channel_info(info, params); if (err) return err; @@ -8182,6 +8186,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) + params.link_sta_params.s1g_capa = + nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); + if (info->attrs[NL80211_ATTR_OPMODE_NOTIF]) { params.link_sta_params.opmode_notif_used = true; params.link_sta_params.opmode_notif = -- cgit v1.2.3 From 2a8a6b7c4cb03808a707ae19b2f0c5eb9b631e9e Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Tue, 17 Jun 2025 18:06:08 +1000 Subject: wifi: mac80211: handle station association response with S1G Add support for updating the stations S1G capabilities when an S1G association occurs. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250617080610.756048-3-lachlan.hodges@morsemicro.com [remove unused S1G_CAP3_MAX_MPDU_LEN_3895/_7791] Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/mlme.c | 6 ++++++ net/mac80211/s1g.c | 26 ++++++++++++++++++++++++++ 4 files changed, 37 insertions(+) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a0de0da4d79b..dcd5969bb559 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2428,6 +2428,7 @@ struct ieee80211_sta_aggregates { * @he_cap: HE capabilities of this STA * @he_6ghz_capa: on 6 GHz, holds the HE 6 GHz band capabilities * @eht_cap: EHT capabilities of this STA + * @s1g_cap: S1G capabilities of this STA * @agg: per-link data for multi-link aggregation * @bandwidth: current bandwidth the station can receive with * @rx_nss: in HT/VHT, the maximum number of spatial streams the @@ -2450,6 +2451,7 @@ struct ieee80211_link_sta { struct ieee80211_sta_he_cap he_cap; struct ieee80211_he_6ghz_capa he_6ghz_capa; struct ieee80211_sta_eht_cap eht_cap; + struct ieee80211_sta_s1g_cap s1g_cap; struct ieee80211_sta_aggregates agg; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f59a5b38e6f2..4ef7b3656aca 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -2270,6 +2270,9 @@ void ieee80211_s1g_rx_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_s1g_cap *s1g_cap_ie, + struct link_sta_info *link_sta); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d526f2fe9fe5..6001c8897d7c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -5399,6 +5399,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, bss_conf->epcs_support = false; } + if (elems->s1g_oper && + link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G && + elems->s1g_capab) + ieee80211_s1g_cap_to_sta_s1g_cap(sdata, elems->s1g_capab, + link_sta); + bss_conf->twt_broadcast = ieee80211_twt_bcast_support(sdata, bss_conf, sband, link_sta); diff --git a/net/mac80211/s1g.c b/net/mac80211/s1g.c index d4ed0c0a335c..1f68df6e8067 100644 --- a/net/mac80211/s1g.c +++ b/net/mac80211/s1g.c @@ -194,3 +194,29 @@ void ieee80211_s1g_status_twt_action(struct ieee80211_sub_if_data *sdata, break; } } + +void ieee80211_s1g_cap_to_sta_s1g_cap(struct ieee80211_sub_if_data *sdata, + const struct ieee80211_s1g_cap *s1g_cap_ie, + struct link_sta_info *link_sta) +{ + struct ieee80211_sta_s1g_cap *s1g_cap = &link_sta->pub->s1g_cap; + + memset(s1g_cap, 0, sizeof(*s1g_cap)); + + memcpy(s1g_cap->cap, s1g_cap_ie->capab_info, sizeof(s1g_cap->cap)); + memcpy(s1g_cap->nss_mcs, s1g_cap_ie->supp_mcs_nss, + sizeof(s1g_cap->nss_mcs)); + + s1g_cap->s1g = true; + + /* Maximum MPDU length is 1 bit for S1G */ + if (s1g_cap->cap[3] & S1G_CAP3_MAX_MPDU_LEN) { + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_7991; + } else { + link_sta->pub->agg.max_amsdu_len = + IEEE80211_MAX_MPDU_LEN_VHT_3895; + } + + ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); +} -- cgit v1.2.3 From c9e78afa688afec528784b79bb02d513cdcd6527 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Mon, 23 Jun 2025 12:53:55 +0200 Subject: udp_tunnel: fix deadlock in udp_tunnel_nic_set_port_priv() While configuring a vxlan tunnel in a system with a i40e NIC driver, I observe the following deadlock: WARNING: possible recursive locking detected 6.16.0-rc2.net-next-6.16_92d87230d899+ #13 Tainted: G E -------------------------------------------- kworker/u256:4/1125 is trying to acquire lock: ffff88921ab9c8c8 (&utn->lock){+.+.}-{4:4}, at: i40e_udp_tunnel_set_port (/home/pabeni/net-next/include/net/udp_tunnel.h:343 /home/pabeni/net-next/drivers/net/ethernet/intel/i40e/i40e_main.c:13013) i40e but task is already holding lock: ffff88921ab9c8c8 (&utn->lock){+.+.}-{4:4}, at: udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:739) udp_tunnel other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&utn->lock); lock(&utn->lock); *** DEADLOCK *** May be due to missing lock nesting notation 4 locks held by kworker/u256:4/1125: #0: ffff8892910ca158 ((wq_completion)udp_tunnel_nic){+.+.}-{0:0}, at: process_one_work (/home/pabeni/net-next/kernel/workqueue.c:3213) #1: ffffc900244efd30 ((work_completion)(&utn->work)){+.+.}-{0:0}, at: process_one_work (/home/pabeni/net-next/kernel/workqueue.c:3214) #2: ffffffff9a14e290 (rtnl_mutex){+.+.}-{4:4}, at: udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:737) udp_tunnel #3: ffff88921ab9c8c8 (&utn->lock){+.+.}-{4:4}, at: udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:739) udp_tunnel stack backtrace: Hardware name: Dell Inc. PowerEdge R7525/0YHMCJ, BIOS 2.2.5 04/08/2021 i Call Trace: dump_stack_lvl (/home/pabeni/net-next/lib/dump_stack.c:123) print_deadlock_bug (/home/pabeni/net-next/kernel/locking/lockdep.c:3047) validate_chain (/home/pabeni/net-next/kernel/locking/lockdep.c:3901) __lock_acquire (/home/pabeni/net-next/kernel/locking/lockdep.c:5240) lock_acquire.part.0 (/home/pabeni/net-next/kernel/locking/lockdep.c:473 /home/pabeni/net-next/kernel/locking/lockdep.c:5873) __mutex_lock (/home/pabeni/net-next/kernel/locking/mutex.c:604 /home/pabeni/net-next/kernel/locking/mutex.c:747) i40e_udp_tunnel_set_port (/home/pabeni/net-next/include/net/udp_tunnel.h:343 /home/pabeni/net-next/drivers/net/ethernet/intel/i40e/i40e_main.c:13013) i40e udp_tunnel_nic_device_sync_by_port (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:230 /home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:249) udp_tunnel __udp_tunnel_nic_device_sync.part.0 (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:292) udp_tunnel udp_tunnel_nic_device_sync_work (/home/pabeni/net-next/net/ipv4/udp_tunnel_nic.c:742) udp_tunnel process_one_work (/home/pabeni/net-next/kernel/workqueue.c:3243) worker_thread (/home/pabeni/net-next/kernel/workqueue.c:3315 /home/pabeni/net-next/kernel/workqueue.c:3402) kthread (/home/pabeni/net-next/kernel/kthread.c:464) AFAICS all the existing callsites of udp_tunnel_nic_set_port_priv() are already under the utn lock scope, avoid (re-)acquiring it in such a function. Fixes: 1ead7501094c ("udp_tunnel: remove rtnl_lock dependency") Signed-off-by: Paolo Abeni Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/95a827621ec78c12d1564ec3209e549774f9657d.1750675978.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- include/net/udp_tunnel.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index cbd3a43074bd..9acef2fbd2fd 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -339,9 +339,8 @@ udp_tunnel_nic_set_port_priv(struct net_device *dev, unsigned int table, unsigned int idx, u8 priv) { if (udp_tunnel_nic_ops) { - udp_tunnel_nic_ops->lock(dev); + udp_tunnel_nic_ops->assert_locked(dev); udp_tunnel_nic_ops->set_port_priv(dev, table, idx, priv); - udp_tunnel_nic_ops->unlock(dev); } } -- cgit v1.2.3 From 4b70e2a069d90cdc447c6bf8437c8b99345852e9 Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 24 Jun 2025 09:43:27 +0800 Subject: net/sched: Remove unused functions Since commit c54e1d920f04 ("flow_offload: add ops to tc_action_ops for flow action setup") these are unused. Signed-off-by: Yue Haibing Acked-by: Cong Wang Link: https://patch.msgid.link/20250624014327.3686873-1-yuehaibing@huawei.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_csum.h | 9 --------- include/net/tc_act/tc_ct.h | 9 --------- include/net/tc_act/tc_gate.h | 9 --------- include/net/tc_act/tc_mpls.h | 9 --------- include/net/tc_act/tc_police.h | 9 --------- include/net/tc_act/tc_sample.h | 9 --------- include/net/tc_act/tc_vlan.h | 9 --------- 7 files changed, 63 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 68269e4581b7..2515da0142a6 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -18,15 +18,6 @@ struct tcf_csum { }; #define to_tcf_csum(a) ((struct tcf_csum *)a) -static inline bool is_tcf_csum(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_CSUM) - return true; -#endif - return false; -} - static inline u32 tcf_csum_update_flags(const struct tc_action *a) { u32 update_flags; diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index 77f87c622a2e..e6b45cb27ebf 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -92,13 +92,4 @@ static inline void tcf_ct_flow_table_restore_skb(struct sk_buff *skb, unsigned long cookie) { } #endif -static inline bool is_tcf_ct(const struct tc_action *a) -{ -#if defined(CONFIG_NET_CLS_ACT) && IS_ENABLED(CONFIG_NF_CONNTRACK) - if (a->ops && a->ops->id == TCA_ID_CT) - return true; -#endif - return false; -} - #endif /* __NET_TC_CT_H */ diff --git a/include/net/tc_act/tc_gate.h b/include/net/tc_act/tc_gate.h index c8fa11ebb397..c1a67149c6b6 100644 --- a/include/net/tc_act/tc_gate.h +++ b/include/net/tc_act/tc_gate.h @@ -51,15 +51,6 @@ struct tcf_gate { #define to_gate(a) ((struct tcf_gate *)a) -static inline bool is_tcf_gate(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_GATE) - return true; -#endif - return false; -} - static inline s32 tcf_gate_prio(const struct tc_action *a) { s32 tcfg_prio; diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h index 721de4f5733a..d452e5e94fd0 100644 --- a/include/net/tc_act/tc_mpls.h +++ b/include/net/tc_act/tc_mpls.h @@ -27,15 +27,6 @@ struct tcf_mpls { }; #define to_mpls(a) ((struct tcf_mpls *)a) -static inline bool is_tcf_mpls(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_MPLS) - return true; -#endif - return false; -} - static inline u32 tcf_mpls_action(const struct tc_action *a) { u32 tcfm_action; diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 283bde711a42..490d88cb5233 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -44,15 +44,6 @@ struct tc_police_compat { struct tc_ratespec peakrate; }; -static inline bool is_tcf_police(const struct tc_action *act) -{ -#ifdef CONFIG_NET_CLS_ACT - if (act->ops && act->ops->id == TCA_ID_POLICE) - return true; -#endif - return false; -} - static inline u64 tcf_police_rate_bytes_ps(const struct tc_action *act) { struct tcf_police *police = to_police(act); diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index b5d76305e854..abd163ca1864 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -17,15 +17,6 @@ struct tcf_sample { }; #define to_sample(a) ((struct tcf_sample *)a) -static inline bool is_tcf_sample(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - return a->ops && a->ops->id == TCA_ID_SAMPLE; -#else - return false; -#endif -} - static inline __u32 tcf_sample_rate(const struct tc_action *a) { return to_sample(a)->rate; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 904eddfc1826..3f5e9242b5e8 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -26,15 +26,6 @@ struct tcf_vlan { }; #define to_vlan(a) ((struct tcf_vlan *)a) -static inline bool is_tcf_vlan(const struct tc_action *a) -{ -#ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->id == TCA_ID_VLAN) - return true; -#endif - return false; -} - static inline u32 tcf_vlan_action(const struct tc_action *a) { u32 tcfv_action; -- cgit v1.2.3 From 8d68411a128705f86da7f037e1c33d81786fee96 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jun 2025 15:30:16 +0000 Subject: tcp: remove rtx_syn_ack field Now inet_rtx_syn_ack() is only used by TCP, it can directly call tcp_rtx_synack() instead of using an indirect call to req->rsk_ops->rtx_syn_ack(). Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250626153017.2156274-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 2 -- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp_ipv4.c | 1 - net/ipv6/tcp_ipv6.c | 1 - 4 files changed, 1 insertion(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b07b1cd14e9f..bad7d16a5515 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -30,8 +30,6 @@ struct request_sock_ops { unsigned int obj_size; struct kmem_cache *slab; char *slab_name; - int (*rtx_syn_ack)(const struct sock *sk, - struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f4157d26ec9e..d61eef748851 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -886,7 +886,7 @@ static void syn_ack_recalc(struct request_sock *req, int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { - int err = req->rsk_ops->rtx_syn_ack(parent, req); + int err = tcp_rtx_synack(parent, req); if (!err) req->num_retrans++; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 429fb34b075e..56223338bc0f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1703,7 +1703,6 @@ static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct request_sock_ops tcp_request_sock_ops __read_mostly = { .family = PF_INET, .obj_size = sizeof(struct tcp_request_sock), - .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index f0ce62549d90..9fb614e17bde 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -835,7 +835,6 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), - .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, -- cgit v1.2.3 From cf56a98202970adf298df5caaa225ed68350e9ab Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Jun 2025 15:30:17 +0000 Subject: tcp: remove inet_rtx_syn_ack() inet_rtx_syn_ack() is a simple wrapper around tcp_rtx_synack(), if we move req->num_retrans update. Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250626153017.2156274-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/request_sock.h | 2 -- net/ipv4/inet_connection_sock.c | 11 +---------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_timer.c | 2 +- 5 files changed, 4 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/request_sock.h b/include/net/request_sock.h index bad7d16a5515..6a5ec1418e85 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -39,8 +39,6 @@ struct request_sock_ops { void (*syn_ack_timeout)(const struct request_sock *req); }; -int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); - struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index d61eef748851..1e2df51427fe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -884,15 +884,6 @@ static void syn_ack_recalc(struct request_sock *req, req->num_timeout >= rskq_defer_accept - 1; } -int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) -{ - int err = tcp_rtx_synack(parent, req); - - if (!err) - req->num_retrans++; - return err; -} - static struct request_sock * reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) @@ -1132,7 +1123,7 @@ static void reqsk_timer_handler(struct timer_list *t) req->rsk_ops->syn_ack_timeout(req); if (!expire && (!resend || - !inet_rtx_syn_ack(sk_listener, req) || + !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 43d7852ce07e..2994c9222c9c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -726,7 +726,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && - !inet_rtx_syn_ack(sk, req)) { + !tcp_rtx_synack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 28f840724fe8..b616776e3354 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4425,6 +4425,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); + req->num_retrans++; } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index bb37e24b97a7..a207877270fb 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -478,7 +478,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ - inet_rtx_syn_ack(sk, req); + tcp_rtx_synack(sk, req); req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) -- cgit v1.2.3 From 03dc03fa0432a9160c4fcbdb86f274e6b4587972 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 26 Jun 2025 10:31:10 +0300 Subject: neighbor: Add NTF_EXT_VALIDATED flag for externally validated entries tl;dr ===== Add a new neighbor flag ("extern_valid") that can be used to indicate to the kernel that a neighbor entry was learned and determined to be valid externally. The kernel will not try to remove or invalidate such an entry, leaving these decisions to the user space control plane. This is needed for EVPN multi-homing where a neighbor entry for a multi-homed host needs to be synced across all the VTEPs among which the host is multi-homed. Background ========== In a typical EVPN multi-homing setup each host is multi-homed using a set of links called ES (Ethernet Segment, i.e., LAG) to multiple leaf switches (VTEPs). VTEPs that are connected to the same ES are called ES peers. When a neighbor entry is learned on a VTEP, it is distributed to both ES peers and remote VTEPs using EVPN MAC/IP advertisement routes. ES peers use the neighbor entry when routing traffic towards the multi-homed host and remote VTEPs use it for ARP/NS suppression. Motivation ========== If the ES link between a host and the VTEP on which the neighbor entry was locally learned goes down, the EVPN MAC/IP advertisement route will be withdrawn and the neighbor entries will be removed from both ES peers and remote VTEPs. Routing towards the multi-homed host and ARP/NS suppression can fail until another ES peer locally learns the neighbor entry and distributes it via an EVPN MAC/IP advertisement route. "draft-rbickhart-evpn-ip-mac-proxy-adv-03" [1] suggests avoiding these intermittent failures by having the ES peers install the neighbor entries as before, but also injecting EVPN MAC/IP advertisement routes with a proxy indication. When the previously mentioned ES link goes down and the original EVPN MAC/IP advertisement route is withdrawn, the ES peers will not withdraw their neighbor entries, but instead start aging timers for the proxy indication. If an ES peer locally learns the neighbor entry (i.e., it becomes "reachable"), it will restart its aging timer for the entry and emit an EVPN MAC/IP advertisement route without a proxy indication. An ES peer will stop its aging timer for the proxy indication if it observes the removal of the proxy indication from at least one of the ES peers advertising the entry. In the event that the aging timer for the proxy indication expired, an ES peer will withdraw its EVPN MAC/IP advertisement route. If the timer expired on all ES peers and they all withdrew their proxy advertisements, the neighbor entry will be completely removed from the EVPN fabric. Implementation ============== In the above scheme, when the control plane (e.g., FRR) advertises a neighbor entry with a proxy indication, it expects the corresponding entry in the data plane (i.e., the kernel) to remain valid and not be removed due to garbage collection or loss of carrier. The control plane also expects the kernel to notify it if the entry was learned locally (i.e., became "reachable") so that it will remove the proxy indication from the EVPN MAC/IP advertisement route. That is why these entries cannot be programmed with dummy states such as "permanent" or "noarp". Instead, add a new neighbor flag ("extern_valid") which indicates that the entry was learned and determined to be valid externally and should not be removed or invalidated by the kernel. The kernel can probe the entry and notify user space when it becomes "reachable" (it is initially installed as "stale"). However, if the kernel does not receive a confirmation, have it return the entry to the "stale" state instead of the "failed" state. In other words, an entry marked with the "extern_valid" flag behaves like any other dynamically learned entry other than the fact that the kernel cannot remove or invalidate it. One can argue that the "extern_valid" flag should not prevent garbage collection and that instead a neighbor entry should be programmed with both the "extern_valid" and "extern_learn" flags. There are two reasons for not doing that: 1. Unclear why a control plane would like to program an entry that the kernel cannot invalidate but can completely remove. 2. The "extern_learn" flag is used by FRR for neighbor entries learned on remote VTEPs (for ARP/NS suppression) whereas here we are concerned with local entries. This distinction is currently irrelevant for the kernel, but might be relevant in the future. Given that the flag only makes sense when the neighbor has a valid state, reject attempts to add a neighbor with an invalid state and with this flag set. For example: # ip neigh add 192.0.2.1 nud none dev br0.10 extern_valid Error: Cannot create externally validated neighbor with an invalid state. # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid # ip neigh replace 192.0.2.1 nud failed dev br0.10 extern_valid Error: Cannot mark neighbor as externally validated with an invalid state. The above means that a neighbor cannot be created with the "extern_valid" flag and flags such as "use" or "managed" as they result in a neighbor being created with an invalid state ("none") and immediately getting probed: # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use Error: Cannot create externally validated neighbor with an invalid state. However, these flags can be used together with "extern_valid" after the neighbor was created with a valid state: # ip neigh add 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use One consequence of preventing the kernel from invalidating a neighbor entry is that by default it will only try to determine reachability using unicast probes. This can be changed using the "mcast_resolicit" sysctl: # sysctl net.ipv4.neigh.br0/10.mcast_resolicit 0 # tcpdump -nn -e -i br0.10 -Q out arp & # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 # sysctl -wq net.ipv4.neigh.br0/10.mcast_resolicit=3 # ip neigh replace 192.0.2.1 lladdr 00:11:22:33:44:55 nud stale dev br0.10 extern_valid use 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > 00:11:22:33:44:55, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 62:50:1d:11:93:6f > ff:ff:ff:ff:ff:ff, ethertype ARP (0x0806), length 42: Request who-has 192.0.2.1 tell 192.0.2.2, length 28 iproute2 patches can be found here [2]. [1] https://datatracker.ietf.org/doc/html/draft-rbickhart-evpn-ip-mac-proxy-adv-03 [2] https://github.com/idosch/iproute2/tree/submit/extern_valid_v1 Signed-off-by: Ido Schimmel Acked-by: Daniel Borkmann Link: https://patch.msgid.link/20250626073111.244534-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/rt-neigh.yaml | 1 + include/net/neighbour.h | 4 +- include/uapi/linux/neighbour.h | 5 ++ net/core/neighbour.c | 79 +++++++++++++++++++++++++++---- 4 files changed, 78 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/Documentation/netlink/specs/rt-neigh.yaml b/Documentation/netlink/specs/rt-neigh.yaml index 25cc2d528d2f..30a9ee16f128 100644 --- a/Documentation/netlink/specs/rt-neigh.yaml +++ b/Documentation/netlink/specs/rt-neigh.yaml @@ -79,6 +79,7 @@ definitions: entries: - managed - locked + - ext-validated - name: rtm-type type: enum diff --git a/include/net/neighbour.h b/include/net/neighbour.h index c7ce5ec7be23..7e865b14749d 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -261,13 +261,15 @@ static inline void *neighbour_priv(const struct neighbour *n) #define NEIGH_UPDATE_F_EXT_LEARNED BIT(5) #define NEIGH_UPDATE_F_ISROUTER BIT(6) #define NEIGH_UPDATE_F_ADMIN BIT(7) +#define NEIGH_UPDATE_F_EXT_VALIDATED BIT(8) /* In-kernel representation for NDA_FLAGS_EXT flags: */ #define NTF_OLD_MASK 0xff #define NTF_EXT_SHIFT 8 -#define NTF_EXT_MASK (NTF_EXT_MANAGED) +#define NTF_EXT_MASK (NTF_EXT_MANAGED | NTF_EXT_EXT_VALIDATED) #define NTF_MANAGED (NTF_EXT_MANAGED << NTF_EXT_SHIFT) +#define NTF_EXT_VALIDATED (NTF_EXT_EXT_VALIDATED << NTF_EXT_SHIFT) extern const struct nla_policy nda_policy[]; diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index b851c36ad25d..c34a81245f87 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -54,6 +54,7 @@ enum { /* Extended flags under NDA_FLAGS_EXT: */ #define NTF_EXT_MANAGED (1 << 0) #define NTF_EXT_LOCKED (1 << 1) +#define NTF_EXT_EXT_VALIDATED (1 << 2) /* * Neighbor Cache Entry States. @@ -92,6 +93,10 @@ enum { * bridge in response to a host trying to communicate via a locked bridge port * with MAB enabled. Their purpose is to notify user space that a host requires * authentication. + * + * NTF_EXT_EXT_VALIDATED flagged neighbor entries were externally validated by + * a user space control plane. The kernel will not remove or invalidate them, + * but it can probe them and notify user space when they become reachable. */ struct nda_cacheinfo { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8ad9898f8e42..e5f0992ac364 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -154,11 +154,12 @@ static void neigh_update_gc_list(struct neighbour *n) if (n->dead) goto out; - /* remove from the gc list if new state is permanent or if neighbor - * is externally learned; otherwise entry should be on the gc list + /* remove from the gc list if new state is permanent or if neighbor is + * externally learned / validated; otherwise entry should be on the gc + * list */ exempt_from_gc = n->nud_state & NUD_PERMANENT || - n->flags & NTF_EXT_LEARNED; + n->flags & (NTF_EXT_LEARNED | NTF_EXT_VALIDATED); on_gc_list = !list_empty(&n->gc_list); if (exempt_from_gc && on_gc_list) { @@ -205,6 +206,7 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, ndm_flags = (flags & NEIGH_UPDATE_F_EXT_LEARNED) ? NTF_EXT_LEARNED : 0; ndm_flags |= (flags & NEIGH_UPDATE_F_MANAGED) ? NTF_MANAGED : 0; + ndm_flags |= (flags & NEIGH_UPDATE_F_EXT_VALIDATED) ? NTF_EXT_VALIDATED : 0; if ((old_flags ^ ndm_flags) & NTF_EXT_LEARNED) { if (ndm_flags & NTF_EXT_LEARNED) @@ -222,6 +224,14 @@ static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, *notify = 1; *managed_update = true; } + if ((old_flags ^ ndm_flags) & NTF_EXT_VALIDATED) { + if (ndm_flags & NTF_EXT_VALIDATED) + neigh->flags |= NTF_EXT_VALIDATED; + else + neigh->flags &= ~NTF_EXT_VALIDATED; + *notify = 1; + *gc_update = true; + } } bool neigh_remove_one(struct neighbour *n) @@ -379,7 +389,9 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev, dev_head = neigh_get_dev_table(dev, tbl->family); hlist_for_each_entry_safe(n, tmp, dev_head, dev_list) { - if (skip_perm && n->nud_state & NUD_PERMANENT) + if (skip_perm && + (n->nud_state & NUD_PERMANENT || + n->flags & NTF_EXT_VALIDATED)) continue; hlist_del_rcu(&n->hash); @@ -942,7 +954,8 @@ static void neigh_periodic_work(struct work_struct *work) state = n->nud_state; if ((state & (NUD_PERMANENT | NUD_IN_TIMER)) || - (n->flags & NTF_EXT_LEARNED)) { + (n->flags & + (NTF_EXT_LEARNED | NTF_EXT_VALIDATED))) { write_unlock(&n->lock); continue; } @@ -1095,9 +1108,15 @@ static void neigh_timer_handler(struct timer_list *t) if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { - WRITE_ONCE(neigh->nud_state, NUD_FAILED); + if (neigh->nud_state == NUD_PROBE && + neigh->flags & NTF_EXT_VALIDATED) { + WRITE_ONCE(neigh->nud_state, NUD_STALE); + neigh->updated = jiffies; + } else { + WRITE_ONCE(neigh->nud_state, NUD_FAILED); + neigh_invalidate(neigh); + } notify = 1; - neigh_invalidate(neigh); goto out; } @@ -1245,6 +1264,8 @@ static void neigh_update_hhs(struct neighbour *neigh) NTF_ROUTER flag. NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as a router. + NEIGH_UPDATE_F_EXT_VALIDATED means that the entry will not be removed + or invalidated. Caller MUST hold reference count on the entry. */ @@ -1979,7 +2000,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - if (ndm_flags & NTF_MANAGED) { + if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } @@ -2010,7 +2031,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (neigh == NULL) { bool ndm_permanent = ndm->ndm_state & NUD_PERMANENT; bool exempt_from_gc = ndm_permanent || - ndm_flags & NTF_EXT_LEARNED; + ndm_flags & (NTF_EXT_LEARNED | + NTF_EXT_VALIDATED); if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; @@ -2021,10 +2043,27 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, err = -EINVAL; goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED will result in the neighbor + * being created with an invalid state (NUD_NONE). + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = NUD_NONE; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot create externally validated neighbor with an invalid state"); + err = -EINVAL; + goto out; + } + } neigh = ___neigh_create(tbl, dst, dev, ndm_flags & - (NTF_EXT_LEARNED | NTF_MANAGED), + (NTF_EXT_LEARNED | NTF_MANAGED | + NTF_EXT_VALIDATED), exempt_from_gc, true); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); @@ -2036,6 +2075,24 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, neigh_release(neigh); goto out; } + if (ndm_flags & NTF_EXT_VALIDATED) { + u8 state = ndm->ndm_state; + + /* NTF_USE and NTF_MANAGED do not update the existing + * state other than clearing it if it was + * NUD_PERMANENT. + */ + if (ndm_flags & (NTF_USE | NTF_MANAGED)) + state = READ_ONCE(neigh->nud_state) & ~NUD_PERMANENT; + + if (!(state & NUD_VALID)) { + NL_SET_ERR_MSG(extack, + "Cannot mark neighbor as externally validated with an invalid state"); + err = -EINVAL; + neigh_release(neigh); + goto out; + } + } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) flags &= ~(NEIGH_UPDATE_F_OVERRIDE | @@ -2052,6 +2109,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, flags |= NEIGH_UPDATE_F_MANAGED; if (ndm_flags & NTF_USE) flags |= NEIGH_UPDATE_F_USE; + if (ndm_flags & NTF_EXT_VALIDATED) + flags |= NEIGH_UPDATE_F_EXT_VALIDATED; err = __neigh_update(neigh, lladdr, ndm->ndm_state, flags, NETLINK_CB(skb).portid, extack); -- cgit v1.2.3 From fbe346ce9d626680a4dd0f079e17c7b5dd32ffad Mon Sep 17 00:00:00 2001 From: Haiyang Zhang Date: Fri, 27 Jun 2025 13:26:23 -0700 Subject: net: mana: Handle Reset Request from MANA NIC Upon receiving the Reset Request, pause the connection and clean up queues, wait for the specified period, then resume the NIC. In the cleanup phase, the HWC is no longer responding, so set hwc_timeout to zero to skip waiting on the response. Signed-off-by: Haiyang Zhang Link: https://patch.msgid.link/1751055983-29760-1-git-send-email-haiyangz@linux.microsoft.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 127 ++++++++++++++++++----- drivers/net/ethernet/microsoft/mana/hw_channel.c | 4 +- drivers/net/ethernet/microsoft/mana/mana_en.c | 37 +++++-- include/net/mana/gdma.h | 10 ++ 4 files changed, 143 insertions(+), 35 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index 55dd7dee718c..a468cd8e5f36 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -10,6 +10,7 @@ #include #include +#include struct dentry *mana_debugfs_root; @@ -68,6 +69,24 @@ static void mana_gd_init_registers(struct pci_dev *pdev) mana_gd_init_vf_regs(pdev); } +/* Suppress logging when we set timeout to zero */ +bool mana_need_log(struct gdma_context *gc, int err) +{ + struct hw_channel_context *hwc; + + if (err != -ETIMEDOUT) + return true; + + if (!gc) + return true; + + hwc = gc->hwc.driver_data; + if (hwc && hwc->hwc_timeout == 0) + return false; + + return true; +} + static int mana_gd_query_max_resources(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -278,8 +297,9 @@ static int mana_gd_disable_queue(struct gdma_queue *queue) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err, - resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to disable queue: %d, 0x%x\n", err, + resp.hdr.status); return err ? err : -EPROTO; } @@ -366,25 +386,12 @@ EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA"); #define MANA_SERVICE_PERIOD 10 -struct mana_serv_work { - struct work_struct serv_work; - struct pci_dev *pdev; -}; - -static void mana_serv_func(struct work_struct *w) +static void mana_serv_fpga(struct pci_dev *pdev) { - struct mana_serv_work *mns_wk; struct pci_bus *bus, *parent; - struct pci_dev *pdev; - - mns_wk = container_of(w, struct mana_serv_work, serv_work); - pdev = mns_wk->pdev; pci_lock_rescan_remove(); - if (!pdev) - goto out; - bus = pdev->bus; if (!bus) { dev_err(&pdev->dev, "MANA service: no bus\n"); @@ -405,7 +412,74 @@ static void mana_serv_func(struct work_struct *w) out: pci_unlock_rescan_remove(); +} + +static void mana_serv_reset(struct pci_dev *pdev) +{ + struct gdma_context *gc = pci_get_drvdata(pdev); + struct hw_channel_context *hwc; + + if (!gc) { + dev_err(&pdev->dev, "MANA service: no GC\n"); + return; + } + + hwc = gc->hwc.driver_data; + if (!hwc) { + dev_err(&pdev->dev, "MANA service: no HWC\n"); + goto out; + } + + /* HWC is not responding in this case, so don't wait */ + hwc->hwc_timeout = 0; + + dev_info(&pdev->dev, "MANA reset cycle start\n"); + mana_gd_suspend(pdev, PMSG_SUSPEND); + + msleep(MANA_SERVICE_PERIOD * 1000); + + mana_gd_resume(pdev); + + dev_info(&pdev->dev, "MANA reset cycle completed\n"); + +out: + gc->in_service = false; +} + +struct mana_serv_work { + struct work_struct serv_work; + struct pci_dev *pdev; + enum gdma_eqe_type type; +}; + +static void mana_serv_func(struct work_struct *w) +{ + struct mana_serv_work *mns_wk; + struct pci_dev *pdev; + + mns_wk = container_of(w, struct mana_serv_work, serv_work); + pdev = mns_wk->pdev; + + if (!pdev) + goto out; + + switch (mns_wk->type) { + case GDMA_EQE_HWC_FPGA_RECONFIG: + mana_serv_fpga(pdev); + break; + + case GDMA_EQE_HWC_RESET_REQUEST: + mana_serv_reset(pdev); + break; + + default: + dev_err(&pdev->dev, "MANA service: unknown type %d\n", + mns_wk->type); + break; + } + +out: pci_dev_put(pdev); kfree(mns_wk); module_put(THIS_MODULE); @@ -462,6 +536,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) break; case GDMA_EQE_HWC_FPGA_RECONFIG: + case GDMA_EQE_HWC_RESET_REQUEST: dev_info(gc->dev, "Recv MANA service type:%d\n", type); if (gc->in_service) { @@ -483,6 +558,7 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) dev_info(gc->dev, "Start MANA service type:%d\n", type); gc->in_service = true; mns_wk->pdev = to_pci_dev(gc->dev); + mns_wk->type = type; pci_dev_get(mns_wk->pdev); INIT_WORK(&mns_wk->serv_work, mana_serv_func); schedule_work(&mns_wk->serv_work); @@ -634,7 +710,8 @@ int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err) { - dev_err(dev, "test_eq failed: %d\n", err); + if (mana_need_log(gc, err)) + dev_err(dev, "test_eq failed: %d\n", err); goto out; } @@ -669,7 +746,7 @@ static void mana_gd_destroy_eq(struct gdma_context *gc, bool flush_evenets, if (flush_evenets) { err = mana_gd_test_eq(gc, queue); - if (err) + if (err && mana_need_log(gc, err)) dev_warn(gc->dev, "Failed to flush EQ: %d\n", err); } @@ -815,8 +892,9 @@ int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n", - err, resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n", + err, resp.hdr.status); return -EPROTO; } @@ -1116,8 +1194,9 @@ int mana_gd_deregister_device(struct gdma_dev *gd) err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp); if (err || resp.hdr.status) { - dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", - err, resp.hdr.status); + if (mana_need_log(gc, err)) + dev_err(gc->dev, "Failed to deregister device: %d, 0x%x\n", + err, resp.hdr.status); if (!err) err = -EPROTO; } @@ -1915,7 +1994,7 @@ static void mana_gd_remove(struct pci_dev *pdev) } /* The 'state' parameter is not used. */ -static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) { struct gdma_context *gc = pci_get_drvdata(pdev); @@ -1931,7 +2010,7 @@ static int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state) * fail -- if this happens, it's safer to just report an error than try to undo * what has been done. */ -static int mana_gd_resume(struct pci_dev *pdev) +int mana_gd_resume(struct pci_dev *pdev) { struct gdma_context *gc = pci_get_drvdata(pdev); int err; diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c index 650d22654d49..ef072e24c46d 100644 --- a/drivers/net/ethernet/microsoft/mana/hw_channel.c +++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c @@ -880,7 +880,9 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len, if (!wait_for_completion_timeout(&ctx->comp_event, (msecs_to_jiffies(hwc->hwc_timeout)))) { - dev_err(hwc->dev, "HWC: Request timed out!\n"); + if (hwc->hwc_timeout != 0) + dev_err(hwc->dev, "HWC: Request timed out!\n"); + err = -ETIMEDOUT; goto out; } diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c index 016fd808ccad..a7973651ae51 100644 --- a/drivers/net/ethernet/microsoft/mana/mana_en.c +++ b/drivers/net/ethernet/microsoft/mana/mana_en.c @@ -47,6 +47,15 @@ static const struct file_operations mana_dbg_q_fops = { .read = mana_dbg_q_read, }; +static bool mana_en_need_log(struct mana_port_context *apc, int err) +{ + if (apc && apc->ac && apc->ac->gdma_dev && + apc->ac->gdma_dev->gdma_context) + return mana_need_log(apc->ac->gdma_dev->gdma_context, err); + else + return true; +} + /* Microsoft Azure Network Adapter (MANA) functions */ static int mana_open(struct net_device *ndev) @@ -854,7 +863,8 @@ static int mana_send_request(struct mana_context *ac, void *in_buf, if (err == -EOPNOTSUPP) return err; - if (req->req.msg_type != MANA_QUERY_PHY_STAT) + if (req->req.msg_type != MANA_QUERY_PHY_STAT && + mana_need_log(gc, err)) dev_err(dev, "Failed to send mana message: %d, 0x%x\n", err, resp->status); return err ? err : -EPROTO; @@ -931,8 +941,10 @@ static void mana_pf_deregister_hw_vport(struct mana_port_context *apc) err = mana_send_request(apc->ac, &req, sizeof(req), &resp, sizeof(resp)); if (err) { - netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n", - err); + if (mana_en_need_log(apc, err)) + netdev_err(apc->ndev, "Failed to unregister hw vPort: %d\n", + err); + return; } @@ -987,8 +999,10 @@ static void mana_pf_deregister_filter(struct mana_port_context *apc) err = mana_send_request(apc->ac, &req, sizeof(req), &resp, sizeof(resp)); if (err) { - netdev_err(apc->ndev, "Failed to unregister filter: %d\n", - err); + if (mana_en_need_log(apc, err)) + netdev_err(apc->ndev, "Failed to unregister filter: %d\n", + err); + return; } @@ -1218,7 +1232,9 @@ static int mana_cfg_vport_steering(struct mana_port_context *apc, err = mana_send_request(apc->ac, req, req_buf_size, &resp, sizeof(resp)); if (err) { - netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + if (mana_en_need_log(apc, err)) + netdev_err(ndev, "Failed to configure vPort RX: %d\n", err); + goto out; } @@ -1402,7 +1418,9 @@ void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type, err = mana_send_request(apc->ac, &req, sizeof(req), &resp, sizeof(resp)); if (err) { - netdev_err(ndev, "Failed to destroy WQ object: %d\n", err); + if (mana_en_need_log(apc, err)) + netdev_err(ndev, "Failed to destroy WQ object: %d\n", err); + return; } @@ -3067,11 +3085,10 @@ static int mana_dealloc_queues(struct net_device *ndev) apc->rss_state = TRI_STATE_FALSE; err = mana_config_rss(apc, TRI_STATE_FALSE, false, false); - if (err) { + if (err && mana_en_need_log(apc, err)) netdev_err(ndev, "Failed to disable vPort: %d\n", err); - return err; - } + /* Even in err case, still need to cleanup the vPort */ mana_destroy_vport(apc); return 0; diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h index 92ab85061df0..57df78cfbf82 100644 --- a/include/net/mana/gdma.h +++ b/include/net/mana/gdma.h @@ -62,6 +62,7 @@ enum gdma_eqe_type { GDMA_EQE_HWC_FPGA_RECONFIG = 132, GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, GDMA_EQE_HWC_SOC_SERVICE = 134, + GDMA_EQE_HWC_RESET_REQUEST = 135, GDMA_EQE_RNIC_QP_FATAL = 176, }; @@ -584,6 +585,9 @@ enum { /* Driver supports dynamic MSI-X vector allocation */ #define GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT BIT(13) +/* Driver can self reset on EQE notification */ +#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14) + /* Driver can self reset on FPGA Reconfig EQE notification */ #define GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE BIT(17) @@ -594,6 +598,7 @@ enum { GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \ GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \ GDMA_DRV_CAP_FLAG_1_DYNAMIC_IRQ_ALLOC_SUPPORT | \ + GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE | \ GDMA_DRV_CAP_FLAG_1_HANDLE_RECONFIG_EQE) #define GDMA_DRV_CAP_FLAGS2 0 @@ -921,4 +926,9 @@ void mana_unregister_debugfs(void); int mana_rdma_service_event(struct gdma_context *gc, enum gdma_service_type event); +int mana_gd_suspend(struct pci_dev *pdev, pm_message_t state); +int mana_gd_resume(struct pci_dev *pdev); + +bool mana_need_log(struct gdma_context *gc, int err); + #endif /* _GDMA_H */ -- cgit v1.2.3 From 3715b5df09b92168a4492b48bb7ea70d89f9d8f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 09:35:37 +0000 Subject: net: add struct net_aligned_data This structure will hold networking data that must consume a full cache line to avoid accidental false sharing. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250630093540.3052835-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/aligned_data.h | 16 ++++++++++++++++ net/core/hotdata.c | 3 +++ 2 files changed, 19 insertions(+) create mode 100644 include/net/aligned_data.h (limited to 'include/net') diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h new file mode 100644 index 000000000000..cf3329d7c227 --- /dev/null +++ b/include/net/aligned_data.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _NET_ALIGNED_DATA_H +#define _NET_ALIGNED_DATA_H + +#include + +/* Structure holding cacheline aligned fields on SMP builds. + * Each field or group should have an ____cacheline_aligned_in_smp + * attribute to ensure no accidental false sharing can happen. + */ +struct net_aligned_data { +}; + +extern struct net_aligned_data net_aligned_data; + +#endif /* _NET_ALIGNED_DATA_H */ diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 0bc893d5f07b..e9c03491ab00 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include @@ -22,3 +23,5 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); + +struct net_aligned_data net_aligned_data; -- cgit v1.2.3 From 998642e999d23324c5dbf38149606d09cec2c377 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 09:35:38 +0000 Subject: net: move net_cookie into net_aligned_data Using per-cpu data for net->net_cookie generation is overkill, because even busy hosts do not create hundreds of netns per second. Make sure to put net_cookie in a private cache line to avoid potential false sharing. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250630093540.3052835-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/aligned_data.h | 2 ++ net/core/net_namespace.c | 8 ++------ 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h index cf3329d7c227..5c7badf71f04 100644 --- a/include/net/aligned_data.h +++ b/include/net/aligned_data.h @@ -2,6 +2,7 @@ #ifndef _NET_ALIGNED_DATA_H #define _NET_ALIGNED_DATA_H +#include #include /* Structure holding cacheline aligned fields on SMP builds. @@ -9,6 +10,7 @@ * attribute to ensure no accidental false sharing can happen. */ struct net_aligned_data { + atomic64_t net_cookie ____cacheline_aligned_in_smp; }; extern struct net_aligned_data net_aligned_data; diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 419604d9cf32..f58ef920a3a1 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -19,9 +19,9 @@ #include #include #include -#include #include +#include #include #include #include @@ -64,8 +64,6 @@ DECLARE_RWSEM(pernet_ops_rwsem); static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; -DEFINE_COOKIE(net_cookie); - static struct net_generic *net_alloc_generic(void) { unsigned int gen_ptrs = READ_ONCE(max_gen_ptrs); @@ -434,9 +432,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - preempt_disable(); - net->net_cookie = gen_cookie_next(&net_cookie); - preempt_enable(); + net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); -- cgit v1.2.3 From 83081337419cb692eca4ee475d936b1fdcfd49f6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 09:35:39 +0000 Subject: tcp: move tcp_memory_allocated into net_aligned_data ____cacheline_aligned_in_smp attribute only makes sure to align a field to a cache line. It does not prevent the linker to use the remaining of the cache line for other variables, causing potential false sharing. Move tcp_memory_allocated into a dedicated cache line. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250630093540.3052835-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/aligned_data.h | 3 +++ include/net/tcp.h | 1 - net/core/hotdata.c | 2 ++ net/ipv4/tcp.c | 2 -- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv6/tcp_ipv6.c | 3 ++- net/mptcp/protocol.c | 3 ++- 7 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h index 5c7badf71f04..bedb4f86b0fe 100644 --- a/include/net/aligned_data.h +++ b/include/net/aligned_data.h @@ -11,6 +11,9 @@ */ struct net_aligned_data { atomic64_t net_cookie ____cacheline_aligned_in_smp; +#if defined(CONFIG_INET) + atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; +#endif }; extern struct net_aligned_data net_aligned_data; diff --git a/include/net/tcp.h b/include/net/tcp.h index 761c4a0ad386..bc08de49805c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -267,7 +267,6 @@ extern long sysctl_tcp_mem[3]; #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ -extern atomic_long_t tcp_memory_allocated; DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); extern struct percpu_counter tcp_sockets_allocated; diff --git a/net/core/hotdata.c b/net/core/hotdata.c index e9c03491ab00..95d0a4df1006 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -4,6 +4,7 @@ #include #include #include +#include #include struct net_hotdata net_hotdata __cacheline_aligned = { @@ -25,3 +26,4 @@ struct net_hotdata net_hotdata __cacheline_aligned = { EXPORT_SYMBOL(net_hotdata); struct net_aligned_data net_aligned_data; +EXPORT_IPV6_MOD(net_aligned_data); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8a3c99246d2e..925b2c572ca2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -302,8 +302,6 @@ EXPORT_PER_CPU_SYMBOL_GPL(tcp_tw_isn); long sysctl_tcp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_tcp_mem); -atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ -EXPORT_IPV6_MOD(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 56223338bc0f..b406fd012b2e 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -59,6 +59,7 @@ #include #include +#include #include #include #include @@ -3390,7 +3391,7 @@ struct proto tcp_prot = { .sockets_allocated = &tcp_sockets_allocated, .orphan_count = &tcp_orphan_count, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 9fb614e17bde..ed0b891885d8 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -2356,7 +2357,7 @@ struct proto tcpv6_prot = { .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e7972e633236..5f904fc5ac4c 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -3729,7 +3730,7 @@ static struct proto mptcp_prot = { .stream_memory_free = mptcp_stream_memory_free, .sockets_allocated = &mptcp_sockets_allocated, - .memory_allocated = &tcp_memory_allocated, + .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, -- cgit v1.2.3 From e3d4825124bce0d1f72187fabcf972b7c0b6cb9b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 09:35:40 +0000 Subject: udp: move udp_memory_allocated into net_aligned_data ____cacheline_aligned_in_smp attribute only makes sure to align a field to a cache line. It does not prevent the linker to use the remaining of the cache line for other variables, causing potential false sharing. Signed-off-by: Eric Dumazet Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250630093540.3052835-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/aligned_data.h | 1 + include/net/udp.h | 1 - net/ipv4/udp.c | 4 +--- net/ipv4/udp_impl.h | 1 + net/ipv4/udplite.c | 2 +- net/ipv6/udp.c | 2 +- net/ipv6/udp_impl.h | 1 + net/ipv6/udplite.c | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include/net') diff --git a/include/net/aligned_data.h b/include/net/aligned_data.h index bedb4f86b0fe..e1a1c8aedc79 100644 --- a/include/net/aligned_data.h +++ b/include/net/aligned_data.h @@ -13,6 +13,7 @@ struct net_aligned_data { atomic64_t net_cookie ____cacheline_aligned_in_smp; #if defined(CONFIG_INET) atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; + atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; #endif }; diff --git a/include/net/udp.h b/include/net/udp.h index a772510b2aa5..f8ae2c4ade14 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -205,7 +205,6 @@ static inline void udp_hash4_dec(struct udp_hslot *hslot2) extern struct proto udp_prot; -extern atomic_long_t udp_memory_allocated; DECLARE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); /* sysctl variables for udp */ diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 19573ee64a0f..49f43c54cfb0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -127,8 +127,6 @@ struct udp_table udp_table __read_mostly; long sysctl_udp_mem[3] __read_mostly; EXPORT_IPV6_MOD(sysctl_udp_mem); -atomic_long_t udp_memory_allocated ____cacheline_aligned_in_smp; -EXPORT_IPV6_MOD(udp_memory_allocated); DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc); @@ -3235,7 +3233,7 @@ struct proto udp_prot = { #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = udp_bpf_update_proto, #endif - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv4/udp_impl.h b/net/ipv4/udp_impl.h index e1ff3a375996..c7142213fc21 100644 --- a/net/ipv4/udp_impl.h +++ b/net/ipv4/udp_impl.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDP4_IMPL_H #define _UDP4_IMPL_H +#include #include #include #include diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index af37af3ab727..d3e621a11a1a 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -60,7 +60,7 @@ struct proto udplite_prot = { .rehash = udp_v4_rehash, .get_port = udp_v4_get_port, - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ebb95d8bc681..6bbdadbd5fec 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1925,7 +1925,7 @@ struct proto udpv6_prot = { .psock_update_sk_prot = udp_bpf_update_proto, #endif - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 0590f566379d..8a406be25a3a 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _UDP6_IMPL_H #define _UDP6_IMPL_H +#include #include #include #include diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index a60bec9b14f1..2cec542437f7 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -59,7 +59,7 @@ struct proto udplitev6_prot = { .rehash = udp_v6_rehash, .get_port = udp_v6_get_port, - .memory_allocated = &udp_memory_allocated, + .memory_allocated = &net_aligned_data.udp_memory_allocated, .per_cpu_fw_alloc = &udp_memory_per_cpu_fw_alloc, .sysctl_mem = sysctl_udp_mem, -- cgit v1.2.3 From 8a402bbe54760dea67f1b2980c727761b47994d7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:25 +0000 Subject: net: dst: annotate data-races around dst->obsolete (dst_entry)->obsolete is read locklessly, add corresponding annotations. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 2 +- net/core/dst.c | 2 +- net/core/dst_cache.c | 2 +- net/core/neighbour.c | 3 ++- net/core/sock.c | 4 ++-- net/ipv4/datagram.c | 2 +- net/ipv4/route.c | 15 ++++++++------- net/ipv6/datagram.c | 2 +- net/ipv6/route.c | 9 ++++----- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- net/sctp/transport.c | 2 +- net/xfrm/xfrm_policy.c | 4 ++-- 12 files changed, 25 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 78c78cdce0e9..76c30c3b22dd 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -476,7 +476,7 @@ INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); static inline struct dst_entry *dst_check(struct dst_entry *dst, u32 cookie) { - if (dst->obsolete) + if (READ_ONCE(dst->obsolete)) dst = INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie); return dst; diff --git a/net/core/dst.c b/net/core/dst.c index 795ca07e28a4..8f2a3138d60c 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -145,7 +145,7 @@ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; - dst->obsolete = DST_OBSOLETE_DEAD; + WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); dst->input = dst_discard; diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c index 93a04d18e505..9ab4902324e1 100644 --- a/net/core/dst_cache.c +++ b/net/core/dst_cache.c @@ -52,7 +52,7 @@ static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache, if (unlikely(!time_after(idst->refresh_ts, READ_ONCE(dst_cache->reset_ts)) || - (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) { + (READ_ONCE(dst->obsolete) && !dst->ops->check(dst, idst->cookie)))) { dst_cache_per_cpu_dst_set(idst, NULL, 0); dst_release(dst); goto fail; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e5f0992ac364..d1de7f292eea 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1428,7 +1428,8 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, * we can reinject the packet there. */ n2 = NULL; - if (dst && dst->obsolete != DST_OBSOLETE_DEAD) { + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { n2 = dst_neigh_lookup_skb(dst, skb); if (n2) n1 = n2; diff --git a/net/core/sock.c b/net/core/sock.c index 3a71d6c4ccf0..dc59fb7760a3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -602,7 +602,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_tx_queue_clear(sk); @@ -620,7 +620,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) { struct dst_entry *dst = sk_dst_get(sk); - if (dst && dst->obsolete && + if (dst && READ_ONCE(dst->obsolete) && INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, dst, cookie) == NULL) { sk_dst_reset(sk); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 4b5bc6eb52e7..c2b2cda1a7e5 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -109,7 +109,7 @@ void ip4_datagram_release_cb(struct sock *sk) rcu_read_lock(); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || dst->ops->check(dst, 0)) { + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, 0)) { rcu_read_unlock(); return; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a2b7cadf66af..d32af8c16727 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -717,7 +717,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, */ rt = rcu_dereference(nhc->nhc_rth_input); if (rt) - rt->dst.obsolete = DST_OBSOLETE_KILL; + WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); for_each_possible_cpu(i) { struct rtable __rcu **prt; @@ -725,7 +725,7 @@ static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); rt = rcu_dereference(*prt); if (rt) - rt->dst.obsolete = DST_OBSOLETE_KILL; + WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); } } @@ -797,7 +797,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow jiffies + ip_rt_gc_timeout); } if (kill_route) - rt->dst.obsolete = DST_OBSOLETE_KILL; + WRITE_ONCE(rt->dst.obsolete, DST_OBSOLETE_KILL); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); } neigh_release(n); @@ -842,7 +842,7 @@ static void ipv4_negative_advice(struct sock *sk, { struct rtable *rt = dst_rtable(dst); - if ((dst->obsolete > 0) || + if ((READ_ONCE(dst->obsolete) > 0) || (rt->rt_flags & RTCF_REDIRECTED) || rt->dst.expires) sk_dst_reset(sk); @@ -1136,7 +1136,7 @@ void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); rt = dst_rtable(odst); - if (odst->obsolete && !odst->ops->check(odst, 0)) { + if (READ_ONCE(odst->obsolete) && !odst->ops->check(odst, 0)) { rt = ip_route_output_flow(sock_net(sk), &fl4, sk); if (IS_ERR(rt)) goto out; @@ -1211,7 +1211,8 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry *ipv4_dst_check(struct dst_entry *dst, * this is indicated by setting obsolete to DST_OBSOLETE_KILL or * DST_OBSOLETE_DEAD. */ - if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) + if (READ_ONCE(dst->obsolete) != DST_OBSOLETE_FORCE_CHK || + rt_is_expired(rt)) return NULL; return dst; } @@ -1571,7 +1572,7 @@ void rt_flush_dev(struct net_device *dev) static bool rt_cache_valid(const struct rtable *rt) { return rt && - rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && !rt_is_expired(rt); } diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 281722817a65..972bf0426d59 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -127,7 +127,7 @@ void ip6_datagram_release_cb(struct sock *sk) rcu_read_lock(); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) { rcu_read_unlock(); return; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 46a4f9d1900f..ace2071f77bd 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -406,7 +406,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) if (time_after(jiffies, rt->dst.expires)) return true; } else if (from) { - return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK || + return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || fib6_check_expired(from); } return false; @@ -2777,11 +2777,10 @@ static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie) { if (!__rt6_check_expired(rt) && - rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && + READ_ONCE(rt->dst.obsolete) == DST_OBSOLETE_FORCE_CHK && fib6_check(from, cookie)) return &rt->dst; - else - return NULL; + return NULL; } INDIRECT_CALLABLE_SCOPE struct dst_entry *ip6_dst_check(struct dst_entry *dst, @@ -3014,7 +3013,7 @@ void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu) sk_uid(sk)); dst = __sk_dst_get(sk); - if (!dst || !dst->obsolete || + if (!dst || !READ_ONCE(dst->obsolete) || dst->ops->check(dst, inet6_sk(sk)->dst_cookie)) return; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 014f07740369..95af252b2939 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -97,7 +97,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest) if (!dest_dst) return NULL; dst = dest_dst->dst_cache; - if (dst->obsolete && + if (READ_ONCE(dst->obsolete) && dst->ops->check(dst, dest_dst->dst_cookie) == NULL) return NULL; return dest_dst; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 6946c1462793..4d258a6e8033 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -240,7 +240,7 @@ void sctp_transport_set_owner(struct sctp_transport *transport, void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk) { /* If we don't have a fresh route, look one up */ - if (!transport->dst || transport->dst->obsolete) { + if (!transport->dst || READ_ONCE(transport->dst->obsolete)) { sctp_transport_dst_release(transport); transport->af_specific->get_dst(transport, &transport->saddr, &transport->fl, sk); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 094d2454602e..c5035a9bc3bb 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3925,7 +3925,7 @@ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ - if (dst->obsolete < 0 && !stale_bundle(dst)) + if (READ_ONCE(dst->obsolete) < 0 && !stale_bundle(dst)) return dst; return NULL; @@ -3953,7 +3953,7 @@ static void xfrm_link_failure(struct sk_buff *skb) static void xfrm_negative_advice(struct sock *sk, struct dst_entry *dst) { - if (dst->obsolete) + if (READ_ONCE(dst->obsolete)) sk_dst_reset(sk); } -- cgit v1.2.3 From 36229b2caca2228b834c03fb83867022485a0563 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:26 +0000 Subject: net: dst: annotate data-races around dst->expires (dst_entry)->expires is read and written locklessly, add corresponding annotations. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 8 +++++--- include/net/ip.h | 2 +- net/ipv4/route.c | 7 ++++--- net/ipv6/route.c | 13 ++++++------- 4 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 76c30c3b22dd..1efe1e5d51a9 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -431,13 +431,15 @@ static inline void dst_link_failure(struct sk_buff *skb) static inline void dst_set_expires(struct dst_entry *dst, int timeout) { - unsigned long expires = jiffies + timeout; + unsigned long old, expires = jiffies + timeout; if (expires == 0) expires = 1; - if (dst->expires == 0 || time_before(expires, dst->expires)) - dst->expires = expires; + old = READ_ONCE(dst->expires); + + if (!old || time_before(expires, old)) + WRITE_ONCE(dst->expires, expires); } static inline unsigned int dst_dev_overhead(struct dst_entry *dst, diff --git a/include/net/ip.h b/include/net/ip.h index 375304bb99f6..391af454422e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -477,7 +477,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, ip_mtu_locked(dst) || !forwarding) { mtu = rt->rt_pmtu; - if (mtu && time_before(jiffies, rt->dst.expires)) + if (mtu && time_before(jiffies, READ_ONCE(rt->dst.expires))) goto out; } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d32af8c16727..d7a534a5f1ff 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -844,7 +844,7 @@ static void ipv4_negative_advice(struct sock *sk, if ((READ_ONCE(dst->obsolete) > 0) || (rt->rt_flags & RTCF_REDIRECTED) || - rt->dst.expires) + READ_ONCE(rt->dst.expires)) sk_dst_reset(sk); } @@ -1033,7 +1033,8 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) } if (rt->rt_pmtu == mtu && !lock && - time_before(jiffies, dst->expires - net->ipv4.ip_rt_mtu_expires / 2)) + time_before(jiffies, READ_ONCE(dst->expires) - + net->ipv4.ip_rt_mtu_expires / 2)) goto out; if (fib_lookup(net, fl4, &res, 0) == 0) { @@ -3010,7 +3011,7 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, } } - expires = rt->dst.expires; + expires = READ_ONCE(rt->dst.expires); if (expires) { unsigned long now = jiffies; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ace2071f77bd..1014dcea1200 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -391,9 +391,8 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev) static bool __rt6_check_expired(const struct rt6_info *rt) { if (rt->rt6i_flags & RTF_EXPIRES) - return time_after(jiffies, rt->dst.expires); - else - return false; + return time_after(jiffies, READ_ONCE(rt->dst.expires)); + return false; } static bool rt6_check_expired(const struct rt6_info *rt) @@ -403,7 +402,7 @@ static bool rt6_check_expired(const struct rt6_info *rt) from = rcu_dereference(rt->from); if (rt->rt6i_flags & RTF_EXPIRES) { - if (time_after(jiffies, rt->dst.expires)) + if (time_after(jiffies, READ_ONCE(rt->dst.expires))) return true; } else if (from) { return READ_ONCE(rt->dst.obsolete) != DST_OBSOLETE_FORCE_CHK || @@ -2139,7 +2138,7 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, rt6_remove_exception(bucket, rt6_ex); return; } - } else if (time_after(jiffies, rt->dst.expires)) { + } else if (time_after(jiffies, READ_ONCE(rt->dst.expires))) { pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; @@ -2870,7 +2869,7 @@ static void rt6_update_expires(struct rt6_info *rt0, int timeout) rcu_read_lock(); from = rcu_dereference(rt0->from); if (from) - rt0->dst.expires = from->expires; + WRITE_ONCE(rt0->dst.expires, from->expires); rcu_read_unlock(); } @@ -5903,7 +5902,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, } if (rt6_flags & RTF_EXPIRES) { - expires = dst ? dst->expires : rt->expires; + expires = dst ? READ_ONCE(dst->expires) : rt->expires; expires -= jiffies; } -- cgit v1.2.3 From 8f2b2282d04a5d5bcbec22f91572bb6803cfc771 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:27 +0000 Subject: net: dst: annotate data-races around dst->lastuse (dst_entry)->lastuse is read and written locklessly, add corresponding annotations. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 4 ++-- net/core/rtnetlink.c | 4 +++- net/ipv6/route.c | 3 ++- 3 files changed, 7 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index 1efe1e5d51a9..bef2f41c7220 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -240,9 +240,9 @@ static inline void dst_hold(struct dst_entry *dst) static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) { - if (unlikely(time != dst->lastuse)) { + if (unlikely(time != READ_ONCE(dst->lastuse))) { dst->__use++; - dst->lastuse = time; + WRITE_ONCE(dst->lastuse, time); } } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c57692eb8da9..a9555bfc372f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1026,9 +1026,11 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, .rta_error = error, .rta_id = id, }; + unsigned long delta; if (dst) { - ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + delta = jiffies - READ_ONCE(dst->lastuse); + ci.rta_lastuse = jiffies_delta_to_clock_t(delta); ci.rta_used = dst->__use; ci.rta_clntref = rcuref_read(&dst->__rcuref); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1014dcea1200..375112a59492 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2133,7 +2133,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, * expired, independently from their aging, as per RFC 8201 section 4 */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { - if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { + if (time_after_eq(now, READ_ONCE(rt->dst.lastuse) + + gc_args->timeout)) { pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; -- cgit v1.2.3 From f1c5fd34891a1c242885f48c2e4dc52df180f311 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:28 +0000 Subject: net: dst: annotate data-races around dst->input dst_dev_put() can overwrite dst->input while other cpus might read this field (for instance from dst_input()) Add READ_ONCE()/WRITE_ONCE() annotations to suppress potential issues. We will likely need full RCU protection later. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 2 +- include/net/lwtunnel.h | 4 ++-- net/core/dst.c | 2 +- net/ipv4/route.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index bef2f41c7220..c0f8b6d8e707 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -468,7 +468,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_local_deliver(struct sk_buff *)); /* Input packet from network to transport. */ static inline int dst_input(struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->input, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->input), ip6_input, ip_local_deliver, skb); } diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index c306ebe379a0..eaac07d50595 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -142,8 +142,8 @@ static inline void lwtunnel_set_redirect(struct dst_entry *dst) dst->output = lwtunnel_output; } if (lwtunnel_input_redirect(dst->lwtstate)) { - dst->lwtstate->orig_input = dst->input; - dst->input = lwtunnel_input; + dst->lwtstate->orig_input = READ_ONCE(dst->input); + WRITE_ONCE(dst->input, lwtunnel_input); } } #else diff --git a/net/core/dst.c b/net/core/dst.c index 8f2a3138d60c..13c629dc7123 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -148,7 +148,7 @@ void dst_dev_put(struct dst_entry *dst) WRITE_ONCE(dst->obsolete, DST_OBSOLETE_DEAD); if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); - dst->input = dst_discard; + WRITE_ONCE(dst->input, dst_discard); dst->output = dst_discard_out; dst->dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, diff --git a/net/ipv4/route.c b/net/ipv4/route.c index d7a534a5f1ff..75a1f9eabd6b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1687,7 +1687,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) else if (rt->rt_gw_family == AF_INET6) new_rt->rt_gw6 = rt->rt_gw6; - new_rt->dst.input = rt->dst.input; + new_rt->dst.input = READ_ONCE(rt->dst.input); new_rt->dst.output = rt->dst.output; new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; -- cgit v1.2.3 From 2dce8c52a98995c4719def6f88629ab1581c0b82 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:29 +0000 Subject: net: dst: annotate data-races around dst->output dst_dev_put() can overwrite dst->output while other cpus might read this field (for instance from dst_output()) Add READ_ONCE()/WRITE_ONCE() annotations to suppress potential issues. We will likely need RCU protection in the future. Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 2 +- include/net/lwtunnel.h | 4 ++-- net/core/dst.c | 2 +- net/ipv4/route.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index c0f8b6d8e707..b6acfde7d587 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -458,7 +458,7 @@ INDIRECT_CALLABLE_DECLARE(int ip_output(struct net *, struct sock *, /* Output packet to network from transport. */ static inline int dst_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - return INDIRECT_CALL_INET(skb_dst(skb)->output, + return INDIRECT_CALL_INET(READ_ONCE(skb_dst(skb)->output), ip6_output, ip_output, net, sk, skb); } diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index eaac07d50595..26232f603e33 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -138,8 +138,8 @@ int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, static inline void lwtunnel_set_redirect(struct dst_entry *dst) { if (lwtunnel_output_redirect(dst->lwtstate)) { - dst->lwtstate->orig_output = dst->output; - dst->output = lwtunnel_output; + dst->lwtstate->orig_output = READ_ONCE(dst->output); + WRITE_ONCE(dst->output, lwtunnel_output); } if (lwtunnel_input_redirect(dst->lwtstate)) { dst->lwtstate->orig_input = READ_ONCE(dst->input); diff --git a/net/core/dst.c b/net/core/dst.c index 13c629dc7123..52e824e57c17 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -149,7 +149,7 @@ void dst_dev_put(struct dst_entry *dst) if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); - dst->output = dst_discard_out; + WRITE_ONCE(dst->output, dst_discard_out); dst->dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 75a1f9eabd6b..ce6aba4f01ff 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1688,7 +1688,7 @@ struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) new_rt->rt_gw6 = rt->rt_gw6; new_rt->dst.input = READ_ONCE(rt->dst.input); - new_rt->dst.output = rt->dst.output; + new_rt->dst.output = READ_ONCE(rt->dst.output); new_rt->dst.error = rt->dst.error; new_rt->dst.lastuse = jiffies; new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); -- cgit v1.2.3 From 88fe14253e181878c2ddb51a298ae8c468a63010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:30 +0000 Subject: net: dst: add four helpers to annotate data-races around dst->dev dst->dev is read locklessly in many contexts, and written in dst_dev_put(). Fixing all the races is going to need many changes. We probably will have to add full RCU protection. Add three helpers to ease this painful process. static inline struct net_device *dst_dev(const struct dst_entry *dst) { return READ_ONCE(dst->dev); } static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) { return dst_dev(skb_dst(skb)); } static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) { return dev_net(skb_dst_dev(skb)); } static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) { return dev_net_rcu(skb_dst_dev(skb)); } Fixes: 4a6ce2b6f2ec ("net: introduce a new function dst_dev_put()") Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dst.h | 20 ++++++++++++++++++++ net/core/dst.c | 4 ++-- net/core/sock.c | 8 ++++---- 3 files changed, 26 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dst.h b/include/net/dst.h index b6acfde7d587..00467c1b5093 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -563,6 +563,26 @@ static inline void skb_dst_update_pmtu_no_confirm(struct sk_buff *skb, u32 mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } +static inline struct net_device *dst_dev(const struct dst_entry *dst) +{ + return READ_ONCE(dst->dev); +} + +static inline struct net_device *skb_dst_dev(const struct sk_buff *skb) +{ + return dst_dev(skb_dst(skb)); +} + +static inline struct net *skb_dst_dev_net(const struct sk_buff *skb) +{ + return dev_net(skb_dst_dev(skb)); +} + +static inline struct net *skb_dst_dev_net_rcu(const struct sk_buff *skb) +{ + return dev_net_rcu(skb_dst_dev(skb)); +} + struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie); void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh); diff --git a/net/core/dst.c b/net/core/dst.c index 52e824e57c17..e2de8b68c41d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); WRITE_ONCE(dst->output, dst_discard_out); - dst->dev = blackhole_netdev; + WRITE_ONCE(dst->dev, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } @@ -263,7 +263,7 @@ unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); - return mtu ? : dst->dev->mtu; + return mtu ? : dst_dev(dst)->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); diff --git a/net/core/sock.c b/net/core/sock.c index dc59fb7760a3..8b7623c7d547 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2588,8 +2588,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) : - READ_ONCE(dst->dev->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : + READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2600,7 +2600,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { u32 max_segs = 1; - sk->sk_route_caps = dst->dev->features; + sk->sk_route_caps = dst_dev(dst)->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2618,7 +2618,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; -- cgit v1.2.3 From a74fc62eec155ca5a6da8ff3856f3dc87fe24558 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:31 +0000 Subject: ipv4: adopt dst_dev, skb_dst_dev and skb_dst_dev_net[_rcu] Use the new helpers as a first step to deal with potential dst->dev races. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 2 +- include/net/ip.h | 11 ++++++----- include/net/route.h | 2 +- net/ipv4/icmp.c | 24 +++++++++++++----------- net/ipv4/igmp.c | 2 +- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_output.c | 6 +++--- net/ipv4/ip_vti.c | 4 ++-- net/ipv4/netfilter.c | 4 ++-- net/ipv4/route.c | 8 ++++---- net/ipv4/tcp_fastopen.c | 4 +++- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_metrics.c | 8 ++++---- net/ipv4/xfrm4_output.c | 2 +- 14 files changed, 43 insertions(+), 38 deletions(-) (limited to 'include/net') diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index ae09e91398a5..19dbd9081d5a 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -481,7 +481,7 @@ static inline struct sock *__inet_lookup_skb(struct inet_hashinfo *hashinfo, const int sdif, bool *refcounted) { - struct net *net = dev_net_rcu(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct iphdr *iph = ip_hdr(skb); struct sock *sk; diff --git a/include/net/ip.h b/include/net/ip.h index 391af454422e..befcba575129 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -472,7 +472,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); if (READ_ONCE(net->ipv4.sysctl_ip_fwd_use_pmtu) || ip_mtu_locked(dst) || !forwarding) { @@ -486,7 +486,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, if (mtu) goto out; - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dst_dev(dst)->mtu); if (unlikely(ip_mtu_locked(dst))) { if (rt->rt_uses_gateway && mtu > 576) @@ -506,16 +506,17 @@ out: static inline unsigned int ip_skb_dst_mtu(struct sock *sk, const struct sk_buff *skb) { + const struct dst_entry *dst = skb_dst(skb); unsigned int mtu; if (!sk || !sk_fullsock(sk) || ip_sk_use_pmtu(sk)) { bool forwarding = IPCB(skb)->flags & IPSKB_FORWARDED; - return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); + return ip_dst_mtu_maybe_forward(dst, forwarding); } - mtu = min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); - return mtu - lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); + mtu = min(READ_ONCE(dst_dev(dst)->mtu), IP_MAX_MTU); + return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, int fc_mx_len, diff --git a/include/net/route.h b/include/net/route.h index 3d3d6048ffca..7ea840daa775 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -390,7 +390,7 @@ static inline int ip4_dst_hoplimit(const struct dst_entry *dst) const struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); hoplimit = READ_ONCE(net->ipv4.sysctl_ip_default_ttl); rcu_read_unlock(); } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 717cb7d3607a..2ffe73ea644f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -311,18 +311,20 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; + struct net_device *dev; bool rc = true; if (!apply_ratelimit) return true; /* No rate limit on loopback */ - if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + dev = dst_dev(dst); + if (dev && (dev->flags & IFF_LOOPBACK)) goto out; rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, - l3mdev_master_ifindex_rcu(dst->dev)); + l3mdev_master_ifindex_rcu(dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); rcu_read_unlock(); @@ -466,13 +468,13 @@ out_bh_enable: */ static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) { - struct net_device *route_lookup_dev = NULL; + struct net_device *dev = skb->dev; + const struct dst_entry *dst; - if (skb->dev) - route_lookup_dev = skb->dev; - else if (skb_dst(skb)) - route_lookup_dev = skb_dst(skb)->dev; - return route_lookup_dev; + if (dev) + return dev; + dst = skb_dst(skb); + return dst ? dst_dev(dst) : NULL; } static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, @@ -869,7 +871,7 @@ static enum skb_drop_reason icmp_unreach(struct sk_buff *skb) struct net *net; u32 info = 0; - net = dev_net_rcu(skb_dst(skb)->dev); + net = skb_dst_dev_net_rcu(skb); /* * Incomplete header ? @@ -1012,7 +1014,7 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) struct icmp_bxm icmp_param; struct net *net; - net = dev_net_rcu(skb_dst(skb)->dev); + net = skb_dst_dev_net_rcu(skb); /* should there be an ICMP stat for ignored echos? */ if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; @@ -1182,7 +1184,7 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) return SKB_NOT_DROPPED_YET; out_err: - __ICMP_INC_STATS(dev_net_rcu(skb_dst(skb)->dev), ICMP_MIB_INERRORS); + __ICMP_INC_STATS(skb_dst_dev_net_rcu(skb), ICMP_MIB_INERRORS); return SKB_DROP_REASON_PKT_TOO_SMALL; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d1769034b643..7182f1419c2a 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -427,7 +427,7 @@ static int igmpv3_sendpack(struct sk_buff *skb) pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen); - return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb); + return ip_local_out(skb_dst_dev_net(skb), skb->sk, skb); } static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 64b3fb3208af..b2584cce90ae 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -476,7 +476,7 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { - struct net_device *dev = skb->dev ? : skb_dst(skb)->dev; + struct net_device *dev = skb->dev ? : skb_dst_dev(skb); int vif = l3mdev_master_ifindex_rcu(dev); struct ipq *qp; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index a2705d454fd6..414b47a0d513 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -116,7 +116,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } @@ -199,7 +199,7 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s { struct dst_entry *dst = skb_dst(skb); struct rtable *rt = dst_rtable(dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); struct neighbour *neigh; bool is_v6gw = false; @@ -425,7 +425,7 @@ int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb) int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; + struct net_device *dev = skb_dst_dev(skb), *indev = skb->dev; skb->dev = dev; skb->protocol = htons(ETH_P_IP); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 686e4f3d83aa..95b6bb78fcd2 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -229,7 +229,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, goto tx_error_icmp; } - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { dst_release(dst); @@ -259,7 +259,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev, xmit: skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(dev))); skb_dst_set(skb, dst); - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); err = dst_output(tunnel->net, skb->sk, skb); if (net_xmit_eval(err) == 0) diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 08bc3f2c0078..0565f001120d 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -20,12 +20,12 @@ /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int addr_type) { + struct net_device *dev = skb_dst_dev(skb); const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct flowi4 fl4 = {}; __be32 saddr = iph->saddr; __u8 flags; - struct net_device *dev = skb_dst(skb)->dev; struct flow_keys flkeys; unsigned int hh_len; @@ -74,7 +74,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un #endif /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; + hh_len = skb_dst_dev(skb)->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ce6aba4f01ff..64ba377cd6cc 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -413,7 +413,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct neighbour *n; rcu_read_lock(); @@ -440,7 +440,7 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); const __be32 *pkey = daddr; if (rt->rt_gw_family == AF_INET) { @@ -1026,7 +1026,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1326,7 +1326,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 5107121c5e37..f1884f0c9e52 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -559,6 +559,7 @@ bool tcp_fastopen_active_should_disable(struct sock *sk) void tcp_fastopen_active_disable_ofo_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + struct net_device *dev; struct dst_entry *dst; struct sk_buff *skb; @@ -576,7 +577,8 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { dst = sk_dst_get(sk); - if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) + dev = dst ? dst_dev(dst) : NULL; + if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); dst_release(dst); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b406fd012b2e..a847d894ace3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -788,7 +788,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, arg.iov[0].iov_base = (unsigned char *)&rep; arg.iov[0].iov_len = sizeof(rep.th); - net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), &md5_hash_location, &aoh)) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 4251670e328c..03c068ea27b6 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -166,11 +166,11 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, unsigned int hash) { struct tcp_metrics_block *tm; - struct net *net; bool reclaim = false; + struct net *net; spin_lock_bh(&tcp_metrics_lock); - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net_rcu(dst->dev); + net = dev_net_rcu(dst_dev(dst)); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 3cff51ba72bb..0ae67d537499 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -31,7 +31,7 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, skb->dev, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst_dev(skb), __xfrm4_output, !(IPCB(skb)->flags & IPSKB_REROUTED)); } -- cgit v1.2.3 From 1caf27297215a5241f9bfc9c07336349d9034ee3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:32 +0000 Subject: ipv6: adopt dst_dev() helper Use the new helper as a step to deal with potential dst->dev races. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/ip6_route.h | 4 ++-- net/ipv6/exthdrs.c | 2 +- net/ipv6/icmp.c | 4 +++- net/ipv6/ila/ila_lwt.c | 2 +- net/ipv6/ioam6_iptunnel.c | 4 ++-- net/ipv6/ip6_gre.c | 8 +++++--- net/ipv6/ip6_output.c | 19 ++++++++++--------- net/ipv6/ip6_tunnel.c | 4 ++-- net/ipv6/ip6_udp_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/ndisc.c | 6 ++++-- net/ipv6/netfilter/nf_dup_ipv6.c | 2 +- net/ipv6/output_core.c | 2 +- net/ipv6/route.c | 20 ++++++++++++-------- net/ipv6/rpl_iptunnel.c | 4 ++-- net/ipv6/seg6_iptunnel.c | 20 +++++++++++--------- net/ipv6/seg6_local.c | 2 +- 17 files changed, 60 insertions(+), 47 deletions(-) (limited to 'include/net') diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 6dbdf60b342f..9255f21818ee 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -274,7 +274,7 @@ static inline unsigned int ip6_skb_dst_mtu(const struct sk_buff *skb) unsigned int mtu; if (np && READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE) { - mtu = READ_ONCE(dst->dev->mtu); + mtu = READ_ONCE(dst_dev(dst)->mtu); mtu -= lwtunnel_headroom(dst->lwtstate, mtu); } else { mtu = dst_mtu(dst); @@ -337,7 +337,7 @@ static inline unsigned int ip6_dst_mtu_maybe_forward(const struct dst_entry *dst mtu = IPV6_MIN_MTU; rcu_read_lock(); - idev = __in6_dev_get(dst->dev); + idev = __in6_dev_get(dst_dev(dst)); if (idev) mtu = READ_ONCE(idev->cnf.mtu6); rcu_read_unlock(); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 457de0745a33..1947530fb20a 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -306,7 +306,7 @@ static int ipv6_destopt_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, skb_transport_offset(skb) + 8) || !pskb_may_pull(skb, (skb_transport_offset(skb) + ((skb_transport_header(skb)[1] + 1) << 3)))) { - __IP6_INC_STATS(dev_net(dst->dev), idev, + __IP6_INC_STATS(dev_net(dst_dev(dst)), idev, IPSTATS_MIB_INHDRERRORS); fail_and_free: kfree_skb(skb); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 3fd19a84b358..44550957fd4e 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -196,6 +196,7 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); + struct net_device *dev; struct dst_entry *dst; bool res = false; @@ -208,10 +209,11 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); + dev = dst_dev(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); - } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + } else if (dev && (dev->flags & IFF_LOOPBACK)) { res = true; } else { struct rt6_info *rt = dst_rt6_info(dst); diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index 7d574f5132e2..7bb9edc5c28c 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -70,7 +70,7 @@ static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) */ memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = orig_dst->dev->ifindex; + fl6.flowi6_oif = dst_dev(orig_dst)->ifindex; fl6.flowi6_iif = LOOPBACK_IFINDEX; fl6.daddr = *rt6_nexthop(dst_rt6_info(orig_dst), &ip6h->daddr); diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c index 40df8bdfaacd..1fe7894f14dd 100644 --- a/net/ipv6/ioam6_iptunnel.c +++ b/net/ipv6/ioam6_iptunnel.c @@ -335,7 +335,7 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb, if (has_tunsrc) memcpy(&hdr->saddr, tunsrc, sizeof(*tunsrc)); else - ipv6_dev_get_saddr(net, dst->dev, &hdr->daddr, + ipv6_dev_get_saddr(net, dst_dev(dst), &hdr->daddr, IPV6_PREFER_SRC_PUBLIC, &hdr->saddr); skb_postpush_rcsum(skb, hdr, len); @@ -442,7 +442,7 @@ do_encap: dst_cache_set_ip6(&ilwt->cache, dst, &fl6.saddr); local_bh_enable(); - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 2dc9dcffe2ca..a1210fd6404e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1085,9 +1085,11 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, htonl(atomic_fetch_inc(&t->o_seqno))); /* TooBig packet may have updated dst->dev's mtu */ - if (!t->parms.collect_md && dst && dst_mtu(dst) > dst->dev->mtu) - dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu, false); - + if (!t->parms.collect_md && dst) { + mtu = READ_ONCE(dst_dev(dst)->mtu); + if (dst_mtu(dst) > mtu) + dst->ops->update_pmtu(dst, NULL, skb, mtu, false); + } err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, NEXTHDR_GRE); if (err != 0) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 7bd29a9ff0db..f494b4ece6b7 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -271,7 +271,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); @@ -503,7 +503,8 @@ int ip6_forward(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct ipv6hdr *hdr = ipv6_hdr(skb); struct inet6_skb_parm *opt = IP6CB(skb); - struct net *net = dev_net(dst->dev); + struct net *net = dev_net(dst_dev(dst)); + struct net_device *dev; struct inet6_dev *idev; SKB_DR(reason); u32 mtu; @@ -591,12 +592,12 @@ int ip6_forward(struct sk_buff *skb) goto drop; } dst = skb_dst(skb); - + dev = dst_dev(dst); /* IPv6 specs say nothing about it, but it is clear that we cannot send redirects to source routed frames. We don't send redirects to frames decapsulated from IPsec. */ - if (IP6CB(skb)->iif == dst->dev->ifindex && + if (IP6CB(skb)->iif == dev->ifindex && opt->srcrt == 0 && !skb_sec_path(skb)) { struct in6_addr *target = NULL; struct inet_peer *peer; @@ -644,7 +645,7 @@ int ip6_forward(struct sk_buff *skb) if (ip6_pkt_too_big(skb, mtu)) { /* Again, force OUTPUT device used as source address */ - skb->dev = dst->dev; + skb->dev = dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); __IP6_INC_STATS(net, ip6_dst_idev(dst), @@ -653,7 +654,7 @@ int ip6_forward(struct sk_buff *skb) return -EMSGSIZE; } - if (skb_cow(skb, dst->dev->hard_header_len)) { + if (skb_cow(skb, dev->hard_header_len)) { __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); goto drop; @@ -666,7 +667,7 @@ int ip6_forward(struct sk_buff *skb) hdr->hop_limit--; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, - net, NULL, skb, skb->dev, dst->dev, + net, NULL, skb, skb->dev, dev, ip6_forward_finish); error: @@ -1093,7 +1094,7 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, #ifdef CONFIG_IPV6_SUBTREES ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || #endif - (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { + (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); dst = NULL; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 436e077061d1..cd8a4141e5c0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1179,7 +1179,7 @@ route_lookup: ndst = dst; } - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); @@ -1255,7 +1255,7 @@ route_lookup: /* Calculate max headroom for all the headers and adjust * needed_headroom if necessary. */ - max_headroom = LL_RESERVED_SPACE(dst->dev) + sizeof(struct ipv6hdr) + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; if (max_headroom > READ_ONCE(dev->needed_headroom)) WRITE_ONCE(dev->needed_headroom, max_headroom); diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 8ebe17a6058a..0ff547a4bff7 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -168,7 +168,7 @@ struct dst_entry *udp_tunnel6_dst_lookup(struct sk_buff *skb, netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr); return ERR_PTR(-ENETUNREACH); } - if (dst->dev == dev) { /* is this necessary? */ + if (dst_dev(dst) == dev) { /* is this necessary? */ netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr); dst_release(dst); return ERR_PTR(-ELOOP); diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 40464a88bca6..2a86de922d42 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -497,7 +497,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) (const struct in6_addr *)&x->id.daddr)) goto tx_err_link_failure; - tdev = dst->dev; + tdev = dst_dev(dst); if (tdev == dev) { DEV_STATS_INC(dev, collisions); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index ecb5c4b8518f..f2299b61221b 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -473,6 +473,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, { struct icmp6hdr *icmp6h = icmp6_hdr(skb); struct dst_entry *dst = skb_dst(skb); + struct net_device *dev; struct inet6_dev *idev; struct net *net; struct sock *sk; @@ -507,11 +508,12 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - idev = __in6_dev_get(dst->dev); + dev = dst_dev(dst); + idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, dst->dev, + net, sk, skb, NULL, dev, dst_output); if (!err) { ICMP6MSGOUT_INC_STATS(net, idev, type); diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index b903c62c00c9..6da3102b7c1b 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -38,7 +38,7 @@ static bool nf_dup_ipv6_route(struct net *net, struct sk_buff *skb, } skb_dst_drop(skb); skb_dst_set(skb, dst); - skb->dev = dst->dev; + skb->dev = dst_dev(dst); skb->protocol = htons(ETH_P_IPV6); return true; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 806d4b5dd1e6..90a178dd24aa 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -105,7 +105,7 @@ int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); if (hoplimit == 0) { - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); struct inet6_dev *idev; rcu_read_lock(); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 375112a59492..dacfe1284918 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -228,13 +228,13 @@ static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst, const struct rt6_info *rt = dst_rt6_info(dst); return ip6_neigh_lookup(rt6_nexthop(rt, &in6addr_any), - dst->dev, skb, daddr); + dst_dev(dst), skb, daddr); } static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct rt6_info *rt = dst_rt6_info(dst); - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); daddr = choose_neigh_daddr(rt6_nexthop(rt, &in6addr_any), NULL, daddr); if (!daddr) @@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst->dev, + .dev = dst_dev(dst), .gw = &rt6->rt6i_gateway, }; @@ -3238,7 +3238,7 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - struct net_device *dev = dst->dev; + struct net_device *dev = dst_dev(dst); unsigned int mtu = dst_mtu(dst); struct net *net; @@ -4301,7 +4301,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst->dev, + .dev = dst_dev(dst), .gw = &rt->rt6i_gateway, }; @@ -4587,13 +4587,14 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg) static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes) { struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct inet6_dev *idev; SKB_DR(reason); int type; if (netif_is_l3_master(skb->dev) || - dst->dev == net->loopback_dev) + dev == net->loopback_dev) idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); else idev = ip6_dst_idev(dst); @@ -5844,11 +5845,14 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, * each as a nexthop within RTA_MULTIPATH. */ if (rt6) { + struct net_device *dev; + if (rt6_flags & RTF_GATEWAY && nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway)) goto nla_put_failure; - if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex)) + dev = dst_dev(dst); + if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex)) goto nla_put_failure; if (lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c index 7c05ac846646..1f41f53fbaff 100644 --- a/net/ipv6/rpl_iptunnel.c +++ b/net/ipv6/rpl_iptunnel.c @@ -242,7 +242,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb) local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } @@ -297,7 +297,7 @@ static int rpl_input(struct sk_buff *skb) local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 51583461ae29..27918fc0c972 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -128,7 +128,8 @@ static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, int proto, struct dst_entry *cache_dst) { struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; struct ipv6_sr_hdr *isrh; int hdrlen, tot_len, err; @@ -181,7 +182,7 @@ static int __seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, isrh->nexthdr = proto; hdr->daddr = isrh->segments[isrh->first_segment]; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { @@ -212,7 +213,8 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, { __u8 first_seg = osrh->first_segment; struct dst_entry *dst = skb_dst(skb); - struct net *net = dev_net(dst->dev); + struct net_device *dev = dst_dev(dst); + struct net *net = dev_net(dev); struct ipv6hdr *hdr, *inner_hdr; int hdrlen = ipv6_optlen(osrh); int red_tlv_offset, tlv_offset; @@ -270,7 +272,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, if (skip_srh) { hdr->nexthdr = proto; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); goto out; } @@ -306,7 +308,7 @@ static int seg6_do_srh_encap_red(struct sk_buff *skb, srcaddr: isrh->nexthdr = proto; - set_tun_src(net, dst->dev, &hdr->daddr, &hdr->saddr); + set_tun_src(net, dev, &hdr->daddr, &hdr->saddr); #ifdef CONFIG_IPV6_SEG6_HMAC if (unlikely(!skip_srh && sr_has_hmac(isrh))) { @@ -507,7 +509,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } else { @@ -518,7 +520,7 @@ static int seg6_input_core(struct net *net, struct sock *sk, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, dev_net(skb->dev), NULL, skb, NULL, - skb_dst(skb)->dev, seg6_input_finish); + skb_dst_dev(skb), seg6_input_finish); return seg6_input_finish(dev_net(skb->dev), NULL, skb); drop: @@ -593,7 +595,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, local_bh_enable(); } - err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst_dev(dst))); if (unlikely(err)) goto drop; } @@ -603,7 +605,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, if (static_branch_unlikely(&nf_hooks_lwtunnel_enabled)) return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, - NULL, skb_dst(skb)->dev, dst_output); + NULL, dst_dev(dst), dst_output); return dst_output(net, sk, skb); drop: diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 4834d72624cf..2b41e4c0dddd 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -313,7 +313,7 @@ seg6_lookup_any_nexthop(struct sk_buff *skb, struct in6_addr *nhaddr, if (!local_delivery) dev_flags |= IFF_LOOPBACK; - if (dst && (dst->dev->flags & dev_flags) && !dst->error) { + if (dst && (dst_dev(dst)->flags & dev_flags) && !dst->error) { dst_release(dst); dst = NULL; } -- cgit v1.2.3 From 93d1cff35adc522a5d21e722eee1071f3f7dc716 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Jun 2025 12:19:33 +0000 Subject: ipv6: adopt skb_dst_dev() and skb_dst_dev_net[_rcu]() helpers Use the new helpers as a step to deal with potential dst->dev races. v2: fix typo in ipv6_rthdr_rcv() (kernel test robot ) Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250630121934.3399505-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet6_hashtables.h | 2 +- include/net/ip6_tunnel.h | 2 +- net/ipv6/exthdrs.c | 8 ++++---- net/ipv6/ioam6.c | 17 +++++++++-------- net/ipv6/ip6_input.c | 6 ++++-- net/ipv6/ip6_output.c | 5 +++-- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/ip6_vti.c | 2 +- net/ipv6/netfilter.c | 4 ++-- net/ipv6/netfilter/nf_reject_ipv6.c | 2 +- net/ipv6/output_core.c | 2 +- net/ipv6/reassembly.c | 10 +++++----- net/ipv6/route.c | 4 ++-- net/ipv6/seg6_iptunnel.c | 6 +++--- net/ipv6/tcp_ipv6.c | 4 ++-- net/ipv6/xfrm6_output.c | 2 +- 16 files changed, 41 insertions(+), 37 deletions(-) (limited to 'include/net') diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index c32878c69179..ab3929a2a956 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -150,7 +150,7 @@ static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, int iif, int sdif, bool *refcounted) { - struct net *net = dev_net_rcu(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net_rcu(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb); struct sock *sk; diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index dd163495f353..120db2865811 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -159,7 +159,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); IP6CB(skb)->flags = ip6cb_flags; pkt_len = skb->len - skb_inner_network_offset(skb); - err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); + err = ip6_local_out(skb_dst_dev_net(skb), sk, skb); if (dev) { if (unlikely(net_xmit_eval(err))) diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 1947530fb20a..d1ef9644f826 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -460,7 +460,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, @@ -621,7 +621,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, @@ -783,7 +783,7 @@ looped_back: kfree_skb(skb); return -1; } - if (!ipv6_chk_home_addr(dev_net(skb_dst(skb)->dev), addr)) { + if (!ipv6_chk_home_addr(skb_dst_dev_net(skb), addr)) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); kfree_skb(skb); return -1; @@ -809,7 +809,7 @@ looped_back: return -1; } - if (skb_dst(skb)->dev->flags&IFF_LOOPBACK) { + if (skb_dst_dev(skb)->flags & IFF_LOOPBACK) { if (ipv6_hdr(skb)->hop_limit <= 1) { __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, diff --git a/net/ipv6/ioam6.c b/net/ipv6/ioam6.c index a84d332f952f..9553a3200081 100644 --- a/net/ipv6/ioam6.c +++ b/net/ipv6/ioam6.c @@ -696,6 +696,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct ioam6_schema *sc, u8 sclen, bool is_input) { + struct net_device *dev = skb_dst_dev(skb); struct timespec64 ts; ktime_t tstamp; u64 raw64; @@ -712,7 +713,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw32 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id; + raw32 = dev_net(dev)->ipv6.sysctl.ioam6_id; *(__be32 *)data = cpu_to_be32((byte << 24) | raw32); data += sizeof(__be32); @@ -728,10 +729,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) raw16 = IOAM6_U16_UNAVAILABLE; else - raw16 = (__force u16)READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id); + raw16 = (__force u16)READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id); *(__be16 *)data = cpu_to_be16(raw16); data += sizeof(__be16); @@ -783,10 +784,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, struct Qdisc *qdisc; __u32 qlen, backlog; - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) { + if (dev->flags & IFF_LOOPBACK) { *(__be32 *)data = cpu_to_be32(IOAM6_U32_UNAVAILABLE); } else { - queue = skb_get_tx_queue(skb_dst(skb)->dev, skb); + queue = skb_get_tx_queue(dev, skb); qdisc = rcu_dereference(queue->qdisc); qdisc_qstats_qlen_backlog(qdisc, &qlen, &backlog); @@ -807,7 +808,7 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, if (is_input) byte--; - raw64 = dev_net(skb_dst(skb)->dev)->ipv6.sysctl.ioam6_id_wide; + raw64 = dev_net(dev)->ipv6.sysctl.ioam6_id_wide; *(__be64 *)data = cpu_to_be64(((u64)byte << 56) | raw64); data += sizeof(__be64); @@ -823,10 +824,10 @@ static void __ioam6_fill_trace_data(struct sk_buff *skb, *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); - if (skb_dst(skb)->dev->flags & IFF_LOOPBACK) + if (dev->flags & IFF_LOOPBACK) raw32 = IOAM6_U32_UNAVAILABLE; else - raw32 = READ_ONCE(__in6_dev_get(skb_dst(skb)->dev)->cnf.ioam6_id_wide); + raw32 = READ_ONCE(__in6_dev_get(dev)->cnf.ioam6_id_wide); *(__be32 *)data = cpu_to_be32(raw32); data += sizeof(__be32); diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 39da6a7ce5f1..16953bd00960 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -187,7 +187,9 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, * arrived via the sending interface (ethX), because of the * nature of scoping architecture. --yoshfuji */ - IP6CB(skb)->iif = skb_valid_dst(skb) ? ip6_dst_idev(skb_dst(skb))->dev->ifindex : dev->ifindex; + IP6CB(skb)->iif = skb_valid_dst(skb) ? + ip6_dst_idev(skb_dst(skb))->dev->ifindex : + dev->ifindex; if (unlikely(!pskb_may_pull(skb, sizeof(*hdr)))) goto err; @@ -504,7 +506,7 @@ int ip6_mc_input(struct sk_buff *skb) struct net_device *dev; bool deliver; - __IP6_UPD_PO_STATS(dev_net(skb_dst(skb)->dev), + __IP6_UPD_PO_STATS(skb_dst_dev_net(skb), __in6_dev_get_safely(skb->dev), IPSTATS_MIB_INMCAST, skb->len); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f494b4ece6b7..877bee7ffee9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -232,8 +232,9 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); + struct dst_entry *dst = skb_dst(skb); + struct net_device *dev = dst_dev(dst), *indev = skb->dev; + struct inet6_dev *idev = ip6_dst_idev(dst); skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cd8a4141e5c0..3262e81223df 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -632,7 +632,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else { if (ip_route_input(skb2, eiph->daddr, eiph->saddr, ip4h_dscp(eiph), skb2->dev) || - skb_dst(skb2)->dev->type != ARPHRD_TUNNEL6) + skb_dst_dev(skb2)->type != ARPHRD_TUNNEL6) goto out; } diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 2a86de922d42..ad5290be4dd6 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -529,7 +529,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) xmit: skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev))); skb_dst_set(skb, dst); - skb->dev = skb_dst(skb)->dev; + skb->dev = dst_dev(dst); err = dst_output(t->net, skb->sk, skb); if (net_xmit_eval(err) == 0) diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 4541836ee3da..45f9105f9ac1 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -24,7 +24,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff { const struct ipv6hdr *iph = ipv6_hdr(skb); struct sock *sk = sk_to_full_sk(sk_partial); - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); struct flow_keys flkeys; unsigned int hh_len; struct dst_entry *dst; @@ -72,7 +72,7 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #endif /* Change in oif may mean change in hh_len. */ - hh_len = skb_dst(skb)->dev->hard_header_len; + hh_len = skb_dst_dev(skb)->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)), 0, GFP_ATOMIC)) diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 9ae2b2725bf9..838295fa32e3 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -300,7 +300,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, skb_dst_set(oldskb, dst); } - fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst(oldskb)->dev); + fl6.flowi6_oif = l3mdev_master_ifindex(skb_dst_dev(oldskb)); fl6.flowi6_mark = IP6_REPLY_MARK(net, oldskb->mark); security_skb_classify_flow(oldskb, flowi6_to_flowi_common(&fl6)); dst = ip6_route_output(net, NULL, &fl6); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 90a178dd24aa..d21fe27fe21e 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -141,7 +141,7 @@ int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) skb->protocol = htons(ETH_P_IPV6); return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, sk, skb, NULL, skb_dst(skb)->dev, + net, sk, skb, NULL, skb_dst_dev(skb), dst_output); } EXPORT_SYMBOL_GPL(__ip6_local_out); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 7d4bcf3fda5b..25ec8001898d 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -104,11 +104,11 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) return container_of(q, struct frag_queue, q); } -static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, +static int ip6_frag_queue(struct net *net, + struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset, int *refs) { - struct net *net = dev_net(skb_dst(skb)->dev); int offset, end, fragsize; struct sk_buff *prev_tail; struct net_device *dev; @@ -324,10 +324,10 @@ out_fail: static int ipv6_frag_rcv(struct sk_buff *skb) { + const struct ipv6hdr *hdr = ipv6_hdr(skb); + struct net *net = skb_dst_dev_net(skb); struct frag_hdr *fhdr; struct frag_queue *fq; - const struct ipv6hdr *hdr = ipv6_hdr(skb); - struct net *net = dev_net(skb_dst(skb)->dev); u8 nexthdr; int iif; @@ -384,7 +384,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) spin_lock(&fq->q.lock); fq->iif = iif; - ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, + ret = ip6_frag_queue(net, fq, skb, fhdr, IP6CB(skb)->nhoff, &prob_offset, &refs); spin_unlock(&fq->q.lock); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dacfe1284918..3fbe0885c21c 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4631,7 +4631,7 @@ static int ip6_pkt_discard(struct sk_buff *skb) static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES); } @@ -4642,7 +4642,7 @@ static int ip6_pkt_prohibit(struct sk_buff *skb) static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - skb->dev = skb_dst(skb)->dev; + skb->dev = skb_dst_dev(skb); return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES); } diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 27918fc0c972..3e1b9991131a 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -364,7 +364,7 @@ static int __seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh, #ifdef CONFIG_IPV6_SEG6_HMAC if (sr_has_hmac(isrh)) { - struct net *net = dev_net(skb_dst(skb)->dev); + struct net *net = skb_dst_dev_net(skb); err = seg6_push_hmac(net, &hdr->saddr, isrh); if (unlikely(err)) @@ -530,7 +530,7 @@ drop: static int seg6_input_nf(struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); struct net *net = dev_net(skb->dev); switch (skb->protocol) { @@ -616,7 +616,7 @@ drop: static int seg6_output_nf(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_device *dev = skb_dst(skb)->dev; + struct net_device *dev = skb_dst_dev(skb); switch (skb->protocol) { case htons(ETH_P_IP): diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ed0b891885d8..8f2c3cba1f1f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -868,7 +868,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { - struct net *net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); + struct net *net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); unsigned int tot_len = sizeof(struct tcphdr); struct sock *ctl_sk = net->ipv6.tcp_sk; const struct tcphdr *th = tcp_hdr(skb); @@ -1043,7 +1043,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, if (!sk && !ipv6_unicast_destination(skb)) return; - net = sk ? sock_net(sk) : dev_net_rcu(skb_dst(skb)->dev); + net = sk ? sock_net(sk) : skb_dst_dev_net_rcu(skb); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index b3d5d1f266ee..512bdaf13699 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -106,7 +106,7 @@ skip_frag: int xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, skb->dev, skb_dst(skb)->dev, + net, sk, skb, skb->dev, skb_dst_dev(skb), __xfrm6_output, !(IP6CB(skb)->flags & IP6SKB_REROUTED)); } -- cgit v1.2.3 From 42401c42389622424f2973ec57414f033ae6be8f Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 29 Jun 2025 17:21:31 +0300 Subject: netlink: introduce type-checking attribute iteration for nlmsg Add the nlmsg_for_each_attr_type() macro to simplify iteration over attributes of a specific type in a Netlink message. Convert existing users in vxlan and nfsd to use the new macro. Suggested-by: Jakub Kicinski Signed-off-by: Carolina Jubran Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250629142138.361537-2-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/vxlan/vxlan_vnifilter.c | 13 ++++--------- fs/nfsd/nfsctl.c | 36 ++++++++++++++---------------------- include/net/netlink.h | 14 ++++++++++++++ 3 files changed, 32 insertions(+), 31 deletions(-) (limited to 'include/net') diff --git a/drivers/net/vxlan/vxlan_vnifilter.c b/drivers/net/vxlan/vxlan_vnifilter.c index 4ff56d9f8f28..adc89e651e27 100644 --- a/drivers/net/vxlan/vxlan_vnifilter.c +++ b/drivers/net/vxlan/vxlan_vnifilter.c @@ -971,15 +971,10 @@ static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh, if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)) return -EOPNOTSUPP; - nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) { - switch (nla_type(attr)) { - case VXLAN_VNIFILTER_ENTRY: - err = vxlan_process_vni_filter(vxlan, attr, - nlh->nlmsg_type, extack); - break; - default: - continue; - } + nlmsg_for_each_attr_type(attr, VXLAN_VNIFILTER_ENTRY, nlh, + sizeof(*tmsg), rem) { + err = vxlan_process_vni_filter(vxlan, attr, nlh->nlmsg_type, + extack); vnis++; if (err) break; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6a42cc7a845a..657d44afc062 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1621,10 +1621,9 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; /* count number of SERVER_THREADS values */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { - if (nla_type(attr) == NFSD_A_SERVER_THREADS) - nrpools++; - } + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) + nrpools++; mutex_lock(&nfsd_mutex); @@ -1635,12 +1634,11 @@ int nfsd_nl_threads_set_doit(struct sk_buff *skb, struct genl_info *info) } i = 0; - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { - if (nla_type(attr) == NFSD_A_SERVER_THREADS) { - nthreads[i++] = nla_get_u32(attr); - if (i >= nrpools) - break; - } + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_THREADS, info->nlhdr, + GENL_HDRLEN, rem) { + nthreads[i++] = nla_get_u32(attr); + if (i >= nrpools) + break; } if (info->attrs[NFSD_A_SERVER_GRACETIME] || @@ -1781,14 +1779,12 @@ int nfsd_nl_version_set_doit(struct sk_buff *skb, struct genl_info *info) for (i = 0; i <= NFSD_SUPPORTED_MINOR_VERSION; i++) nfsd_minorversion(nn, i, NFSD_CLEAR); - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_PROTO_VERSION, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_VERSION_MAX + 1]; u32 major, minor = 0; bool enabled; - if (nla_type(attr) != NFSD_A_SERVER_PROTO_VERSION) - continue; - if (nla_parse_nested(tb, NFSD_A_VERSION_MAX, attr, nfsd_version_nl_policy, info->extack) < 0) continue; @@ -1939,14 +1935,12 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * Walk the list of server_socks from userland and move any that match * back to sv_permsocks */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; - if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) - continue; - if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; @@ -2001,15 +1995,13 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) svc_xprt_destroy_all(serv, net); /* walk list of addrs again, open any that still don't exist */ - nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { + nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, + GENL_HDRLEN, rem) { struct nlattr *tb[NFSD_A_SOCK_MAX + 1]; const char *xcl_name; struct sockaddr *sa; int ret; - if (nla_type(attr) != NFSD_A_SERVER_SOCK_ADDR) - continue; - if (nla_parse_nested(tb, NFSD_A_SOCK_MAX, attr, nfsd_sock_nl_policy, info->extack) < 0) continue; diff --git a/include/net/netlink.h b/include/net/netlink.h index 90a560dc167a..1a8356ca4b78 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -68,6 +68,8 @@ * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes + * nlmsg_for_each_attr_type() loop over all attributes with the + * given type * * Misc: * nlmsg_report() report back to application? @@ -966,6 +968,18 @@ static inline u32 nlmsg_seq(const struct nlmsghdr *nlh) nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ nlmsg_attrlen(nlh, hdrlen), rem) +/** + * nlmsg_for_each_attr_type - iterate over a stream of attributes + * @pos: loop counter, set to the current attribute + * @type: required attribute type for @pos + * @nlh: netlink message header + * @hdrlen: length of the family specific header + * @rem: initialized to len, holds bytes currently remaining in stream + */ +#define nlmsg_for_each_attr_type(pos, type, nlh, hdrlen, rem) \ + nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ + if (nla_type(pos) == type) + /** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in -- cgit v1.2.3 From 566e8f108fc7847f2a8676ec6a101d37b7dd0fb4 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 29 Jun 2025 17:21:32 +0300 Subject: devlink: Extend devlink rate API with traffic classes bandwidth management Introduce support for specifying relative bandwidth shares between traffic classes (TC) in the devlink-rate API. This new option allows users to allocate bandwidth across multiple traffic classes in a single command. This feature provides a more granular control over traffic management, especially for scenarios requiring Enhanced Transmission Selection. Users can now define a relative bandwidth share for each traffic class. For example, assigning share values of 20 to TC0 (TCP/UDP) and 80 to TC5 (RoCE) will result in TC0 receiving 20% and TC5 receiving 80% of the total bandwidth. The actual percentage each class receives depends on the ratio of its share value to the sum of all shares. Example: DEV=pci/0000:08:00.0 $ devlink port function rate add $DEV/vfs_group tx_share 10Gbit \ tx_max 50Gbit tc-bw 0:20 1:0 2:0 3:0 4:0 5:80 6:0 7:0 $ devlink port function rate set $DEV/vfs_group \ tc-bw 0:20 1:0 2:0 3:0 4:0 5:20 6:60 7:0 Example usage with ynl: ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-set --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1, "rate-tc-bws": [ {"rate-tc-index": 0, "rate-tc-bw": 50}, {"rate-tc-index": 1, "rate-tc-bw": 50}, {"rate-tc-index": 2, "rate-tc-bw": 0}, {"rate-tc-index": 3, "rate-tc-bw": 0}, {"rate-tc-index": 4, "rate-tc-bw": 0}, {"rate-tc-index": 5, "rate-tc-bw": 0}, {"rate-tc-index": 6, "rate-tc-bw": 0}, {"rate-tc-index": 7, "rate-tc-bw": 0} ] }' ./tools/net/ynl/cli.py --spec Documentation/netlink/specs/devlink.yaml \ --do rate-get --json '{ "bus-name": "pci", "dev-name": "0000:08:00.0", "port-index": 1 }' output for rate-get: {'bus-name': 'pci', 'dev-name': '0000:08:00.0', 'port-index': 1, 'rate-tc-bws': [{'rate-tc-bw': 50, 'rate-tc-index': 0}, {'rate-tc-bw': 50, 'rate-tc-index': 1}, {'rate-tc-bw': 0, 'rate-tc-index': 2}, {'rate-tc-bw': 0, 'rate-tc-index': 3}, {'rate-tc-bw': 0, 'rate-tc-index': 4}, {'rate-tc-bw': 0, 'rate-tc-index': 5}, {'rate-tc-bw': 0, 'rate-tc-index': 6}, {'rate-tc-bw': 0, 'rate-tc-index': 7}], 'rate-tx-max': 0, 'rate-tx-priority': 0, 'rate-tx-share': 0, 'rate-tx-weight': 0, 'rate-type': 'leaf'} Signed-off-by: Carolina Jubran Reviewed-by: Cosmin Ratiu Reviewed-by: Jiri Pirko Signed-off-by: Tariq Toukan Signed-off-by: Mark Bloch Link: https://patch.msgid.link/20250629142138.361537-3-mbloch@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/netlink/specs/devlink.yaml | 32 +++++- Documentation/networking/devlink/devlink-port.rst | 8 ++ include/net/devlink.h | 8 ++ include/uapi/linux/devlink.h | 9 ++ net/devlink/netlink_gen.c | 15 ++- net/devlink/netlink_gen.h | 1 + net/devlink/rate.c | 127 ++++++++++++++++++++++ 7 files changed, 195 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/Documentation/netlink/specs/devlink.yaml b/Documentation/netlink/specs/devlink.yaml index bfba466d694a..1c4bb0cbe5f0 100644 --- a/Documentation/netlink/specs/devlink.yaml +++ b/Documentation/netlink/specs/devlink.yaml @@ -224,6 +224,10 @@ definitions: value: 10 - name: binary + - + name: rate-tc-index-max + type: const + value: 7 attribute-sets: - @@ -844,7 +848,23 @@ attribute-sets: - name: region-direct type: flag - + - + name: rate-tc-bws + type: nest + multi-attr: true + nested-attributes: dl-rate-tc-bws + - + name: rate-tc-index + type: u8 + checks: + max: rate-tc-index-max + - + name: rate-tc-bw + type: u32 + doc: | + Specifies the bandwidth share assigned to the Traffic Class. + The bandwidth for the traffic class is determined + in proportion to the sum of the shares of all configured classes. - name: dl-dev-stats subset-of: devlink @@ -1249,6 +1269,14 @@ attribute-sets: - name: flash type: flag + - + name: dl-rate-tc-bws + subset-of: devlink + attributes: + - + name: rate-tc-index + - + name: rate-tc-bw operations: enum-model: directional @@ -2176,6 +2204,7 @@ operations: - rate-tx-priority - rate-tx-weight - rate-parent-node-name + - rate-tc-bws - name: rate-new @@ -2196,6 +2225,7 @@ operations: - rate-tx-priority - rate-tx-weight - rate-parent-node-name + - rate-tc-bws - name: rate-del diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst index 9d22d41a7cd1..5e397798a402 100644 --- a/Documentation/networking/devlink/devlink-port.rst +++ b/Documentation/networking/devlink/devlink-port.rst @@ -418,6 +418,14 @@ API allows to configure following rate object's parameters: to all node children limits. ``tx_max`` is an upper limit for children. ``tx_share`` is a total bandwidth distributed among children. +``tc_bw`` + Allow users to set the bandwidth allocation per traffic class on rate + objects. This enables fine-grained QoS configurations by assigning a relative + share value to each traffic class. The bandwidth is distributed in proportion + to the share value for each class, relative to the sum of all shares. + When applied to a non-leaf node, tc_bw determines how bandwidth is shared + among its child elements. + ``tx_priority`` and ``tx_weight`` can be used simultaneously. In that case nodes with the same priority form a WFQ subgroup in the sibling group and arbitration among them is based on assigned weights. diff --git a/include/net/devlink.h b/include/net/devlink.h index 63517646a497..d0ce5a7e984c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -118,6 +118,8 @@ struct devlink_rate { u32 tx_priority; u32 tx_weight; + + u32 tc_bw[DEVLINK_RATE_TCS_MAX]; }; struct devlink_port { @@ -1486,6 +1488,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_leaf_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv, u64 tx_share, struct netlink_ext_ack *extack); int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv, @@ -1494,6 +1499,9 @@ struct devlink_ops { u32 tx_priority, struct netlink_ext_ack *extack); int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv, u32 tx_weight, struct netlink_ext_ack *extack); + int (*rate_node_tc_bw_set)(struct devlink_rate *devlink_rate, + void *priv, u32 *tc_bw, + struct netlink_ext_ack *extack); int (*rate_node_new)(struct devlink_rate *rate_node, void **priv, struct netlink_ext_ack *extack); int (*rate_node_del)(struct devlink_rate *rate_node, void *priv, diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a5ee0f13740a..e72bcc239afd 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -221,6 +221,11 @@ enum devlink_port_flavour { */ }; +/* IEEE 802.1Qaz standard supported values. */ + +#define DEVLINK_RATE_TCS_MAX 8 +#define DEVLINK_RATE_TC_INDEX_MAX (DEVLINK_RATE_TCS_MAX - 1) + enum devlink_rate_type { DEVLINK_RATE_TYPE_LEAF, DEVLINK_RATE_TYPE_NODE, @@ -629,6 +634,10 @@ enum devlink_attr { DEVLINK_ATTR_REGION_DIRECT, /* flag */ + DEVLINK_ATTR_RATE_TC_BWS, /* nested */ + DEVLINK_ATTR_RATE_TC_INDEX, /* u8 */ + DEVLINK_ATTR_RATE_TC_BW, /* u32 */ + /* Add new attributes above here, update the spec in * Documentation/netlink/specs/devlink.yaml and re-generate * net/devlink/netlink_gen.c. diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index e340d955cf3b..c50436433c18 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -45,6 +45,11 @@ const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_ [DEVLINK_PORT_FN_ATTR_CAPS] = NLA_POLICY_BITFIELD32(15), }; +const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1] = { + [DEVLINK_ATTR_RATE_TC_INDEX] = NLA_POLICY_MAX(NLA_U8, DEVLINK_RATE_TC_INDEX_MAX), + [DEVLINK_ATTR_RATE_TC_BW] = { .type = NLA_U32, }, +}; + const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1] = { [DEVLINK_ATTR_SELFTEST_ID_FLASH] = { .type = NLA_FLAG, }, }; @@ -523,7 +528,7 @@ static const struct nla_policy devlink_rate_get_dump_nl_policy[DEVLINK_ATTR_DEV_ }; /* DEVLINK_CMD_RATE_SET - do */ -static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -532,10 +537,11 @@ static const struct nla_policy devlink_rate_set_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_NEW - do */ -static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_WEIGHT + 1] = { +static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TC_BWS + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_RATE_NODE_NAME] = { .type = NLA_NUL_STRING, }, @@ -544,6 +550,7 @@ static const struct nla_policy devlink_rate_new_nl_policy[DEVLINK_ATTR_RATE_TX_W [DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32, }, [DEVLINK_ATTR_RATE_PARENT_NODE_NAME] = { .type = NLA_NUL_STRING, }, + [DEVLINK_ATTR_RATE_TC_BWS] = NLA_POLICY_NESTED(devlink_dl_rate_tc_bws_nl_policy), }; /* DEVLINK_CMD_RATE_DEL - do */ @@ -1191,7 +1198,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_set_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -1201,7 +1208,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_rate_new_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_rate_new_nl_policy, - .maxattr = DEVLINK_ATTR_RATE_TX_WEIGHT, + .maxattr = DEVLINK_ATTR_RATE_TC_BWS, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 8f2bd50ddf5e..fb733b5d4ff1 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -13,6 +13,7 @@ /* Common nested types */ extern const struct nla_policy devlink_dl_port_function_nl_policy[DEVLINK_PORT_FN_ATTR_CAPS + 1]; +extern const struct nla_policy devlink_dl_rate_tc_bws_nl_policy[DEVLINK_ATTR_RATE_TC_BW + 1]; extern const struct nla_policy devlink_dl_selftest_id_nl_policy[DEVLINK_ATTR_SELFTEST_ID_FLASH + 1]; /* Ops table for devlink */ diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 8828ffaf6cbc..d39300a9b3d4 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -80,6 +80,29 @@ devlink_rate_get_from_info(struct devlink *devlink, struct genl_info *info) return ERR_PTR(-EINVAL); } +static int devlink_rate_put_tc_bws(struct sk_buff *msg, u32 *tc_bw) +{ + struct nlattr *nla_tc_bw; + int i; + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + nla_tc_bw = nla_nest_start(msg, DEVLINK_ATTR_RATE_TC_BWS); + if (!nla_tc_bw) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_RATE_TC_INDEX, i) || + nla_put_u32(msg, DEVLINK_ATTR_RATE_TC_BW, tc_bw[i])) + goto nla_put_failure; + + nla_nest_end(msg, nla_tc_bw); + } + return 0; + +nla_put_failure: + nla_nest_cancel(msg, nla_tc_bw); + return -EMSGSIZE; +} + static int devlink_nl_rate_fill(struct sk_buff *msg, struct devlink_rate *devlink_rate, enum devlink_command cmd, u32 portid, u32 seq, @@ -129,6 +152,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg, devlink_rate->parent->name)) goto nla_put_failure; + if (devlink_rate_put_tc_bws(msg, devlink_rate->tc_bw)) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -316,6 +342,87 @@ devlink_nl_rate_parent_node_set(struct devlink_rate *devlink_rate, return 0; } +static int devlink_nl_rate_tc_bw_parse(struct nlattr *parent_nest, u32 *tc_bw, + unsigned long *bitmap, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[DEVLINK_ATTR_MAX + 1]; + u8 tc_index; + int err; + + err = nla_parse_nested(tb, DEVLINK_ATTR_MAX, parent_nest, + devlink_dl_rate_tc_bws_nl_policy, extack); + if (err) + return err; + + if (!tb[DEVLINK_ATTR_RATE_TC_INDEX]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_ATTR_RATE_TC_INDEX); + return -EINVAL; + } + + tc_index = nla_get_u8(tb[DEVLINK_ATTR_RATE_TC_INDEX]); + + if (!tb[DEVLINK_ATTR_RATE_TC_BW]) { + NL_SET_ERR_ATTR_MISS(extack, parent_nest, + DEVLINK_ATTR_RATE_TC_BW); + return -EINVAL; + } + + if (test_and_set_bit(tc_index, bitmap)) { + NL_SET_ERR_MSG_FMT(extack, + "Duplicate traffic class index specified (%u)", + tc_index); + return -EINVAL; + } + + tc_bw[tc_index] = nla_get_u32(tb[DEVLINK_ATTR_RATE_TC_BW]); + + return 0; +} + +static int devlink_nl_rate_tc_bw_set(struct devlink_rate *devlink_rate, + struct genl_info *info) +{ + DECLARE_BITMAP(bitmap, DEVLINK_RATE_TCS_MAX) = {}; + struct devlink *devlink = devlink_rate->devlink; + const struct devlink_ops *ops = devlink->ops; + u32 tc_bw[DEVLINK_RATE_TCS_MAX] = {}; + int rem, err = -EOPNOTSUPP, i; + struct nlattr *attr; + + nlmsg_for_each_attr_type(attr, DEVLINK_ATTR_RATE_TC_BWS, info->nlhdr, + GENL_HDRLEN, rem) { + err = devlink_nl_rate_tc_bw_parse(attr, tc_bw, bitmap, + info->extack); + if (err) + return err; + } + + for (i = 0; i < DEVLINK_RATE_TCS_MAX; i++) { + if (!test_bit(i, bitmap)) { + NL_SET_ERR_MSG_FMT(info->extack, + "Bandwidth values must be specified for all %u traffic classes", + DEVLINK_RATE_TCS_MAX); + return -EINVAL; + } + } + + if (devlink_rate_is_leaf(devlink_rate)) + err = ops->rate_leaf_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + else if (devlink_rate_is_node(devlink_rate)) + err = ops->rate_node_tc_bw_set(devlink_rate, devlink_rate->priv, + tc_bw, info->extack); + + if (err) + return err; + + memcpy(devlink_rate->tc_bw, tc_bw, sizeof(tc_bw)); + + return 0; +} + static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, const struct devlink_ops *ops, struct genl_info *info) @@ -388,6 +495,12 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate, return err; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS]) { + err = devlink_nl_rate_tc_bw_set(devlink_rate, info); + if (err) + return err; + } + return 0; } @@ -423,6 +536,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the leafs"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_leaf_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the leafs"); + return false; + } } else if (type == DEVLINK_RATE_TYPE_NODE) { if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) { NL_SET_ERR_MSG(info->extack, "TX share set isn't supported for the nodes"); @@ -449,6 +569,13 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops, "TX weight set isn't supported for the nodes"); return false; } + if (attrs[DEVLINK_ATTR_RATE_TC_BWS] && + !ops->rate_node_tc_bw_set) { + NL_SET_ERR_MSG_ATTR(info->extack, + attrs[DEVLINK_ATTR_RATE_TC_BWS], + "TC bandwidth set isn't supported for the nodes"); + return false; + } } else { WARN(1, "Unknown type of rate object"); return false; -- cgit v1.2.3 From fd72f265bb00d2dd2a3bbad7ec45520025e3a926 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 22 May 2025 16:52:23 +0200 Subject: netfilter: conntrack: remove DCCP protocol support The DCCP socket family has now been removed from this tree, see: 8bb3212be4b4 ("Merge branch 'net-retire-dccp-socket'") Remove connection tracking and NAT support for this protocol, this should not pose a problem because no DCCP traffic is expected to be seen on the wire. As for the code for matching on dccp header for iptables and nftables, mark it as deprecated and keep it in place. Ruleset restoration is an atomic operation. Without dccp matching support, an astray match on dccp could break this operation leaving your computer with no policy in place, so let's follow a more conservative approach for matches. Add CONFIG_NFT_EXTHDR_DCCP which is set to 'n' by default to deprecate dccp extension support. Similarly, label CONFIG_NETFILTER_XT_MATCH_DCCP as deprecated too and also set it to 'n' by default. Code to match on DCCP protocol from ebtables also remains in place, this is just a few checks on IPPROTO_DCCP from _check() path which is exercised when ruleset is loaded. There is another use of IPPROTO_DCCP from the _check() path in the iptables multiport match. Another check for IPPROTO_DCCP from the packet in the reject target is also removed. So let's schedule removal of the dccp matching for a second stage, this should not interfer with the dccp retirement since this is only matching on the dccp header. Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Kuniyuki Iwashima Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- Documentation/networking/nf_conntrack-sysctl.rst | 1 - arch/arm/configs/omap2plus_defconfig | 1 - arch/loongarch/configs/loongson3_defconfig | 1 - arch/m68k/configs/amiga_defconfig | 1 - arch/m68k/configs/apollo_defconfig | 1 - arch/m68k/configs/atari_defconfig | 1 - arch/m68k/configs/bvme6000_defconfig | 1 - arch/m68k/configs/hp300_defconfig | 1 - arch/m68k/configs/mac_defconfig | 1 - arch/m68k/configs/multi_defconfig | 1 - arch/m68k/configs/mvme147_defconfig | 1 - arch/m68k/configs/mvme16x_defconfig | 1 - arch/m68k/configs/q40_defconfig | 1 - arch/m68k/configs/sun3_defconfig | 1 - arch/m68k/configs/sun3x_defconfig | 1 - arch/mips/configs/fuloong2e_defconfig | 1 - arch/mips/configs/ip22_defconfig | 1 - arch/mips/configs/loongson2k_defconfig | 1 - arch/mips/configs/loongson3_defconfig | 1 - arch/mips/configs/malta_defconfig | 1 - arch/mips/configs/malta_kvm_defconfig | 1 - arch/mips/configs/maltaup_xpa_defconfig | 1 - arch/mips/configs/rb532_defconfig | 1 - arch/mips/configs/rm200_defconfig | 1 - arch/powerpc/configs/cell_defconfig | 1 - arch/s390/configs/debug_defconfig | 1 - arch/s390/configs/defconfig | 1 - arch/sh/configs/titan_defconfig | 1 - include/linux/netfilter/nf_conntrack_dccp.h | 38 -- include/net/netfilter/ipv4/nf_conntrack_ipv4.h | 3 - include/net/netfilter/nf_conntrack.h | 2 - include/net/netfilter/nf_conntrack_l4proto.h | 13 - include/net/netfilter/nf_reject.h | 1 - include/net/netns/conntrack.h | 13 - net/netfilter/Kconfig | 20 +- net/netfilter/Makefile | 1 - net/netfilter/nf_conntrack_core.c | 8 - net/netfilter/nf_conntrack_netlink.c | 1 - net/netfilter/nf_conntrack_proto.c | 6 - net/netfilter/nf_conntrack_proto_dccp.c | 826 ----------------------- net/netfilter/nf_conntrack_standalone.c | 92 --- net/netfilter/nf_nat_core.c | 6 - net/netfilter/nf_nat_proto.c | 43 -- net/netfilter/nfnetlink_cttimeout.c | 5 - net/netfilter/nft_exthdr.c | 8 + 45 files changed, 16 insertions(+), 1098 deletions(-) delete mode 100644 include/linux/netfilter/nf_conntrack_dccp.h delete mode 100644 net/netfilter/nf_conntrack_proto_dccp.c (limited to 'include/net') diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index 238b66d0e059..35f889259fcd 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -85,7 +85,6 @@ nf_conntrack_log_invalid - INTEGER - 1 - log ICMP packets - 6 - log TCP packets - 17 - log UDP packets - - 33 - log DCCP packets - 41 - log ICMPv6 packets - 136 - log UDPLITE packets - 255 - log packets of any protocol diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 9f9780c8e62a..fee43d156622 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -142,7 +142,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index 0d59af6007b7..68e337aed2bb 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -225,7 +225,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index d05690289e33..83eab331872f 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -85,7 +85,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index a1747fbe23fb..0e5de7edd544 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -81,7 +81,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 74293551f66b..35fc466095f4 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -88,7 +88,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 419b13ae950a..53b7844cf301 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -78,7 +78,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 4c81d756587c..560fdf3ed106 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -80,7 +80,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index daa01d7fb462..2e28e54b52f8 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -79,7 +79,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 641ca22eb3b2..f5f6b8e65c26 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -99,7 +99,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index f98ffa7a1640..36bbf98d6aa4 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -77,7 +77,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 2bfc3f4b48f9..e247bff8f1a4 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -78,7 +78,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 2bd46cbcca2a..27aa4eb5d3f4 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -79,7 +79,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index dc7fc94fc669..b338f2043d97 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -74,7 +74,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index b026a54867f5..87ee47da4e31 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -75,7 +75,6 @@ CONFIG_NETFILTER=y CONFIG_NETFILTER_NETLINK_HOOK=m CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_ZONES=y -# CONFIG_NF_CT_PROTO_DCCP is not set CONFIG_NF_CONNTRACK_AMANDA=m CONFIG_NF_CONNTRACK_FTP=m CONFIG_NF_CONNTRACK_H323=m diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig index 114fcd67898d..cdedbb8a8f53 100644 --- a/arch/mips/configs/fuloong2e_defconfig +++ b/arch/mips/configs/fuloong2e_defconfig @@ -44,7 +44,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_TARGET_TRACE=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_IPRANGE=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig index f1a8ccf2c459..2decf8b98d31 100644 --- a/arch/mips/configs/ip22_defconfig +++ b/arch/mips/configs/ip22_defconfig @@ -79,7 +79,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m diff --git a/arch/mips/configs/loongson2k_defconfig b/arch/mips/configs/loongson2k_defconfig index 4b7f914d01d0..6aea6a5b1b66 100644 --- a/arch/mips/configs/loongson2k_defconfig +++ b/arch/mips/configs/loongson2k_defconfig @@ -52,7 +52,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m CONFIG_NETFILTER_XT_TARGET_MARK=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_LIMIT=m diff --git a/arch/mips/configs/loongson3_defconfig b/arch/mips/configs/loongson3_defconfig index 98844b457b7f..43a72c410538 100644 --- a/arch/mips/configs/loongson3_defconfig +++ b/arch/mips/configs/loongson3_defconfig @@ -72,7 +72,6 @@ CONFIG_NETFILTER_XT_TARGET_MARK=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_LIMIT=m diff --git a/arch/mips/configs/malta_defconfig b/arch/mips/configs/malta_defconfig index 869a14b3184f..9fcbac829920 100644 --- a/arch/mips/configs/malta_defconfig +++ b/arch/mips/configs/malta_defconfig @@ -80,7 +80,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m diff --git a/arch/mips/configs/malta_kvm_defconfig b/arch/mips/configs/malta_kvm_defconfig index 41e1fea303ea..19102386a81c 100644 --- a/arch/mips/configs/malta_kvm_defconfig +++ b/arch/mips/configs/malta_kvm_defconfig @@ -84,7 +84,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m diff --git a/arch/mips/configs/maltaup_xpa_defconfig b/arch/mips/configs/maltaup_xpa_defconfig index 13ff1877e26e..1dd07c9d1812 100644 --- a/arch/mips/configs/maltaup_xpa_defconfig +++ b/arch/mips/configs/maltaup_xpa_defconfig @@ -82,7 +82,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_HELPER=m diff --git a/arch/mips/configs/rb532_defconfig b/arch/mips/configs/rb532_defconfig index 9fb114ef5e2d..30d18b084cda 100644 --- a/arch/mips/configs/rb532_defconfig +++ b/arch/mips/configs/rb532_defconfig @@ -56,7 +56,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_TARGET_TRACE=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m CONFIG_NETFILTER_XT_MATCH_LIMIT=y CONFIG_NETFILTER_XT_MATCH_MULTIPORT=y diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig index 7b5a5591ccc9..39a2419e1f3e 100644 --- a/arch/mips/configs/rm200_defconfig +++ b/arch/mips/configs/rm200_defconfig @@ -64,7 +64,6 @@ CONFIG_NETFILTER_XT_MATCH_COMMENT=m CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m diff --git a/arch/powerpc/configs/cell_defconfig b/arch/powerpc/configs/cell_defconfig index 3347192b77b8..7a31b52e92e1 100644 --- a/arch/powerpc/configs/cell_defconfig +++ b/arch/powerpc/configs/cell_defconfig @@ -62,7 +62,6 @@ CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_TARGET_TCPMSS=m CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 8ecad727497e..0808a3718298 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -248,7 +248,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig index c13a77765162..6118e3105adb 100644 --- a/arch/s390/configs/defconfig +++ b/arch/s390/configs/defconfig @@ -239,7 +239,6 @@ CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m CONFIG_NETFILTER_XT_MATCH_CONNMARK=m CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m CONFIG_NETFILTER_XT_MATCH_DSCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index f022ada363b5..8ef72b8dbcd3 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -61,7 +61,6 @@ CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m CONFIG_NETFILTER_XT_TARGET_MARK=m CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m CONFIG_NETFILTER_XT_MATCH_ESP=m CONFIG_NETFILTER_XT_MATCH_LENGTH=m CONFIG_NETFILTER_XT_MATCH_LIMIT=m diff --git a/include/linux/netfilter/nf_conntrack_dccp.h b/include/linux/netfilter/nf_conntrack_dccp.h deleted file mode 100644 index c509ed76e714..000000000000 --- a/include/linux/netfilter/nf_conntrack_dccp.h +++ /dev/null @@ -1,38 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _NF_CONNTRACK_DCCP_H -#define _NF_CONNTRACK_DCCP_H - -/* Exposed to userspace over nfnetlink */ -enum ct_dccp_states { - CT_DCCP_NONE, - CT_DCCP_REQUEST, - CT_DCCP_RESPOND, - CT_DCCP_PARTOPEN, - CT_DCCP_OPEN, - CT_DCCP_CLOSEREQ, - CT_DCCP_CLOSING, - CT_DCCP_TIMEWAIT, - CT_DCCP_IGNORE, - CT_DCCP_INVALID, - __CT_DCCP_MAX -}; -#define CT_DCCP_MAX (__CT_DCCP_MAX - 1) - -enum ct_dccp_roles { - CT_DCCP_ROLE_CLIENT, - CT_DCCP_ROLE_SERVER, - __CT_DCCP_ROLE_MAX -}; -#define CT_DCCP_ROLE_MAX (__CT_DCCP_ROLE_MAX - 1) - -#include - -struct nf_ct_dccp { - u_int8_t role[IP_CT_DIR_MAX]; - u_int8_t state; - u_int8_t last_pkt; - u_int8_t last_dir; - u_int64_t handshake_seq; -}; - -#endif /* _NF_CONNTRACK_DCCP_H */ diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 2c8c2b023848..8d65ffbf57de 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -13,9 +13,6 @@ extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp; extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP -extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP extern const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp; #endif diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3f02a45773e8..a844aa46d076 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -18,7 +18,6 @@ #include #include -#include #include #include @@ -31,7 +30,6 @@ struct nf_ct_udp { /* per conntrack: protocol private data */ union nf_conntrack_proto { /* insert conntrack proto private data here */ - struct nf_ct_dccp dccp; struct ip_ct_sctp sctp; struct ip_ct_tcp tcp; struct nf_ct_udp udp; diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 1f47bef51722..6929f8daf1ed 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -117,11 +117,6 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, unsigned int dataoff, enum ip_conntrack_info ctinfo, const struct nf_hook_state *state); -int nf_conntrack_dccp_packet(struct nf_conn *ct, - struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state); int nf_conntrack_sctp_packet(struct nf_conn *ct, struct sk_buff *skb, unsigned int dataoff, @@ -137,7 +132,6 @@ void nf_conntrack_generic_init_net(struct net *net); void nf_conntrack_tcp_init_net(struct net *net); void nf_conntrack_udp_init_net(struct net *net); void nf_conntrack_gre_init_net(struct net *net); -void nf_conntrack_dccp_init_net(struct net *net); void nf_conntrack_sctp_init_net(struct net *net); void nf_conntrack_icmp_init_net(struct net *net); void nf_conntrack_icmpv6_init_net(struct net *net); @@ -223,13 +217,6 @@ static inline bool nf_conntrack_tcp_established(const struct nf_conn *ct) } #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP -static inline struct nf_dccp_net *nf_dccp_pernet(struct net *net) -{ - return &net->ct.nf_ct_proto.dccp; -} -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP static inline struct nf_sctp_net *nf_sctp_pernet(struct net *net) { diff --git a/include/net/netfilter/nf_reject.h b/include/net/netfilter/nf_reject.h index 7c669792fb9c..f1db33bc6bf8 100644 --- a/include/net/netfilter/nf_reject.h +++ b/include/net/netfilter/nf_reject.h @@ -34,7 +34,6 @@ static inline bool nf_reject_verify_csum(struct sk_buff *skb, int dataoff, /* Protocols with partial checksums. */ case IPPROTO_UDPLITE: - case IPPROTO_DCCP: return false; } return true; diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index bae914815aa3..ab74b5ed0b01 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -7,9 +7,6 @@ #include #include #include -#ifdef CONFIG_NF_CT_PROTO_DCCP -#include -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP #include #endif @@ -50,13 +47,6 @@ struct nf_icmp_net { unsigned int timeout; }; -#ifdef CONFIG_NF_CT_PROTO_DCCP -struct nf_dccp_net { - u8 dccp_loose; - unsigned int dccp_timeout[CT_DCCP_MAX + 1]; -}; -#endif - #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net { unsigned int timeouts[SCTP_CONNTRACK_MAX]; @@ -82,9 +72,6 @@ struct nf_ip_net { struct nf_udp_net udp; struct nf_icmp_net icmp; struct nf_icmp_net icmpv6; -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP struct nf_sctp_net sctp; #endif diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 2560416218d0..ba60b48d7567 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -195,16 +195,6 @@ config NF_CONNTRACK_LABELS config NF_CONNTRACK_OVS bool -config NF_CT_PROTO_DCCP - bool 'DCCP protocol connection tracking support' - depends on NETFILTER_ADVANCED - default y - help - With this option enabled, the layer 3 independent connection - tracking code will be able to do state tracking on DCCP connections. - - If unsure, say Y. - config NF_CT_PROTO_GRE bool @@ -516,6 +506,12 @@ config NFT_CT This option adds the "ct" expression that you can use to match connection tracking information such as the flow state. +config NFT_EXTHDR_DCCP + bool "Netfilter nf_tables exthdr DCCP support (DEPRECATED)" + default n + help + This option adds support for matching on DCCP extension headers. + config NFT_FLOW_OFFLOAD depends on NF_CONNTRACK && NF_FLOW_TABLE tristate "Netfilter nf_tables hardware flow offload module" @@ -1278,9 +1274,9 @@ config NETFILTER_XT_MATCH_CPU To compile it as a module, choose M here. If unsure, say N. config NETFILTER_XT_MATCH_DCCP - tristate '"dccp" protocol match support' + tristate '"dccp" protocol match support (DEPRECATED)' depends on NETFILTER_ADVANCED - default IP_DCCP + default n help With this option enabled, you will be able to use the iptables `dccp' match in order to match on DCCP source/destination ports diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index f0aa4d7ef499..e43e20f529f8 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -12,7 +12,6 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o nf_conntrack-$(CONFIG_NF_CONNTRACK_LABELS) += nf_conntrack_labels.o nf_conntrack-$(CONFIG_NF_CONNTRACK_OVS) += nf_conntrack_ovs.o -nf_conntrack-$(CONFIG_NF_CT_PROTO_DCCP) += nf_conntrack_proto_dccp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_SCTP) += nf_conntrack_proto_sctp.o nf_conntrack-$(CONFIG_NF_CT_PROTO_GRE) += nf_conntrack_proto_gre.o ifeq ($(CONFIG_NF_CONNTRACK),m) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 201d3c4ec623..1097f26a6788 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -328,9 +328,6 @@ nf_ct_get_tuple(const struct sk_buff *skb, #endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: -#endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: #endif /* fallthrough */ return nf_ct_get_tuple_ports(skb, dataoff, tuple); @@ -1982,11 +1979,6 @@ static int nf_conntrack_handle_packet(struct nf_conn *ct, return nf_conntrack_sctp_packet(ct, skb, dataoff, ctinfo, state); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: - return nf_conntrack_dccp_packet(ct, skb, dataoff, - ctinfo, state); -#endif #ifdef CONFIG_NF_CT_PROTO_GRE case IPPROTO_GRE: return nf_conntrack_gre_packet(ct, skb, dataoff, diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 2cc0fde23344..486d52b45fe5 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2036,7 +2036,6 @@ static void ctnetlink_change_mark(struct nf_conn *ct, static const struct nla_policy protoinfo_policy[CTA_PROTOINFO_MAX+1] = { [CTA_PROTOINFO_TCP] = { .type = NLA_NESTED }, - [CTA_PROTOINFO_DCCP] = { .type = NLA_NESTED }, [CTA_PROTOINFO_SCTP] = { .type = NLA_NESTED }, }; diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index f36727ed91e1..bc1d96686b9c 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -100,9 +100,6 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto) case IPPROTO_UDP: return &nf_conntrack_l4proto_udp; case IPPROTO_TCP: return &nf_conntrack_l4proto_tcp; case IPPROTO_ICMP: return &nf_conntrack_l4proto_icmp; -#ifdef CONFIG_NF_CT_PROTO_DCCP - case IPPROTO_DCCP: return &nf_conntrack_l4proto_dccp; -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP case IPPROTO_SCTP: return &nf_conntrack_l4proto_sctp; #endif @@ -681,9 +678,6 @@ void nf_conntrack_proto_pernet_init(struct net *net) #if IS_ENABLED(CONFIG_IPV6) nf_conntrack_icmpv6_init_net(net); #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - nf_conntrack_dccp_init_net(net); -#endif #ifdef CONFIG_NF_CT_PROTO_SCTP nf_conntrack_sctp_init_net(net); #endif diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c deleted file mode 100644 index ebc4f733bb2e..000000000000 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ /dev/null @@ -1,826 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * DCCP connection tracking protocol helper - * - * Copyright (c) 2005, 2006, 2008 Patrick McHardy - */ -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -/* Timeouts are based on values from RFC4340: - * - * - REQUEST: - * - * 8.1.2. Client Request - * - * A client MAY give up on its DCCP-Requests after some time - * (3 minutes, for example). - * - * - RESPOND: - * - * 8.1.3. Server Response - * - * It MAY also leave the RESPOND state for CLOSED after a timeout of - * not less than 4MSL (8 minutes); - * - * - PARTOPEN: - * - * 8.1.5. Handshake Completion - * - * If the client remains in PARTOPEN for more than 4MSL (8 minutes), - * it SHOULD reset the connection with Reset Code 2, "Aborted". - * - * - OPEN: - * - * The DCCP timestamp overflows after 11.9 hours. If the connection - * stays idle this long the sequence number won't be recognized - * as valid anymore. - * - * - CLOSEREQ/CLOSING: - * - * 8.3. Termination - * - * The retransmission timer should initially be set to go off in two - * round-trip times and should back off to not less than once every - * 64 seconds ... - * - * - TIMEWAIT: - * - * 4.3. States - * - * A server or client socket remains in this state for 2MSL (4 minutes) - * after the connection has been town down, ... - */ - -#define DCCP_MSL (2 * 60 * HZ) - -#ifdef CONFIG_NF_CONNTRACK_PROCFS -static const char * const dccp_state_names[] = { - [CT_DCCP_NONE] = "NONE", - [CT_DCCP_REQUEST] = "REQUEST", - [CT_DCCP_RESPOND] = "RESPOND", - [CT_DCCP_PARTOPEN] = "PARTOPEN", - [CT_DCCP_OPEN] = "OPEN", - [CT_DCCP_CLOSEREQ] = "CLOSEREQ", - [CT_DCCP_CLOSING] = "CLOSING", - [CT_DCCP_TIMEWAIT] = "TIMEWAIT", - [CT_DCCP_IGNORE] = "IGNORE", - [CT_DCCP_INVALID] = "INVALID", -}; -#endif - -#define sNO CT_DCCP_NONE -#define sRQ CT_DCCP_REQUEST -#define sRS CT_DCCP_RESPOND -#define sPO CT_DCCP_PARTOPEN -#define sOP CT_DCCP_OPEN -#define sCR CT_DCCP_CLOSEREQ -#define sCG CT_DCCP_CLOSING -#define sTW CT_DCCP_TIMEWAIT -#define sIG CT_DCCP_IGNORE -#define sIV CT_DCCP_INVALID - -/* - * DCCP state transition table - * - * The assumption is the same as for TCP tracking: - * - * We are the man in the middle. All the packets go through us but might - * get lost in transit to the destination. It is assumed that the destination - * can't receive segments we haven't seen. - * - * The following states exist: - * - * NONE: Initial state, expecting Request - * REQUEST: Request seen, waiting for Response from server - * RESPOND: Response from server seen, waiting for Ack from client - * PARTOPEN: Ack after Response seen, waiting for packet other than Response, - * Reset or Sync from server - * OPEN: Packet other than Response, Reset or Sync seen - * CLOSEREQ: CloseReq from server seen, expecting Close from client - * CLOSING: Close seen, expecting Reset - * TIMEWAIT: Reset seen - * IGNORE: Not determinable whether packet is valid - * - * Some states exist only on one side of the connection: REQUEST, RESPOND, - * PARTOPEN, CLOSEREQ. For the other side these states are equivalent to - * the one it was in before. - * - * Packets are marked as ignored (sIG) if we don't know if they're valid - * (for example a reincarnation of a connection we didn't notice is dead - * already) and the server may send back a connection closing Reset or a - * Response. They're also used for Sync/SyncAck packets, which we don't - * care about. - */ -static const u_int8_t -dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] = { - [CT_DCCP_ROLE_CLIENT] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sRQ Regular Request - * sRQ -> sRQ Retransmitted Request or reincarnation - * sRS -> sRS Retransmitted Request (apparently Response - * got lost after we saw it) or reincarnation - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation - * - * sNO, sRQ, sRS, sPO. sOP, sCR, sCG, sTW, */ - sRQ, sRQ, sRS, sIG, sIG, sIG, sIG, sRQ, - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, might be response to ignored Request - * sRS -> sIG Ignore, might be response to ignored Request - * sPO -> sIG Ignore, might be response to ignored Request - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, reincarnation in reverse direction - * goes through sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIV, - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Retransmitted Ack for Response, remain in PARTOPEN - * sOP -> sOP Regular ACK, remain in OPEN - * sCR -> sCR Ack in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sIV MUST use DataAck in PARTOPEN state (8.1.5.) - * sOP -> sOP Regular Data packet - * sCR -> sCR Data in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sOP, sCR, sCG, sIV, - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sPO Ack for Response, move to PARTOPEN (8.1.5.) - * sPO -> sPO Remain in PARTOPEN state - * sOP -> sOP Regular DataAck packet in OPEN state - * sCR -> sCR DataAck in CLOSEREQ MAY be processed (8.3.) - * sCG -> sCG DataAck in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sPO, sPO, sOP, sCR, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * CLOSEREQ may only be sent by the server. - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sCG Client-initiated close - * sOP -> sCG Client-initiated close - * sCR -> sCG Close in response to CloseReq (8.3.) - * sCG -> sCG Retransmit - * sTW -> sIV Late retransmit, already in TIME_WAIT - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sCG, sIV, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Sync received or timeout, SHOULD send Reset (8.1.1.) - * sRS -> sTW Response received without Request - * sPO -> sTW Timeout, SHOULD send Reset (8.1.5.) - * sOP -> sTW Connection reset - * sCR -> sTW Connection reset - * sCG -> sTW Connection reset - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, - [CT_DCCP_ROLE_SERVER] = { - [DCCP_PKT_REQUEST] = { - /* - * sNO -> sIV Invalid - * sRQ -> sIG Ignore, conntrack might be out of sync - * sRS -> sIG Ignore, conntrack might be out of sync - * sPO -> sIG Ignore, conntrack might be out of sync - * sOP -> sIG Ignore, conntrack might be out of sync - * sCR -> sIG Ignore, conntrack might be out of sync - * sCG -> sIG Ignore, conntrack might be out of sync - * sTW -> sRQ Reincarnation, must reverse roles - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sRQ - }, - [DCCP_PKT_RESPONSE] = { - /* - * sNO -> sIV Response without Request - * sRQ -> sRS Response to clients Request - * sRS -> sRS Retransmitted Response (8.1.3. SHOULD NOT) - * sPO -> sIG Response to an ignored Request or late retransmit - * sOP -> sIG Ignore, might be response to ignored Request - * sCR -> sIG Ignore, might be response to ignored Request - * sCG -> sIG Ignore, might be response to ignored Request - * sTW -> sIV Invalid, Request from client in sTW moves to sRQ - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sRS, sRS, sIG, sIG, sIG, sIG, sIV - }, - [DCCP_PKT_ACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Ack in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Ack in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATA] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular Data packet in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_DATAACK] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP Enter OPEN state (8.1.5.) - * sOP -> sOP Regular DataAck in OPEN state - * sCR -> sIV Waiting for Close from client - * sCG -> sCG Data in CLOSING MAY be processed (8.3.) - * sTW -> sIV - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sOP, sOP, sIV, sCG, sIV - }, - [DCCP_PKT_CLOSEREQ] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCR Move directly to CLOSEREQ (8.1.5.) - * sOP -> sCR CloseReq in OPEN state - * sCR -> sCR Retransmit - * sCG -> sCR Simultaneous close, client sends another Close - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCR, sCR, sCR, sCR, sIV - }, - [DCCP_PKT_CLOSE] = { - /* - * sNO -> sIV No connection - * sRQ -> sIV No connection - * sRS -> sIV No connection - * sPO -> sOP -> sCG Move direcly to CLOSING - * sOP -> sCG Move to CLOSING - * sCR -> sIV Close after CloseReq is invalid - * sCG -> sCG Retransmit - * sTW -> sIV Already closed - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIV, sIV, sCG, sCG, sIV, sCG, sIV - }, - [DCCP_PKT_RESET] = { - /* - * sNO -> sIV No connection - * sRQ -> sTW Reset in response to Request - * sRS -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sPO -> sTW Timeout, SHOULD send Reset (8.1.3.) - * sOP -> sTW - * sCR -> sTW - * sCG -> sTW - * sTW -> sIG Ignore (don't refresh timer) - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW, sTW */ - sIV, sTW, sTW, sTW, sTW, sTW, sTW, sTW, sIG - }, - [DCCP_PKT_SYNC] = { - /* - * We currently ignore Sync packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - [DCCP_PKT_SYNCACK] = { - /* - * We currently ignore SyncAck packets - * - * sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */ - sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, - }, - }, -}; - -static noinline bool -dccp_new(struct nf_conn *ct, const struct sk_buff *skb, - const struct dccp_hdr *dh, - const struct nf_hook_state *hook_state) -{ - struct net *net = nf_ct_net(ct); - struct nf_dccp_net *dn; - const char *msg; - u_int8_t state; - - state = dccp_state_table[CT_DCCP_ROLE_CLIENT][dh->dccph_type][CT_DCCP_NONE]; - switch (state) { - default: - dn = nf_dccp_pernet(net); - if (dn->dccp_loose == 0) { - msg = "not picking up existing connection "; - goto out_invalid; - } - break; - case CT_DCCP_REQUEST: - break; - case CT_DCCP_INVALID: - msg = "invalid state transition "; - goto out_invalid; - } - - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.state = CT_DCCP_NONE; - ct->proto.dccp.last_pkt = DCCP_PKT_REQUEST; - ct->proto.dccp.last_dir = IP_CT_DIR_ORIGINAL; - ct->proto.dccp.handshake_seq = 0; - return true; - -out_invalid: - nf_ct_l4proto_log_invalid(skb, ct, hook_state, "%s", msg); - return false; -} - -static u64 dccp_ack_seq(const struct dccp_hdr *dh) -{ - const struct dccp_hdr_ack_bits *dhack; - - dhack = (void *)dh + __dccp_basic_hdr_len(dh); - return ((u64)ntohs(dhack->dccph_ack_nr_high) << 32) + - ntohl(dhack->dccph_ack_nr_low); -} - -static bool dccp_error(const struct dccp_hdr *dh, - struct sk_buff *skb, unsigned int dataoff, - const struct nf_hook_state *state) -{ - static const unsigned long require_seq48 = 1 << DCCP_PKT_REQUEST | - 1 << DCCP_PKT_RESPONSE | - 1 << DCCP_PKT_CLOSEREQ | - 1 << DCCP_PKT_CLOSE | - 1 << DCCP_PKT_RESET | - 1 << DCCP_PKT_SYNC | - 1 << DCCP_PKT_SYNCACK; - unsigned int dccp_len = skb->len - dataoff; - unsigned int cscov; - const char *msg; - u8 type; - - BUILD_BUG_ON(DCCP_PKT_INVALID >= BITS_PER_LONG); - - if (dh->dccph_doff * 4 < sizeof(struct dccp_hdr) || - dh->dccph_doff * 4 > dccp_len) { - msg = "nf_ct_dccp: truncated/malformed packet "; - goto out_invalid; - } - - cscov = dccp_len; - if (dh->dccph_cscov) { - cscov = (dh->dccph_cscov - 1) * 4; - if (cscov > dccp_len) { - msg = "nf_ct_dccp: bad checksum coverage "; - goto out_invalid; - } - } - - if (state->hook == NF_INET_PRE_ROUTING && - state->net->ct.sysctl_checksum && - nf_checksum_partial(skb, state->hook, dataoff, cscov, - IPPROTO_DCCP, state->pf)) { - msg = "nf_ct_dccp: bad checksum "; - goto out_invalid; - } - - type = dh->dccph_type; - if (type >= DCCP_PKT_INVALID) { - msg = "nf_ct_dccp: reserved packet type "; - goto out_invalid; - } - - if (test_bit(type, &require_seq48) && !dh->dccph_x) { - msg = "nf_ct_dccp: type lacks 48bit sequence numbers"; - goto out_invalid; - } - - return false; -out_invalid: - nf_l4proto_log_invalid(skb, state, IPPROTO_DCCP, "%s", msg); - return true; -} - -struct nf_conntrack_dccp_buf { - struct dccp_hdr dh; /* generic header part */ - struct dccp_hdr_ext ext; /* optional depending dh->dccph_x */ - union { /* depends on header type */ - struct dccp_hdr_ack_bits ack; - struct dccp_hdr_request req; - struct dccp_hdr_response response; - struct dccp_hdr_reset rst; - } u; -}; - -static struct dccp_hdr * -dccp_header_pointer(const struct sk_buff *skb, int offset, const struct dccp_hdr *dh, - struct nf_conntrack_dccp_buf *buf) -{ - unsigned int hdrlen = __dccp_hdr_len(dh); - - if (hdrlen > sizeof(*buf)) - return NULL; - - return skb_header_pointer(skb, offset, hdrlen, buf); -} - -int nf_conntrack_dccp_packet(struct nf_conn *ct, struct sk_buff *skb, - unsigned int dataoff, - enum ip_conntrack_info ctinfo, - const struct nf_hook_state *state) -{ - enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); - struct nf_conntrack_dccp_buf _dh; - u_int8_t type, old_state, new_state; - enum ct_dccp_roles role; - unsigned int *timeouts; - struct dccp_hdr *dh; - - dh = skb_header_pointer(skb, dataoff, sizeof(*dh), &_dh.dh); - if (!dh) - return -NF_ACCEPT; - - if (dccp_error(dh, skb, dataoff, state)) - return -NF_ACCEPT; - - /* pull again, including possible 48 bit sequences and subtype header */ - dh = dccp_header_pointer(skb, dataoff, dh, &_dh); - if (!dh) - return -NF_ACCEPT; - - type = dh->dccph_type; - if (!nf_ct_is_confirmed(ct) && !dccp_new(ct, skb, dh, state)) - return -NF_ACCEPT; - - if (type == DCCP_PKT_RESET && - !test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { - /* Tear down connection immediately if only reply is a RESET */ - nf_ct_kill_acct(ct, ctinfo, skb); - return NF_ACCEPT; - } - - spin_lock_bh(&ct->lock); - - role = ct->proto.dccp.role[dir]; - old_state = ct->proto.dccp.state; - new_state = dccp_state_table[role][type][old_state]; - - switch (new_state) { - case CT_DCCP_REQUEST: - if (old_state == CT_DCCP_TIMEWAIT && - role == CT_DCCP_ROLE_SERVER) { - /* Reincarnation in the reverse direction: reopen and - * reverse client/server roles. */ - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_SERVER; - } - break; - case CT_DCCP_RESPOND: - if (old_state == CT_DCCP_REQUEST) - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - break; - case CT_DCCP_PARTOPEN: - if (old_state == CT_DCCP_RESPOND && - type == DCCP_PKT_ACK && - dccp_ack_seq(dh) == ct->proto.dccp.handshake_seq) - set_bit(IPS_ASSURED_BIT, &ct->status); - break; - case CT_DCCP_IGNORE: - /* - * Connection tracking might be out of sync, so we ignore - * packets that might establish a new connection and resync - * if the server responds with a valid Response. - */ - if (ct->proto.dccp.last_dir == !dir && - ct->proto.dccp.last_pkt == DCCP_PKT_REQUEST && - type == DCCP_PKT_RESPONSE) { - ct->proto.dccp.role[!dir] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[dir] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.handshake_seq = dccp_hdr_seq(dh); - new_state = CT_DCCP_RESPOND; - break; - } - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid packet"); - return NF_ACCEPT; - case CT_DCCP_INVALID: - spin_unlock_bh(&ct->lock); - nf_ct_l4proto_log_invalid(skb, ct, state, "%s", "invalid state transition"); - return -NF_ACCEPT; - } - - ct->proto.dccp.last_dir = dir; - ct->proto.dccp.last_pkt = type; - ct->proto.dccp.state = new_state; - spin_unlock_bh(&ct->lock); - - if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - - timeouts = nf_ct_timeout_lookup(ct); - if (!timeouts) - timeouts = nf_dccp_pernet(nf_ct_net(ct))->dccp_timeout; - nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]); - - return NF_ACCEPT; -} - -static bool dccp_can_early_drop(const struct nf_conn *ct) -{ - switch (ct->proto.dccp.state) { - case CT_DCCP_CLOSEREQ: - case CT_DCCP_CLOSING: - case CT_DCCP_TIMEWAIT: - return true; - default: - break; - } - - return false; -} - -#ifdef CONFIG_NF_CONNTRACK_PROCFS -static void dccp_print_conntrack(struct seq_file *s, struct nf_conn *ct) -{ - seq_printf(s, "%s ", dccp_state_names[ct->proto.dccp.state]); -} -#endif - -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) -static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - struct nf_conn *ct, bool destroy) -{ - struct nlattr *nest_parms; - - spin_lock_bh(&ct->lock); - nest_parms = nla_nest_start(skb, CTA_PROTOINFO_DCCP); - if (!nest_parms) - goto nla_put_failure; - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_STATE, ct->proto.dccp.state)) - goto nla_put_failure; - - if (destroy) - goto skip_state; - - if (nla_put_u8(skb, CTA_PROTOINFO_DCCP_ROLE, - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL]) || - nla_put_be64(skb, CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ, - cpu_to_be64(ct->proto.dccp.handshake_seq), - CTA_PROTOINFO_DCCP_PAD)) - goto nla_put_failure; -skip_state: - nla_nest_end(skb, nest_parms); - spin_unlock_bh(&ct->lock); - - return 0; - -nla_put_failure: - spin_unlock_bh(&ct->lock); - return -1; -} - -static const struct nla_policy dccp_nla_policy[CTA_PROTOINFO_DCCP_MAX + 1] = { - [CTA_PROTOINFO_DCCP_STATE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_ROLE] = { .type = NLA_U8 }, - [CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ] = { .type = NLA_U64 }, - [CTA_PROTOINFO_DCCP_PAD] = { .type = NLA_UNSPEC }, -}; - -#define DCCP_NLATTR_SIZE ( \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + 1) + \ - NLA_ALIGN(NLA_HDRLEN + sizeof(u64)) + \ - NLA_ALIGN(NLA_HDRLEN + 0)) - -static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) -{ - struct nlattr *attr = cda[CTA_PROTOINFO_DCCP]; - struct nlattr *tb[CTA_PROTOINFO_DCCP_MAX + 1]; - int err; - - if (!attr) - return 0; - - err = nla_parse_nested_deprecated(tb, CTA_PROTOINFO_DCCP_MAX, attr, - dccp_nla_policy, NULL); - if (err < 0) - return err; - - if (!tb[CTA_PROTOINFO_DCCP_STATE] || - !tb[CTA_PROTOINFO_DCCP_ROLE] || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) > CT_DCCP_ROLE_MAX || - nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]) >= CT_DCCP_IGNORE) { - return -EINVAL; - } - - spin_lock_bh(&ct->lock); - ct->proto.dccp.state = nla_get_u8(tb[CTA_PROTOINFO_DCCP_STATE]); - if (nla_get_u8(tb[CTA_PROTOINFO_DCCP_ROLE]) == CT_DCCP_ROLE_CLIENT) { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_CLIENT; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_SERVER; - } else { - ct->proto.dccp.role[IP_CT_DIR_ORIGINAL] = CT_DCCP_ROLE_SERVER; - ct->proto.dccp.role[IP_CT_DIR_REPLY] = CT_DCCP_ROLE_CLIENT; - } - if (tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ]) { - ct->proto.dccp.handshake_seq = - be64_to_cpu(nla_get_be64(tb[CTA_PROTOINFO_DCCP_HANDSHAKE_SEQ])); - } - spin_unlock_bh(&ct->lock); - return 0; -} -#endif - -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - -#include -#include - -static int dccp_timeout_nlattr_to_obj(struct nlattr *tb[], - struct net *net, void *data) -{ - struct nf_dccp_net *dn = nf_dccp_pernet(net); - unsigned int *timeouts = data; - int i; - - if (!timeouts) - timeouts = dn->dccp_timeout; - - /* set default DCCP timeouts. */ - for (i=0; idccp_timeout[i]; - - /* there's a 1:1 mapping between attributes and protocol states. */ - for (i=CTA_TIMEOUT_DCCP_UNSPEC+1; idccp_loose = 1; - dn->dccp_timeout[CT_DCCP_REQUEST] = 2 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_RESPOND] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_PARTOPEN] = 4 * DCCP_MSL; - dn->dccp_timeout[CT_DCCP_OPEN] = 12 * 3600 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSEREQ] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_CLOSING] = 64 * HZ; - dn->dccp_timeout[CT_DCCP_TIMEWAIT] = 2 * DCCP_MSL; - - /* timeouts[0] is unused, make it same as SYN_SENT so - * ->timeouts[0] contains 'new' timeout, like udp or icmp. - */ - dn->dccp_timeout[CT_DCCP_NONE] = dn->dccp_timeout[CT_DCCP_REQUEST]; -} - -const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp = { - .l4proto = IPPROTO_DCCP, - .can_early_drop = dccp_can_early_drop, -#ifdef CONFIG_NF_CONNTRACK_PROCFS - .print_conntrack = dccp_print_conntrack, -#endif -#if IS_ENABLED(CONFIG_NF_CT_NETLINK) - .nlattr_size = DCCP_NLATTR_SIZE, - .to_nlattr = dccp_to_nlattr, - .from_nlattr = nlattr_to_dccp, - .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, - .nlattr_tuple_size = nf_ct_port_nlattr_tuple_size, - .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, - .nla_policy = nf_ct_port_nla_policy, -#endif -#ifdef CONFIG_NF_CONNTRACK_TIMEOUT - .ctnl_timeout = { - .nlattr_to_obj = dccp_timeout_nlattr_to_obj, - .obj_to_nlattr = dccp_timeout_obj_to_nlattr, - .nlattr_max = CTA_TIMEOUT_DCCP_MAX, - .obj_size = sizeof(unsigned int) * CT_DCCP_MAX, - .nla_policy = dccp_timeout_nla_policy, - }, -#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ -}; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 6c4cff10357d..829f60496008 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -67,11 +67,6 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ntohs(tuple->dst.u.udp.port)); break; - case IPPROTO_DCCP: - seq_printf(s, "sport=%hu dport=%hu ", - ntohs(tuple->src.u.dccp.port), - ntohs(tuple->dst.u.dccp.port)); - break; case IPPROTO_SCTP: seq_printf(s, "sport=%hu dport=%hu ", ntohs(tuple->src.u.sctp.port), @@ -279,7 +274,6 @@ static const char* l4proto_name(u16 proto) case IPPROTO_ICMP: return "icmp"; case IPPROTO_TCP: return "tcp"; case IPPROTO_UDP: return "udp"; - case IPPROTO_DCCP: return "dccp"; case IPPROTO_GRE: return "gre"; case IPPROTO_SCTP: return "sctp"; case IPPROTO_UDPLITE: return "udplite"; @@ -612,16 +606,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_SHUTDOWN_ACK_SENT, NF_SYSCTL_CT_PROTO_TIMEOUT_SCTP_HEARTBEAT_SENT, #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING, - NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT, - NF_SYSCTL_CT_PROTO_DCCP_LOOSE, -#endif #ifdef CONFIG_NF_CT_PROTO_GRE NF_SYSCTL_CT_PROTO_TIMEOUT_GRE, NF_SYSCTL_CT_PROTO_TIMEOUT_GRE_STREAM, @@ -895,58 +879,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .proc_handler = proc_dointvec_jiffies, }, #endif -#ifdef CONFIG_NF_CT_PROTO_DCCP - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_REQUEST] = { - .procname = "nf_conntrack_dccp_timeout_request", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_RESPOND] = { - .procname = "nf_conntrack_dccp_timeout_respond", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_PARTOPEN] = { - .procname = "nf_conntrack_dccp_timeout_partopen", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_OPEN] = { - .procname = "nf_conntrack_dccp_timeout_open", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSEREQ] = { - .procname = "nf_conntrack_dccp_timeout_closereq", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_CLOSING] = { - .procname = "nf_conntrack_dccp_timeout_closing", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_TIMEWAIT] = { - .procname = "nf_conntrack_dccp_timeout_timewait", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, - [NF_SYSCTL_CT_PROTO_DCCP_LOOSE] = { - .procname = "nf_conntrack_dccp_loose", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, -#endif #ifdef CONFIG_NF_CT_PROTO_GRE [NF_SYSCTL_CT_PROTO_TIMEOUT_GRE] = { .procname = "nf_conntrack_gre_timeout", @@ -1032,29 +964,6 @@ static void nf_conntrack_standalone_init_sctp_sysctl(struct net *net, #endif } -static void nf_conntrack_standalone_init_dccp_sysctl(struct net *net, - struct ctl_table *table) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct nf_dccp_net *dn = nf_dccp_pernet(net); - -#define XASSIGN(XNAME, dn) \ - table[NF_SYSCTL_CT_PROTO_TIMEOUT_DCCP_ ## XNAME].data = \ - &(dn)->dccp_timeout[CT_DCCP_ ## XNAME] - - XASSIGN(REQUEST, dn); - XASSIGN(RESPOND, dn); - XASSIGN(PARTOPEN, dn); - XASSIGN(OPEN, dn); - XASSIGN(CLOSEREQ, dn); - XASSIGN(CLOSING, dn); - XASSIGN(TIMEWAIT, dn); -#undef XASSIGN - - table[NF_SYSCTL_CT_PROTO_DCCP_LOOSE].data = &dn->dccp_loose; -#endif -} - static void nf_conntrack_standalone_init_gre_sysctl(struct net *net, struct ctl_table *table) { @@ -1100,7 +1009,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) nf_conntrack_standalone_init_tcp_sysctl(net, table); nf_conntrack_standalone_init_sctp_sysctl(net, table); - nf_conntrack_standalone_init_dccp_sysctl(net, table); nf_conntrack_standalone_init_gre_sysctl(net, table); /* Don't allow non-init_net ns to alter global sysctls */ diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index f391cd267922..78a61dac4ade 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -69,7 +69,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } @@ -81,7 +80,6 @@ static void nf_nat_ipv4_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } @@ -102,7 +100,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } @@ -114,7 +111,6 @@ static void nf_nat_ipv6_decode_session(struct sk_buff *skb, if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || - t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } @@ -432,7 +428,6 @@ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: - case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; @@ -632,7 +627,6 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: - case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index dc450cc81222..b14a434b9561 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -179,46 +179,6 @@ tcp_manip_pkt(struct sk_buff *skb, return true; } -static bool -dccp_manip_pkt(struct sk_buff *skb, - unsigned int iphdroff, unsigned int hdroff, - const struct nf_conntrack_tuple *tuple, - enum nf_nat_manip_type maniptype) -{ -#ifdef CONFIG_NF_CT_PROTO_DCCP - struct dccp_hdr *hdr; - __be16 *portptr, oldport, newport; - int hdrsize = 8; /* DCCP connection tracking guarantees this much */ - - if (skb->len >= hdroff + sizeof(struct dccp_hdr)) - hdrsize = sizeof(struct dccp_hdr); - - if (skb_ensure_writable(skb, hdroff + hdrsize)) - return false; - - hdr = (struct dccp_hdr *)(skb->data + hdroff); - - if (maniptype == NF_NAT_MANIP_SRC) { - newport = tuple->src.u.dccp.port; - portptr = &hdr->dccph_sport; - } else { - newport = tuple->dst.u.dccp.port; - portptr = &hdr->dccph_dport; - } - - oldport = *portptr; - *portptr = newport; - - if (hdrsize < sizeof(*hdr)) - return true; - - nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); - inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, - false); -#endif - return true; -} - static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, @@ -338,9 +298,6 @@ static bool l4proto_manip_pkt(struct sk_buff *skb, case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); - case IPPROTO_DCCP: - return dccp_manip_pkt(skb, iphdroff, hdroff, - tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index eab4f476b47f..38d75484e531 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -461,11 +461,6 @@ static int cttimeout_default_get(struct sk_buff *skb, case IPPROTO_UDPLITE: timeouts = nf_udp_pernet(info->net)->timeouts; break; - case IPPROTO_DCCP: -#ifdef CONFIG_NF_CT_PROTO_DCCP - timeouts = nf_dccp_pernet(info->net)->dccp_timeout; -#endif - break; case IPPROTO_ICMPV6: timeouts = &nf_icmpv6_pernet(info->net)->timeout; break; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index c74012c99125..7eedf4e3ae9c 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -407,6 +407,7 @@ err: regs->verdict.code = NFT_BREAK; } +#ifdef CONFIG_NFT_EXTHDR_DCCP static void nft_exthdr_dccp_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -482,6 +483,7 @@ static void nft_exthdr_dccp_eval(const struct nft_expr *expr, err: *dest = 0; } +#endif static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, @@ -634,6 +636,7 @@ static int nft_exthdr_ipv4_init(const struct nft_ctx *ctx, return 0; } +#ifdef CONFIG_NFT_EXTHDR_DCCP static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -649,6 +652,7 @@ static int nft_exthdr_dccp_init(const struct nft_ctx *ctx, return 0; } +#endif static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { @@ -779,6 +783,7 @@ static const struct nft_expr_ops nft_exthdr_sctp_ops = { .reduce = nft_exthdr_reduce, }; +#ifdef CONFIG_NFT_EXTHDR_DCCP static const struct nft_expr_ops nft_exthdr_dccp_ops = { .type = &nft_exthdr_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), @@ -787,6 +792,7 @@ static const struct nft_expr_ops nft_exthdr_dccp_ops = { .dump = nft_exthdr_dump, .reduce = nft_exthdr_reduce, }; +#endif static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, @@ -822,10 +828,12 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_sctp_ops; break; +#ifdef CONFIG_NFT_EXTHDR_DCCP case NFT_EXTHDR_OP_DCCP: if (tb[NFTA_EXTHDR_DREG]) return &nft_exthdr_dccp_ops; break; +#endif } return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3 From 59710a26a289ad4e7ef227d22063e964930928b0 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 30 Jun 2025 15:37:46 -0400 Subject: Bluetooth: hci_core: Remove check of BDADDR_ANY in hci_conn_hash_lookup_big_state The check for destination to be BDADDR_ANY is no longer necessary with the introduction of BIS_LINK. Fixes: 23205562ffc8 ("Bluetooth: separate CIS_LINK and BIS_LINK link types") Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9fc8f544e20e..0da011fc8146 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1350,8 +1350,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != BIS_LINK || bacmp(&c->dst, BDADDR_ANY) || - c->state != state) + if (c->type != BIS_LINK || c->state != state) continue; if (handle == c->iso_qos.bcast.big) { -- cgit v1.2.3 From 2b9996417e4ec231c91818f9ea8107ae62ef75ad Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:08 +0200 Subject: af_unix/scm: fix whitespace errors Fix whitespace/formatting errors. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-5-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- include/net/scm.h | 4 ++-- net/unix/af_unix.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/scm.h b/include/net/scm.h index 84c4707e78a5..c52519669349 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -69,7 +69,7 @@ static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_co static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { - scm->pid = get_pid(pid); + scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; @@ -78,7 +78,7 @@ static __inline__ void scm_set_cred(struct scm_cookie *scm, static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); - scm->pid = NULL; + scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index df2174d9904d..323e4fc85d4b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1929,7 +1929,7 @@ static void unix_destruct_scm(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; + scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); -- cgit v1.2.3 From 1e3b66e326015f77bc4b36976bebeedc2ac0f588 Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 3 Jul 2025 13:23:29 +0200 Subject: vsock: fix `vsock_proto` declaration From commit 634f1a7110b4 ("vsock: support sockmap"), `struct proto vsock_proto`, defined in af_vsock.c, is not static anymore, since it's used by vsock_bpf.c. If CONFIG_BPF_SYSCALL is not defined, `make C=2` will print a warning: $ make O=build C=2 W=1 net/vmw_vsock/ ... CC [M] net/vmw_vsock/af_vsock.o CHECK ../net/vmw_vsock/af_vsock.c ../net/vmw_vsock/af_vsock.c:123:14: warning: symbol 'vsock_proto' was not declared. Should it be static? Declare `vsock_proto` regardless of CONFIG_BPF_SYSCALL, since it's defined in af_vsock.c, which is built regardless of CONFIG_BPF_SYSCALL. Fixes: 634f1a7110b4 ("vsock: support sockmap") Signed-off-by: Stefano Garzarella Acked-by: Michael S. Tsirkin Link: https://patch.msgid.link/20250703112329.28365-1-sgarzare@redhat.com Signed-off-by: Jakub Kicinski --- include/net/af_vsock.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index d56e6e135158..d40e978126e3 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -243,8 +243,8 @@ int __vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, int vsock_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); -#ifdef CONFIG_BPF_SYSCALL extern struct proto vsock_proto; +#ifdef CONFIG_BPF_SYSCALL int vsock_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore); void __init vsock_bpf_build_proto(void); #else -- cgit v1.2.3 From 4369d40da2f28ae1d3caadd4eb5d7b7f49a3776f Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 2 Jul 2025 14:32:55 +0900 Subject: netmem: use _Generic to cover const casting for page_to_netmem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current page_to_netmem() doesn't cover const casting resulting in trying to cast const struct page * to const netmem_ref fails. To cover the case, change page_to_netmem() to use macro and _Generic. Signed-off-by: Byungchul Park Reviewed-by: Mina Almasry Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Pavel Begunkov Link: https://patch.msgid.link/20250702053256.4594-5-byungchul@sk.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/netmem.h b/include/net/netmem.h index 7a1dafa3f080..de1d95f04076 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -139,10 +139,9 @@ static inline netmem_ref net_iov_to_netmem(struct net_iov *niov) return (__force netmem_ref)((unsigned long)niov | NET_IOV); } -static inline netmem_ref page_to_netmem(const struct page *page) -{ - return (__force netmem_ref)page; -} +#define page_to_netmem(p) (_Generic((p), \ + const struct page * : (__force const netmem_ref)(p), \ + struct page * : (__force netmem_ref)(p))) /** * virt_to_netmem - convert virtual memory pointer to a netmem reference -- cgit v1.2.3 From d8bf56a0ca10af7936de8bbdd510c33041dacecc Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Wed, 2 Jul 2025 14:32:56 +0900 Subject: page_pool: make page_pool_get_dma_addr() just wrap page_pool_get_dma_addr_netmem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The page pool members in struct page cannot be removed unless it's not allowed to access any of them via struct page. Do not access 'page->dma_addr' directly in page_pool_get_dma_addr() but just wrap page_pool_get_dma_addr_netmem() safely. Signed-off-by: Byungchul Park Reviewed-by: Mina Almasry Reviewed-by: Ilias Apalodimas Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Pavel Begunkov Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20250702053256.4594-6-byungchul@sk.com Signed-off-by: Jakub Kicinski --- include/net/page_pool/helpers.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/page_pool/helpers.h b/include/net/page_pool/helpers.h index 773fc65780b5..db180626be06 100644 --- a/include/net/page_pool/helpers.h +++ b/include/net/page_pool/helpers.h @@ -444,12 +444,7 @@ static inline dma_addr_t page_pool_get_dma_addr_netmem(netmem_ref netmem) */ static inline dma_addr_t page_pool_get_dma_addr(const struct page *page) { - dma_addr_t ret = page->dma_addr; - - if (PAGE_POOL_32BIT_ARCH_WITH_64BIT_DMA) - ret <<= PAGE_SHIFT; - - return ret; + return page_pool_get_dma_addr_netmem(page_to_netmem(page)); } static inline void __page_pool_dma_sync_for_cpu(const struct page_pool *pool, -- cgit v1.2.3 From ce7a381697cb3958ffe0b45e5028ac69444e9288 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Fri, 27 Jun 2025 21:49:28 +0800 Subject: net: bonding: add broadcast_neighbor option for 802.3ad Stacking technology is a type of technology used to expand ports on Ethernet switches. It is widely used as a common access method in large-scale Internet data center architectures. Years of practice have proved that stacking technology has advantages and disadvantages in high-reliability network architecture scenarios. For instance, in stacking networking arch, conventional switch system upgrades require multiple stacked devices to restart at the same time. Therefore, it is inevitable that the business will be interrupted for a while. It is for this reason that "no-stacking" in data centers has become a trend. Additionally, when the stacking link connecting the switches fails or is abnormal, the stack will split. Although it is not common, it still happens in actual operation. The problem is that after the split, it is equivalent to two switches with the same configuration appearing in the network, causing network configuration conflicts and ultimately interrupting the services carried by the stacking system. To improve network stability, "non-stacking" solutions have been increasingly adopted, particularly by public cloud providers and tech companies like Alibaba, Tencent, and Didi. "non-stacking" is a method of mimicing switch stacking that convinces a LACP peer, bonding in this case, connected to a set of "non-stacked" switches that all of its ports are connected to a single switch (i.e., LACP aggregator), as if those switches were stacked. This enables the LACP peer's ports to aggregate together, and requires (a) special switch configuration, described in the linked article, and (b) modifications to the bonding 802.3ad (LACP) mode to send all ARP/ND packets across all ports of the active aggregator. Note that, with multiple aggregators, the current broadcast mode logic will send only packets to the selected aggregator(s). +-----------+ +-----------+ | switch1 | | switch2 | +-----------+ +-----------+ ^ ^ | | +-----------------+ | bond4 lacp | +-----------------+ | | | NIC1 | NIC2 +-----------------+ | server | +-----------------+ - https://www.ruijie.com/fr-fr/support/tech-gallery/de-stack-data-center-network-architecture/ Cc: Jay Vosburgh Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Jonathan Corbet Cc: Andrew Lunn Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Nikolay Aleksandrov Signed-off-by: Tonghao Zhang Signed-off-by: Zengbing Tu Link: https://patch.msgid.link/84d0a044514157bb856a10b6d03a1028c4883561.1751031306.git.tonghao@bamaicloud.com Signed-off-by: Paolo Abeni --- Documentation/networking/bonding.rst | 6 ++++ drivers/net/bonding/bond_main.c | 66 ++++++++++++++++++++++++++++++++---- drivers/net/bonding/bond_options.c | 42 +++++++++++++++++++++++ include/net/bond_options.h | 1 + include/net/bonding.h | 3 ++ 5 files changed, 112 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/bonding.rst b/Documentation/networking/bonding.rst index a4c1291d2561..14f7593d888d 100644 --- a/Documentation/networking/bonding.rst +++ b/Documentation/networking/bonding.rst @@ -562,6 +562,12 @@ lacp_rate The default is slow. +broadcast_neighbor + + Option specifying whether to broadcast ARP/ND packets to all + active slaves. This option has no effect in modes other than + 802.3ad mode. The default is off (0). + max_bonds Specifies the number of bonding devices to create for this diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index c4d53e8e7c15..12046ef51569 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -212,6 +212,8 @@ atomic_t netpoll_block_tx = ATOMIC_INIT(0); unsigned int bond_net_id __read_mostly; +DEFINE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); + static const struct flow_dissector_key flow_keys_bonding_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, @@ -4456,6 +4458,9 @@ static int bond_open(struct net_device *bond_dev) bond_for_each_slave(bond, slave, iter) dev_mc_add(slave->dev, lacpdu_mcast_addr); + + if (bond->params.broadcast_neighbor) + static_branch_inc(&bond_bcast_neigh_enabled); } if (bond_mode_can_use_xmit_hash(bond)) @@ -4475,6 +4480,10 @@ static int bond_close(struct net_device *bond_dev) bond_alb_deinitialize(bond); bond->recv_probe = NULL; + if (BOND_MODE(bond) == BOND_MODE_8023AD && + bond->params.broadcast_neighbor) + static_branch_dec(&bond_bcast_neigh_enabled); + if (bond_uses_primary(bond)) { rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); @@ -5310,6 +5319,37 @@ static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond, return slaves->arr[hash % count]; } +static bool bond_should_broadcast_neighbor(struct sk_buff *skb, + struct net_device *dev) +{ + struct bonding *bond = netdev_priv(dev); + struct { + struct ipv6hdr ip6; + struct icmp6hdr icmp6; + } *combined, _combined; + + if (!static_branch_unlikely(&bond_bcast_neigh_enabled)) + return false; + + if (!bond->params.broadcast_neighbor) + return false; + + if (skb->protocol == htons(ETH_P_ARP)) + return true; + + if (skb->protocol == htons(ETH_P_IPV6)) { + combined = skb_header_pointer(skb, skb_mac_header_len(skb), + sizeof(_combined), + &_combined); + if (combined && combined->ip6.nexthdr == NEXTHDR_ICMP && + (combined->icmp6.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || + combined->icmp6.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) + return true; + } + + return false; +} + /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. @@ -5329,17 +5369,27 @@ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, return bond_tx_drop(dev, skb); } -/* in broadcast mode, we send everything to all usable interfaces. */ +/* in broadcast mode, we send everything to all or usable slave interfaces. + * under rcu_read_lock when this function is called. + */ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, - struct net_device *bond_dev) + struct net_device *bond_dev, + bool all_slaves) { struct bonding *bond = netdev_priv(bond_dev); - struct slave *slave = NULL; - struct list_head *iter; + struct bond_up_slave *slaves; bool xmit_suc = false; bool skb_used = false; + int slaves_count, i; - bond_for_each_slave_rcu(bond, slave, iter) { + if (all_slaves) + slaves = rcu_dereference(bond->all_slaves); + else + slaves = rcu_dereference(bond->usable_slaves); + + slaves_count = slaves ? READ_ONCE(slaves->count) : 0; + for (i = 0; i < slaves_count; i++) { + struct slave *slave = slaves->arr[i]; struct sk_buff *skb2; if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)) @@ -5577,10 +5627,13 @@ static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: + if (bond_should_broadcast_neighbor(skb, dev)) + return bond_xmit_broadcast(skb, dev, false); + fallthrough; case BOND_MODE_XOR: return bond_3ad_xor_xmit(skb, dev); case BOND_MODE_BROADCAST: - return bond_xmit_broadcast(skb, dev); + return bond_xmit_broadcast(skb, dev, true); case BOND_MODE_ALB: return bond_alb_xmit(skb, dev); case BOND_MODE_TLB: @@ -6456,6 +6509,7 @@ static int __init bond_check_params(struct bond_params *params) eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; params->coupled_control = 1; + params->broadcast_neighbor = 0; if (packets_per_slave > 0) { params->reciprocal_packets_per_slave = reciprocal_value(packets_per_slave); diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 91893c29b899..1d639a3be6ba 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -87,6 +87,8 @@ static int bond_option_missed_max_set(struct bonding *bond, const struct bond_opt_value *newval); static int bond_option_coupled_control_set(struct bonding *bond, const struct bond_opt_value *newval); +static int bond_option_broadcast_neigh_set(struct bonding *bond, + const struct bond_opt_value *newval); static const struct bond_opt_value bond_mode_tbl[] = { { "balance-rr", BOND_MODE_ROUNDROBIN, BOND_VALFLAG_DEFAULT}, @@ -240,6 +242,12 @@ static const struct bond_opt_value bond_coupled_control_tbl[] = { { NULL, -1, 0}, }; +static const struct bond_opt_value bond_broadcast_neigh_tbl[] = { + { "off", 0, BOND_VALFLAG_DEFAULT}, + { "on", 1, 0}, + { NULL, -1, 0} +}; + static const struct bond_option bond_opts[BOND_OPT_LAST] = { [BOND_OPT_MODE] = { .id = BOND_OPT_MODE, @@ -513,6 +521,14 @@ static const struct bond_option bond_opts[BOND_OPT_LAST] = { .flags = BOND_OPTFLAG_IFDOWN, .values = bond_coupled_control_tbl, .set = bond_option_coupled_control_set, + }, + [BOND_OPT_BROADCAST_NEIGH] = { + .id = BOND_OPT_BROADCAST_NEIGH, + .name = "broadcast_neighbor", + .desc = "Broadcast neighbor packets to all active slaves", + .unsuppmodes = BOND_MODE_ALL_EX(BIT(BOND_MODE_8023AD)), + .values = bond_broadcast_neigh_tbl, + .set = bond_option_broadcast_neigh_set, } }; @@ -894,6 +910,13 @@ static int bond_option_mode_set(struct bonding *bond, bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; bond->params.mode = newval->value; + /* When changing mode, the bond device is down, we may reduce + * the bond_bcast_neigh_enabled in bond_close() if broadcast_neighbor + * enabled in 8023ad mode. Therefore, only clear broadcast_neighbor + * to 0. + */ + bond->params.broadcast_neighbor = 0; + if (bond->dev->reg_state == NETREG_REGISTERED) { bool update = false; @@ -1840,3 +1863,22 @@ static int bond_option_coupled_control_set(struct bonding *bond, bond->params.coupled_control = newval->value; return 0; } + +static int bond_option_broadcast_neigh_set(struct bonding *bond, + const struct bond_opt_value *newval) +{ + if (bond->params.broadcast_neighbor == newval->value) + return 0; + + bond->params.broadcast_neighbor = newval->value; + if (bond->dev->flags & IFF_UP) { + if (bond->params.broadcast_neighbor) + static_branch_inc(&bond_bcast_neigh_enabled); + else + static_branch_dec(&bond_bcast_neigh_enabled); + } + + netdev_dbg(bond->dev, "Setting broadcast_neighbor to %s (%llu)\n", + newval->string, newval->value); + return 0; +} diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 18687ccf0638..022b122a9fb6 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -77,6 +77,7 @@ enum { BOND_OPT_NS_TARGETS, BOND_OPT_PRIO, BOND_OPT_COUPLED_CONTROL, + BOND_OPT_BROADCAST_NEIGH, BOND_OPT_LAST }; diff --git a/include/net/bonding.h b/include/net/bonding.h index 95f67b308c19..e06f0d63b2c1 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -115,6 +115,8 @@ static inline int is_netpoll_tx_blocked(struct net_device *dev) #define is_netpoll_tx_blocked(dev) (0) #endif +DECLARE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); + struct bond_params { int mode; int xmit_policy; @@ -149,6 +151,7 @@ struct bond_params { struct in6_addr ns_targets[BOND_MAX_NS_TARGETS]; #endif int coupled_control; + int broadcast_neighbor; /* 2 bytes of padding : see ether_addr_equal_64bits() */ u8 ad_actor_system[ETH_ALEN + 2]; -- cgit v1.2.3 From 269936db5eb3962fe290b1dc4dbf1859cd5a04dd Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 2 Jul 2025 14:20:03 +0800 Subject: net: mctp: separate routing database from routing operations This change adds a struct mctp_dst, representing the result of a routing lookup. This decouples the struct mctp_route from the actual implementation of a routing operation. This will allow for future routing changes which may require more involved lookup logic, such as gateway routing - which may require multiple traversals of the routing table. Since we only use the struct mctp_route at lookup time, we no longer hold routes over a routing operation, as we only need it to populate the dst. However, we do hold the dev while the dst is active. This requires some changes to the route test infrastructure, as we no longer have a mock route to handle the route output operation, and transient dsts are created by the routing code, so we can't override them as easily. Instead, we use kunit->priv to stash a packet queue, and a custom dst_output function queues into that packet queue, which we can use for later expectations. Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250702-dev-forwarding-v5-3-1468191da8a4@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/net/mctp.h | 35 +++++-- net/mctp/af_mctp.c | 62 ++++-------- net/mctp/route.c | 210 ++++++++++++++++++++------------------ net/mctp/test/route-test.c | 245 ++++++++++++++++++++++++++++----------------- 4 files changed, 313 insertions(+), 239 deletions(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 07d458990113..6c9c5c48f59a 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -222,6 +222,8 @@ struct mctp_flow { struct mctp_sk_key *key; }; +struct mctp_dst; + /* Route definition. * * These are held in the pernet->mctp.routes list, with RCU protection for @@ -229,8 +231,7 @@ struct mctp_flow { * dropped on NETDEV_UNREGISTER events. * * Updates to the route table are performed under rtnl; all reads under RCU, - * so routes cannot be referenced over a RCU grace period. Specifically: A - * caller cannot block between mctp_route_lookup and mctp_route_release() + * so routes cannot be referenced over a RCU grace period. */ struct mctp_route { mctp_eid_t min, max; @@ -238,7 +239,7 @@ struct mctp_route { unsigned char type; unsigned int mtu; struct mctp_dev *dev; - int (*output)(struct mctp_route *route, + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); struct list_head list; @@ -246,12 +247,34 @@ struct mctp_route { struct rcu_head rcu; }; +/* Route lookup result: dst. Represents the results of a routing decision, + * but is only held over the individual routing operation. + * + * Will typically be stored on the caller stack, and must be released after + * usage. + */ +struct mctp_dst { + struct mctp_dev *dev; + unsigned int mtu; + + /* set for direct addressing */ + unsigned char halen; + unsigned char haddr[MAX_ADDR_LEN]; + + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); +}; + +int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, + unsigned char halen, const unsigned char *haddr); + /* route interfaces */ -struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, - mctp_eid_t daddr); +int mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr, struct mctp_dst *dst); + +void mctp_dst_release(struct mctp_dst *dst); /* always takes ownership of skb */ -int mctp_local_output(struct sock *sk, struct mctp_route *rt, +int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag); void mctp_key_unref(struct mctp_sk_key *key); diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 9b12ca97f412..e2570d9755ea 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -97,8 +97,8 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb; - struct mctp_route *rt; struct sk_buff *skb = NULL; + struct mctp_dst dst; int hlen; if (addr) { @@ -133,34 +133,30 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) if (msk->addr_ext && addrlen >= sizeof(struct sockaddr_mctp_ext)) { DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, extaddr, msg->msg_name); - struct net_device *dev; - - rc = -EINVAL; - rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), extaddr->smctp_ifindex); - /* check for correct halen */ - if (dev && extaddr->smctp_halen == dev->addr_len) { - hlen = LL_RESERVED_SPACE(dev) + sizeof(struct mctp_hdr); - rc = 0; - } - rcu_read_unlock(); + + if (!mctp_sockaddr_ext_is_ok(extaddr)) + return -EINVAL; + + rc = mctp_dst_from_extaddr(&dst, sock_net(sk), + extaddr->smctp_ifindex, + extaddr->smctp_halen, + extaddr->smctp_haddr); if (rc) - goto err_free; - rt = NULL; + return rc; + } else { - rt = mctp_route_lookup(sock_net(sk), addr->smctp_network, - addr->smctp_addr.s_addr); - if (!rt) { - rc = -EHOSTUNREACH; - goto err_free; - } - hlen = LL_RESERVED_SPACE(rt->dev->dev) + sizeof(struct mctp_hdr); + rc = mctp_route_lookup(sock_net(sk), addr->smctp_network, + addr->smctp_addr.s_addr, &dst); + if (rc) + return rc; } + hlen = LL_RESERVED_SPACE(dst.dev->dev) + sizeof(struct mctp_hdr); + skb = sock_alloc_send_skb(sk, hlen + 1 + len, msg->msg_flags & MSG_DONTWAIT, &rc); if (!skb) - return rc; + goto err_release_dst; skb_reserve(skb, hlen); @@ -175,30 +171,16 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) cb = __mctp_cb(skb); cb->net = addr->smctp_network; - if (!rt) { - /* fill extended address in cb */ - DECLARE_SOCKADDR(struct sockaddr_mctp_ext *, - extaddr, msg->msg_name); - - if (!mctp_sockaddr_ext_is_ok(extaddr) || - extaddr->smctp_halen > sizeof(cb->haddr)) { - rc = -EINVAL; - goto err_free; - } - - cb->ifindex = extaddr->smctp_ifindex; - /* smctp_halen is checked above */ - cb->halen = extaddr->smctp_halen; - memcpy(cb->haddr, extaddr->smctp_haddr, cb->halen); - } - - rc = mctp_local_output(sk, rt, skb, addr->smctp_addr.s_addr, + rc = mctp_local_output(sk, &dst, skb, addr->smctp_addr.s_addr, addr->smctp_tag); + mctp_dst_release(&dst); return rc ? : len; err_free: kfree_skb(skb); +err_release_dst: + mctp_dst_release(&dst); return rc; } diff --git a/net/mctp/route.c b/net/mctp/route.c index 128ac46dda5e..3985388a6035 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -32,7 +32,7 @@ static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ; static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev); /* route output callbacks */ -static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb) +static int mctp_dst_discard(struct mctp_dst *dst, struct sk_buff *skb) { kfree_skb(skb); return 0; @@ -368,7 +368,7 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) return 0; } -static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb) +static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) { struct mctp_sk_key *key, *any_key = NULL; struct net *net = dev_net(skb->dev); @@ -559,24 +559,17 @@ out: return rc; } -static unsigned int mctp_route_mtu(struct mctp_route *rt) -{ - return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu); -} - -static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) +static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { struct mctp_skb_cb *cb = mctp_cb(skb); struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; - unsigned int mtu; int rc; skb->protocol = htons(ETH_P_MCTP); - mtu = READ_ONCE(skb->dev->mtu); - if (skb->len > mtu) { + if (skb->len > dst->mtu) { kfree_skb(skb); return -EMSGSIZE; } @@ -598,7 +591,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) } else { skb->pkt_type = PACKET_OUTGOING; /* If lookup fails let the device handle daddr==NULL */ - if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0) + if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0) daddr = daddr_buf; } @@ -609,7 +602,7 @@ static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb) return -EHOSTUNREACH; } - mctp_flow_prepare_output(skb, route->dev); + mctp_flow_prepare_output(skb, dst->dev); rc = dev_queue_xmit(skb); if (rc) @@ -638,7 +631,7 @@ static struct mctp_route *mctp_route_alloc(void) INIT_LIST_HEAD(&rt->list); refcount_set(&rt->refs, 1); - rt->output = mctp_route_discard; + rt->output = mctp_dst_discard; return rt; } @@ -828,49 +821,106 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1, rt1->max == rt2->max; } -struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet, - mctp_eid_t daddr) +static void mctp_dst_from_route(struct mctp_dst *dst, struct mctp_route *route) +{ + mctp_dev_hold(route->dev); + dst->dev = route->dev; + dst->mtu = route->mtu ?: READ_ONCE(dst->dev->dev->mtu); + dst->halen = 0; + dst->output = route->output; +} + +int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, + unsigned char halen, const unsigned char *haddr) { - struct mctp_route *tmp, *rt = NULL; + struct net_device *netdev; + struct mctp_dev *dev; + int rc = -ENOENT; + + if (halen > sizeof(dst->haddr)) + return -EINVAL; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { + netdev = dev_get_by_index_rcu(net, ifindex); + if (!netdev) + goto out_unlock; + + if (netdev->addr_len != halen) { + rc = -EINVAL; + goto out_unlock; + } + + dev = __mctp_dev_get(netdev); + if (!dev) + goto out_unlock; + + dst->dev = dev; + dst->mtu = READ_ONCE(netdev->mtu); + dst->halen = halen; + dst->output = mctp_dst_output; + memcpy(dst->haddr, haddr, halen); + + rc = 0; + +out_unlock: + rcu_read_unlock(); + return rc; +} + +void mctp_dst_release(struct mctp_dst *dst) +{ + mctp_dev_put(dst->dev); +} + +/* populates *dst on successful lookup, if set */ +int mctp_route_lookup(struct net *net, unsigned int dnet, + mctp_eid_t daddr, struct mctp_dst *dst) +{ + int rc = -EHOSTUNREACH; + struct mctp_route *rt; + + rcu_read_lock(); + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { /* TODO: add metrics */ - if (mctp_rt_match_eid(tmp, dnet, daddr)) { - if (refcount_inc_not_zero(&tmp->refs)) { - rt = tmp; - break; - } - } + if (!mctp_rt_match_eid(rt, dnet, daddr)) + continue; + + if (dst) + mctp_dst_from_route(dst, rt); + rc = 0; + break; } rcu_read_unlock(); - return rt; + return rc; } -static struct mctp_route *mctp_route_lookup_null(struct net *net, - struct net_device *dev) +static int mctp_route_lookup_null(struct net *net, struct net_device *dev, + struct mctp_dst *dst) { - struct mctp_route *tmp, *rt = NULL; + int rc = -EHOSTUNREACH; + struct mctp_route *rt; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &net->mctp.routes, list) { - if (tmp->dev->dev == dev && tmp->type == RTN_LOCAL && - refcount_inc_not_zero(&tmp->refs)) { - rt = tmp; - break; - } + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (rt->dev->dev != dev || rt->type != RTN_LOCAL) + continue; + + mctp_dst_from_route(dst, rt); + rc = 0; + break; } rcu_read_unlock(); - return rt; + return rc; } -static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, +static int mctp_do_fragment_route(struct mctp_dst *dst, struct sk_buff *skb, unsigned int mtu, u8 tag) { const unsigned int hlen = sizeof(struct mctp_hdr); @@ -943,7 +993,7 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, skb_ext_copy(skb2, skb); /* do route */ - rc = rt->output(rt, skb2); + rc = dst->output(dst, skb2); if (rc) break; @@ -955,68 +1005,32 @@ static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb, return rc; } -int mctp_local_output(struct sock *sk, struct mctp_route *rt, +int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); struct mctp_skb_cb *cb = mctp_cb(skb); - struct mctp_route tmp_rt = {0}; struct mctp_sk_key *key; struct mctp_hdr *hdr; unsigned long flags; unsigned int netid; unsigned int mtu; mctp_eid_t saddr; - bool ext_rt; int rc; u8 tag; rc = -ENODEV; - if (rt) { - ext_rt = false; - if (WARN_ON(!rt->dev)) - goto out_release; - - } else if (cb->ifindex) { - struct net_device *dev; - - ext_rt = true; - rt = &tmp_rt; - - rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex); - if (!dev) { - rcu_read_unlock(); - goto out_free; - } - rt->dev = __mctp_dev_get(dev); - rcu_read_unlock(); - - if (!rt->dev) - goto out_release; - - /* establish temporary route - we set up enough to keep - * mctp_route_output happy - */ - rt->output = mctp_route_output; - rt->mtu = 0; - - } else { - rc = -EINVAL; - goto out_free; - } - - spin_lock_irqsave(&rt->dev->addrs_lock, flags); - if (rt->dev->num_addrs == 0) { + spin_lock_irqsave(&dst->dev->addrs_lock, flags); + if (dst->dev->num_addrs == 0) { rc = -EHOSTUNREACH; } else { /* use the outbound interface's first address as our source */ - saddr = rt->dev->addrs[0]; + saddr = dst->dev->addrs[0]; rc = 0; } - spin_unlock_irqrestore(&rt->dev->addrs_lock, flags); - netid = READ_ONCE(rt->dev->net); + spin_unlock_irqrestore(&dst->dev->addrs_lock, flags); + netid = READ_ONCE(dst->dev->net); if (rc) goto out_release; @@ -1048,7 +1062,7 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, skb_reset_transport_header(skb); skb_push(skb, sizeof(struct mctp_hdr)); skb_reset_network_header(skb); - skb->dev = rt->dev->dev; + skb->dev = dst->dev->dev; /* cb->net will have been set on initial ingress */ cb->src = saddr; @@ -1059,26 +1073,20 @@ int mctp_local_output(struct sock *sk, struct mctp_route *rt, hdr->dest = daddr; hdr->src = saddr; - mtu = mctp_route_mtu(rt); + mtu = dst->mtu; if (skb->len + sizeof(struct mctp_hdr) <= mtu) { hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM | tag; - rc = rt->output(rt, skb); + rc = dst->output(dst, skb); } else { - rc = mctp_do_fragment_route(rt, skb, mtu, tag); + rc = mctp_do_fragment_route(dst, skb, mtu, tag); } /* route output functions consume the skb, even on error */ skb = NULL; out_release: - if (!ext_rt) - mctp_route_release(rt); - - mctp_dev_put(tmp_rt.dev); - -out_free: kfree_skb(skb); return rc; } @@ -1088,7 +1096,7 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, unsigned int daddr_extent, unsigned int mtu, unsigned char type) { - int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb); + int (*rtfn)(struct mctp_dst *dst, struct sk_buff *skb); struct net *net = dev_net(mdev->dev); struct mctp_route *rt, *ert; @@ -1100,15 +1108,17 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, switch (type) { case RTN_LOCAL: - rtfn = mctp_route_input; + rtfn = mctp_dst_input; break; case RTN_UNICAST: - rtfn = mctp_route_output; + rtfn = mctp_dst_output; break; default: return -EINVAL; } + ASSERT_RTNL(); + rt = mctp_route_alloc(); if (!rt) return -ENOMEM; @@ -1121,7 +1131,6 @@ static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start, rt->type = type; rt->output = rtfn; - ASSERT_RTNL(); /* Prevent duplicate identical routes. */ list_for_each_entry(ert, &net->mctp.routes, list) { if (mctp_rt_compare_exact(rt, ert)) { @@ -1200,8 +1209,9 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, struct net *net = dev_net(dev); struct mctp_dev *mdev; struct mctp_skb_cb *cb; - struct mctp_route *rt; + struct mctp_dst dst; struct mctp_hdr *mh; + int rc; rcu_read_lock(); mdev = __mctp_dev_get(dev); @@ -1243,17 +1253,17 @@ static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev, cb->net = READ_ONCE(mdev->net); cb->ifindex = dev->ifindex; - rt = mctp_route_lookup(net, cb->net, mh->dest); + rc = mctp_route_lookup(net, cb->net, mh->dest, &dst); /* NULL EID, but addressed to our physical address */ - if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) - rt = mctp_route_lookup_null(net, dev); + if (rc && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST) + rc = mctp_route_lookup_null(net, dev, &dst); - if (!rt) + if (rc) goto err_drop; - rt->output(rt, skb); - mctp_route_release(rt); + dst.output(&dst, skb); + mctp_dst_release(&dst); mctp_dev_put(mdev); return NET_RX_SUCCESS; diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index 44ebc8e4e30c..7a1eba463fe7 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -2,18 +2,37 @@ #include +/* keep clangd happy when compiled outside of the route.c include */ +#include +#include + #include "utils.h" struct mctp_test_route { struct mctp_route rt; - struct sk_buff_head pkts; }; -static int mctp_test_route_output(struct mctp_route *rt, struct sk_buff *skb) +static const unsigned int test_pktqueue_magic = 0x5f713aef; + +struct mctp_test_pktqueue { + unsigned int magic; + struct sk_buff_head pkts; +}; + +static void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq) +{ + tpq->magic = test_pktqueue_magic; + skb_queue_head_init(&tpq->pkts); +} + +static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct mctp_test_route *test_rt = container_of(rt, struct mctp_test_route, rt); + struct kunit *test = current->kunit_test; + struct mctp_test_pktqueue *tpq = test->priv; + + KUNIT_ASSERT_EQ(test, tpq->magic, test_pktqueue_magic); - skb_queue_tail(&test_rt->pkts, skb); + skb_queue_tail(&tpq->pkts, skb); return 0; } @@ -29,9 +48,7 @@ static struct mctp_test_route *mctp_route_test_alloc(void) INIT_LIST_HEAD(&rt->rt.list); refcount_set(&rt->rt.refs, 1); - rt->rt.output = mctp_test_route_output; - - skb_queue_head_init(&rt->pkts); + rt->rt.output = mctp_test_dst_output; return rt; } @@ -60,6 +77,32 @@ static struct mctp_test_route *mctp_test_create_route(struct net *net, return rt; } +/* Convenience function for our test dst; release with mctp_test_dst_release() + */ +static void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, + struct mctp_test_dev *dev, + struct mctp_test_pktqueue *tpq, + unsigned int mtu) +{ + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); + + memset(dst, 0, sizeof(*dst)); + + dst->dev = dev->mdev; + __mctp_dev_get(dst->dev->dev); + dst->mtu = mtu; + dst->output = mctp_test_dst_output; + mctp_test_pktqueue_init(tpq); + test->priv = tpq; +} + +static void mctp_test_dst_release(struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq) +{ + mctp_dst_release(dst); + skb_queue_purge(&tpq->pkts); +} + static void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt) { @@ -69,7 +112,6 @@ static void mctp_test_route_destroy(struct kunit *test, list_del_rcu(&rt->rt.list); rtnl_unlock(); - skb_queue_purge(&rt->pkts); if (rt->rt.dev) mctp_dev_put(rt->rt.dev); @@ -141,8 +183,10 @@ struct mctp_frag_test { static void mctp_test_fragment(struct kunit *test) { const struct mctp_frag_test *params; + struct mctp_test_pktqueue tpq; int rc, i, n, mtu, msgsize; - struct mctp_test_route *rt; + struct mctp_test_dev *dev; + struct mctp_dst dst; struct sk_buff *skb; struct mctp_hdr hdr; u8 seq; @@ -159,13 +203,15 @@ static void mctp_test_fragment(struct kunit *test) skb = mctp_test_create_skb(&hdr, msgsize); KUNIT_ASSERT_TRUE(test, skb); - rt = mctp_test_create_route(&init_net, NULL, 10, mtu); - KUNIT_ASSERT_TRUE(test, rt); + dev = mctp_test_create_dev(); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + + mctp_test_dst_setup(test, &dst, dev, &tpq, mtu); - rc = mctp_do_fragment_route(&rt->rt, skb, mtu, MCTP_TAG_OWNER); + rc = mctp_do_fragment_route(&dst, skb, mtu, MCTP_TAG_OWNER); KUNIT_EXPECT_FALSE(test, rc); - n = rt->pkts.qlen; + n = tpq.pkts.qlen; KUNIT_EXPECT_EQ(test, n, params->n_frags); @@ -178,7 +224,7 @@ static void mctp_test_fragment(struct kunit *test) first = i == 0; last = i == (n - 1); - skb2 = skb_dequeue(&rt->pkts); + skb2 = skb_dequeue(&tpq.pkts); if (!skb2) break; @@ -216,7 +262,8 @@ static void mctp_test_fragment(struct kunit *test) kfree_skb(skb2); } - mctp_test_route_destroy(test, rt); + mctp_test_dst_release(&dst, &tpq); + mctp_test_destroy_dev(dev); } static const struct mctp_frag_test mctp_frag_tests[] = { @@ -246,11 +293,13 @@ struct mctp_rx_input_test { static void mctp_test_rx_input(struct kunit *test) { const struct mctp_rx_input_test *params; + struct mctp_test_pktqueue tpq; struct mctp_test_route *rt; struct mctp_test_dev *dev; struct sk_buff *skb; params = test->param_value; + test->priv = &tpq; dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); @@ -261,10 +310,13 @@ static void mctp_test_rx_input(struct kunit *test) skb = mctp_test_create_skb(¶ms->hdr, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); + mctp_test_pktqueue_init(&tpq); + mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); - KUNIT_EXPECT_EQ(test, !!rt->pkts.qlen, params->input); + KUNIT_EXPECT_EQ(test, !!tpq.pkts.qlen, params->input); + skb_queue_purge(&tpq.pkts); mctp_test_route_destroy(test, rt); mctp_test_destroy_dev(dev); } @@ -292,12 +344,12 @@ KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, /* set up a local dev, route on EID 8, and a socket listening on type 0 */ static void __mctp_route_test_init(struct kunit *test, struct mctp_test_dev **devp, - struct mctp_test_route **rtp, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket **sockp, unsigned int netid) { struct sockaddr_mctp addr = {0}; - struct mctp_test_route *rt; struct mctp_test_dev *dev; struct socket *sock; int rc; @@ -307,8 +359,7 @@ static void __mctp_route_test_init(struct kunit *test, if (netid != MCTP_NET_ANY) WRITE_ONCE(dev->mdev->net, netid); - rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + mctp_test_dst_setup(test, dst, dev, tpq, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -320,18 +371,18 @@ static void __mctp_route_test_init(struct kunit *test, rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); KUNIT_ASSERT_EQ(test, rc, 0); - *rtp = rt; *devp = dev; *sockp = sock; } static void __mctp_route_test_fini(struct kunit *test, struct mctp_test_dev *dev, - struct mctp_test_route *rt, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket *sock) { sock_release(sock); - mctp_test_route_destroy(test, rt); + mctp_test_dst_release(dst, tpq); mctp_test_destroy_dev(dev); } @@ -344,22 +395,24 @@ struct mctp_route_input_sk_test { static void mctp_test_route_input_sk(struct kunit *test) { const struct mctp_route_input_sk_test *params; + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct socket *sock; int rc; params = test->param_value; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); mctp_test_skb_set_dev(skb, dev); + mctp_test_pktqueue_init(&tpq); - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); if (params->deliver) { KUNIT_EXPECT_EQ(test, rc, 0); @@ -376,7 +429,7 @@ static void mctp_test_route_input_sk(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } #define FL_S (MCTP_HDR_FLAG_SOM) @@ -413,16 +466,17 @@ struct mctp_route_input_sk_reasm_test { static void mctp_test_route_input_sk_reasm(struct kunit *test) { const struct mctp_route_input_sk_reasm_test *params; + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct socket *sock; int i, rc; u8 c; params = test->param_value; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); for (i = 0; i < params->n_hdrs; i++) { c = i; @@ -431,7 +485,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) mctp_test_skb_set_dev(skb, dev); - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); } skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); @@ -445,7 +499,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } #define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT)) @@ -547,7 +601,7 @@ struct mctp_route_input_sk_keys_test { static void mctp_test_route_input_sk_keys(struct kunit *test) { const struct mctp_route_input_sk_keys_test *params; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_sk_key *key; @@ -555,6 +609,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) struct mctp_sock *msk; struct socket *sock; unsigned long flags; + struct mctp_dst dst; unsigned int net; int rc; u8 c; @@ -565,8 +620,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); net = READ_ONCE(dev->mdev->net); - rt = mctp_test_create_route(&init_net, dev->mdev, 8, 68); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + mctp_test_dst_setup(test, &dst, dev, &tpq, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -592,7 +646,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) mctp_test_skb_set_dev(skb, dev); - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); /* (potentially) receive message */ skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc); @@ -606,7 +660,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) skb_free_datagram(sock->sk, skb2); mctp_key_unref(key); - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = { @@ -681,7 +735,8 @@ KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests, struct test_net { unsigned int netid; struct mctp_test_dev *dev; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; + struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; struct mctp_sk_key *key; @@ -699,18 +754,20 @@ mctp_test_route_input_multiple_nets_bind_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, + t->netid); t->skb = mctp_test_create_skb_data(&hdr, &t->msg); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb); mctp_test_skb_set_dev(t->skb, t->dev); + mctp_test_pktqueue_init(&t->tpq); } static void mctp_test_route_input_multiple_nets_bind_fini(struct kunit *test, struct test_net *t) { - __mctp_route_test_fini(test, t->dev, t->rt, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); } /* Test that skbs from different nets (otherwise identical) get routed to their @@ -731,9 +788,9 @@ static void mctp_test_route_input_multiple_nets_bind(struct kunit *test) mctp_test_route_input_multiple_nets_bind_init(test, &t1); mctp_test_route_input_multiple_nets_bind_init(test, &t2); - rc = mctp_route_input(&t1.rt->rt, t1.skb); + rc = mctp_dst_input(&t1.dst, t1.skb); KUNIT_ASSERT_EQ(test, rc, 0); - rc = mctp_route_input(&t2.rt->rt, t2.skb); + rc = mctp_dst_input(&t2.dst, t2.skb); KUNIT_ASSERT_EQ(test, rc, 0); rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc); @@ -767,7 +824,8 @@ mctp_test_route_input_multiple_nets_key_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->rt, &t->sock, t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, + t->netid); msk = container_of(t->sock->sk, struct mctp_sock, sk); @@ -790,7 +848,7 @@ mctp_test_route_input_multiple_nets_key_fini(struct kunit *test, struct test_net *t) { mctp_key_unref(t->key); - __mctp_route_test_fini(test, t->dev, t->rt, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); } /* test that skbs from different nets (otherwise identical) get routed to their @@ -812,9 +870,9 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) mctp_test_route_input_multiple_nets_key_init(test, &t1); mctp_test_route_input_multiple_nets_key_init(test, &t2); - rc = mctp_route_input(&t1.rt->rt, t1.skb); + rc = mctp_dst_input(&t1.dst, t1.skb); KUNIT_ASSERT_EQ(test, rc, 0); - rc = mctp_route_input(&t2.rt->rt, t2.skb); + rc = mctp_dst_input(&t2.dst, t2.skb); KUNIT_ASSERT_EQ(test, rc, 0); rx_skb1 = skb_recv_datagram(t1.sock->sk, MSG_DONTWAIT, &rc); @@ -843,13 +901,14 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) static void mctp_test_route_input_sk_fail_single(struct kunit *test) { const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; + struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; int rc; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. @@ -865,14 +924,14 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) mctp_test_skb_set_dev(skb, dev); /* do route input, which should fail */ - rc = mctp_route_input(&rt->rt, skb); + rc = mctp_dst_input(&dst, skb); KUNIT_EXPECT_NE(test, rc, 0); /* we should hold the only reference to skb */ KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); kfree_skb(skb); - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } /* Input route to socket, using a fragmented message, where sock delivery fails. @@ -880,14 +939,15 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) static void mctp_test_route_input_sk_fail_frag(struct kunit *test) { const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skbs[2]; + struct mctp_dst dst; struct socket *sock; unsigned int i; int rc; - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); lock_sock(sock->sk); WRITE_ONCE(sock->sk->sk_rcvbuf, 0); @@ -904,11 +964,11 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) /* first route input should succeed, we're only queueing to the * frag list */ - rc = mctp_route_input(&rt->rt, skbs[0]); + rc = mctp_dst_input(&dst, skbs[0]); KUNIT_EXPECT_EQ(test, rc, 0); /* final route input should fail to deliver to the socket */ - rc = mctp_route_input(&rt->rt, skbs[1]); + rc = mctp_dst_input(&dst, skbs[1]); KUNIT_EXPECT_NE(test, rc, 0); /* we should hold the only reference to both skbs */ @@ -918,7 +978,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); kfree_skb(skbs[1]); - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } /* Input route to socket, using a fragmented message created from clones. @@ -936,10 +996,11 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) const size_t data_len = 3; /* arbitrary */ u8 compare[3 * ARRAY_SIZE(hdrs)]; u8 flat[3 * ARRAY_SIZE(hdrs)]; - struct mctp_test_route *rt; + struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skb[5]; struct sk_buff *rx_skb; + struct mctp_dst dst; struct socket *sock; size_t total; void *p; @@ -947,7 +1008,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) total = data_len + sizeof(struct mctp_hdr); - __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); /* Create a single skb initially with concatenated packets */ skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total); @@ -986,7 +1047,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) /* Feed the fragments into MCTP core */ for (int i = 0; i < 5; i++) { - rc = mctp_route_input(&rt->rt, skb[i]); + rc = mctp_dst_input(&dst, skb[i]); KUNIT_EXPECT_EQ(test, rc, 0); } @@ -1024,29 +1085,29 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) kfree_skb(skb[i]); } - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, &dst, &tpq, sock); } #if IS_ENABLED(CONFIG_MCTP_FLOWS) static void mctp_test_flow_init(struct kunit *test, struct mctp_test_dev **devp, - struct mctp_test_route **rtp, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket **sock, struct sk_buff **skbp, unsigned int len) { - struct mctp_test_route *rt; struct mctp_test_dev *dev; struct sk_buff *skb; /* we have a slightly odd routing setup here; the test route * is for EID 8, which is our local EID. We don't do a routing * lookup, so that's fine - all we require is a path through - * mctp_local_output, which will call rt->output on whatever + * mctp_local_output, which will call dst->output on whatever * route we provide */ - __mctp_route_test_init(test, &dev, &rt, sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, dst, tpq, sock, MCTP_NET_ANY); /* Assign a single EID. ->addrs is freed on mctp netdev release */ dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); @@ -1059,42 +1120,41 @@ static void mctp_test_flow_init(struct kunit *test, skb_reserve(skb, sizeof(struct mctp_hdr) + 1); memset(skb_put(skb, len), 0, len); - /* take a ref for the route, we'll decrement in local output */ - refcount_inc(&rt->rt.refs); *devp = dev; - *rtp = rt; *skbp = skb; } static void mctp_test_flow_fini(struct kunit *test, struct mctp_test_dev *dev, - struct mctp_test_route *rt, + struct mctp_dst *dst, + struct mctp_test_pktqueue *tpq, struct socket *sock) { - __mctp_route_test_fini(test, dev, rt, sock); + __mctp_route_test_fini(test, dev, dst, tpq, sock); } /* test that an outgoing skb has the correct MCTP extension data set */ static void mctp_test_packet_flow(struct kunit *test) { + struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct mctp_flow *flow; struct socket *sock; - u8 dst = 8; + u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 30); + mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 30); - rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = rt->pkts.qlen; + n = tpq.pkts.qlen; KUNIT_ASSERT_EQ(test, n, 1); - skb2 = skb_dequeue(&rt->pkts); + skb2 = skb_dequeue(&tpq.pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2); flow = skb_ext_find(skb2, SKB_EXT_MCTP); @@ -1103,7 +1163,7 @@ static void mctp_test_packet_flow(struct kunit *test) KUNIT_ASSERT_PTR_EQ(test, flow->key->sk, sock->sk); kfree_skb(skb2); - mctp_test_flow_fini(test, dev, rt, sock); + mctp_test_flow_fini(test, dev, &dst, &tpq, sock); } /* test that outgoing skbs, after fragmentation, all have the correct MCTP @@ -1111,26 +1171,27 @@ static void mctp_test_packet_flow(struct kunit *test) */ static void mctp_test_fragment_flow(struct kunit *test) { + struct mctp_test_pktqueue tpq; struct mctp_flow *flows[2]; struct sk_buff *tx_skbs[2]; - struct mctp_test_route *rt; struct mctp_test_dev *dev; + struct mctp_dst dst; struct sk_buff *skb; struct socket *sock; - u8 dst = 8; + u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &rt, &sock, &skb, 100); + mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 100); - rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = rt->pkts.qlen; + n = tpq.pkts.qlen; KUNIT_ASSERT_EQ(test, n, 2); /* both resulting packets should have the same flow data */ - tx_skbs[0] = skb_dequeue(&rt->pkts); - tx_skbs[1] = skb_dequeue(&rt->pkts); + tx_skbs[0] = skb_dequeue(&tpq.pkts); + tx_skbs[1] = skb_dequeue(&tpq.pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[0]); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[1]); @@ -1146,7 +1207,7 @@ static void mctp_test_fragment_flow(struct kunit *test) kfree_skb(tx_skbs[0]); kfree_skb(tx_skbs[1]); - mctp_test_flow_fini(test, dev, rt, sock); + mctp_test_flow_fini(test, dev, &dst, &tpq, sock); } #else @@ -1164,15 +1225,16 @@ static void mctp_test_fragment_flow(struct kunit *test) /* Test that outgoing skbs cause a suitable tag to be created */ static void mctp_test_route_output_key_create(struct kunit *test) { + const u8 dst_eid = 26, src_eid = 15; + struct mctp_test_pktqueue tpq; const unsigned int netid = 50; - const u8 dst = 26, src = 15; - struct mctp_test_route *rt; struct mctp_test_dev *dev; struct mctp_sk_key *key; struct netns_mctp *mns; unsigned long flags; struct socket *sock; struct sk_buff *skb; + struct mctp_dst dst; bool empty, single; const int len = 2; int rc; @@ -1181,15 +1243,14 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); WRITE_ONCE(dev->mdev->net, netid); - rt = mctp_test_create_route(&init_net, dev->mdev, dst, 68); - KUNIT_ASSERT_NOT_ERR_OR_NULL(test, rt); + mctp_test_dst_setup(test, &dst, dev, &tpq, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); dev->mdev->num_addrs = 1; - dev->mdev->addrs[0] = src; + dev->mdev->addrs[0] = src_eid; skb = alloc_skb(sizeof(struct mctp_hdr) + 1 + len, GFP_KERNEL); KUNIT_ASSERT_TRUE(test, skb); @@ -1197,8 +1258,6 @@ static void mctp_test_route_output_key_create(struct kunit *test) skb_reserve(skb, sizeof(struct mctp_hdr) + 1 + len); memset(skb_put(skb, len), 0, len); - refcount_inc(&rt->rt.refs); - mns = &sock_net(sock->sk)->mctp; /* We assume we're starting from an empty keys list, which requires @@ -1209,7 +1268,7 @@ static void mctp_test_route_output_key_create(struct kunit *test) spin_unlock_irqrestore(&mns->keys_lock, flags); KUNIT_ASSERT_TRUE(test, empty); - rc = mctp_local_output(sock->sk, &rt->rt, skb, dst, MCTP_TAG_OWNER); + rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); key = NULL; @@ -1225,13 +1284,13 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_ASSERT_TRUE(test, single); KUNIT_EXPECT_EQ(test, key->net, netid); - KUNIT_EXPECT_EQ(test, key->local_addr, src); - KUNIT_EXPECT_EQ(test, key->peer_addr, dst); + KUNIT_EXPECT_EQ(test, key->local_addr, src_eid); + KUNIT_EXPECT_EQ(test, key->peer_addr, dst_eid); /* key has incoming tag, so inverse of what we sent */ KUNIT_EXPECT_FALSE(test, key->tag & MCTP_TAG_OWNER); sock_release(sock); - mctp_test_route_destroy(test, rt); + mctp_test_dst_release(&dst, &tpq); mctp_test_destroy_dev(dev); } -- cgit v1.2.3 From 3007f90ec0385304ab5794e9585427b73f40e32f Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 2 Jul 2025 14:20:04 +0800 Subject: net: mctp: separate cb from direct-addressing routing Now that we have the dst->haddr populated by sendmsg (when extended addressing is in use), we no longer need to stash the link-layer address in the skb->cb. Instead, only use skb->cb for incoming lladdr data. While we're at it: remove cb->src, as was never used. Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250702-dev-forwarding-v5-4-1468191da8a4@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/net/mctp.h | 4 ++-- net/mctp/route.c | 21 +++++---------------- 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index 6c9c5c48f59a..b3af0690f607 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -183,8 +183,8 @@ struct mctp_sk_key { struct mctp_skb_cb { unsigned int magic; unsigned int net; - int ifindex; /* extended/direct addressing if set */ - mctp_eid_t src; + /* fields below provide extended addressing for ingress to recvmsg() */ + int ifindex; unsigned char halen; unsigned char haddr[MAX_ADDR_LEN]; }; diff --git a/net/mctp/route.c b/net/mctp/route.c index 3985388a6035..23f339b43643 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -561,35 +561,28 @@ out: static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct mctp_skb_cb *cb = mctp_cb(skb); struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; int rc; skb->protocol = htons(ETH_P_MCTP); + skb->pkt_type = PACKET_OUTGOING; if (skb->len > dst->mtu) { kfree_skb(skb); return -EMSGSIZE; } - /* If we're forwarding, we don't want to use the input path's cb, - * as it holds the *source* hardware addressing information. - * - * We will have a PACKET_HOST skb from the dev, or PACKET_OUTGOING - * from a socket; only use cb in the latter case. - */ - if (skb->pkt_type == PACKET_OUTGOING && cb->ifindex) { - /* direct route; use the hwaddr we stashed in sendmsg */ - if (cb->halen != skb->dev->addr_len) { + /* direct route; use the hwaddr we stashed in sendmsg */ + if (dst->halen) { + if (dst->halen != skb->dev->addr_len) { /* sanity check, sendmsg should have already caught this */ kfree_skb(skb); return -EMSGSIZE; } - daddr = cb->haddr; + daddr = dst->haddr; } else { - skb->pkt_type = PACKET_OUTGOING; /* If lookup fails let the device handle daddr==NULL */ if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0) daddr = daddr_buf; @@ -1009,7 +1002,6 @@ int mctp_local_output(struct sock *sk, struct mctp_dst *dst, struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); - struct mctp_skb_cb *cb = mctp_cb(skb); struct mctp_sk_key *key; struct mctp_hdr *hdr; unsigned long flags; @@ -1064,9 +1056,6 @@ int mctp_local_output(struct sock *sk, struct mctp_dst *dst, skb_reset_network_header(skb); skb->dev = dst->dev->dev; - /* cb->net will have been set on initial ingress */ - cb->src = saddr; - /* set up common header fields */ hdr = mctp_hdr(skb); hdr->ver = 1; -- cgit v1.2.3 From ad39c12fcee34b8980a80ad5c803bca9906fbd4e Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Wed, 2 Jul 2025 14:20:13 +0800 Subject: net: mctp: add gateway routing support This change allows for gateway routing, where a route table entry may reference a routable endpoint (by network and EID), instead of routing directly to a netdevice. We add support for a RTM_GATEWAY attribute for netlink route updates, with an attribute format of: struct mctp_fq_addr { unsigned int net; mctp_eid_t eid; } - we need the net here to uniquely identify the target EID, as we no longer have the device reference directly (which would provide the net id in the case of direct routes). This makes route lookups recursive, as a route lookup that returns a gateway route must be resolved into a direct route (ie, to a device) eventually. We provide a limit to the route lookups, to prevent infinite loop routing. The route lookup populates a new 'nexthop' field in the dst structure, which now specifies the key for the neighbour table lookup on device output, rather than using the packet destination address directly. Signed-off-by: Jeremy Kerr Link: https://patch.msgid.link/20250702-dev-forwarding-v5-13-1468191da8a4@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/net/mctp.h | 13 ++- include/uapi/linux/mctp.h | 8 ++ net/mctp/route.c | 206 +++++++++++++++++++++++++++++++++------------- net/mctp/test/utils.c | 3 +- 4 files changed, 173 insertions(+), 57 deletions(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index b3af0690f607..ac4f4ecdfc24 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -237,8 +237,18 @@ struct mctp_route { mctp_eid_t min, max; unsigned char type; + unsigned int mtu; - struct mctp_dev *dev; + + enum { + MCTP_ROUTE_DIRECT, + MCTP_ROUTE_GATEWAY, + } dst_type; + union { + struct mctp_dev *dev; + struct mctp_fq_addr gateway; + }; + int (*output)(struct mctp_dst *dst, struct sk_buff *skb); @@ -256,6 +266,7 @@ struct mctp_route { struct mctp_dst { struct mctp_dev *dev; unsigned int mtu; + mctp_eid_t nexthop; /* set for direct addressing */ unsigned char halen; diff --git a/include/uapi/linux/mctp.h b/include/uapi/linux/mctp.h index e1db65df9359..19ad12a0cd4b 100644 --- a/include/uapi/linux/mctp.h +++ b/include/uapi/linux/mctp.h @@ -37,6 +37,14 @@ struct sockaddr_mctp_ext { __u8 smctp_haddr[MAX_ADDR_LEN]; }; +/* A "fully qualified" MCTP address, which includes the system-local network ID, + * required to uniquely resolve a routable EID. + */ +struct mctp_fq_addr { + unsigned int net; + mctp_eid_t eid; +}; + #define MCTP_NET_ANY 0x0 #define MCTP_ADDR_NULL 0x00 diff --git a/net/mctp/route.c b/net/mctp/route.c index 5eca3ce0ba3c..a20d6b11d418 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -563,7 +563,6 @@ out: static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct mctp_hdr *hdr = mctp_hdr(skb); char daddr_buf[MAX_ADDR_LEN]; char *daddr = NULL; int rc; @@ -586,7 +585,7 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) daddr = dst->haddr; } else { /* If lookup fails let the device handle daddr==NULL */ - if (mctp_neigh_lookup(dst->dev, hdr->dest, daddr_buf) == 0) + if (mctp_neigh_lookup(dst->dev, dst->nexthop, daddr_buf) == 0) daddr = daddr_buf; } @@ -610,7 +609,8 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) static void mctp_route_release(struct mctp_route *rt) { if (refcount_dec_and_test(&rt->refs)) { - mctp_dev_put(rt->dev); + if (rt->dst_type == MCTP_ROUTE_DIRECT) + mctp_dev_put(rt->dev); kfree_rcu(rt, rcu); } } @@ -799,10 +799,16 @@ static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk, } /* routing lookups */ +static unsigned int mctp_route_netid(struct mctp_route *rt) +{ + return rt->dst_type == MCTP_ROUTE_DIRECT ? + READ_ONCE(rt->dev->net) : rt->gateway.net; +} + static bool mctp_rt_match_eid(struct mctp_route *rt, unsigned int net, mctp_eid_t eid) { - return READ_ONCE(rt->dev->net) == net && + return mctp_route_netid(rt) == net && rt->min <= eid && rt->max >= eid; } @@ -811,16 +817,21 @@ static bool mctp_rt_compare_exact(struct mctp_route *rt1, struct mctp_route *rt2) { ASSERT_RTNL(); - return rt1->dev->net == rt2->dev->net && + return mctp_route_netid(rt1) == mctp_route_netid(rt2) && rt1->min == rt2->min && rt1->max == rt2->max; } -static void mctp_dst_from_route(struct mctp_dst *dst, struct mctp_route *route) +/* must only be called on a direct route, as the final output hop */ +static void mctp_dst_from_route(struct mctp_dst *dst, mctp_eid_t eid, + unsigned int mtu, struct mctp_route *route) { mctp_dev_hold(route->dev); + dst->nexthop = eid; dst->dev = route->dev; - dst->mtu = route->mtu ?: READ_ONCE(dst->dev->dev->mtu); + dst->mtu = READ_ONCE(dst->dev->dev->mtu); + if (mtu) + dst->mtu = min(dst->mtu, mtu); dst->halen = 0; dst->output = route->output; } @@ -854,6 +865,7 @@ int mctp_dst_from_extaddr(struct mctp_dst *dst, struct net *net, int ifindex, dst->mtu = READ_ONCE(netdev->mtu); dst->halen = halen; dst->output = mctp_dst_output; + dst->nexthop = 0; memcpy(dst->haddr, haddr, halen); rc = 0; @@ -868,24 +880,54 @@ void mctp_dst_release(struct mctp_dst *dst) mctp_dev_put(dst->dev); } +static struct mctp_route *mctp_route_lookup_single(struct net *net, + unsigned int dnet, + mctp_eid_t daddr) +{ + struct mctp_route *rt; + + list_for_each_entry_rcu(rt, &net->mctp.routes, list) { + if (mctp_rt_match_eid(rt, dnet, daddr)) + return rt; + } + + return NULL; +} + /* populates *dst on successful lookup, if set */ int mctp_route_lookup(struct net *net, unsigned int dnet, mctp_eid_t daddr, struct mctp_dst *dst) { + const unsigned int max_depth = 32; + unsigned int depth, mtu = 0; int rc = -EHOSTUNREACH; - struct mctp_route *rt; rcu_read_lock(); - list_for_each_entry_rcu(rt, &net->mctp.routes, list) { - /* TODO: add metrics */ - if (!mctp_rt_match_eid(rt, dnet, daddr)) - continue; + for (depth = 0; depth < max_depth; depth++) { + struct mctp_route *rt; - if (dst) - mctp_dst_from_route(dst, rt); - rc = 0; - break; + rt = mctp_route_lookup_single(net, dnet, daddr); + if (!rt) + break; + + /* clamp mtu to the smallest in the path, allowing 0 + * to specify no restrictions + */ + if (mtu && rt->mtu) + mtu = min(mtu, rt->mtu); + else + mtu = mtu ?: rt->mtu; + + if (rt->dst_type == MCTP_ROUTE_DIRECT) { + if (dst) + mctp_dst_from_route(dst, daddr, mtu, rt); + rc = 0; + break; + + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { + daddr = rt->gateway.eid; + } } rcu_read_unlock(); @@ -902,10 +944,13 @@ static int mctp_route_lookup_null(struct net *net, struct net_device *dev, rcu_read_lock(); list_for_each_entry_rcu(rt, &net->mctp.routes, list) { - if (rt->dev->dev != dev || rt->type != RTN_LOCAL) + if (rt->dst_type != MCTP_ROUTE_DIRECT || rt->type != RTN_LOCAL) + continue; + + if (rt->dev->dev != dev) continue; - mctp_dst_from_route(dst, rt); + mctp_dst_from_route(dst, 0, 0, rt); rc = 0; break; } @@ -1085,11 +1130,6 @@ out_release: return rc; } -static unsigned int mctp_route_netid(struct mctp_route *rt) -{ - return rt->dev->net; -} - /* route management */ /* mctp_route_add(): Add the provided route, previously allocated via @@ -1097,9 +1137,9 @@ static unsigned int mctp_route_netid(struct mctp_route *rt) * hold on rt->dev for usage in the route table. On failure a caller will want * to mctp_route_release(). * - * We expect that the caller has set rt->type, rt->min, rt->max, rt->dev and - * rt->mtu, and that the route holds a reference to rt->dev (via mctp_dev_hold). - * Other fields will be populated. + * We expect that the caller has set rt->type, rt->dst_type, rt->min, rt->max, + * rt->mtu and either rt->dev (with a reference held appropriately) or + * rt->gateway. Other fields will be populated. */ static int mctp_route_add(struct net *net, struct mctp_route *rt) { @@ -1108,7 +1148,10 @@ static int mctp_route_add(struct net *net, struct mctp_route *rt) if (!mctp_address_unicast(rt->min) || !mctp_address_unicast(rt->max)) return -EINVAL; - if (!rt->dev) + if (rt->dst_type == MCTP_ROUTE_DIRECT && !rt->dev) + return -EINVAL; + + if (rt->dst_type == MCTP_ROUTE_GATEWAY && !rt->gateway.eid) return -EINVAL; switch (rt->type) { @@ -1177,6 +1220,7 @@ int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr) rt->min = addr; rt->max = addr; + rt->dst_type = MCTP_ROUTE_DIRECT; rt->dev = mdev; rt->type = RTN_LOCAL; @@ -1203,7 +1247,7 @@ void mctp_route_remove_dev(struct mctp_dev *mdev) ASSERT_RTNL(); list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) { - if (rt->dev == mdev) { + if (rt->dst_type == MCTP_ROUTE_DIRECT && rt->dev == mdev) { list_del_rcu(&rt->list); /* TODO: immediate RTM_DELROUTE */ mctp_route_release(rt); @@ -1296,21 +1340,28 @@ static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = { [RTA_DST] = { .type = NLA_U8 }, [RTA_METRICS] = { .type = NLA_NESTED }, [RTA_OIF] = { .type = NLA_U32 }, + [RTA_GATEWAY] = NLA_POLICY_EXACT_LEN(sizeof(struct mctp_fq_addr)), }; static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = { [RTAX_MTU] = { .type = NLA_U32 }, }; -/* base parsing; common to both _lookup and _populate variants */ +/* base parsing; common to both _lookup and _populate variants. + * + * For gateway routes (which have a RTA_GATEWAY, and no RTA_OIF), we populate + * *gatweayp. for direct routes (RTA_OIF, no RTA_GATEWAY), we populate *mdev. + */ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, struct nlattr **tb, struct rtmsg **rtm, struct mctp_dev **mdev, + struct mctp_fq_addr *gatewayp, mctp_eid_t *daddr_start) { + struct mctp_fq_addr *gateway = NULL; + unsigned int ifindex = 0; struct net_device *dev; - unsigned int ifindex; int rc; rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX, @@ -1326,11 +1377,44 @@ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, } *daddr_start = nla_get_u8(tb[RTA_DST]); - if (!tb[RTA_OIF]) { - NL_SET_ERR_MSG(extack, "ifindex missing"); + if (tb[RTA_OIF]) + ifindex = nla_get_u32(tb[RTA_OIF]); + + if (tb[RTA_GATEWAY]) + gateway = nla_data(tb[RTA_GATEWAY]); + + if (ifindex && gateway) { + NL_SET_ERR_MSG(extack, + "cannot specify both ifindex and gateway"); + return -EINVAL; + + } else if (ifindex) { + dev = __dev_get_by_index(net, ifindex); + if (!dev) { + NL_SET_ERR_MSG(extack, "bad ifindex"); + return -ENODEV; + } + *mdev = mctp_dev_get_rtnl(dev); + if (!*mdev) + return -ENODEV; + gatewayp->eid = 0; + + } else if (gateway) { + if (!mctp_address_unicast(gateway->eid)) { + NL_SET_ERR_MSG(extack, "bad gateway"); + return -EINVAL; + } + + gatewayp->eid = gateway->eid; + gatewayp->net = gateway->net != MCTP_NET_ANY ? + gateway->net : + READ_ONCE(net->mctp.default_net); + *mdev = NULL; + + } else { + NL_SET_ERR_MSG(extack, "no route output provided"); return -EINVAL; } - ifindex = nla_get_u32(tb[RTA_OIF]); *rtm = nlmsg_data(nlh); if ((*rtm)->rtm_family != AF_MCTP) { @@ -1343,16 +1427,6 @@ static int mctp_route_nlparse_common(struct net *net, struct nlmsghdr *nlh, return -EINVAL; } - dev = __dev_get_by_index(net, ifindex); - if (!dev) { - NL_SET_ERR_MSG(extack, "bad ifindex"); - return -ENODEV; - } - - *mdev = mctp_dev_get_rtnl(dev); - if (!*mdev) - return -ENODEV; - return 0; } @@ -1366,24 +1440,34 @@ static int mctp_route_nlparse_lookup(struct net *net, struct nlmsghdr *nlh, unsigned int *daddr_extent) { struct nlattr *tb[RTA_MAX + 1]; + struct mctp_fq_addr gw; struct mctp_dev *mdev; struct rtmsg *rtm; int rc; rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, - &mdev, daddr_start); + &mdev, &gw, daddr_start); if (rc) return rc; - *netid = mdev->net; + if (mdev) { + *netid = mdev->net; + } else if (gw.eid) { + *netid = gw.net; + } else { + /* bug: _nlparse_common should not allow this */ + return -1; + } + *type = rtm->rtm_type; *daddr_extent = rtm->rtm_dst_len; return 0; } -/* Full route parse for RTM_NEWROUTE: populate @rt. On success, the route will - * hold a reference to the dev. +/* Full route parse for RTM_NEWROUTE: populate @rt. On success, + * MCTP_ROUTE_DIRECT routes (ie, those with a direct dev) will hold a reference + * to that dev. */ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, struct netlink_ext_ack *extack, @@ -1392,6 +1476,7 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, struct nlattr *tbx[RTAX_MAX + 1]; struct nlattr *tb[RTA_MAX + 1]; unsigned int daddr_extent; + struct mctp_fq_addr gw; mctp_eid_t daddr_start; struct mctp_dev *dev; struct rtmsg *rtm; @@ -1399,7 +1484,7 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, int rc; rc = mctp_route_nlparse_common(net, nlh, extack, tb, &rtm, - &dev, &daddr_start); + &dev, &gw, &daddr_start); if (rc) return rc; @@ -1425,8 +1510,15 @@ static int mctp_route_nlparse_populate(struct net *net, struct nlmsghdr *nlh, rt->min = daddr_start; rt->max = daddr_start + daddr_extent; rt->mtu = mtu; - rt->dev = dev; - mctp_dev_hold(rt->dev); + if (gw.eid) { + rt->dst_type = MCTP_ROUTE_GATEWAY; + rt->gateway.eid = gw.eid; + rt->gateway.net = gw.net; + } else { + rt->dst_type = MCTP_ROUTE_DIRECT; + rt->dev = dev; + mctp_dev_hold(rt->dev); + } return 0; } @@ -1446,7 +1538,8 @@ static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (rc < 0) goto err_free; - if (rt->dev->dev->flags & IFF_LOOPBACK) { + if (rt->dst_type == MCTP_ROUTE_DIRECT && + rt->dev->dev->flags & IFF_LOOPBACK) { NL_SET_ERR_MSG(extack, "no routes to loopback"); rc = -EINVAL; goto err_free; @@ -1505,7 +1598,6 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, hdr->rtm_tos = 0; hdr->rtm_table = RT_TABLE_DEFAULT; hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */ - hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */ hdr->rtm_type = rt->type; if (nla_put_u8(skb, RTA_DST, rt->min)) @@ -1522,13 +1614,17 @@ static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt, nla_nest_end(skb, metrics); - if (rt->dev) { + if (rt->dst_type == MCTP_ROUTE_DIRECT) { + hdr->rtm_scope = RT_SCOPE_LINK; if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex)) goto cancel; + } else if (rt->dst_type == MCTP_ROUTE_GATEWAY) { + hdr->rtm_scope = RT_SCOPE_UNIVERSE; + if (nla_put(skb, RTA_GATEWAY, + sizeof(rt->gateway), &rt->gateway)) + goto cancel; } - /* TODO: conditional neighbour physaddr? */ - nlmsg_end(skb, nlh); return 0; diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 6b4dc40d882c..97b05e340586 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -134,6 +134,7 @@ struct mctp_test_route *mctp_test_create_route(struct net *net, rt->rt.max = eid; rt->rt.mtu = mtu; rt->rt.type = RTN_UNSPEC; + rt->rt.dst_type = MCTP_ROUTE_DIRECT; if (dev) mctp_dev_hold(dev); rt->rt.dev = dev; @@ -176,7 +177,7 @@ void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt) list_del_rcu(&rt->rt.list); rtnl_unlock(); - if (rt->rt.dev) + if (rt->rt.dst_type == MCTP_ROUTE_DIRECT && rt->rt.dev) mctp_dev_put(rt->rt.dev); refs = refcount_read(&rt->rt.refs); -- cgit v1.2.3 From 84a7d6797e6a03705e6b48c613fa424662049d87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Jul 2025 07:12:30 +0000 Subject: net/sched: acp_api: no longer acquire RTNL in tc_action_net_exit() tc_action_net_exit() got an rtnl exclusion in commit a159d3c4b829 ("net_sched: acquire RTNL in tc_action_net_exit()") Since then, commit 16af6067392c ("net: sched: implement reference counted action release") made this RTNL exclusion obsolete for most cases. Only tcf_action_offload_del() might still require it. Move the rtnl locking into tcf_idrinfo_destroy() when an offload action is found. Most netns do not have actions, yet deleting them is adding a lot of pressure on RTNL, which is for many the most contended mutex in the kernel. We are moving to a per-netns 'rtnl', so tc_action_net_exit() will not be able to grab 'rtnl' a single time for a batch of netns. Before the patch: perf probe -a rtnl_lock perf record -e probe:rtnl_lock -a /bin/bash -c 'unshare -n "/bin/true"; sleep 1' [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.305 MB perf.data (25 samples) ] After the patch: perf record -e probe:rtnl_lock -a /bin/bash -c 'unshare -n "/bin/true"; sleep 1' [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.304 MB perf.data (9 samples) ] Signed-off-by: Eric Dumazet Cc: Vlad Buslov Cc: Jiri Pirko Cc: Marcelo Ricardo Leitner Link: https://patch.msgid.link/20250702071230.1892674-1-edumazet@google.com Signed-off-by: Paolo Abeni --- include/net/act_api.h | 2 -- net/sched/act_api.c | 9 ++++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 404df8557f6a..04781c92b43d 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -170,14 +170,12 @@ static inline void tc_action_net_exit(struct list_head *net_list, { struct net *net; - rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { struct tc_action_net *tn = net_generic(net, id); tcf_idrinfo_destroy(tn->ops, tn->idrinfo); kfree(tn->idrinfo); } - rtnl_unlock(); } int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 057e20cef375..9e468e463467 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -933,18 +933,25 @@ void tcf_idrinfo_destroy(const struct tc_action_ops *ops, struct tcf_idrinfo *idrinfo) { struct idr *idr = &idrinfo->action_idr; + bool mutex_taken = false; struct tc_action *p; - int ret; unsigned long id = 1; unsigned long tmp; + int ret; idr_for_each_entry_ul(idr, p, tmp, id) { + if (tc_act_in_hw(p) && !mutex_taken) { + rtnl_lock(); + mutex_taken = true; + } ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) module_put(ops->owner); else if (ret < 0) return; } + if (mutex_taken) + rtnl_unlock(); idr_destroy(&idrinfo->action_idr); } EXPORT_SYMBOL(tcf_idrinfo_destroy); -- cgit v1.2.3 From b441cf3f8c4b8576639d20c8eb4aa32917602ecd Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 4 Jul 2025 16:54:33 +0200 Subject: xfrm: delete x->tunnel as we delete x The ipcomp fallback tunnels currently get deleted (from the various lists and hashtables) as the last user state that needed that fallback is destroyed (not deleted). If a reference to that user state still exists, the fallback state will remain on the hashtables/lists, triggering the WARN in xfrm_state_fini. Because of those remaining references, the fix in commit f75a2804da39 ("xfrm: destroy xfrm_state synchronously on net exit path") is not complete. We recently fixed one such situation in TCP due to defered freeing of skbs (commit 9b6412e6979f ("tcp: drop secpath at the same time as we currently drop dst")). This can also happen due to IP reassembly: skbs with a secpath remain on the reassembly queue until netns destruction. If we can't guarantee that the queues are flushed by the time xfrm_state_fini runs, there may still be references to a (user) xfrm_state, preventing the timely deletion of the corresponding fallback state. Instead of chasing each instance of skbs holding a secpath one by one, this patch fixes the issue directly within xfrm, by deleting the fallback state as soon as the last user state depending on it has been deleted. Destruction will still happen when the final reference is dropped. A separate lockdep class for the fallback state is required since we're going to lock x->tunnel while x is locked. Fixes: 9d4139c76905 ("netns xfrm: per-netns xfrm_state_all list") Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 1 - net/ipv4/ipcomp.c | 2 ++ net/ipv6/ipcomp6.c | 2 ++ net/ipv6/xfrm6_tunnel.c | 2 +- net/xfrm/xfrm_ipcomp.c | 1 - net/xfrm/xfrm_state.c | 19 ++++++++----------- 6 files changed, 13 insertions(+), 14 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index e45a275fca26..91d52a380e37 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -441,7 +441,6 @@ int xfrm_input_register_afinfo(const struct xfrm_input_afinfo *afinfo); int xfrm_input_unregister_afinfo(const struct xfrm_input_afinfo *afinfo); void xfrm_flush_gc(void); -void xfrm_state_delete_tunnel(struct xfrm_state *x); struct xfrm_type { struct module *owner; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 5a4fb2539b08..9a45aed508d1 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -54,6 +54,7 @@ static int ipcomp4_err(struct sk_buff *skb, u32 info) } /* We always hold one tunnel user reference to indicate a tunnel */ +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -62,6 +63,7 @@ static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPIP; t->id.spi = x->props.saddr.a4; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 72d4858dec18..8607569de34f 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -71,6 +71,7 @@ static int ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } +static struct lock_class_key xfrm_state_lock_key; static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -79,6 +80,7 @@ static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) t = xfrm_state_alloc(net); if (!t) goto out; + lockdep_set_class(&t->lock, &xfrm_state_lock_key); t->id.proto = IPPROTO_IPV6; t->id.spi = xfrm6_tunnel_alloc_spi(net, (xfrm_address_t *)&x->props.saddr); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index bf140ef781c1..7fd8bc08e6eb 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,8 +334,8 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_flush_gc(); xfrm_state_flush(net, 0, false, true); + xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) WARN_ON_ONCE(!hlist_empty(&xfrm6_tn->spi_byaddr[i])); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index a38545413b80..43fdc6ed8dd1 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -313,7 +313,6 @@ void ipcomp_destroy(struct xfrm_state *x) struct ipcomp_data *ipcd = x->data; if (!ipcd) return; - xfrm_state_delete_tunnel(x); ipcomp_free_data(ipcd); kfree(ipcd); } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c7e6472c623d..f7110a658897 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -811,6 +811,7 @@ void __xfrm_state_destroy(struct xfrm_state *x, bool sync) } EXPORT_SYMBOL(__xfrm_state_destroy); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -838,6 +839,8 @@ int __xfrm_state_delete(struct xfrm_state *x) xfrm_dev_state_delete(x); + xfrm_state_delete_tunnel(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -941,10 +944,7 @@ restart: err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); - if (sync) - xfrm_state_put_sync(x); - else - xfrm_state_put(x); + xfrm_state_put(x); if (!err) cnt++; @@ -3068,20 +3068,17 @@ void xfrm_flush_gc(void) } EXPORT_SYMBOL(xfrm_flush_gc); -/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -void xfrm_state_delete_tunnel(struct xfrm_state *x) +static void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; - if (atomic_read(&t->tunnel_users) == 2) + if (atomic_dec_return(&t->tunnel_users) == 1) xfrm_state_delete(t); - atomic_dec(&t->tunnel_users); - xfrm_state_put_sync(t); + xfrm_state_put(t); x->tunnel = NULL; } } -EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { @@ -3286,8 +3283,8 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - flush_work(&xfrm_state_gc_work); xfrm_state_flush(net, 0, false, true); + flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); -- cgit v1.2.3 From 2a198bbec6913ae1c90ec963750003c6213668c7 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 4 Jul 2025 16:54:34 +0200 Subject: Revert "xfrm: destroy xfrm_state synchronously on net exit path" This reverts commit f75a2804da391571563c4b6b29e7797787332673. With all states (whether user or kern) removed from the hashtables during deletion, there's no need for synchronous destruction of states. xfrm6_tunnel states still need to have been destroyed (which will be the case when its last user is deleted (not destroyed)) so that xfrm6_tunnel_free_spi removes it from the per-netns hashtable before the netns is destroyed. This has the benefit of skipping one synchronize_rcu per state (in __xfrm_state_destroy(sync=true)) when we exit a netns. Signed-off-by: Sabrina Dubroca Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 12 +++--------- net/ipv6/xfrm6_tunnel.c | 2 +- net/key/af_key.c | 2 +- net/xfrm/xfrm_state.c | 23 +++++++++-------------- net/xfrm/xfrm_user.c | 2 +- 5 files changed, 15 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 91d52a380e37..f3014e4f54fc 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -915,7 +915,7 @@ static inline void xfrm_pols_put(struct xfrm_policy **pols, int npols) xfrm_pol_put(pols[i]); } -void __xfrm_state_destroy(struct xfrm_state *, bool); +void __xfrm_state_destroy(struct xfrm_state *); static inline void __xfrm_state_put(struct xfrm_state *x) { @@ -925,13 +925,7 @@ static inline void __xfrm_state_put(struct xfrm_state *x) static inline void xfrm_state_put(struct xfrm_state *x) { if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, false); -} - -static inline void xfrm_state_put_sync(struct xfrm_state *x) -{ - if (refcount_dec_and_test(&x->refcnt)) - __xfrm_state_destroy(x, true); + __xfrm_state_destroy(x); } static inline void xfrm_state_hold(struct xfrm_state *x) @@ -1769,7 +1763,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); int xfrm_state_delete(struct xfrm_state *x); -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); int xfrm_dev_policy_flush(struct net *net, struct net_device *dev, bool task_valid); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 7fd8bc08e6eb..5120a763da0d 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, 0, false, true); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) diff --git a/net/key/af_key.c b/net/key/af_key.c index efc2a91f4c48..b5d761700776 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1766,7 +1766,7 @@ static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_m if (proto == 0) return -EINVAL; - err = xfrm_state_flush(net, proto, true, false); + err = xfrm_state_flush(net, proto, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f7110a658897..327a1a6f892c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -592,7 +592,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); -static void ___xfrm_state_destroy(struct xfrm_state *x) +static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) x->mode_cbs->destroy_state(x); @@ -631,7 +631,7 @@ static void xfrm_state_gc_task(struct work_struct *work) synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) - ___xfrm_state_destroy(x); + xfrm_state_gc_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) @@ -795,19 +795,14 @@ void xfrm_dev_state_free(struct xfrm_state *x) } #endif -void __xfrm_state_destroy(struct xfrm_state *x, bool sync) +void __xfrm_state_destroy(struct xfrm_state *x) { WARN_ON(x->km.state != XFRM_STATE_DEAD); - if (sync) { - synchronize_rcu(); - ___xfrm_state_destroy(x); - } else { - spin_lock_bh(&xfrm_state_gc_lock); - hlist_add_head(&x->gclist, &xfrm_state_gc_list); - spin_unlock_bh(&xfrm_state_gc_lock); - schedule_work(&xfrm_state_gc_work); - } + spin_lock_bh(&xfrm_state_gc_lock); + hlist_add_head(&x->gclist, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(__xfrm_state_destroy); @@ -922,7 +917,7 @@ xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool } #endif -int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) +int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) { int i, err = 0, cnt = 0; @@ -3283,7 +3278,7 @@ void xfrm_state_fini(struct net *net) unsigned int sz; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, 0, false, true); + xfrm_state_flush(net, IPSEC_PROTO_ANY, false); flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 1db18f470f42..684239018bec 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2635,7 +2635,7 @@ static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_usersa_flush *p = nlmsg_data(nlh); int err; - err = xfrm_state_flush(net, p->proto, true, false); + err = xfrm_state_flush(net, p->proto, true); if (err) { if (err == -ESRCH) /* empty table */ return 0; -- cgit v1.2.3 From f4e1fb04c12384fb1b69a95c33527b515a652a74 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:16 +0000 Subject: af_unix: Use cached value for SOCK_STREAM in unix_inq_len(). Compared to TCP, ioctl(SIOCINQ) for AF_UNIX SOCK_STREAM socket is more expensive, as unix_inq_len() requires iterating through the receive queue and accumulating skb->len. Let's cache the value for SOCK_STREAM to a new field during sendmsg() and recvmsg(). The field is protected by the receive queue lock. Note that ioctl(SIOCINQ) for SOCK_DGRAM returns the length of the first skb in the queue. SOCK_SEQPACKET still requires iterating through the queue because we do not touch functions shared with unix_dgram_ops. But, if really needed, we can support it by switching __skb_try_recv_datagram() to a custom version. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 29 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 1af1841b7601..603f8cd026e5 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -47,6 +47,7 @@ struct unix_sock { #define peer_wait peer_wq.wait wait_queue_entry_t peer_wake; struct scm_stat scm_stat; + int inq_len; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) struct sk_buff *oob_skb; #endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 94596d6c37e9..d9e604295a71 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2297,6 +2297,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); + WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -2319,6 +2320,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sock *other = NULL; + struct unix_sock *otheru; struct scm_cookie scm; bool fds_sent = false; int err, sent = 0; @@ -2342,14 +2344,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; - } else { - other = unix_peer(sk); - if (!other) { - err = -ENOTCONN; - goto out_err; - } } + other = unix_peer(sk); + if (!other) { + err = -ENOTCONN; + goto out_err; + } + + otheru = unix_sk(other); + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto out_pipe; @@ -2417,7 +2421,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); - skb_queue_tail(&other->sk_receive_queue, skb); + + spin_lock(&other->sk_receive_queue.lock); + WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); + unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2704,6 +2713,7 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + WRITE_ONCE(u->inq_len, u->inq_len - 1); if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && !unix_skb_len(oob_skb->prev)) { @@ -2808,6 +2818,8 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) return -EAGAIN; } + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb == u->oob_skb) { WRITE_ONCE(u->oob_skb, NULL); @@ -2988,7 +3000,11 @@ unlock: if (unix_skb_len(skb)) break; - skb_unlink(skb, &sk->sk_receive_queue); + spin_lock(&sk->sk_receive_queue.lock); + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + __skb_unlink(skb, &sk->sk_receive_queue); + spin_unlock(&sk->sk_receive_queue.lock); + consume_skb(skb); if (scm.fp) @@ -3159,9 +3175,11 @@ long unix_inq_len(struct sock *sk) if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; + if (sk->sk_type == SOCK_STREAM) + return READ_ONCE(unix_sk(sk)->inq_len); + spin_lock(&sk->sk_receive_queue.lock); - if (sk->sk_type == SOCK_STREAM || - sk->sk_type == SOCK_SEQPACKET) { + if (sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { -- cgit v1.2.3 From df30285b3670bf52e1e5512e4d4482bec5e93c16 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:18 +0000 Subject: af_unix: Introduce SO_INQ. We have an application that uses almost the same code for TCP and AF_UNIX (SOCK_STREAM). TCP can use TCP_INQ, but AF_UNIX doesn't have it and requires an extra syscall, ioctl(SIOCINQ) or getsockopt(SO_MEMINFO) as an alternative. Let's introduce the generic version of TCP_INQ. If SO_INQ is enabled, recvmsg() will put a cmsg of SCM_INQ that contains the exact value of ioctl(SIOCINQ). The cmsg is also included when msg->msg_get_inq is non-zero to make sockets io_uring-friendly. Note that SOCK_CUSTOM_SOCKOPT is flagged only for SOCK_STREAM to override setsockopt() for SOL_SOCKET. By having the flag in struct unix_sock, instead of struct sock, we can later add SO_INQ support for TCP and reuse tcp_sk(sk)->recvmsg_inq. Note also that supporting custom getsockopt() for SOL_SOCKET will need preparation for other SOCK_CUSTOM_SOCKOPT users (UDP, vsock, MPTCP). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- arch/alpha/include/uapi/asm/socket.h | 3 ++ arch/mips/include/uapi/asm/socket.h | 3 ++ arch/parisc/include/uapi/asm/socket.h | 3 ++ arch/sparc/include/uapi/asm/socket.h | 3 ++ include/net/af_unix.h | 1 + include/uapi/asm-generic/socket.h | 3 ++ net/unix/af_unix.c | 62 +++++++++++++++++++++++++++++++++-- 7 files changed, 76 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 8f1f18adcdb5..5ef57f88df6b 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -152,6 +152,9 @@ #define SO_PASSRIGHTS 83 +#define SO_INQ 84 +#define SCM_INQ SO_INQ + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 31ac655b7837..72fb1b006da9 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -163,6 +163,9 @@ #define SO_PASSRIGHTS 83 +#define SO_INQ 84 +#define SCM_INQ SO_INQ + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 1f2d5b7a7f5d..c16ec36dfee6 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -144,6 +144,9 @@ #define SO_PASSRIGHTS 0x4051 +#define SO_INQ 0x4052 +#define SCM_INQ SO_INQ + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index adcba7329386..71befa109e1c 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -145,6 +145,9 @@ #define SO_PASSRIGHTS 0x005c +#define SO_INQ 0x005d +#define SCM_INQ SO_INQ + #if !defined(__KERNEL__) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 603f8cd026e5..34f53dde65ce 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -48,6 +48,7 @@ struct unix_sock { wait_queue_entry_t peer_wake; struct scm_stat scm_stat; int inq_len; + bool recvmsg_inq; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) struct sk_buff *oob_skb; #endif diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index f333a0ac4ee4..53b5a8c002b1 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -147,6 +147,9 @@ #define SO_PASSRIGHTS 83 +#define SO_INQ 84 +#define SCM_INQ SO_INQ + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c3dd41596d89..7a92733706fe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -934,6 +934,52 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) #define unix_show_fdinfo NULL #endif +static bool unix_custom_sockopt(int optname) +{ + switch (optname) { + case SO_INQ: + return true; + default: + return false; + } +} + +static int unix_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct unix_sock *u = unix_sk(sock->sk); + struct sock *sk = sock->sk; + int val; + + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + if (!unix_custom_sockopt(optname)) + return sock_setsockopt(sock, level, optname, optval, optlen); + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + switch (optname) { + case SO_INQ: + if (sk->sk_type != SOCK_STREAM) + return -EINVAL; + + if (val > 1 || val < 0) + return -EINVAL; + + WRITE_ONCE(u->recvmsg_inq, val); + break; + default: + return -ENOPROTOOPT; + } + + return 0; +} + static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, @@ -950,6 +996,7 @@ static const struct proto_ops unix_stream_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, + .setsockopt = unix_setsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, @@ -1116,6 +1163,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, switch (sock->type) { case SOCK_STREAM: + set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); sock->ops = &unix_stream_ops; break; /* @@ -1847,6 +1895,9 @@ static int unix_accept(struct socket *sock, struct socket *newsock, skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); + if (tsk->sk_type == SOCK_STREAM) + set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); @@ -3034,10 +3085,17 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (msg) + if (msg) { scm_recv_unix(sock, msg, &scm, flags); - else + + if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { + msg->msg_inq = READ_ONCE(u->inq_len); + put_cmsg(msg, SOL_SOCKET, SCM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); + } + } else { scm_destroy(&scm); + } out: return copied ? : err; } -- cgit v1.2.3 From be1ba9ed221ffb95a8bb15f4c83d0694225ba808 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 9 Jun 2025 21:35:13 +0300 Subject: wifi: mac80211: avoid weird state in error path If we get to the error path of ieee80211_prep_connection, for example because of a FW issue, then ieee80211_vif_set_links is called with 0. But the call to drv_change_vif_links from ieee80211_vif_update_links will probably fail as well, for the same reason. In this case, the valid_links and active_links bitmaps will be reverted to the value of the failing connection. Then, in the next connection, due to the logic of ieee80211_set_vif_links_bitmaps, valid_links will be set to the ID of the new connection assoc link, but the active_links will remain with the ID of the old connection's assoc link. If those IDs are different, we get into a weird state of valid_links and active_links being different. One of the consequences of this state is to call drv_change_vif_links with new_links as 0, since the & operation between the bitmaps will be 0. Since a removal of a link should always succeed, ignore the return value of drv_change_vif_links if it was called to only remove links, which is the case for the ieee80211_prep_connection's error path. That way, the bitmaps will not be reverted to have the value from the failing connection and will have 0, so the next connection will have a good state. Signed-off-by: Miri Korenblit Reviewed-by: Johannes Berg Link: https://patch.msgid.link/20250609213231.ba2011fb435f.Id87ff6dab5e1cf757b54094ac2d714c656165059@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/link.c | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index dcd5969bb559..a61ffdbf99be 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4477,6 +4477,8 @@ struct ieee80211_prep_tx_info { * new links bitmaps may be 0 if going from/to a non-MLO situation. * The @old array contains pointers to the old bss_conf structures * that were already removed, in case they're needed. + * Note that removal of link should always succeed, so the return value + * will be ignored in a removal only case. * This callback can sleep. * @change_sta_links: Change the valid links of a station, similar to * @change_vif_links. This callback can sleep. diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 4f7b7d0f64f2..d71eabe5abf8 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -2,7 +2,7 @@ /* * MLO link handling * - * Copyright (C) 2022-2024 Intel Corporation + * Copyright (C) 2022-2025 Intel Corporation */ #include #include @@ -368,6 +368,13 @@ static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, ieee80211_update_apvlan_links(sdata); } + /* + * Ignore errors if we are only removing links as removal should + * always succeed + */ + if (!new_links) + ret = 0; + if (ret) { /* restore config */ memcpy(sdata->link, old_data, sizeof(old_data)); -- cgit v1.2.3 From f0df91b6a7120d85c873f5e77bc183fb6eccda16 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 9 Jun 2025 21:35:19 +0300 Subject: wifi: cfg80211: hide scan internals Hide the internal scan fields from mac80211 and drivers, the 'notified' variable is for internal tracking, and the 'info' is output that's passed to cfg80211_scan_done() and stored only for delayed userspace notification. Signed-off-by: Johannes Berg Reviewed-by: Benjamin Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250609213231.6a62e41858e2.I004f66e9c087cc6e6ae4a24951cf470961ee9466@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 -- net/wireless/core.c | 4 +- net/wireless/core.h | 11 ++- net/wireless/nl80211.c | 97 +++++++++++++------------ net/wireless/rdev-ops.h | 6 +- net/wireless/scan.c | 188 +++++++++++++++++++++++++----------------------- net/wireless/sme.c | 40 +++++------ net/wireless/trace.h | 23 +++--- 8 files changed, 196 insertions(+), 179 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 4a092da3a9de..5d5ad7926877 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2748,8 +2748,6 @@ struct cfg80211_scan_6ghz_params { * @wiphy: the wiphy this was for * @scan_start: time (in jiffies) when the scan started * @wdev: the wireless device to scan for - * @info: (internal) information about completed scan - * @notified: (internal) scan request was notified as done or aborted * @no_cck: used to send probe requests at non CCK rate in 2GHz band * @mac_addr: MAC address used with randomisation * @mac_addr_mask: MAC address mask used with randomisation, bits that @@ -2780,12 +2778,8 @@ struct cfg80211_scan_request { u8 mac_addr[ETH_ALEN] __aligned(2); u8 mac_addr_mask[ETH_ALEN] __aligned(2); u8 bssid[ETH_ALEN] __aligned(2); - - /* internal */ struct wiphy *wiphy; unsigned long scan_start; - struct cfg80211_scan_info info; - bool notified; bool no_cck; bool scan_6ghz; u32 n_6ghz_params; diff --git a/net/wireless/core.c b/net/wireless/core.c index f3cd70757ef2..a7e2931ffb2e 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -239,7 +239,7 @@ void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, rdev->opencount--; - if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) @@ -1574,7 +1574,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, case NETDEV_DOWN: wiphy_lock(&rdev->wiphy); cfg80211_update_iface_num(rdev, wdev->iftype, -1); - if (rdev->scan_req && rdev->scan_req->wdev == wdev) { + if (rdev->scan_req && rdev->scan_req->req.wdev == wdev) { if (WARN_ON(!rdev->scan_req->notified && (!rdev->int_scan_req || !rdev->int_scan_req->notified))) diff --git a/net/wireless/core.h b/net/wireless/core.h index c56a35040caa..b6bd7f4d6385 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -21,6 +21,13 @@ #define WIPHY_IDX_INVALID -1 +struct cfg80211_scan_request_int { + struct cfg80211_scan_info info; + bool notified; + /* must be last - variable members */ + struct cfg80211_scan_request req; +}; + struct cfg80211_registered_device { const struct cfg80211_ops *ops; struct list_head list; @@ -70,8 +77,8 @@ struct cfg80211_registered_device { struct rb_root bss_tree; u32 bss_generation; u32 bss_entries; - struct cfg80211_scan_request *scan_req; /* protected by RTNL */ - struct cfg80211_scan_request *int_scan_req; + struct cfg80211_scan_request_int *scan_req; /* protected by RTNL */ + struct cfg80211_scan_request_int *int_scan_req; struct sk_buff *scan_msg; struct list_head sched_scan_req_list; time64_t suspend_at; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 70ca74a75f22..18f27f193772 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9831,12 +9831,12 @@ nl80211_check_scan_flags(struct wiphy *wiphy, struct wireless_dev *wdev, mac_addr = req->mac_addr; mac_addr_mask = req->mac_addr_mask; } else { - struct cfg80211_scan_request *req = request; + struct cfg80211_scan_request_int *req = request; randomness_flag = NL80211_FEATURE_SCAN_RANDOM_MAC_ADDR; - flags = &req->flags; - mac_addr = req->mac_addr; - mac_addr_mask = req->mac_addr_mask; + flags = &req->req.flags; + mac_addr = req->req.mac_addr; + mac_addr_mask = req->req.mac_addr_mask; } *flags = nla_get_u32(attrs[NL80211_ATTR_SCAN_FLAGS]); @@ -9891,7 +9891,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; - struct cfg80211_scan_request *request; + struct cfg80211_scan_request_int *request; struct nlattr *scan_freqs = NULL; bool scan_freqs_khz = false; struct nlattr *attr; @@ -9943,21 +9943,21 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (ie_len > wiphy->max_scan_ie_len) return -EINVAL; - size = struct_size(request, channels, n_channels); + size = struct_size(request, req.channels, n_channels); ssids_offset = size; - size = size_add(size, array_size(sizeof(*request->ssids), n_ssids)); + size = size_add(size, array_size(sizeof(*request->req.ssids), n_ssids)); ie_offset = size; size = size_add(size, ie_len); request = kzalloc(size, GFP_KERNEL); if (!request) return -ENOMEM; - request->n_channels = n_channels; + request->req.n_channels = n_channels; if (n_ssids) - request->ssids = (void *)request + ssids_offset; - request->n_ssids = n_ssids; + request->req.ssids = (void *)request + ssids_offset; + request->req.n_ssids = n_ssids; if (ie_len) - request->ie = (void *)request + ie_offset; + request->req.ie = (void *)request + ie_offset; i = 0; if (scan_freqs) { @@ -9980,7 +9980,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) !cfg80211_wdev_channel_allowed(wdev, chan)) continue; - request->channels[i] = chan; + request->req.channels[i] = chan; i++; } } else { @@ -10001,7 +10001,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) !cfg80211_wdev_channel_allowed(wdev, chan)) continue; - request->channels[i] = chan; + request->req.channels[i] = chan; i++; } } @@ -10012,10 +10012,10 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - request->n_channels = i; + request->req.n_channels = i; - for (i = 0; i < request->n_channels; i++) { - struct ieee80211_channel *chan = request->channels[i]; + for (i = 0; i < request->req.n_channels; i++) { + struct ieee80211_channel *chan = request->req.channels[i]; /* if we can go off-channel to the target channel we're good */ if (cfg80211_off_channel_oper_allowed(wdev, chan)) @@ -10034,22 +10034,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) err = -EINVAL; goto out_free; } - request->ssids[i].ssid_len = nla_len(attr); - memcpy(request->ssids[i].ssid, nla_data(attr), nla_len(attr)); + request->req.ssids[i].ssid_len = nla_len(attr); + memcpy(request->req.ssids[i].ssid, + nla_data(attr), nla_len(attr)); i++; } } if (info->attrs[NL80211_ATTR_IE]) { - request->ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); - memcpy((void *)request->ie, + request->req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); + memcpy((void *)request->req.ie, nla_data(info->attrs[NL80211_ATTR_IE]), - request->ie_len); + request->req.ie_len); } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) - request->rates[i] = + request->req.rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; if (info->attrs[NL80211_ATTR_SCAN_SUPP_RATES]) { @@ -10069,16 +10070,16 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) err = ieee80211_get_ratemask(wiphy->bands[band], nla_data(attr), nla_len(attr), - &request->rates[band]); + &request->req.rates[band]); if (err) goto out_free; } } if (info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]) { - request->duration = + request->req.duration = nla_get_u16(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION]); - request->duration_mandatory = + request->req.duration_mandatory = nla_get_flag(info->attrs[NL80211_ATTR_MEASUREMENT_DURATION_MANDATORY]); } @@ -10087,7 +10088,7 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (err) goto out_free; - request->no_cck = + request->req.no_cck = nla_get_flag(info->attrs[NL80211_ATTR_TX_NO_CCK_RATE]); /* Initial implementation used NL80211_ATTR_MAC to set the specific @@ -10100,19 +10101,21 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) * (NL80211_ATTR_SCAN_FLAGS is used to enable random MAC address use). */ if (info->attrs[NL80211_ATTR_BSSID]) - memcpy(request->bssid, + memcpy(request->req.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN); - else if (!(request->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) && + else if (!(request->req.flags & NL80211_SCAN_FLAG_RANDOM_ADDR) && info->attrs[NL80211_ATTR_MAC]) - memcpy(request->bssid, nla_data(info->attrs[NL80211_ATTR_MAC]), + memcpy(request->req.bssid, + nla_data(info->attrs[NL80211_ATTR_MAC]), ETH_ALEN); else - eth_broadcast_addr(request->bssid); + eth_broadcast_addr(request->req.bssid); - request->tsf_report_link_id = nl80211_link_id_or_invalid(info->attrs); - request->wdev = wdev; - request->wiphy = &rdev->wiphy; - request->scan_start = jiffies; + request->req.tsf_report_link_id = + nl80211_link_id_or_invalid(info->attrs); + request->req.wdev = wdev; + request->req.wiphy = &rdev->wiphy; + request->req.scan_start = jiffies; rdev->scan_req = request; err = cfg80211_scan(rdev); @@ -18414,7 +18417,7 @@ void nl80211_notify_iface(struct cfg80211_registered_device *rdev, static int nl80211_add_scan_req(struct sk_buff *msg, struct cfg80211_registered_device *rdev) { - struct cfg80211_scan_request *req = rdev->scan_req; + struct cfg80211_scan_request_int *req = rdev->scan_req; struct nlattr *nest; int i; struct cfg80211_scan_info *info; @@ -18425,19 +18428,20 @@ static int nl80211_add_scan_req(struct sk_buff *msg, nest = nla_nest_start_noflag(msg, NL80211_ATTR_SCAN_SSIDS); if (!nest) goto nla_put_failure; - for (i = 0; i < req->n_ssids; i++) { - if (nla_put(msg, i, req->ssids[i].ssid_len, req->ssids[i].ssid)) + for (i = 0; i < req->req.n_ssids; i++) { + if (nla_put(msg, i, req->req.ssids[i].ssid_len, + req->req.ssids[i].ssid)) goto nla_put_failure; } nla_nest_end(msg, nest); - if (req->flags & NL80211_SCAN_FLAG_FREQ_KHZ) { + if (req->req.flags & NL80211_SCAN_FLAG_FREQ_KHZ) { nest = nla_nest_start(msg, NL80211_ATTR_SCAN_FREQ_KHZ); if (!nest) goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { + for (i = 0; i < req->req.n_channels; i++) { if (nla_put_u32(msg, i, - ieee80211_channel_to_khz(req->channels[i]))) + ieee80211_channel_to_khz(req->req.channels[i]))) goto nla_put_failure; } nla_nest_end(msg, nest); @@ -18446,19 +18450,20 @@ static int nl80211_add_scan_req(struct sk_buff *msg, NL80211_ATTR_SCAN_FREQUENCIES); if (!nest) goto nla_put_failure; - for (i = 0; i < req->n_channels; i++) { - if (nla_put_u32(msg, i, req->channels[i]->center_freq)) + for (i = 0; i < req->req.n_channels; i++) { + if (nla_put_u32(msg, i, + req->req.channels[i]->center_freq)) goto nla_put_failure; } nla_nest_end(msg, nest); } - if (req->ie && - nla_put(msg, NL80211_ATTR_IE, req->ie_len, req->ie)) + if (req->req.ie && + nla_put(msg, NL80211_ATTR_IE, req->req.ie_len, req->req.ie)) goto nla_put_failure; - if (req->flags && - nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->flags)) + if (req->req.flags && + nla_put_u32(msg, NL80211_ATTR_SCAN_FLAGS, req->req.flags)) goto nla_put_failure; info = rdev->int_scan_req ? &rdev->int_scan_req->info : diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 803b39c26587..ac6884bacf3f 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -456,15 +456,15 @@ rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, } static inline int rdev_scan(struct cfg80211_registered_device *rdev, - struct cfg80211_scan_request *request) + struct cfg80211_scan_request_int *request) { int ret; - if (WARN_ON_ONCE(!request->n_ssids && request->ssids)) + if (WARN_ON_ONCE(!request->req.n_ssids && request->req.ssids)) return -EINVAL; trace_rdev_scan(&rdev->wiphy, request); - ret = rdev->ops->scan(&rdev->wiphy, request); + ret = rdev->ops->scan(&rdev->wiphy, &request->req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index e8a4fe44ec2d..a75cecc47d78 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -782,9 +782,9 @@ cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, } EXPORT_SYMBOL_IF_CFG80211_KUNIT(cfg80211_parse_colocated_ap); -static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, - struct ieee80211_channel *chan, - bool add_to_6ghz) +static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, + struct ieee80211_channel *chan, + bool add_to_6ghz) { int i; u32 n_channels = request->n_channels; @@ -843,25 +843,25 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) u8 i; struct cfg80211_colocated_ap *ap; int n_channels, count = 0, err; - struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; + struct cfg80211_scan_request_int *request, *rdev_req = rdev->scan_req; LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; size_t size, offs_ssids, offs_6ghz_params, offs_ies; - rdev_req->scan_6ghz = true; + rdev_req->req.scan_6ghz = true; if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) return -EOPNOTSUPP; iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], - rdev_req->wdev->iftype); + rdev_req->req.wdev->iftype); if (!iftd || !iftd->he_cap.has_he) return -EOPNOTSUPP; n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; - if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { + if (rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { struct cfg80211_internal_bss *intbss; spin_lock_bh(&rdev->bss_lock); @@ -883,8 +883,8 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) * This is relevant for ML probe requests when the lower * band APs have not been discovered. */ - if (is_broadcast_ether_addr(rdev_req->bssid) || - !ether_addr_equal(rdev_req->bssid, res->bssid) || + if (is_broadcast_ether_addr(rdev_req->req.bssid) || + !ether_addr_equal(rdev_req->req.bssid, res->bssid) || res->channel->band != NL80211_BAND_6GHZ) continue; @@ -911,13 +911,13 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) spin_unlock_bh(&rdev->bss_lock); } - size = struct_size(request, channels, n_channels); + size = struct_size(request, req.channels, n_channels); offs_ssids = size; - size += sizeof(*request->ssids) * rdev_req->n_ssids; + size += sizeof(*request->req.ssids) * rdev_req->req.n_ssids; offs_6ghz_params = size; - size += sizeof(*request->scan_6ghz_params) * count; + size += sizeof(*request->req.scan_6ghz_params) * count; offs_ies = size; - size += rdev_req->ie_len; + size += rdev_req->req.ie_len; request = kzalloc(size, GFP_KERNEL); if (!request) { @@ -926,26 +926,26 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) } *request = *rdev_req; - request->n_channels = 0; - request->n_6ghz_params = 0; - if (rdev_req->n_ssids) { + request->req.n_channels = 0; + request->req.n_6ghz_params = 0; + if (rdev_req->req.n_ssids) { /* * Add the ssids from the parent scan request to the new * scan request, so the driver would be able to use them * in its probe requests to discover hidden APs on PSC * channels. */ - request->ssids = (void *)request + offs_ssids; - memcpy(request->ssids, rdev_req->ssids, - sizeof(*request->ssids) * request->n_ssids); + request->req.ssids = (void *)request + offs_ssids; + memcpy(request->req.ssids, rdev_req->req.ssids, + sizeof(*request->req.ssids) * request->req.n_ssids); } - request->scan_6ghz_params = (void *)request + offs_6ghz_params; + request->req.scan_6ghz_params = (void *)request + offs_6ghz_params; - if (rdev_req->ie_len) { + if (rdev_req->req.ie_len) { void *ie = (void *)request + offs_ies; - memcpy(ie, rdev_req->ie, rdev_req->ie_len); - request->ie = ie; + memcpy(ie, rdev_req->req.ie, rdev_req->req.ie_len); + request->req.ie = ie; } /* @@ -953,10 +953,12 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) * and at least one of the reported co-located APs with same SSID * indicating that all APs in the same ESS are co-located */ - if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) { + if (count && + request->req.n_ssids == 1 && + request->req.ssids[0].ssid_len) { list_for_each_entry(ap, &coloc_ap_list, list) { if (ap->colocated_ess && - cfg80211_find_ssid_match(ap, request)) { + cfg80211_find_ssid_match(ap, &request->req)) { need_scan_psc = false; break; } @@ -968,51 +970,52 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) * regardless of the collocated APs (PSC channels or all channels * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) */ - for (i = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && + for (i = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i]->band == NL80211_BAND_6GHZ && ((need_scan_psc && - cfg80211_channel_is_psc(rdev_req->channels[i])) || - !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { - cfg80211_scan_req_add_chan(request, - rdev_req->channels[i], + cfg80211_channel_is_psc(rdev_req->req.channels[i])) || + !(rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { + cfg80211_scan_req_add_chan(&request->req, + rdev_req->req.channels[i], false); } } - if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) + if (!(rdev_req->req.flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) goto skip; list_for_each_entry(ap, &coloc_ap_list, list) { bool found = false; struct cfg80211_scan_6ghz_params *scan_6ghz_params = - &request->scan_6ghz_params[request->n_6ghz_params]; + &request->req.scan_6ghz_params[request->req.n_6ghz_params]; struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED || - !cfg80211_wdev_channel_allowed(rdev_req->wdev, chan)) + !cfg80211_wdev_channel_allowed(rdev_req->req.wdev, chan)) continue; - for (i = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i] == chan) + for (i = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i] == chan) found = true; } if (!found) continue; - if (request->n_ssids > 0 && - !cfg80211_find_ssid_match(ap, request)) + if (request->req.n_ssids > 0 && + !cfg80211_find_ssid_match(ap, &request->req)) continue; - if (!is_broadcast_ether_addr(request->bssid) && - !ether_addr_equal(request->bssid, ap->bssid)) + if (!is_broadcast_ether_addr(request->req.bssid) && + !ether_addr_equal(request->req.bssid, ap->bssid)) continue; - if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) + if (!request->req.n_ssids && ap->multi_bss && + !ap->transmitted_bssid) continue; - cfg80211_scan_req_add_chan(request, chan, true); + cfg80211_scan_req_add_chan(&request->req, chan, true); memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); scan_6ghz_params->short_ssid = ap->short_ssid; scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; @@ -1028,14 +1031,14 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) if (cfg80211_channel_is_psc(chan) && !need_scan_psc) scan_6ghz_params->psc_no_listen = true; - request->n_6ghz_params++; + request->req.n_6ghz_params++; } skip: cfg80211_free_coloc_ap_list(&coloc_ap_list); - if (request->n_channels) { - struct cfg80211_scan_request *old = rdev->int_scan_req; + if (request->req.n_channels) { + struct cfg80211_scan_request_int *old = rdev->int_scan_req; rdev->int_scan_req = request; @@ -1063,35 +1066,36 @@ skip: int cfg80211_scan(struct cfg80211_registered_device *rdev) { - struct cfg80211_scan_request *request; - struct cfg80211_scan_request *rdev_req = rdev->scan_req; + struct cfg80211_scan_request_int *request; + struct cfg80211_scan_request_int *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) return rdev_scan(rdev, rdev_req); - for (i = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) + for (i = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ) n_channels++; } if (!n_channels) return cfg80211_scan_6ghz(rdev); - request = kzalloc(struct_size(request, channels, n_channels), + request = kzalloc(struct_size(request, req.channels, n_channels), GFP_KERNEL); if (!request) return -ENOMEM; *request = *rdev_req; - request->n_channels = n_channels; + request->req.n_channels = n_channels; - for (i = idx = 0; i < rdev_req->n_channels; i++) { - if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) - request->channels[idx++] = rdev_req->channels[i]; + for (i = idx = 0; i < rdev_req->req.n_channels; i++) { + if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ) + request->req.channels[idx++] = + rdev_req->req.channels[i]; } - rdev_req->scan_6ghz = false; + rdev_req->req.scan_6ghz = false; rdev->int_scan_req = request; return rdev_scan(rdev, request); } @@ -1099,7 +1103,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { - struct cfg80211_scan_request *request, *rdev_req; + struct cfg80211_scan_request_int *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT @@ -1118,12 +1122,12 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (!rdev_req) return; - wdev = rdev_req->wdev; + wdev = rdev_req->req.wdev; request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; if (wdev_running(wdev) && (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && - !rdev_req->scan_6ghz && !request->info.aborted && + !rdev_req->req.scan_6ghz && !request->info.aborted && !cfg80211_scan_6ghz(rdev)) return; @@ -1136,10 +1140,10 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, cfg80211_sme_scan_done(wdev->netdev); if (!request->info.aborted && - request->flags & NL80211_SCAN_FLAG_FLUSH) { + request->req.flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); + __cfg80211_bss_expire(rdev, request->req.scan_start); spin_unlock_bh(&rdev->bss_lock); } @@ -1175,13 +1179,16 @@ void __cfg80211_scan_done(struct wiphy *wiphy, struct wiphy_work *wk) void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { - struct cfg80211_scan_info old_info = request->info; + struct cfg80211_scan_request_int *intreq = + container_of(request, struct cfg80211_scan_request_int, req); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(request->wiphy); + struct cfg80211_scan_info old_info = intreq->info; - trace_cfg80211_scan_done(request, info); - WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && - request != wiphy_to_rdev(request->wiphy)->int_scan_req); + trace_cfg80211_scan_done(intreq, info); + WARN_ON(intreq != rdev->scan_req && + intreq != rdev->int_scan_req); - request->info = *info; + intreq->info = *info; /* * In case the scan is split, the scan_start_tsf and tsf_bssid should @@ -1189,14 +1196,13 @@ void cfg80211_scan_done(struct cfg80211_scan_request *request, * be non zero. */ if (request->scan_6ghz && old_info.scan_start_tsf) { - request->info.scan_start_tsf = old_info.scan_start_tsf; - memcpy(request->info.tsf_bssid, old_info.tsf_bssid, - sizeof(request->info.tsf_bssid)); + intreq->info.scan_start_tsf = old_info.scan_start_tsf; + memcpy(intreq->info.tsf_bssid, old_info.tsf_bssid, + sizeof(intreq->info.tsf_bssid)); } - request->notified = true; - wiphy_work_queue(request->wiphy, - &wiphy_to_rdev(request->wiphy)->scan_done_wk); + intreq->notified = true; + wiphy_work_queue(request->wiphy, &rdev->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); @@ -3496,7 +3502,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, struct cfg80211_registered_device *rdev; struct wiphy *wiphy; struct iw_scan_req *wreq = NULL; - struct cfg80211_scan_request *creq; + struct cfg80211_scan_request_int *creq; int i, err, n_channels = 0; enum nl80211_band band; @@ -3526,19 +3532,20 @@ int cfg80211_wext_siwscan(struct net_device *dev, n_channels = ieee80211_get_num_supported_channels(wiphy); } - creq = kzalloc(struct_size(creq, channels, n_channels) + + creq = kzalloc(struct_size(creq, req.channels, n_channels) + sizeof(struct cfg80211_ssid), GFP_ATOMIC); if (!creq) return -ENOMEM; - creq->wiphy = wiphy; - creq->wdev = dev->ieee80211_ptr; + creq->req.wiphy = wiphy; + creq->req.wdev = dev->ieee80211_ptr; /* SSIDs come after channels */ - creq->ssids = (void *)creq + struct_size(creq, channels, n_channels); - creq->n_channels = n_channels; - creq->n_ssids = 1; - creq->scan_start = jiffies; + creq->req.ssids = (void *)creq + + struct_size(creq, req.channels, n_channels); + creq->req.n_channels = n_channels; + creq->req.n_ssids = 1; + creq->req.scan_start = jiffies; /* translate "Scan on frequencies" request */ i = 0; @@ -3554,7 +3561,7 @@ int cfg80211_wext_siwscan(struct net_device *dev, /* ignore disabled channels */ chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || - !cfg80211_wdev_channel_allowed(creq->wdev, chan)) + !cfg80211_wdev_channel_allowed(creq->req.wdev, chan)) continue; /* If we have a wireless request structure and the @@ -3577,7 +3584,8 @@ int cfg80211_wext_siwscan(struct net_device *dev, } wext_freq_found: - creq->channels[i] = &wiphy->bands[band]->channels[j]; + creq->req.channels[i] = + &wiphy->bands[band]->channels[j]; i++; wext_freq_not_found: ; } @@ -3588,28 +3596,30 @@ int cfg80211_wext_siwscan(struct net_device *dev, goto out; } - /* Set real number of channels specified in creq->channels[] */ - creq->n_channels = i; + /* Set real number of channels specified in creq->req.channels[] */ + creq->req.n_channels = i; /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) return -EINVAL; - memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); - creq->ssids[0].ssid_len = wreq->essid_len; + memcpy(creq->req.ssids[0].ssid, wreq->essid, + wreq->essid_len); + creq->req.ssids[0].ssid_len = wreq->essid_len; } if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) { - creq->ssids = NULL; - creq->n_ssids = 0; + creq->req.ssids = NULL; + creq->req.n_ssids = 0; } } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) - creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; + creq->req.rates[i] = + (1 << wiphy->bands[i]->n_bitrates) - 1; - eth_broadcast_addr(creq->bssid); + eth_broadcast_addr(creq->req.bssid); scoped_guard(wiphy, &rdev->wiphy) { rdev->scan_req = creq; diff --git a/net/wireless/sme.c b/net/wireless/sme.c index cf998500a965..6d7a7e7f0fc2 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,7 +5,7 @@ * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg - * Copyright (C) 2009, 2020, 2022-2024 Intel Corporation. All rights reserved. + * Copyright (C) 2009, 2020, 2022-2025 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ @@ -64,7 +64,7 @@ static void cfg80211_sme_free(struct wireless_dev *wdev) static int cfg80211_conn_scan(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); - struct cfg80211_scan_request *request; + struct cfg80211_scan_request_int *request; int n_channels, err; lockdep_assert_wiphy(wdev->wiphy); @@ -77,13 +77,13 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) else n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); - request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + - sizeof(request->channels[0]) * n_channels, + request = kzalloc(sizeof(*request) + sizeof(request->req.ssids[0]) + + sizeof(request->req.channels[0]) * n_channels, GFP_KERNEL); if (!request) return -ENOMEM; - request->n_channels = n_channels; + request->req.n_channels = n_channels; if (wdev->conn->params.channel) { enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = @@ -93,8 +93,8 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) kfree(request); return -EINVAL; } - request->channels[0] = wdev->conn->params.channel; - request->rates[band] = (1 << sband->n_bitrates) - 1; + request->req.channels[0] = wdev->conn->params.channel; + request->req.rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; enum nl80211_band band; @@ -109,26 +109,26 @@ static int cfg80211_conn_scan(struct wireless_dev *wdev) channel = &bands->channels[j]; if (channel->flags & IEEE80211_CHAN_DISABLED) continue; - request->channels[i++] = channel; + request->req.channels[i++] = channel; } - request->rates[band] = (1 << bands->n_bitrates) - 1; + request->req.rates[band] = (1 << bands->n_bitrates) - 1; } n_channels = i; } - request->n_channels = n_channels; - request->ssids = (void *)request + - struct_size(request, channels, n_channels); - request->n_ssids = 1; + request->req.n_channels = n_channels; + request->req.ssids = (void *)request + + struct_size(request, req.channels, n_channels); + request->req.n_ssids = 1; - memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, - wdev->conn->params.ssid_len); - request->ssids[0].ssid_len = wdev->conn->params.ssid_len; + memcpy(request->req.ssids[0].ssid, wdev->conn->params.ssid, + wdev->conn->params.ssid_len); + request->req.ssids[0].ssid_len = wdev->conn->params.ssid_len; - eth_broadcast_addr(request->bssid); + eth_broadcast_addr(request->req.bssid); - request->wdev = wdev; - request->wiphy = &rdev->wiphy; - request->scan_start = jiffies; + request->req.wdev = wdev; + request->req.wiphy = &rdev->wiphy; + request->req.scan_start = jiffies; rdev->scan_req = request; diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 7e43ab9de923..a07d88d61bec 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -373,7 +373,8 @@ TRACE_EVENT(rdev_return_int, ); TRACE_EVENT(rdev_scan, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request), + TP_PROTO(struct wiphy *wiphy, + struct cfg80211_scan_request_int *request), TP_ARGS(wiphy, request), TP_STRUCT__entry( WIPHY_ENTRY @@ -3716,12 +3717,12 @@ TRACE_EVENT(cfg80211_tdls_oper_request, ); TRACE_EVENT(cfg80211_scan_done, - TP_PROTO(struct cfg80211_scan_request *request, + TP_PROTO(struct cfg80211_scan_request_int *request, struct cfg80211_scan_info *info), TP_ARGS(request, info), TP_STRUCT__entry( __field(u32, n_channels) - __dynamic_array(u8, ie, request ? request->ie_len : 0) + __dynamic_array(u8, ie, request ? request->req.ie_len : 0) __array(u32, rates, NUM_NL80211_BANDS) __field(u32, wdev_id) MAC_ENTRY(wiphy_mac) @@ -3732,16 +3733,16 @@ TRACE_EVENT(cfg80211_scan_done, ), TP_fast_assign( if (request) { - memcpy(__get_dynamic_array(ie), request->ie, - request->ie_len); - memcpy(__entry->rates, request->rates, + memcpy(__get_dynamic_array(ie), request->req.ie, + request->req.ie_len); + memcpy(__entry->rates, request->req.rates, NUM_NL80211_BANDS); - __entry->wdev_id = request->wdev ? - request->wdev->identifier : 0; - if (request->wiphy) + __entry->wdev_id = request->req.wdev ? + request->req.wdev->identifier : 0; + if (request->req.wiphy) MAC_ASSIGN(wiphy_mac, - request->wiphy->perm_addr); - __entry->no_cck = request->no_cck; + request->req.wiphy->perm_addr); + __entry->no_cck = request->req.no_cck; } if (info) { __entry->aborted = info->aborted; -- cgit v1.2.3 From 984462751d57047828ff4a799cc7d4670a2cfeb2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 9 Jun 2025 21:35:22 +0300 Subject: wifi: mac80211: remove DISALLOW_PUNCTURING_5GHZ code Since iwlwifi was the only driver using this and no longer does, we can remove all this code. Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250609213231.4dff5fb8890f.Ie531f912b252a0042c18c0734db50c3afe1adfb5@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 3 --- net/mac80211/debugfs.c | 3 +-- net/mac80211/mlme.c | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a61ffdbf99be..14a6bd120f25 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -2852,8 +2852,6 @@ struct ieee80211_txq { * * @IEEE80211_HW_DISALLOW_PUNCTURING: HW requires disabling puncturing in EHT * and connecting with a lower bandwidth instead - * @IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ: HW requires disabling puncturing in - * EHT in 5 GHz and connecting with a lower bandwidth instead * * @IEEE80211_HW_HANDLES_QUIET_CSA: HW/driver handles quieting for CSA, so * no need to stop queues. This really should be set by a driver that @@ -2923,7 +2921,6 @@ enum ieee80211_hw_flags { IEEE80211_HW_DETECTS_COLOR_COLLISION, IEEE80211_HW_MLO_MCAST_MULTI_LINK_TX, IEEE80211_HW_DISALLOW_PUNCTURING, - IEEE80211_HW_DISALLOW_PUNCTURING_5GHZ, IEEE80211_HW_HANDLES_QUIET_CSA, IEEE80211_HW_STRICT, diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 69e03630f64c..e8b78ec682da 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -4,7 +4,7 @@ * * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2018 - 2019, 2021-2024 Intel Corporation + * Copyright (C) 2018 - 2019, 2021-2025 Intel Corporation */ #include @@ -490,7 +490,6 @@ static const char *hw_flag_names[] = { FLAG(DETECTS_COLOR_COLLISION), FLAG(MLO_MCAST_MULTI_LINK_TX), FLAG(DISALLOW_PUNCTURING), - FLAG(DISALLOW_PUNCTURING_5GHZ), FLAG(HANDLES_QUIET_CSA), FLAG(STRICT), #undef FLAG diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2c700d12eef6..75dfbb06dff2 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -776,10 +776,6 @@ static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata, ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING)) return false; - if (chandef->punctured && chandef->chan->band == NL80211_BAND_5GHZ && - ieee80211_hw_check(&sdata->local->hw, DISALLOW_PUNCTURING_5GHZ)) - return false; - return true; } -- cgit v1.2.3 From 62c57ebb3107842482bc5e3568a0202295a8db0d Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 9 Jun 2025 21:35:23 +0300 Subject: wifi: cfg80211: add a flag for the first part of a scan When there are no non-6 GHz channels, then the 6 GHz scan is the first part of a split scan. Add a boolean denoting whether the scan is the first part of a scan as it might be useful to drivers for internal bookkeeping. This flag is also set if the scan is not split. Signed-off-by: Benjamin Berg Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250609213231.07e5a8a452ec.Ibf18f513e507422078fb31b28947e582a20df87a@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 +++++- net/wireless/scan.c | 15 ++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 5d5ad7926877..6ec9a8865b8b 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2754,7 +2754,10 @@ struct cfg80211_scan_6ghz_params { * are 0 in the mask should be randomised, bits that are 1 should * be taken from the @mac_addr * @scan_6ghz: relevant for split scan request only, - * true if this is the second scan request + * true if this is a 6 GHz scan request + * @first_part: %true if this is the first part of a split scan request or a + * scan that was not split. May be %true for a @scan_6ghz scan if no other + * channels were requested * @n_6ghz_params: number of 6 GHz params * @scan_6ghz_params: 6 GHz params * @bssid: BSSID to scan for (most commonly, the wildcard BSSID) @@ -2782,6 +2785,7 @@ struct cfg80211_scan_request { unsigned long scan_start; bool no_cck; bool scan_6ghz; + bool first_part; u32 n_6ghz_params; struct cfg80211_scan_6ghz_params *scan_6ghz_params; s8 tsf_report_link_id; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a75cecc47d78..b963ca5c606e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -838,7 +838,8 @@ static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, return false; } -static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) +static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev, + bool first_part) { u8 i; struct cfg80211_colocated_ap *ap; @@ -850,6 +851,7 @@ static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) size_t size, offs_ssids, offs_6ghz_params, offs_ies; rdev_req->req.scan_6ghz = true; + rdev_req->req.first_part = first_part; if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) return -EOPNOTSUPP; @@ -1046,7 +1048,7 @@ skip: * If this scan follows a previous scan, save the scan start * info from the first part of the scan */ - if (old) + if (!first_part && !WARN_ON(!old)) rdev->int_scan_req->info = old->info; err = rdev_scan(rdev, request); @@ -1070,8 +1072,10 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) struct cfg80211_scan_request_int *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) + if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) { + rdev_req->req.first_part = true; return rdev_scan(rdev, rdev_req); + } for (i = 0; i < rdev_req->req.n_channels; i++) { if (rdev_req->req.channels[i]->band != NL80211_BAND_6GHZ) @@ -1079,7 +1083,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) } if (!n_channels) - return cfg80211_scan_6ghz(rdev); + return cfg80211_scan_6ghz(rdev, true); request = kzalloc(struct_size(request, req.channels, n_channels), GFP_KERNEL); @@ -1096,6 +1100,7 @@ int cfg80211_scan(struct cfg80211_registered_device *rdev) } rdev_req->req.scan_6ghz = false; + rdev_req->req.first_part = true; rdev->int_scan_req = request; return rdev_scan(rdev, request); } @@ -1128,7 +1133,7 @@ void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, if (wdev_running(wdev) && (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && !rdev_req->req.scan_6ghz && !request->info.aborted && - !cfg80211_scan_6ghz(rdev)) + !cfg80211_scan_6ghz(rdev, false)) return; /* -- cgit v1.2.3 From 6b04716cdcac37bdbacde34def08bc6fdb5fc4e2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 9 Jun 2025 21:35:27 +0300 Subject: wifi: mac80211: don't complete management TX on SAE commit When SAE commit is sent and received in response, there's no ordering for the SAE confirm messages. As such, don't call drivers to stop listening on the channel when the confirm message is still expected. This fixes an issue if the local confirm is transmitted later than the AP's confirm, for iwlwifi (and possibly mt76) the AP's confirm would then get lost since the device isn't on the channel at the time the AP transmit the confirm. For iwlwifi at least, this also improves the overall timing of the authentication handshake (by about 15ms according to the report), likely since the session protection won't be aborted and rescheduled. Note that even before this, mgd_complete_tx() wasn't always called for each call to mgd_prepare_tx() (e.g. in the case of WEP key shared authentication), and the current drivers that have the complete callback don't seem to mind. Document this as well though. Reported-by: Jan Hendrik Farr Closes: https://lore.kernel.org/all/aB30Ea2kRG24LINR@archlinux/ Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250609213232.12691580e140.I3f1d3127acabcd58348a110ab11044213cf147d3@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 2 ++ net/mac80211/mlme.c | 9 ++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 14a6bd120f25..577fd6a8c372 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -4310,6 +4310,8 @@ struct ieee80211_prep_tx_info { * @mgd_complete_tx: Notify the driver that the response frame for a previously * transmitted frame announced with @mgd_prepare_tx was received, the data * is filled similarly to @mgd_prepare_tx though the duration is not used. + * Note that this isn't always called for each mgd_prepare_tx() call, for + * example for SAE the 'confirm' messages can be on the air in any order. * * @mgd_protect_tdls_discover: Protect a TDLS discovery session. After sending * a TDLS discovery-request, we expect a reply to arrive on the AP's diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index aaff7e9c3eb7..8b9c132cce3d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -4780,6 +4780,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, struct ieee80211_prep_tx_info info = { .subtype = IEEE80211_STYPE_AUTH, }; + bool sae_need_confirm = false; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -4825,6 +4826,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, jiffies + IEEE80211_AUTH_WAIT_SAE_RETRY; ifmgd->auth_data->timeout_started = true; run_again(sdata, ifmgd->auth_data->timeout); + if (auth_transaction == 1) + sae_need_confirm = true; goto notify_driver; } @@ -4867,6 +4870,9 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, ifmgd->auth_data->expected_transaction == 2)) { if (!ieee80211_mark_sta_auth(sdata)) return; /* ignore frame -- wait for timeout */ + } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && + auth_transaction == 1) { + sae_need_confirm = true; } else if (ifmgd->auth_data->algorithm == WLAN_AUTH_SAE && auth_transaction == 2) { sdata_info(sdata, "SAE peer confirmed\n"); @@ -4875,7 +4881,8 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, cfg80211_rx_mlme_mgmt(sdata->dev, (u8 *)mgmt, len); notify_driver: - drv_mgd_complete_tx(sdata->local, sdata, &info); + if (!sae_need_confirm) + drv_mgd_complete_tx(sdata->local, sdata, &info); } #define case_WLAN(type) \ -- cgit v1.2.3 From c0ef1446959101d23fdf1b1bdefc6613a83dba03 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 4 Jul 2025 20:21:53 +0200 Subject: devlink: Add support for u64 parameters Only 8, 16 and 32-bit integers are supported for numeric devlink parameters. The subsequent patch adds support for DPLL clock ID that is defined as 64-bit number. Add support for u64 parameter type. Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250704182202.1641943-4-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 2 ++ net/devlink/param.c | 10 ++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/include/net/devlink.h b/include/net/devlink.h index d0ce5a7e984c..4a5896b846a4 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -425,6 +425,7 @@ enum devlink_param_type { DEVLINK_PARAM_TYPE_U8 = DEVLINK_VAR_ATTR_TYPE_U8, DEVLINK_PARAM_TYPE_U16 = DEVLINK_VAR_ATTR_TYPE_U16, DEVLINK_PARAM_TYPE_U32 = DEVLINK_VAR_ATTR_TYPE_U32, + DEVLINK_PARAM_TYPE_U64 = DEVLINK_VAR_ATTR_TYPE_U64, DEVLINK_PARAM_TYPE_STRING = DEVLINK_VAR_ATTR_TYPE_STRING, DEVLINK_PARAM_TYPE_BOOL = DEVLINK_VAR_ATTR_TYPE_FLAG, }; @@ -433,6 +434,7 @@ union devlink_param_value { u8 vu8; u16 vu16; u32 vu32; + u64 vu64; char vstr[__DEVLINK_PARAM_MAX_STRING_VALUE]; bool vbool; }; diff --git a/net/devlink/param.c b/net/devlink/param.c index 396b8a7f6013..9709b41664aa 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -200,6 +200,11 @@ devlink_nl_param_value_fill_one(struct sk_buff *msg, if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) goto value_nest_cancel; break; + case DEVLINK_PARAM_TYPE_U64: + if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, + val.vu64)) + goto value_nest_cancel; + break; case DEVLINK_PARAM_TYPE_STRING: if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vstr)) @@ -434,6 +439,11 @@ devlink_param_value_get_from_info(const struct devlink_param *param, return -EINVAL; value->vu32 = nla_get_u32(param_data); break; + case DEVLINK_PARAM_TYPE_U64: + if (nla_len(param_data) != sizeof(u64)) + return -EINVAL; + value->vu64 = nla_get_u64(param_data); + break; case DEVLINK_PARAM_TYPE_STRING: len = strnlen(nla_data(param_data), nla_len(param_data)); if (len == nla_len(param_data) || -- cgit v1.2.3 From de9ccf2296ac323a571e442b5730ca9cc259fbf0 Mon Sep 17 00:00:00 2001 From: Ivan Vecera Date: Fri, 4 Jul 2025 20:21:54 +0200 Subject: devlink: Add new "clock_id" generic device param Add a new device generic parameter to specify clock ID that should be used by the device for registering DPLL devices and pins. Signed-off-by: Ivan Vecera Reviewed-by: Jiri Pirko Link: https://patch.msgid.link/20250704182202.1641943-5-ivecera@redhat.com Signed-off-by: Jakub Kicinski --- Documentation/networking/devlink/devlink-params.rst | 3 +++ include/net/devlink.h | 4 ++++ net/devlink/param.c | 5 +++++ 3 files changed, 12 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/devlink/devlink-params.rst b/Documentation/networking/devlink/devlink-params.rst index 3da8f4ef2417..211b58177e12 100644 --- a/Documentation/networking/devlink/devlink-params.rst +++ b/Documentation/networking/devlink/devlink-params.rst @@ -140,3 +140,6 @@ own name. * - ``enable_phc`` - Boolean - Enable PHC (PTP Hardware Clock) functionality in the device. + * - ``clock_id`` + - u64 + - Clock ID used by the device for registering DPLL devices and pins. diff --git a/include/net/devlink.h b/include/net/devlink.h index 4a5896b846a4..93640a29427c 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -525,6 +525,7 @@ enum devlink_param_generic_id { DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, DEVLINK_PARAM_GENERIC_ID_ENABLE_PHC, + DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, /* add new param generic ids above here*/ __DEVLINK_PARAM_GENERIC_ID_MAX, @@ -586,6 +587,9 @@ enum devlink_param_generic_id { #define DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME "enable_phc" #define DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE DEVLINK_PARAM_TYPE_BOOL +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME "clock_id" +#define DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE DEVLINK_PARAM_TYPE_U64 + #define DEVLINK_PARAM_GENERIC(_id, _cmodes, _get, _set, _validate) \ { \ .id = DEVLINK_PARAM_GENERIC_ID_##_id, \ diff --git a/net/devlink/param.c b/net/devlink/param.c index 9709b41664aa..41dcc86cfd94 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -97,6 +97,11 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_ENABLE_PHC_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_PHC_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_CLOCK_ID, + .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, + .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) -- cgit v1.2.3 From dd831ac8221e691e9e918585b1003c7071df0379 Mon Sep 17 00:00:00 2001 From: Xiang Mei Date: Sat, 5 Jul 2025 14:21:43 -0700 Subject: net/sched: sch_qfq: Fix null-deref in agg_dequeue To prevent a potential crash in agg_dequeue (net/sched/sch_qfq.c) when cl->qdisc->ops->peek(cl->qdisc) returns NULL, we check the return value before using it, similar to the existing approach in sch_hfsc.c. To avoid code duplication, the following changes are made: 1. Changed qdisc_warn_nonwc(include/net/pkt_sched.h) into a static inline function. 2. Moved qdisc_peek_len from net/sched/sch_hfsc.c to include/net/pkt_sched.h so that sch_qfq can reuse it. 3. Applied qdisc_peek_len in agg_dequeue to avoid crashing. Signed-off-by: Xiang Mei Reviewed-by: Cong Wang Link: https://patch.msgid.link/20250705212143.3982664-1-xmei5@asu.edu Signed-off-by: Paolo Abeni --- include/net/pkt_sched.h | 25 ++++++++++++++++++++++++- net/sched/sch_api.c | 10 ---------- net/sched/sch_hfsc.c | 16 ---------------- net/sched/sch_qfq.c | 2 +- 4 files changed, 25 insertions(+), 28 deletions(-) (limited to 'include/net') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index d7b7b6cd4aa1..8a75c73fc555 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -114,7 +114,6 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct netlink_ext_ack *extack); void qdisc_put_rtab(struct qdisc_rate_table *tab); void qdisc_put_stab(struct qdisc_size_table *tab); -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc); bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq, spinlock_t *root_lock, bool validate); @@ -290,4 +289,28 @@ static inline bool tc_qdisc_stats_dump(struct Qdisc *sch, return true; } +static inline void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) +{ + if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { + pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", + txt, qdisc->ops->id, qdisc->handle >> 16); + qdisc->flags |= TCQ_F_WARN_NONWC; + } +} + +static inline unsigned int qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->ops->peek(sch); + if (unlikely(skb == NULL)) { + qdisc_warn_nonwc("qdisc_peek_len", sch); + return 0; + } + len = qdisc_pkt_len(skb); + + return len; +} + #endif diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 241e86cec9c5..d7c767b861a4 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -601,16 +601,6 @@ out: qdisc_skb_cb(skb)->pkt_len = pkt_len; } -void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) -{ - if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { - pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", - txt, qdisc->ops->id, qdisc->handle >> 16); - qdisc->flags |= TCQ_F_WARN_NONWC; - } -} -EXPORT_SYMBOL(qdisc_warn_nonwc); - static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 5a7745170e84..d8fd35da32a7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -835,22 +835,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) } } -static unsigned int -qdisc_peek_len(struct Qdisc *sch) -{ - struct sk_buff *skb; - unsigned int len; - - skb = sch->ops->peek(sch); - if (unlikely(skb == NULL)) { - qdisc_warn_nonwc("qdisc_peek_len", sch); - return 0; - } - len = qdisc_pkt_len(skb); - - return len; -} - static void hfsc_adjust_levels(struct hfsc_class *cl) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index bf1282cb22eb..bcce36608871 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -989,7 +989,7 @@ static struct sk_buff *agg_dequeue(struct qfq_aggregate *agg, if (cl->qdisc->q.qlen == 0) /* no more packets, remove from list */ list_del_init(&cl->alist); - else if (cl->deficit < qdisc_pkt_len(cl->qdisc->ops->peek(cl->qdisc))) { + else if (cl->deficit < qdisc_peek_len(cl->qdisc)) { cl->deficit += agg->lmax; list_move_tail(&cl->alist, &agg->active); } -- cgit v1.2.3 From 45e359be1ce88fb22e61fa3aa23b2e450a6cae03 Mon Sep 17 00:00:00 2001 From: Jason Xing Date: Sat, 5 Jul 2025 00:01:38 +0800 Subject: net: xsk: introduce XDP_MAX_TX_SKB_BUDGET setsockopt This patch provides a setsockopt method to let applications leverage to adjust how many descs to be handled at most in one send syscall. It mitigates the situation where the default value (32) that is too small leads to higher frequency of triggering send syscall. Considering the prosperity/complexity the applications have, there is no absolutely ideal suggestion fitting all cases. So keep 32 as its default value like before. The patch does the following things: - Add XDP_MAX_TX_SKB_BUDGET socket option. - Set max_tx_budget to 32 by default in the initialization phase as a per-socket granular control. - Set the range of max_tx_budget as [32, xs->tx->nentries]. The idea behind this comes out of real workloads in production. We use a user-level stack with xsk support to accelerate sending packets and minimize triggering syscalls. When the packets are aggregated, it's not hard to hit the upper bound (namely, 32). The moment user-space stack fetches the -EAGAIN error number passed from sendto(), it will loop to try again until all the expected descs from tx ring are sent out to the driver. Enlarging the XDP_MAX_TX_SKB_BUDGET value contributes to less frequency of sendto() and higher throughput/PPS. Here is what I did in production, along with some numbers as follows: For one application I saw lately, I suggested using 128 as max_tx_budget because I saw two limitations without changing any default configuration: 1) XDP_MAX_TX_SKB_BUDGET, 2) socket sndbuf which is 212992 decided by net.core.wmem_default. As to XDP_MAX_TX_SKB_BUDGET, the scenario behind this was I counted how many descs are transmitted to the driver at one time of sendto() based on [1] patch and then I calculated the possibility of hitting the upper bound. Finally I chose 128 as a suitable value because 1) it covers most of the cases, 2) a higher number would not bring evident results. After twisting the parameters, a stable improvement of around 4% for both PPS and throughput and less resources consumption were found to be observed by strace -c -p xxx: 1) %time was decreased by 7.8% 2) error counter was decreased from 18367 to 572 [1]: https://lore.kernel.org/all/20250619093641.70700-1-kerneljasonxing@gmail.com/ Signed-off-by: Jason Xing Acked-by: Maciej Fijalkowski Link: https://patch.msgid.link/20250704160138.48677-1-kerneljasonxing@gmail.com Signed-off-by: Paolo Abeni --- Documentation/networking/af_xdp.rst | 9 +++++++++ include/net/xdp_sock.h | 1 + include/uapi/linux/if_xdp.h | 1 + net/xdp/xsk.c | 21 +++++++++++++++++++-- tools/include/uapi/linux/if_xdp.h | 1 + 5 files changed, 31 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/Documentation/networking/af_xdp.rst b/Documentation/networking/af_xdp.rst index d486014bb31d..50d92084a49c 100644 --- a/Documentation/networking/af_xdp.rst +++ b/Documentation/networking/af_xdp.rst @@ -438,6 +438,15 @@ is created by a privileged process and passed to a non-privileged one. Once the option is set, kernel will refuse attempts to bind that socket to a different interface. Updating the value requires CAP_NET_RAW. +XDP_MAX_TX_SKB_BUDGET setsockopt +-------------------------------- + +This setsockopt sets the maximum number of descriptors that can be handled +and passed to the driver at one send syscall. It is applied in the copy +mode to allow application to tune the per-socket maximum iteration for +better throughput and less frequency of send syscall. +Allowed range is [32, xs->tx->nentries]. + XDP_STATISTICS getsockopt ------------------------- diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index e8bd6ddb7b12..ce587a225661 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -84,6 +84,7 @@ struct xdp_sock { struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; + u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index 44f2bb93e7e6..23a062781468 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 +#define XDP_MAX_TX_SKB_BUDGET 9 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index bd61b0bc9c24..9c3acecc14b1 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -34,7 +34,7 @@ #include "xsk.h" #define TX_BATCH_SIZE 32 -#define MAX_PER_SOCKET_BUDGET (TX_BATCH_SIZE) +#define MAX_PER_SOCKET_BUDGET 32 void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { @@ -783,10 +783,10 @@ free_err: static int __xsk_generic_xmit(struct sock *sk) { struct xdp_sock *xs = xdp_sk(sk); - u32 max_batch = TX_BATCH_SIZE; bool sent_frame = false; struct xdp_desc desc; struct sk_buff *skb; + u32 max_batch; int err = 0; mutex_lock(&xs->mutex); @@ -800,6 +800,7 @@ static int __xsk_generic_xmit(struct sock *sk) if (xs->queue_id >= xs->dev->real_num_tx_queues) goto out; + max_batch = READ_ONCE(xs->max_tx_budget); while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) { if (max_batch-- == 0) { err = -EAGAIN; @@ -1440,6 +1441,21 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, mutex_unlock(&xs->mutex); return err; } + case XDP_MAX_TX_SKB_BUDGET: + { + unsigned int budget; + + if (optlen != sizeof(budget)) + return -EINVAL; + if (copy_from_sockptr(&budget, optval, sizeof(budget))) + return -EFAULT; + if (!xs->tx || + budget < TX_BATCH_SIZE || budget > xs->tx->nentries) + return -EACCES; + + WRITE_ONCE(xs->max_tx_budget, budget); + return 0; + } default: break; } @@ -1737,6 +1753,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); xs->state = XSK_READY; + xs->max_tx_budget = TX_BATCH_SIZE; mutex_init(&xs->mutex); INIT_LIST_HEAD(&xs->map_list); diff --git a/tools/include/uapi/linux/if_xdp.h b/tools/include/uapi/linux/if_xdp.h index 44f2bb93e7e6..23a062781468 100644 --- a/tools/include/uapi/linux/if_xdp.h +++ b/tools/include/uapi/linux/if_xdp.h @@ -79,6 +79,7 @@ struct xdp_mmap_offsets { #define XDP_UMEM_COMPLETION_RING 6 #define XDP_STATISTICS 7 #define XDP_OPTIONS 8 +#define XDP_MAX_TX_SKB_BUDGET 9 struct xdp_umem_reg { __u64 addr; /* Start of packet data area */ -- cgit v1.2.3 From 96698d1898bc79c783990ac7d5458b7c8f8e0b69 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 8 Jul 2025 11:33:42 +0800 Subject: net: replace ND_PRINTK with dynamic debug ND_PRINTK with val > 1 only works when the ND_DEBUG was set in compilation phase. Replace it with dynamic debug. Convert ND_PRINTK with val <= 1 to net_{err,warn}_ratelimited, and convert the rest to net_dbg_ratelimited. Suggested-by: Ido Schimmel Signed-off-by: Wang Liang Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/20250708033342.1627636-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- include/net/ndisc.h | 9 --- net/6lowpan/ndisc.c | 16 ++---- net/ipv6/ndisc.c | 157 ++++++++++++++++++++-------------------------------- 3 files changed, 67 insertions(+), 115 deletions(-) (limited to 'include/net') diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 3c88d5bc5eed..d38783a2ce57 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -60,15 +60,6 @@ enum { #include -/* Set to 3 to get tracing... */ -#define ND_DEBUG 1 - -#define ND_PRINTK(val, level, fmt, ...) \ -do { \ - if (val <= ND_DEBUG) \ - net_##level##_ratelimited(fmt, ##__VA_ARGS__); \ -} while (0) - struct ctl_table; struct inet6_dev; struct net_device; diff --git a/net/6lowpan/ndisc.c b/net/6lowpan/ndisc.c index c40b98f7743c..868d28583c0a 100644 --- a/net/6lowpan/ndisc.c +++ b/net/6lowpan/ndisc.c @@ -20,9 +20,8 @@ static int lowpan_ndisc_parse_802154_options(const struct net_device *dev, switch (nd_opt->nd_opt_len) { case NDISC_802154_SHORT_ADDR_LENGTH: if (ndopts->nd_802154_opt_array[nd_opt->nd_opt_type]) - ND_PRINTK(2, warn, - "%s: duplicated short addr ND6 option found: type=%d\n", - __func__, nd_opt->nd_opt_type); + net_dbg_ratelimited("%s: duplicated short addr ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); else ndopts->nd_802154_opt_array[nd_opt->nd_opt_type] = nd_opt; return 1; @@ -63,8 +62,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_src_lladdr, IEEE802154_SHORT_ADDR_LEN, 0); if (!lladdr_short) { - ND_PRINTK(2, warn, - "NA: invalid short link-layer address length\n"); + net_dbg_ratelimited("NA: invalid short link-layer address length\n"); return; } } @@ -75,8 +73,7 @@ static void lowpan_ndisc_802154_update(struct neighbour *n, u32 flags, lladdr_short = __ndisc_opt_addr_data(ndopts->nd_802154_opts_tgt_lladdr, IEEE802154_SHORT_ADDR_LEN, 0); if (!lladdr_short) { - ND_PRINTK(2, warn, - "NA: invalid short link-layer address length\n"); + net_dbg_ratelimited("NA: invalid short link-layer address length\n"); return; } } @@ -209,9 +206,8 @@ static void lowpan_ndisc_prefix_rcv_add_addr(struct net *net, sllao, tokenized, valid_lft, prefered_lft); if (err) - ND_PRINTK(2, warn, - "RA: could not add a short address based address for prefix: %pI6c\n", - &pinfo->prefix); + net_dbg_ratelimited("RA: could not add a short address based address for prefix: %pI6c\n", + &pinfo->prefix); } } #endif diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 28f35cbb6577..d4c5876e1771 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -243,9 +243,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, case ND_OPT_NONCE: case ND_OPT_REDIRECT_HDR: if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { - ND_PRINTK(2, warn, - "%s: duplicated ND6 option found: type=%d\n", - __func__, nd_opt->nd_opt_type); + net_dbg_ratelimited("%s: duplicated ND6 option found: type=%d\n", + __func__, nd_opt->nd_opt_type); } else { ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; } @@ -275,11 +274,8 @@ struct ndisc_options *ndisc_parse_options(const struct net_device *dev, * to accommodate future extension to the * protocol. */ - ND_PRINTK(2, notice, - "%s: ignored unsupported option; type=%d, len=%d\n", - __func__, - nd_opt->nd_opt_type, - nd_opt->nd_opt_len); + net_dbg_ratelimited("%s: ignored unsupported option; type=%d, len=%d\n", + __func__, nd_opt->nd_opt_type, nd_opt->nd_opt_len); } next_opt: opt_len -= l; @@ -754,9 +750,8 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) { - ND_PRINTK(1, dbg, - "%s: trying to ucast probe in NUD_INVALID: %pI6\n", - __func__, target); + net_dbg_ratelimited("%s: trying to ucast probe in NUD_INVALID: %pI6\n", + __func__, target); } ndisc_send_ns(dev, target, target, saddr, 0); } else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) { @@ -814,7 +809,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK(2, warn, "NS: multicast target address\n"); + net_dbg_ratelimited("NS: multicast target address\n"); return reason; } @@ -823,7 +818,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) * DAD has to be destined for solicited node multicast address. */ if (dad && !ipv6_addr_is_solict_mult(daddr)) { - ND_PRINTK(2, warn, "NS: bad DAD packet (wrong destination)\n"); + net_dbg_ratelimited("NS: bad DAD packet (wrong destination)\n"); return reason; } @@ -833,8 +828,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) if (ndopts.nd_opts_src_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); if (!lladdr) { - ND_PRINTK(2, warn, - "NS: invalid link-layer address length\n"); + net_dbg_ratelimited("NS: invalid link-layer address length\n"); return reason; } @@ -844,8 +838,7 @@ static enum skb_drop_reason ndisc_recv_ns(struct sk_buff *skb) * in the message. */ if (dad) { - ND_PRINTK(2, warn, - "NS: bad DAD packet (link-layer address option)\n"); + net_dbg_ratelimited("NS: bad DAD packet (link-layer address option)\n"); return reason; } } @@ -862,10 +855,8 @@ have_ifp: if (nonce != 0 && ifp->dad_nonce == nonce) { u8 *np = (u8 *)&nonce; /* Matching nonce if looped back */ - ND_PRINTK(2, notice, - "%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", - ifp->idev->dev->name, - &ifp->addr, np); + net_dbg_ratelimited("%s: IPv6 DAD loopback for address %pI6c nonce %pM ignored\n", + ifp->idev->dev->name, &ifp->addr, np); goto out; } /* @@ -1016,13 +1007,13 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) return SKB_DROP_REASON_PKT_TOO_SMALL; if (ipv6_addr_is_multicast(&msg->target)) { - ND_PRINTK(2, warn, "NA: target address is multicast\n"); + net_dbg_ratelimited("NA: target address is multicast\n"); return reason; } if (ipv6_addr_is_multicast(daddr) && msg->icmph.icmp6_solicited) { - ND_PRINTK(2, warn, "NA: solicited NA is multicasted\n"); + net_dbg_ratelimited("NA: solicited NA is multicasted\n"); return reason; } @@ -1041,8 +1032,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (ndopts.nd_opts_tgt_lladdr) { lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); if (!lladdr) { - ND_PRINTK(2, warn, - "NA: invalid link-layer address length\n"); + net_dbg_ratelimited("NA: invalid link-layer address length\n"); return reason; } } @@ -1063,9 +1053,9 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) unsolicited advertisement. */ if (skb->pkt_type != PACKET_LOOPBACK) - ND_PRINTK(1, warn, - "NA: %pM advertised our address %pI6c on %s!\n", - eth_hdr(skb)->h_source, &ifp->addr, ifp->idev->dev->name); + net_warn_ratelimited("NA: %pM advertised our address %pI6c on %s!\n", + eth_hdr(skb)->h_source, &ifp->addr, + ifp->idev->dev->name); in6_ifa_put(ifp); return reason; } @@ -1152,7 +1142,7 @@ static enum skb_drop_reason ndisc_recv_rs(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); if (!idev) { - ND_PRINTK(1, err, "RS: can't find in6 device\n"); + net_err_ratelimited("RS: can't find in6 device\n"); return reason; } @@ -1260,11 +1250,9 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) optlen = (skb_tail_pointer(skb) - skb_transport_header(skb)) - sizeof(struct ra_msg); - ND_PRINTK(2, info, - "RA: %s, dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, dev: %s\n", __func__, skb->dev->name); if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, "RA: source address is not link-local\n"); + net_dbg_ratelimited("RA: source address is not link-local\n"); return reason; } if (optlen < 0) @@ -1272,15 +1260,14 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) #ifdef CONFIG_IPV6_NDISC_NODETYPE if (skb->ndisc_nodetype == NDISC_NODETYPE_HOST) { - ND_PRINTK(2, warn, "RA: from host or unauthorized router\n"); + net_dbg_ratelimited("RA: from host or unauthorized router\n"); return reason; } #endif in6_dev = __in6_dev_get(skb->dev); if (!in6_dev) { - ND_PRINTK(0, err, "RA: can't find inet6 device for %s\n", - skb->dev->name); + net_err_ratelimited("RA: can't find inet6 device for %s\n", skb->dev->name); return reason; } @@ -1288,18 +1275,16 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) return SKB_DROP_REASON_IPV6_NDISC_BAD_OPTIONS; if (!ipv6_accept_ra(in6_dev)) { - ND_PRINTK(2, info, - "RA: %s, did not accept ra for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, did not accept ra for dev: %s\n", __func__, + skb->dev->name); goto skip_linkparms; } #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific parameters from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { - ND_PRINTK(2, info, - "RA: %s, nodetype is NODEFAULT, dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT, dev: %s\n", __func__, + skb->dev->name); goto skip_linkparms; } #endif @@ -1328,18 +1313,16 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) send_ifinfo_notify = true; if (!READ_ONCE(in6_dev->cnf.accept_ra_defrtr)) { - ND_PRINTK(2, info, - "RA: %s, defrtr is false for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, defrtr is false for dev: %s\n", __func__, + skb->dev->name); goto skip_defrtr; } lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); if (lifetime != 0 && lifetime < READ_ONCE(in6_dev->cnf.accept_ra_min_lft)) { - ND_PRINTK(2, info, - "RA: router lifetime (%ds) is too short: %s\n", - lifetime, skb->dev->name); + net_dbg_ratelimited("RA: router lifetime (%ds) is too short: %s\n", lifetime, + skb->dev->name); goto skip_defrtr; } @@ -1349,9 +1332,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) net = dev_net(in6_dev->dev); if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(net, &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { - ND_PRINTK(2, info, - "RA from local address detected on dev: %s: default router ignored\n", - skb->dev->name); + net_dbg_ratelimited("RA from local address detected on dev: %s: default router ignored\n", + skb->dev->name); goto skip_defrtr; } @@ -1369,9 +1351,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { - ND_PRINTK(0, err, - "RA: %s got default router without neighbour\n", - __func__); + net_err_ratelimited("RA: %s got default router without neighbour\n", + __func__); fib6_info_release(rt); return reason; } @@ -1384,10 +1365,10 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) rt = NULL; } - ND_PRINTK(3, info, "RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", - rt, lifetime, defrtr_usr_metric, skb->dev->name); + net_dbg_ratelimited("RA: rt: %p lifetime: %d, metric: %d, for dev: %s\n", rt, lifetime, + defrtr_usr_metric, skb->dev->name); if (!rt && lifetime) { - ND_PRINTK(3, info, "RA: adding default router\n"); + net_dbg_ratelimited("RA: adding default router\n"); if (neigh) neigh_release(neigh); @@ -1396,9 +1377,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) skb->dev, pref, defrtr_usr_metric, lifetime); if (!rt) { - ND_PRINTK(0, err, - "RA: %s failed to add default route\n", - __func__); + net_err_ratelimited("RA: %s failed to add default route\n", __func__); return reason; } @@ -1406,9 +1385,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) rt->fib6_nh->fib_nh_dev, NULL, &ipv6_hdr(skb)->saddr); if (!neigh) { - ND_PRINTK(0, err, - "RA: %s got default router without neighbour\n", - __func__); + net_err_ratelimited("RA: %s got default router without neighbour\n", + __func__); fib6_info_release(rt); return reason; } @@ -1439,7 +1417,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) fib6_metric_set(rt, RTAX_HOPLIMIT, ra_msg->icmph.icmp6_hop_limit); } else { - ND_PRINTK(2, warn, "RA: Got route advertisement with lower hop_limit than minimum\n"); + net_dbg_ratelimited("RA: Got route advertisement with lower hop_limit than minimum\n"); } } @@ -1495,8 +1473,7 @@ skip_linkparms: lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, skb->dev); if (!lladdr) { - ND_PRINTK(2, warn, - "RA: invalid link-layer address length\n"); + net_dbg_ratelimited("RA: invalid link-layer address length\n"); goto out; } } @@ -1510,9 +1487,8 @@ skip_linkparms: } if (!ipv6_accept_ra(in6_dev)) { - ND_PRINTK(2, info, - "RA: %s, accept_ra is false for dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, accept_ra is false for dev: %s\n", __func__, + skb->dev->name); goto out; } @@ -1520,9 +1496,8 @@ skip_linkparms: if (!READ_ONCE(in6_dev->cnf.accept_ra_from_local) && ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr, in6_dev->dev, 0)) { - ND_PRINTK(2, info, - "RA from local address detected on dev: %s: router info ignored.\n", - skb->dev->name); + net_dbg_ratelimited("RA from local address detected on dev: %s: router info ignored.\n", + skb->dev->name); goto skip_routeinfo; } @@ -1558,9 +1533,8 @@ skip_routeinfo: #ifdef CONFIG_IPV6_NDISC_NODETYPE /* skip link-specific ndopts from interior routers */ if (skb->ndisc_nodetype == NDISC_NODETYPE_NODEFAULT) { - ND_PRINTK(2, info, - "RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", - __func__, skb->dev->name); + net_dbg_ratelimited("RA: %s, nodetype is NODEFAULT (interior routes), dev: %s\n", + __func__, skb->dev->name); goto out; } #endif @@ -1589,7 +1563,7 @@ skip_routeinfo: } if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { - ND_PRINTK(2, warn, "RA: invalid mtu: %d\n", mtu); + net_dbg_ratelimited("RA: invalid mtu: %d\n", mtu); } else if (READ_ONCE(in6_dev->cnf.mtu6) != mtu) { WRITE_ONCE(in6_dev->cnf.mtu6, mtu); fib6_metric_set(rt, RTAX_MTU, mtu); @@ -1608,7 +1582,7 @@ skip_routeinfo: } if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { - ND_PRINTK(2, warn, "RA: invalid RA options\n"); + net_dbg_ratelimited("RA: invalid RA options\n"); } out: /* Send a notify if RA changed managed/otherconf flags or @@ -1636,15 +1610,13 @@ static enum skb_drop_reason ndisc_redirect_rcv(struct sk_buff *skb) switch (skb->ndisc_nodetype) { case NDISC_NODETYPE_HOST: case NDISC_NODETYPE_NODEFAULT: - ND_PRINTK(2, warn, - "Redirect: from host or unauthorized router\n"); + net_dbg_ratelimited("Redirect: from host or unauthorized router\n"); return reason; } #endif if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: source address is not link-local\n"); + net_dbg_ratelimited("Redirect: source address is not link-local\n"); return reason; } @@ -1705,15 +1677,13 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) } if (ipv6_get_lladdr(dev, &saddr_buf, IFA_F_TENTATIVE)) { - ND_PRINTK(2, warn, "Redirect: no link-local address on %s\n", - dev->name); + net_dbg_ratelimited("Redirect: no link-local address on %s\n", dev->name); return; } if (!ipv6_addr_equal(&ipv6_hdr(skb)->daddr, target) && ipv6_addr_type(target) != (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) { - ND_PRINTK(2, warn, - "Redirect: target address is not link-local unicast\n"); + net_dbg_ratelimited("Redirect: target address is not link-local unicast\n"); return; } @@ -1732,8 +1702,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) rt = dst_rt6_info(dst); if (rt->rt6i_flags & RTF_GATEWAY) { - ND_PRINTK(2, warn, - "Redirect: destination is not a neighbour\n"); + net_dbg_ratelimited("Redirect: destination is not a neighbour\n"); goto release; } @@ -1746,8 +1715,7 @@ void ndisc_send_redirect(struct sk_buff *skb, const struct in6_addr *target) if (dev->addr_len) { struct neighbour *neigh = dst_neigh_lookup(skb_dst(skb), target); if (!neigh) { - ND_PRINTK(2, warn, - "Redirect: no neigh for target address\n"); + net_dbg_ratelimited("Redirect: no neigh for target address\n"); goto release; } @@ -1848,14 +1816,12 @@ enum skb_drop_reason ndisc_rcv(struct sk_buff *skb) __skb_push(skb, skb->data - skb_transport_header(skb)); if (ipv6_hdr(skb)->hop_limit != 255) { - ND_PRINTK(2, warn, "NDISC: invalid hop-limit: %d\n", - ipv6_hdr(skb)->hop_limit); + net_dbg_ratelimited("NDISC: invalid hop-limit: %d\n", ipv6_hdr(skb)->hop_limit); return SKB_DROP_REASON_IPV6_NDISC_HOP_LIMIT; } if (msg->icmph.icmp6_code != 0) { - ND_PRINTK(2, warn, "NDISC: invalid ICMPv6 code: %d\n", - msg->icmph.icmp6_code); + net_dbg_ratelimited("NDISC: invalid ICMPv6 code: %d\n", msg->icmph.icmp6_code); return SKB_DROP_REASON_IPV6_NDISC_BAD_CODE; } @@ -2006,9 +1972,8 @@ static int __net_init ndisc_net_init(struct net *net) err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, net); if (err < 0) { - ND_PRINTK(0, err, - "NDISC: Failed to initialize the control socket (err %d)\n", - err); + net_err_ratelimited("NDISC: Failed to initialize the control socket (err %d)\n", + err); return err; } -- cgit v1.2.3 From 18cdb3d982da8976b28d57691eb256ec5688fad2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Jul 2025 12:45:17 +0000 Subject: netfilter: flowtable: account for Ethernet header in nf_flow_pppoe_proto() syzbot found a potential access to uninit-value in nf_flow_pppoe_proto() Blamed commit forgot the Ethernet header. BUG: KMSAN: uninit-value in nf_flow_offload_inet_hook+0x7e4/0x940 net/netfilter/nf_flow_table_inet.c:27 nf_flow_offload_inet_hook+0x7e4/0x940 net/netfilter/nf_flow_table_inet.c:27 nf_hook_entry_hookfn include/linux/netfilter.h:157 [inline] nf_hook_slow+0xe1/0x3d0 net/netfilter/core.c:623 nf_hook_ingress include/linux/netfilter_netdev.h:34 [inline] nf_ingress net/core/dev.c:5742 [inline] __netif_receive_skb_core+0x4aff/0x70c0 net/core/dev.c:5837 __netif_receive_skb_one_core net/core/dev.c:5975 [inline] __netif_receive_skb+0xcc/0xac0 net/core/dev.c:6090 netif_receive_skb_internal net/core/dev.c:6176 [inline] netif_receive_skb+0x57/0x630 net/core/dev.c:6235 tun_rx_batched+0x1df/0x980 drivers/net/tun.c:1485 tun_get_user+0x4ee0/0x6b40 drivers/net/tun.c:1938 tun_chr_write_iter+0x3e9/0x5c0 drivers/net/tun.c:1984 new_sync_write fs/read_write.c:593 [inline] vfs_write+0xb4b/0x1580 fs/read_write.c:686 ksys_write fs/read_write.c:738 [inline] __do_sys_write fs/read_write.c:749 [inline] Reported-by: syzbot+bf6ed459397e307c3ad2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/686bc073.a00a0220.c7b3.0086.GAE@google.com/T/#u Fixes: 87b3593bed18 ("netfilter: flowtable: validate pppoe header") Signed-off-by: Eric Dumazet Reviewed-by: Pablo Neira Ayuso Link: https://patch.msgid.link/20250707124517.614489-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/netfilter/nf_flow_table.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index d711642e78b5..c003cd194fa2 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -370,7 +370,7 @@ static inline __be16 __nf_flow_pppoe_proto(const struct sk_buff *skb) static inline bool nf_flow_pppoe_proto(struct sk_buff *skb, __be16 *inner_proto) { - if (!pskb_may_pull(skb, PPPOE_SES_HLEN)) + if (!pskb_may_pull(skb, ETH_HLEN + PPPOE_SES_HLEN)) return false; *inner_proto = __nf_flow_pppoe_proto(skb); -- cgit v1.2.3 From 6e816e1c052b453a93aeb8b57ede9acde58c458d Mon Sep 17 00:00:00 2001 From: Tao Chen Date: Thu, 10 Jul 2025 11:20:35 +0800 Subject: bpf: Remove location field in tcx_link Use attach_type in bpf_link to replace the location filed, and remove location field in tcx_link. Signed-off-by: Tao Chen Signed-off-by: Andrii Nakryiko Acked-by: Daniel Borkmann Acked-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20250710032038.888700-5-chen.dylane@linux.dev --- include/net/tcx.h | 1 - kernel/bpf/tcx.c | 13 ++++++------- 2 files changed, 6 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/tcx.h b/include/net/tcx.h index 5ce0ce9e0c02..23a61af13547 100644 --- a/include/net/tcx.h +++ b/include/net/tcx.h @@ -20,7 +20,6 @@ struct tcx_entry { struct tcx_link { struct bpf_link link; struct net_device *dev; - u32 location; }; static inline void tcx_set_ingress(struct sk_buff *skb, bool ingress) diff --git a/kernel/bpf/tcx.c b/kernel/bpf/tcx.c index e6a14f408d94..efd987ea6872 100644 --- a/kernel/bpf/tcx.c +++ b/kernel/bpf/tcx.c @@ -142,7 +142,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, u64 revision) { struct tcx_link *tcx = tcx_link(link); - bool created, ingress = tcx->location == BPF_TCX_INGRESS; + bool created, ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev = tcx->dev; int ret; @@ -169,7 +169,7 @@ static int tcx_link_prog_attach(struct bpf_link *link, u32 flags, u32 id_or_fd, static void tcx_link_release(struct bpf_link *link) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -204,7 +204,7 @@ static int tcx_link_update(struct bpf_link *link, struct bpf_prog *nprog, struct bpf_prog *oprog) { struct tcx_link *tcx = tcx_link(link); - bool ingress = tcx->location == BPF_TCX_INGRESS; + bool ingress = link->attach_type == BPF_TCX_INGRESS; struct bpf_mprog_entry *entry, *entry_new; struct net_device *dev; int ret = 0; @@ -260,8 +260,8 @@ static void tcx_link_fdinfo(const struct bpf_link *link, struct seq_file *seq) seq_printf(seq, "ifindex:\t%u\n", ifindex); seq_printf(seq, "attach_type:\t%u (%s)\n", - tcx->location, - tcx->location == BPF_TCX_INGRESS ? "ingress" : "egress"); + link->attach_type, + link->attach_type == BPF_TCX_INGRESS ? "ingress" : "egress"); } static int tcx_link_fill_info(const struct bpf_link *link, @@ -276,7 +276,7 @@ static int tcx_link_fill_info(const struct bpf_link *link, rtnl_unlock(); info->tcx.ifindex = ifindex; - info->tcx.attach_type = tcx->location; + info->tcx.attach_type = link->attach_type; return 0; } @@ -303,7 +303,6 @@ static int tcx_link_init(struct tcx_link *tcx, { bpf_link_init(&tcx->link, BPF_LINK_TYPE_TCX, &tcx_link_lops, prog, attr->link_create.attach_type); - tcx->location = attr->link_create.attach_type; tcx->dev = dev; return bpf_link_prime(&tcx->link, link_primer); } -- cgit v1.2.3 From 30dbb2d0e16fce445581049ebcd9043837a843ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:53 +0000 Subject: net_sched: act: annotate data-races in tcf_lastuse_update() and tcf_tm_dump() tcf_tm_dump() reads fields that can be changed concurrently, and tcf_lastuse_update() might race against itself. Add READ_ONCE() and WRITE_ONCE() annotations. Fetch jiffies once in tcf_tm_dump(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/act_api.h | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/act_api.h b/include/net/act_api.h index 04781c92b43d..2894cfff2da3 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -76,19 +76,24 @@ static inline void tcf_lastuse_update(struct tcf_t *tm) { unsigned long now = jiffies; - if (tm->lastuse != now) - tm->lastuse = now; - if (unlikely(!tm->firstuse)) - tm->firstuse = now; + if (READ_ONCE(tm->lastuse) != now) + WRITE_ONCE(tm->lastuse, now); + if (unlikely(!READ_ONCE(tm->firstuse))) + WRITE_ONCE(tm->firstuse, now); } static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) { - dtm->install = jiffies_to_clock_t(jiffies - stm->install); - dtm->lastuse = jiffies_to_clock_t(jiffies - stm->lastuse); - dtm->firstuse = stm->firstuse ? - jiffies_to_clock_t(jiffies - stm->firstuse) : 0; - dtm->expires = jiffies_to_clock_t(stm->expires); + unsigned long firstuse, now = jiffies; + + dtm->install = jiffies_to_clock_t(now - READ_ONCE(stm->install)); + dtm->lastuse = jiffies_to_clock_t(now - READ_ONCE(stm->lastuse)); + + firstuse = READ_ONCE(stm->firstuse); + dtm->firstuse = firstuse ? + jiffies_to_clock_t(now - firstuse) : 0; + + dtm->expires = jiffies_to_clock_t(READ_ONCE(stm->expires)); } static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats) -- cgit v1.2.3 From 0d752877705c0252ef2726e4c63c5573f048951c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:54 +0000 Subject: net_sched: act_connmark: use RCU in tcf_connmark_dump() Also storing tcf_action into struct tcf_connmark_parms makes sure there is no discrepancy in tcf_connmark_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_connmark.h | 1 + net/sched/act_connmark.c | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_connmark.h b/include/net/tc_act/tc_connmark.h index e8dd77a96748..a5ce83f3eea4 100644 --- a/include/net/tc_act/tc_connmark.h +++ b/include/net/tc_act/tc_connmark.h @@ -7,6 +7,7 @@ struct tcf_connmark_parms { struct net *net; u16 zone; + int action; struct rcu_head rcu; }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 0fce631e7c91..3e89927d7116 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -88,7 +88,7 @@ count: /* using overlimits stats to count how many packets marked */ tcf_action_inc_overlimit_qstats(&ca->common); out: - return READ_ONCE(ca->tcf_action); + return parms->action; } static const struct nla_policy connmark_policy[TCA_CONNMARK_MAX + 1] = { @@ -167,6 +167,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (err < 0) goto release_idr; + nparms->action = parm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(ci->parms, nparms, lockdep_is_held(&ci->tcf_lock)); @@ -190,20 +192,20 @@ out_free: static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_connmark_info *ci = to_connmark(a); + const struct tcf_connmark_parms *parms; struct tc_connmark opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - struct tcf_connmark_parms *parms; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - parms = rcu_dereference_protected(ci->parms, lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + parms = rcu_dereference(ci->parms); - opt.action = ci->tcf_action; + opt.action = parms->action; opt.zone = parms->zone; if (nla_put(skb, TCA_CONNMARK_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -212,12 +214,12 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_CONNMARK_TM, sizeof(t), &t, TCA_CONNMARK_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From ba9dc9c14038b5f721e193f9e69ab73fd2f7bdd2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:55 +0000 Subject: net_sched: act_csum: use RCU in tcf_csum_dump() Also storing tcf_action into struct tcf_csum_params makes sure there is no discrepancy in tcf_csum_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_csum.h | 1 + net/sched/act_csum.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 2515da0142a6..8d0c7a9f9345 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -8,6 +8,7 @@ struct tcf_csum_params { u32 update_flags; + int action; struct rcu_head rcu; }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 5cc8e407e791..0939e6b2ba4d 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -99,6 +99,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, goto put_chain; } params_new->update_flags = parm->update_flags; + params_new->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -580,7 +581,7 @@ TC_INDIRECT_SCOPE int tcf_csum_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); + action = params->action; if (unlikely(action == TC_ACT_SHOT)) goto drop; @@ -631,9 +632,9 @@ drop: static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_csum *p = to_tcf_csum(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_csum *p = to_tcf_csum(a); - struct tcf_csum_params *params; + const struct tcf_csum_params *params; struct tc_csum opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, @@ -641,10 +642,9 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, }; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - params = rcu_dereference_protected(p->params, - lockdep_is_held(&p->tcf_lock)); - opt.action = p->tcf_action; + rcu_read_lock(); + params = rcu_dereference(p->params); + opt.action = params->action; opt.update_flags = params->update_flags; if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt)) @@ -653,12 +653,12 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_CSUM_TM, sizeof(t), &t, TCA_CSUM_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 554e66bad84ce4181ad91a2ae9cc74c7c440e836 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:56 +0000 Subject: net_sched: act_ct: use RCU in tcf_ct_dump() Also storing tcf_action into struct tcf_ct_params makes sure there is no discrepancy in tcf_ct_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_ct.h | 2 +- net/sched/act_ct.c | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h index e6b45cb27ebf..8b90c86c0b0d 100644 --- a/include/net/tc_act/tc_ct.h +++ b/include/net/tc_act/tc_ct.h @@ -13,7 +13,7 @@ struct tcf_ct_params { struct nf_conntrack_helper *helper; struct nf_conn *tmpl; u16 zone; - + int action; u32 mark; u32 mark_mask; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index c02f39efc6ef..6749a4a9a9cd 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -977,7 +977,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, p = rcu_dereference_bh(c->params); - retval = READ_ONCE(c->tcf_action); + retval = p->action; commit = p->ct_action & TCA_CT_ACT_COMMIT; clear = p->ct_action & TCA_CT_ACT_CLEAR; tmpl = p->tmpl; @@ -1409,6 +1409,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, if (err) goto cleanup; + params->action = parm->action; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params = rcu_replace_pointer(c->params, params, @@ -1442,8 +1443,8 @@ static void tcf_ct_cleanup(struct tc_action *a) } static int tcf_ct_dump_key_val(struct sk_buff *skb, - void *val, int val_type, - void *mask, int mask_type, + const void *val, int val_type, + const void *mask, int mask_type, int len) { int err; @@ -1464,9 +1465,9 @@ static int tcf_ct_dump_key_val(struct sk_buff *skb, return 0; } -static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) +static int tcf_ct_dump_nat(struct sk_buff *skb, const struct tcf_ct_params *p) { - struct nf_nat_range2 *range = &p->range; + const struct nf_nat_range2 *range = &p->range; if (!(p->ct_action & TCA_CT_ACT_NAT)) return 0; @@ -1504,7 +1505,8 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p) return 0; } -static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper) +static int tcf_ct_dump_helper(struct sk_buff *skb, + const struct nf_conntrack_helper *helper) { if (!helper) return 0; @@ -1521,9 +1523,8 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_ct *c = to_ct(a); - struct tcf_ct_params *p; - + const struct tcf_ct *c = to_ct(a); + const struct tcf_ct_params *p; struct tc_ct opt = { .index = c->tcf_index, .refcnt = refcount_read(&c->tcf_refcnt) - ref, @@ -1531,10 +1532,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&c->tcf_lock); - p = rcu_dereference_protected(c->params, - lockdep_is_held(&c->tcf_lock)); - opt.action = c->tcf_action; + rcu_read_lock(); + p = rcu_dereference(c->params); + opt.action = p->action; if (tcf_ct_dump_key_val(skb, &p->ct_action, TCA_CT_ACTION, @@ -1579,11 +1579,11 @@ skip_dump: tcf_tm_dump(&t, &c->tcf_tm); if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD)) goto nla_put_failure; - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&c->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From d300335b4e18672913dd792ff9f49e6cccf41d26 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:57 +0000 Subject: net_sched: act_ctinfo: use atomic64_t for three counters Commit 21c167aa0ba9 ("net/sched: act_ctinfo: use percpu stats") missed that stats_dscp_set, stats_dscp_error and stats_cpmark_set might be written (and read) locklessly. Use atomic64_t for these three fields, I doubt act_ctinfo is used heavily on big SMP hosts anyway. Fixes: 24ec483cec98 ("net: sched: Introduce act_ctinfo action") Signed-off-by: Eric Dumazet Cc: Pedro Tammela Link: https://patch.msgid.link/20250709090204.797558-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_ctinfo.h | 6 +++--- net/sched/act_ctinfo.c | 19 +++++++++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h index f071c1d70a25..a04bcac7adf4 100644 --- a/include/net/tc_act/tc_ctinfo.h +++ b/include/net/tc_act/tc_ctinfo.h @@ -18,9 +18,9 @@ struct tcf_ctinfo_params { struct tcf_ctinfo { struct tc_action common; struct tcf_ctinfo_params __rcu *params; - u64 stats_dscp_set; - u64 stats_dscp_error; - u64 stats_cpmark_set; + atomic64_t stats_dscp_set; + atomic64_t stats_dscp_error; + atomic64_t stats_cpmark_set; }; enum { diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 5b1241ddc758..93ab3bcd6d31 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -44,9 +44,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -57,9 +57,9 @@ static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, newdscp); - ca->stats_dscp_set++; + atomic64_inc(&ca->stats_dscp_set); } else { - ca->stats_dscp_error++; + atomic64_inc(&ca->stats_dscp_error); } } break; @@ -72,7 +72,7 @@ static void tcf_ctinfo_cpmark_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, struct sk_buff *skb) { - ca->stats_cpmark_set++; + atomic64_inc(&ca->stats_cpmark_set); skb->mark = READ_ONCE(ct->mark) & cp->cpmarkmask; } @@ -323,15 +323,18 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, } if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_SET, - ci->stats_dscp_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_set), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_DSCP_ERROR, - ci->stats_dscp_error, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_dscp_error), + TCA_CTINFO_PAD)) goto nla_put_failure; if (nla_put_u64_64bit(skb, TCA_CTINFO_STATS_CPMARK_SET, - ci->stats_cpmark_set, TCA_CTINFO_PAD)) + atomic64_read(&ci->stats_cpmark_set), + TCA_CTINFO_PAD)) goto nla_put_failure; spin_unlock_bh(&ci->tcf_lock); -- cgit v1.2.3 From 799c94178cf9c9e80575b05b7479396de8b42b61 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:58 +0000 Subject: net_sched: act_ctinfo: use RCU in tcf_ctinfo_dump() Also storing tcf_action into struct tcf_ctinfo_params makes sure there is no discrepancy in tcf_ctinfo_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_ctinfo.h | 1 + net/sched/act_ctinfo.c | 23 +++++++++++------------ 2 files changed, 12 insertions(+), 12 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_ctinfo.h b/include/net/tc_act/tc_ctinfo.h index a04bcac7adf4..7fe01ab236da 100644 --- a/include/net/tc_act/tc_ctinfo.h +++ b/include/net/tc_act/tc_ctinfo.h @@ -7,6 +7,7 @@ struct tcf_ctinfo_params { struct rcu_head rcu; struct net *net; + int action; u32 dscpmask; u32 dscpstatemask; u32 cpmarkmask; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 93ab3bcd6d31..71efe04d00b5 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -88,13 +88,11 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, struct tcf_ctinfo_params *cp; struct nf_conn *ct; int proto, wlen; - int action; cp = rcu_dereference_bh(ca->params); tcf_lastuse_update(&ca->tcf_tm); tcf_action_update_bstats(&ca->common, skb); - action = READ_ONCE(ca->tcf_action); wlen = skb_network_offset(skb); switch (skb_protocol(skb, true)) { @@ -141,7 +139,7 @@ TC_INDIRECT_SCOPE int tcf_ctinfo_act(struct sk_buff *skb, if (thash) nf_ct_put(ct); out: - return action; + return cp->action; } static const struct nla_policy ctinfo_policy[TCA_CTINFO_MAX + 1] = { @@ -258,6 +256,8 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, cp_new->mode |= CTINFO_MODE_CPMARK; } + cp_new->action = actparm->action; + spin_lock_bh(&ci->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, actparm->action, goto_ch); cp_new = rcu_replace_pointer(ci->params, cp_new, @@ -282,25 +282,24 @@ release_idr: static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { - struct tcf_ctinfo *ci = to_ctinfo(a); + const struct tcf_ctinfo *ci = to_ctinfo(a); + unsigned char *b = skb_tail_pointer(skb); + const struct tcf_ctinfo_params *cp; struct tc_ctinfo opt = { .index = ci->tcf_index, .refcnt = refcount_read(&ci->tcf_refcnt) - ref, .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, }; - unsigned char *b = skb_tail_pointer(skb); - struct tcf_ctinfo_params *cp; struct tcf_t t; - spin_lock_bh(&ci->tcf_lock); - cp = rcu_dereference_protected(ci->params, - lockdep_is_held(&ci->tcf_lock)); + rcu_read_lock(); + cp = rcu_dereference(ci->params); tcf_tm_dump(&t, &ci->tcf_tm); if (nla_put_64bit(skb, TCA_CTINFO_TM, sizeof(t), &t, TCA_CTINFO_PAD)) goto nla_put_failure; - opt.action = ci->tcf_action; + opt.action = cp->action; if (nla_put(skb, TCA_CTINFO_ACT, sizeof(opt), &opt)) goto nla_put_failure; @@ -337,11 +336,11 @@ static int tcf_ctinfo_dump(struct sk_buff *skb, struct tc_action *a, TCA_CTINFO_PAD)) goto nla_put_failure; - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&ci->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 8151684e339996ffe6d65968c5eea154366539f4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:01:59 +0000 Subject: net_sched: act_mpls: use RCU in tcf_mpls_dump() Also storing tcf_action into struct tcf_mpls_params makes sure there is no discrepancy in tcf_mpls_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_mpls.h | 1 + net/sched/act_mpls.c | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_mpls.h b/include/net/tc_act/tc_mpls.h index d452e5e94fd0..dd067bd4018d 100644 --- a/include/net/tc_act/tc_mpls.h +++ b/include/net/tc_act/tc_mpls.h @@ -10,6 +10,7 @@ struct tcf_mpls_params { int tcfm_action; u32 tcfm_label; + int action; /* tcf_action */ u8 tcfm_tc; u8 tcfm_ttl; u8 tcfm_bos; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 9f86f4e666d3..6654011dcd2b 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -57,7 +57,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, struct tcf_mpls *m = to_mpls(a); struct tcf_mpls_params *p; __be32 new_lse; - int ret, mac_len; + int mac_len; tcf_lastuse_update(&m->tcf_tm); bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); @@ -72,8 +72,6 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, mac_len = skb_network_offset(skb); } - ret = READ_ONCE(m->tcf_action); - p = rcu_dereference_bh(m->mpls_p); switch (p->tcfm_action) { @@ -122,7 +120,7 @@ TC_INDIRECT_SCOPE int tcf_mpls_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); - return ret; + return p->action; drop: qstats_drop_inc(this_cpu_ptr(m->common.cpu_qstats)); @@ -296,6 +294,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, ACT_MPLS_BOS_NOT_SET); p->tcfm_proto = nla_get_be16_default(tb[TCA_MPLS_PROTO], htons(ETH_P_MPLS_UC)); + p->action = parm->action; spin_lock_bh(&m->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -330,8 +329,8 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_mpls *m = to_mpls(a); - struct tcf_mpls_params *p; + const struct tcf_mpls *m = to_mpls(a); + const struct tcf_mpls_params *p; struct tc_mpls opt = { .index = m->tcf_index, .refcnt = refcount_read(&m->tcf_refcnt) - ref, @@ -339,10 +338,10 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&m->tcf_lock); - opt.action = m->tcf_action; - p = rcu_dereference_protected(m->mpls_p, lockdep_is_held(&m->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(m->mpls_p); opt.m_action = p->tcfm_action; + opt.action = p->action; if (nla_put(skb, TCA_MPLS_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -370,12 +369,12 @@ static int tcf_mpls_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_MPLS_TM, sizeof(t), &t, TCA_MPLS_PAD)) goto nla_put_failure; - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&m->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -EMSGSIZE; } -- cgit v1.2.3 From 5d28928668a2ef6182401ddca7ab4064bf349e3e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:02:00 +0000 Subject: net_sched: act_nat: use RCU in tcf_nat_dump() Also storing tcf_action into struct tcf_nat_params makes sure there is no discrepancy in tcf_nat_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_nat.h | 1 + net/sched/act_nat.c | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_nat.h b/include/net/tc_act/tc_nat.h index c869274ac529..ae35f4009445 100644 --- a/include/net/tc_act/tc_nat.h +++ b/include/net/tc_act/tc_nat.h @@ -6,6 +6,7 @@ #include struct tcf_nat_parms { + int action; __be32 old_addr; __be32 new_addr; __be32 mask; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index d541f553805f..26241d80ebe0 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -91,6 +91,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, nparm->new_addr = parm->new_addr; nparm->mask = parm->mask; nparm->flags = parm->flags; + nparm->action = parm->action; p = to_tcf_nat(*a); @@ -130,17 +131,16 @@ TC_INDIRECT_SCOPE int tcf_nat_act(struct sk_buff *skb, tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); - action = READ_ONCE(p->tcf_action); - parms = rcu_dereference_bh(p->parms); + action = parms->action; + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + old_addr = parms->old_addr; new_addr = parms->new_addr; mask = parms->mask; egress = parms->flags & TCA_NAT_FLAG_EGRESS; - if (unlikely(action == TC_ACT_SHOT)) - goto drop; - noff = skb_network_offset(skb); if (!pskb_may_pull(skb, sizeof(*iph) + noff)) goto drop; @@ -268,21 +268,20 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat *p = to_tcf_nat(a); + const struct tcf_nat_parms *parms; struct tc_nat opt = { .index = p->tcf_index, .refcnt = refcount_read(&p->tcf_refcnt) - ref, .bindcnt = atomic_read(&p->tcf_bindcnt) - bind, }; - struct tcf_nat_parms *parms; struct tcf_t t; - spin_lock_bh(&p->tcf_lock); - - opt.action = p->tcf_action; + rcu_read_lock(); - parms = rcu_dereference_protected(p->parms, lockdep_is_held(&p->tcf_lock)); + parms = rcu_dereference(p->parms); + opt.action = parms->action; opt.old_addr = parms->old_addr; opt.new_addr = parms->new_addr; opt.mask = parms->mask; @@ -294,12 +293,12 @@ static int tcf_nat_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_NAT_TM, sizeof(t), &t, TCA_NAT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 9d096746572616a50cac4906f528a1959c0ee1c2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:02:01 +0000 Subject: net_sched: act_pedit: use RCU in tcf_pedit_dump() Also storing tcf_action into struct tcf_pedit_params makes sure there is no discrepancy in tcf_pedit_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_pedit.h | 1 + net/sched/act_pedit.c | 20 ++++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index 83fe39931781..f58ee15cd858 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -14,6 +14,7 @@ struct tcf_pedit_key_ex { struct tcf_pedit_parms { struct tc_pedit_key *tcfp_keys; struct tcf_pedit_key_ex *tcfp_keys_ex; + int action; u32 tcfp_off_max_hint; unsigned char tcfp_nkeys; unsigned char tcfp_flags; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index fc0a35a7b62a..4b65901397a8 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -279,7 +279,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, } p = to_pedit(*a); - + nparms->action = parm->action; spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); @@ -483,7 +483,7 @@ TC_INDIRECT_SCOPE int tcf_pedit_act(struct sk_buff *skb, bad: tcf_action_inc_overlimit_qstats(&p->common); done: - return p->tcf_action; + return parms->action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, @@ -500,19 +500,19 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); - struct tcf_pedit *p = to_pedit(a); - struct tcf_pedit_parms *parms; + const struct tcf_pedit *p = to_pedit(a); + const struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; - spin_lock_bh(&p->tcf_lock); - parms = rcu_dereference_protected(p->parms, 1); + rcu_read_lock(); + parms = rcu_dereference(p->parms); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); return -ENOBUFS; } opt->nkeys = parms->tcfp_nkeys; @@ -521,7 +521,7 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->flags = parms->tcfp_flags; - opt->action = p->tcf_action; + opt->action = parms->action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; @@ -540,13 +540,13 @@ static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); kfree(opt); return skb->len; nla_put_failure: - spin_unlock_bh(&p->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); kfree(opt); return -1; -- cgit v1.2.3 From cec7a5c6c695ba2226b6120dc330e3bea3ea96f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:02:02 +0000 Subject: net_sched: act_police: use RCU in tcf_police_dump() Also storing tcf_action into struct tcf_police_params makes sure there is no discrepancy in tcf_police_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-11-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_police.h | 3 ++- net/sched/act_police.c | 18 +++++++++--------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_police.h b/include/net/tc_act/tc_police.h index 490d88cb5233..a89fc8e68b1e 100644 --- a/include/net/tc_act/tc_police.h +++ b/include/net/tc_act/tc_police.h @@ -5,10 +5,11 @@ #include struct tcf_police_params { + int action; int tcfp_result; u32 tcfp_ewma_rate; - s64 tcfp_burst; u32 tcfp_mtu; + s64 tcfp_burst; s64 tcfp_mtu_ptoks; s64 tcfp_pkt_burst; struct psched_ratecfg rate; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index a214ed681142..0e1c61183379 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -198,6 +198,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, psched_ppscfg_precompute(&new->ppsrate, pps); } + new->action = parm->action; spin_lock_bh(&police->tcf_lock); spin_lock_bh(&police->tcfp_lock); police->tcfp_t_c = ktime_get_ns(); @@ -254,8 +255,8 @@ TC_INDIRECT_SCOPE int tcf_police_act(struct sk_buff *skb, tcf_lastuse_update(&police->tcf_tm); bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); - ret = READ_ONCE(police->tcf_action); p = rcu_dereference_bh(police->params); + ret = p->action; if (p->tcfp_ewma_rate) { struct gnet_stats_rate_est64 sample; @@ -338,9 +339,9 @@ static void tcf_police_stats_update(struct tc_action *a, static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_police *police = to_police(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_police *police = to_police(a); - struct tcf_police_params *p; + const struct tcf_police_params *p; struct tc_police opt = { .index = police->tcf_index, .refcnt = refcount_read(&police->tcf_refcnt) - ref, @@ -348,10 +349,9 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&police->tcf_lock); - opt.action = police->tcf_action; - p = rcu_dereference_protected(police->params, - lockdep_is_held(&police->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(police->params); + opt.action = p->action; opt.mtu = p->tcfp_mtu; opt.burst = PSCHED_NS2TICKS(p->tcfp_burst); if (p->rate_present) { @@ -392,12 +392,12 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &police->tcf_tm); if (nla_put_64bit(skb, TCA_POLICE_TM, sizeof(t), &t, TCA_POLICE_PAD)) goto nla_put_failure; - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&police->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 1f376373bd225c90381b745e38fa65a9386f7f8e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 9 Jul 2025 09:02:03 +0000 Subject: net_sched: act_skbedit: use RCU in tcf_skbedit_dump() Also storing tcf_action into struct tcf_skbedit_params makes sure there is no discrepancy in tcf_skbedit_act(). Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20250709090204.797558-12-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_skbedit.h | 1 + net/sched/act_skbedit.c | 20 +++++++++----------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 9649600fb3dc..31b2cd0bebb5 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -12,6 +12,7 @@ #include struct tcf_skbedit_params { + int action; u32 flags; u32 priority; u32 mark; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 1f1d9ce3e968..8c1d1554f657 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -43,13 +43,11 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, { struct tcf_skbedit *d = to_skbedit(a); struct tcf_skbedit_params *params; - int action; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); params = rcu_dereference_bh(d->params); - action = READ_ONCE(d->tcf_action); if (params->flags & SKBEDIT_F_PRIORITY) skb->priority = params->priority; @@ -85,7 +83,7 @@ TC_INDIRECT_SCOPE int tcf_skbedit_act(struct sk_buff *skb, } if (params->flags & SKBEDIT_F_PTYPE) skb->pkt_type = params->ptype; - return action; + return params->action; err: qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -262,6 +260,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (flags & SKBEDIT_F_MASK) params_new->mask = *mask; + params_new->action = parm->action; spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(d->params, params_new, @@ -284,9 +283,9 @@ release_idr: static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { + const struct tcf_skbedit *d = to_skbedit(a); unsigned char *b = skb_tail_pointer(skb); - struct tcf_skbedit *d = to_skbedit(a); - struct tcf_skbedit_params *params; + const struct tcf_skbedit_params *params; struct tc_skbedit opt = { .index = d->tcf_index, .refcnt = refcount_read(&d->tcf_refcnt) - ref, @@ -295,10 +294,9 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, u64 pure_flags = 0; struct tcf_t t; - spin_lock_bh(&d->tcf_lock); - params = rcu_dereference_protected(d->params, - lockdep_is_held(&d->tcf_lock)); - opt.action = d->tcf_action; + rcu_read_lock(); + params = rcu_dereference(d->params); + opt.action = params->action; if (nla_put(skb, TCA_SKBEDIT_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -333,12 +331,12 @@ static int tcf_skbedit_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &d->tcf_tm); if (nla_put_64bit(skb, TCA_SKBEDIT_TM, sizeof(t), &t, TCA_SKBEDIT_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } -- cgit v1.2.3 From 36a686c0784fcccdaa4f38b498a9ef0d42ea7cb8 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 10 Jul 2025 18:43:42 +0200 Subject: Revert "netfilter: nf_tables: Add notifications for hook changes" This reverts commit 465b9ee0ee7bc268d7f261356afd6c4262e48d82. Such notifications fit better into core or nfnetlink_hook code, following the NFNL_MSG_HOOK_GET message format. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 --- include/uapi/linux/netfilter/nf_tables.h | 10 ------ include/uapi/linux/netfilter/nfnetlink.h | 2 -- net/netfilter/nf_tables_api.c | 59 -------------------------------- net/netfilter/nfnetlink.c | 1 - net/netfilter/nft_chain_filter.c | 2 -- 6 files changed, 79 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index e4d8e451e935..5e49619ae49c 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1142,11 +1142,6 @@ int nft_set_catchall_validate(const struct nft_ctx *ctx, struct nft_set *set); int nf_tables_bind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); void nf_tables_unbind_chain(const struct nft_ctx *ctx, struct nft_chain *chain); -struct nft_hook; -void nf_tables_chain_device_notify(const struct nft_chain *chain, - const struct nft_hook *hook, - const struct net_device *dev, int event); - enum nft_chain_types { NFT_CHAIN_T_DEFAULT = 0, NFT_CHAIN_T_ROUTE, diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 518ba144544c..2beb30be2c5f 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -142,8 +142,6 @@ enum nf_tables_msg_types { NFT_MSG_DESTROYOBJ, NFT_MSG_DESTROYFLOWTABLE, NFT_MSG_GETSETELEM_RESET, - NFT_MSG_NEWDEV, - NFT_MSG_DELDEV, NFT_MSG_MAX, }; @@ -1786,18 +1784,10 @@ enum nft_synproxy_attributes { * enum nft_device_attributes - nf_tables device netlink attributes * * @NFTA_DEVICE_NAME: name of this device (NLA_STRING) - * @NFTA_DEVICE_TABLE: table containing the flowtable or chain hooking into the device (NLA_STRING) - * @NFTA_DEVICE_FLOWTABLE: flowtable hooking into the device (NLA_STRING) - * @NFTA_DEVICE_CHAIN: chain hooking into the device (NLA_STRING) - * @NFTA_DEVICE_SPEC: hook spec matching the device (NLA_STRING) */ enum nft_devices_attributes { NFTA_DEVICE_UNSPEC, NFTA_DEVICE_NAME, - NFTA_DEVICE_TABLE, - NFTA_DEVICE_FLOWTABLE, - NFTA_DEVICE_CHAIN, - NFTA_DEVICE_SPEC, __NFTA_DEVICE_MAX }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) diff --git a/include/uapi/linux/netfilter/nfnetlink.h b/include/uapi/linux/netfilter/nfnetlink.h index 50d807af2649..6cd58cd2a6f0 100644 --- a/include/uapi/linux/netfilter/nfnetlink.h +++ b/include/uapi/linux/netfilter/nfnetlink.h @@ -25,8 +25,6 @@ enum nfnetlink_groups { #define NFNLGRP_ACCT_QUOTA NFNLGRP_ACCT_QUOTA NFNLGRP_NFTRACE, #define NFNLGRP_NFTRACE NFNLGRP_NFTRACE - NFNLGRP_NFT_DEV, -#define NFNLGRP_NFT_DEV NFNLGRP_NFT_DEV __NFNLGRP_MAX, }; #define NFNLGRP_MAX (__NFNLGRP_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 24c71ecb2179..a7240736f98e 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -9686,64 +9686,6 @@ struct nf_hook_ops *nft_hook_find_ops_rcu(const struct nft_hook *hook, } EXPORT_SYMBOL_GPL(nft_hook_find_ops_rcu); -static void -nf_tables_device_notify(const struct nft_table *table, int attr, - const char *name, const struct nft_hook *hook, - const struct net_device *dev, int event) -{ - struct net *net = dev_net(dev); - struct nlmsghdr *nlh; - struct sk_buff *skb; - u16 flags = 0; - - if (!nfnetlink_has_listeners(net, NFNLGRP_NFT_DEV)) - return; - - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - goto err; - - event = event == NETDEV_REGISTER ? NFT_MSG_NEWDEV : NFT_MSG_DELDEV; - event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nfnl_msg_put(skb, 0, 0, event, flags, table->family, - NFNETLINK_V0, nft_base_seq(net)); - if (!nlh) - goto err; - - if (nla_put_string(skb, NFTA_DEVICE_TABLE, table->name) || - nla_put_string(skb, attr, name) || - nla_put(skb, NFTA_DEVICE_SPEC, hook->ifnamelen, hook->ifname) || - nla_put_string(skb, NFTA_DEVICE_NAME, dev->name)) - goto err; - - nlmsg_end(skb, nlh); - nfnetlink_send(skb, net, 0, NFNLGRP_NFT_DEV, - nlmsg_report(nlh), GFP_KERNEL); - return; -err: - if (skb) - kfree_skb(skb); - nfnetlink_set_err(net, 0, NFNLGRP_NFT_DEV, -ENOBUFS); -} - -void -nf_tables_chain_device_notify(const struct nft_chain *chain, - const struct nft_hook *hook, - const struct net_device *dev, int event) -{ - nf_tables_device_notify(chain->table, NFTA_DEVICE_CHAIN, - chain->name, hook, dev, event); -} - -static void -nf_tables_flowtable_device_notify(const struct nft_flowtable *ft, - const struct nft_hook *hook, - const struct net_device *dev, int event) -{ - nf_tables_device_notify(ft->table, NFTA_DEVICE_FLOWTABLE, - ft->name, hook, dev, event); -} - static int nft_flowtable_event(unsigned long event, struct net_device *dev, struct nft_flowtable *flowtable, bool changename) { @@ -9791,7 +9733,6 @@ static int nft_flowtable_event(unsigned long event, struct net_device *dev, list_add_tail_rcu(&ops->list, &hook->ops_list); break; } - nf_tables_flowtable_device_notify(flowtable, hook, dev, event); break; } return 0; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index ac77fc21632d..e598a2a252b0 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -86,7 +86,6 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = { [NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES, [NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT, [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES, - [NFNLGRP_NFT_DEV] = NFNL_SUBSYS_NFTABLES, }; static struct nfnl_net *nfnl_pernet(struct net *net) diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c index 846d48ba8965..b16185e9a6dd 100644 --- a/net/netfilter/nft_chain_filter.c +++ b/net/netfilter/nft_chain_filter.c @@ -363,8 +363,6 @@ static int nft_netdev_event(unsigned long event, struct net_device *dev, list_add_tail_rcu(&ops->list, &hook->ops_list); break; } - nf_tables_chain_device_notify(&basechain->chain, - hook, dev, event); break; } return 0; -- cgit v1.2.3 From 08a305b2a5b8e125120bcf670ffe775c86cf1f59 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 12 Jul 2025 21:57:59 +0100 Subject: net/x25: Remove unused x25_terminate_link() x25_terminate_link() has been unused since the last use was removed in 2020 by: commit 7eed751b3b2a ("net/x25: handle additional netdev events") Remove it. Signed-off-by: Dr. David Alan Gilbert Acked-by: Martin Schiller Link: https://patch.msgid.link/20250712205759.278777-1-linux@treblig.org Signed-off-by: Jakub Kicinski --- include/net/x25.h | 1 - net/x25/x25_dev.c | 22 ---------------------- 2 files changed, 23 deletions(-) (limited to 'include/net') diff --git a/include/net/x25.h b/include/net/x25.h index 5e833cfc864e..414f3fd99345 100644 --- a/include/net/x25.h +++ b/include/net/x25.h @@ -203,7 +203,6 @@ void x25_send_frame(struct sk_buff *, struct x25_neigh *); int x25_lapb_receive_frame(struct sk_buff *, struct net_device *, struct packet_type *, struct net_device *); void x25_establish_link(struct x25_neigh *); -void x25_terminate_link(struct x25_neigh *); /* x25_facilities.c */ int x25_parse_facilities(struct sk_buff *, struct x25_facilities *, diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c index 748d8630ab58..fb8ac1aa5826 100644 --- a/net/x25/x25_dev.c +++ b/net/x25/x25_dev.c @@ -170,28 +170,6 @@ void x25_establish_link(struct x25_neigh *nb) dev_queue_xmit(skb); } -void x25_terminate_link(struct x25_neigh *nb) -{ - struct sk_buff *skb; - unsigned char *ptr; - - if (nb->dev->type != ARPHRD_X25) - return; - - skb = alloc_skb(1, GFP_ATOMIC); - if (!skb) { - pr_err("x25_dev: out of memory\n"); - return; - } - - ptr = skb_put(skb, 1); - *ptr = X25_IFACE_DISCONNECT; - - skb->protocol = htons(ETH_P_X25); - skb->dev = nb->dev; - dev_queue_xmit(skb); -} - void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) { unsigned char *dptr; -- cgit v1.2.3 From 9ca48d616ed76b284f946667a3cb7961205c8ee3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Jul 2025 11:39:59 +0000 Subject: tcp: do not accept packets beyond window Currently, TCP accepts incoming packets which might go beyond the offered RWIN. Add to tcp_sequence() the validation of packet end sequence. Add the corresponding check in the fast path. We relax this new constraint if the receive queue is empty, to not freeze flows from buggy peers. Add a new drop reason : SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250711114006.480026-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 8 +++++++- net/ipv4/tcp_input.c | 22 +++++++++++++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index b9e78290269e..beb134d55747 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -45,6 +45,7 @@ FN(TCP_LISTEN_OVERFLOW) \ FN(TCP_OLD_SEQUENCE) \ FN(TCP_INVALID_SEQUENCE) \ + FN(TCP_INVALID_END_SEQUENCE) \ FN(TCP_INVALID_ACK_SEQUENCE) \ FN(TCP_RESET) \ FN(TCP_INVALID_SYN) \ @@ -303,8 +304,13 @@ enum skb_drop_reason { SKB_DROP_REASON_TCP_LISTEN_OVERFLOW, /** @SKB_DROP_REASON_TCP_OLD_SEQUENCE: Old SEQ field (duplicate packet) */ SKB_DROP_REASON_TCP_OLD_SEQUENCE, - /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field */ + /** @SKB_DROP_REASON_TCP_INVALID_SEQUENCE: Not acceptable SEQ field. */ SKB_DROP_REASON_TCP_INVALID_SEQUENCE, + /** + * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE: + * Not acceptable END_SEQ field. + */ + SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE, /** * @SKB_DROP_REASON_TCP_INVALID_ACK_SEQUENCE: Not acceptable ACK SEQ * field because ack sequence is not in the window between snd_una diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9b03c44c12b8..f0f9c78654b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4391,14 +4391,22 @@ static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk, * (borrowed from freebsd) */ -static enum skb_drop_reason tcp_sequence(const struct tcp_sock *tp, +static enum skb_drop_reason tcp_sequence(const struct sock *sk, u32 seq, u32 end_seq) { + const struct tcp_sock *tp = tcp_sk(sk); + if (before(end_seq, tp->rcv_wup)) return SKB_DROP_REASON_TCP_OLD_SEQUENCE; - if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) - return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + if (after(end_seq, tp->rcv_nxt + tcp_receive_window(tp))) { + if (after(seq, tp->rcv_nxt + tcp_receive_window(tp))) + return SKB_DROP_REASON_TCP_INVALID_SEQUENCE; + + /* Only accept this packet if receive queue is empty. */ + if (skb_queue_len(&sk->sk_receive_queue)) + return SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE; + } return SKB_NOT_DROPPED_YET; } @@ -5881,7 +5889,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, step1: /* Step 1: check sequence number */ - reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + reason = tcp_sequence(sk, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); if (reason) { /* RFC793, page 37: "In all states except SYN-SENT, all reset * (RST) segments are validated by checking their SEQ-fields." @@ -6110,6 +6118,10 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) if (tcp_checksum_complete(skb)) goto csum_error; + if (after(TCP_SKB_CB(skb)->end_seq, + tp->rcv_nxt + tcp_receive_window(tp))) + goto validate; + if ((int)skb->truesize > sk->sk_forward_alloc) goto step5; @@ -6165,7 +6177,7 @@ slow_path: /* * Standard slow path. */ - +validate: if (!tcp_validate_incoming(sk, skb, th, 1)) return; -- cgit v1.2.3 From 6c758062c64dfbd61862801fbde4e0702f4f3a23 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Jul 2025 11:40:00 +0000 Subject: tcp: add LINUX_MIB_BEYOND_WINDOW Add a new SNMP MIB : LINUX_MIB_BEYOND_WINDOW Incremented when an incoming packet is received beyond the receiver window. nstat -az | grep TcpExtBeyondWindow Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250711114006.480026-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/net_cachelines/snmp.rst | 1 + include/net/dropreason-core.h | 1 + include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_input.c | 1 + 5 files changed, 5 insertions(+) (limited to 'include/net') diff --git a/Documentation/networking/net_cachelines/snmp.rst b/Documentation/networking/net_cachelines/snmp.rst index bd44b3eebbef..bce4eb35ec48 100644 --- a/Documentation/networking/net_cachelines/snmp.rst +++ b/Documentation/networking/net_cachelines/snmp.rst @@ -36,6 +36,7 @@ unsigned_long LINUX_MIB_TIMEWAITRECYCLED unsigned_long LINUX_MIB_TIMEWAITKILLED unsigned_long LINUX_MIB_PAWSACTIVEREJECTED unsigned_long LINUX_MIB_PAWSESTABREJECTED +unsigned_long LINUX_MIB_BEYOND_WINDOW unsigned_long LINUX_MIB_TSECR_REJECTED unsigned_long LINUX_MIB_PAWS_OLD_ACK unsigned_long LINUX_MIB_PAWS_TW_REJECTED diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index beb134d55747..229bb1826f2a 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -309,6 +309,7 @@ enum skb_drop_reason { /** * @SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE: * Not acceptable END_SEQ field. + * Corresponds to LINUX_MIB_BEYOND_WINDOW. */ SKB_DROP_REASON_TCP_INVALID_END_SEQUENCE, /** diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 1d234d7e1892..49f5640092a0 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -186,6 +186,7 @@ enum LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ + LINUX_MIB_BEYOND_WINDOW, /* BeyondWindow */ LINUX_MIB_TSECRREJECTED, /* TSEcrRejected */ LINUX_MIB_PAWS_OLD_ACK, /* PAWSOldAck */ LINUX_MIB_PAWS_TW_REJECTED, /* PAWSTimewait */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index ea2f01584379..65b0d0ab0084 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("BeyondWindow", LINUX_MIB_BEYOND_WINDOW), SNMP_MIB_ITEM("TSEcrRejected", LINUX_MIB_TSECRREJECTED), SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK), SNMP_MIB_ITEM("PAWSTimewait", LINUX_MIB_PAWS_TW_REJECTED), diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f0f9c78654b4..5e2d82c273e2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5900,6 +5900,7 @@ step1: if (!th->rst) { if (th->syn) goto syn_challenge; + NET_INC_STATS(sock_net(sk), LINUX_MIB_BEYOND_WINDOW); if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSEQ, &tp->last_oow_ack_time)) -- cgit v1.2.3 From 75dff0584cce79203ee9968c66c7589150fed591 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 11 Jul 2025 11:40:04 +0000 Subject: tcp: add const to tcp_try_rmem_schedule() and sk_rmem_schedule() skb These functions to not modify the skb, add a const qualifier. Signed-off-by: Eric Dumazet Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250711114006.480026-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 2 +- net/ipv4/tcp_input.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/sock.h b/include/net/sock.h index 0f2443d4ec58..c8a4b283df6f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1553,7 +1553,7 @@ __sk_rmem_schedule(struct sock *sk, int size, bool pfmemalloc) } static inline bool -sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) +sk_rmem_schedule(struct sock *sk, const struct sk_buff *skb, int size) { return __sk_rmem_schedule(sk, size, skb_pfmemalloc(skb)); } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 78da05933078..39de55ff898e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4888,7 +4888,7 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk, const struct sk_buff *in_skb); static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); -static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, +static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || -- cgit v1.2.3 From 444020f4bf06fb86805ee7e7ceec0375485fd94d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 14 Jul 2025 14:21:30 +0200 Subject: wifi: cfg80211: remove scan request n_channels counted_by This reverts commit e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by"). This really has been a completely failed experiment. There were no actual bugs found, and yet at this point we already have four "fixes" to it, with nothing to show for but code churn, and it never even made the code any safer. In all of the cases that ended up getting "fixed", the structure is also internally inconsistent after the n_channels setting as the channel list isn't actually filled yet. You cannot scan with such a structure, that's just wrong. In mac80211, the struct is also reused multiple times, so initializing it once is no good. Some previous "fixes" (e.g. one in brcm80211) are also just setting n_channels before accessing the array, under the assumption that the code is correct and the array can be accessed, further showing that the whole thing is just pointless when the allocation count and use count are not separate. If we really wanted to fix it, we'd need to separately track the number of channels allocated and the number of channels currently used, but given that no bugs were found despite the numerous syzbot reports, that'd just be a waste of time. Remove the __counted_by() annotation. We really should also remove a number of the n_channels settings that are setting up a structure that's inconsistent, but that can wait. Reported-by: syzbot+e834e757bd9b3d3e1251@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=e834e757bd9b3d3e1251 Fixes: e3eac9f32ec0 ("wifi: cfg80211: Annotate struct cfg80211_scan_request with __counted_by") Link: https://patch.msgid.link/20250714142130.9b0bbb7e1f07.I09112ccde72d445e11348fc2bef68942cb2ffc94@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index d1848dc8ec99..10248d527616 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2690,7 +2690,7 @@ struct cfg80211_scan_request { s8 tsf_report_link_id; /* keep last */ - struct ieee80211_channel *channels[] __counted_by(n_channels); + struct ieee80211_channel *channels[]; }; static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) -- cgit v1.2.3 From 14450be2332a49445106403492a367412b8c23f4 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Wed, 9 Jul 2025 23:37:55 +0300 Subject: wifi: cfg80211: Fix interface type validation Fix a condition that verified valid values of interface types. Signed-off-by: Ilan Peer Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250709233537.7ad199ca5939.I0ac1ff74798bf59a87a57f2e18f2153c308b119b@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6ec9a8865b8b..f67424ec1085 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -633,7 +633,7 @@ ieee80211_get_sband_iftype_data(const struct ieee80211_supported_band *sband, const struct ieee80211_sband_iftype_data *data; int i; - if (WARN_ON(iftype >= NL80211_IFTYPE_MAX)) + if (WARN_ON(iftype >= NUM_NL80211_IFTYPES)) return NULL; if (iftype == NL80211_IFTYPE_AP_VLAN) -- cgit v1.2.3 From c932be7262323011ae8caa050811300b85347050 Mon Sep 17 00:00:00 2001 From: Yuvarani V Date: Thu, 10 Jul 2025 11:04:27 +0530 Subject: wifi: cfg80211: parse attribute to update unsolicited probe response template At present, the updated unsolicited broadcast probe response template is not processed during userspace commands such as channel switch or color change. This leads to an issue where older incorrect unsolicited probe response is still used during these events. Add support to parse the netlink attribute and store it so that mac80211/drivers can use it to set the BSS_CHANGED_UNSOL_BCAST_PROBE_RESP flag in order to send the updated unsolicited broadcast probe response templates during these events. Signed-off-by: Yuvarani V Signed-off-by: Aditya Kumar Singh Link: https://patch.msgid.link/20250710-update_unsol_bcast_probe_resp-v2-1-31aca39d3b30@oss.qualcomm.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ net/wireless/nl80211.c | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index f67424ec1085..77bc17d6e96d 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1526,6 +1526,7 @@ struct cfg80211_ap_update { * @n_counter_offsets_beacon: number of csa counters the beacon (tail) * @n_counter_offsets_presp: number of csa counters in the probe response * @beacon_after: beacon data to be used on the new channel + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @radar_required: whether radar detection is required on the new channel * @block_tx: whether transmissions should be blocked while changing * @count: number of beacons until switch @@ -1540,6 +1541,7 @@ struct cfg80211_csa_settings { unsigned int n_counter_offsets_beacon; unsigned int n_counter_offsets_presp; struct cfg80211_beacon_data beacon_after; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; bool radar_required; bool block_tx; u8 count; @@ -1555,6 +1557,7 @@ struct cfg80211_csa_settings { * @counter_offset_beacon: offsets of the counters within the beacon (tail) * @counter_offset_presp: offsets of the counters within the probe response * @beacon_next: beacon data to be used after the color change + * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @count: number of beacons until the color change * @color: the color used after the change * @link_id: defines the link on which color change is expected during MLO. @@ -1565,6 +1568,7 @@ struct cfg80211_color_change_settings { u16 counter_offset_beacon; u16 counter_offset_presp; struct cfg80211_beacon_data beacon_next; + struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; u8 count; u8 color; u8 link_id; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4e6c0a4e2a82..1ee14592828d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -10995,6 +10995,16 @@ skip_beacons: if (info->attrs[NL80211_ATTR_CH_SWITCH_BLOCK_TX]) params.block_tx = true; + if ((wdev->iftype == NL80211_IFTYPE_AP || + wdev->iftype == NL80211_IFTYPE_P2P_GO) && + info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms.unsol_bcast_probe_resp); + if (err) + goto free; + } + params.link_id = link_id; err = rdev_channel_switch(rdev, dev, ¶ms); @@ -16797,6 +16807,14 @@ static int nl80211_color_change(struct sk_buff *skb, struct genl_info *info) params.counter_offset_presp = offset; } + if (info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP]) { + err = nl80211_parse_unsol_bcast_probe_resp( + rdev, info->attrs[NL80211_ATTR_UNSOL_BCAST_PROBE_RESP], + ¶ms.unsol_bcast_probe_resp); + if (err) + goto out; + } + params.link_id = nl80211_link_id(info->attrs); err = rdev_color_change(rdev, dev, ¶ms); -- cgit v1.2.3 From 1aeed732f4f885ad36280ca4afb331fa42bf7263 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 10 Jul 2025 16:55:58 +0800 Subject: net: mctp: Use hashtable for binds Ensure that a specific EID (remote or local) bind will match in preference to a MCTP_ADDR_ANY bind. This adds infrastructure for binding a socket to receive messages from a specific remote peer address, a future commit will expose an API for this. Signed-off-by: Matt Johnston Link: https://patch.msgid.link/20250710-mctp-bind-v4-5-8ec2f6460c56@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/net/netns/mctp.h | 20 ++++++++++--- net/mctp/af_mctp.c | 11 ++++--- net/mctp/route.c | 75 +++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 84 insertions(+), 22 deletions(-) (limited to 'include/net') diff --git a/include/net/netns/mctp.h b/include/net/netns/mctp.h index 1db8f9aaddb4..89555f90b97b 100644 --- a/include/net/netns/mctp.h +++ b/include/net/netns/mctp.h @@ -6,19 +6,25 @@ #ifndef __NETNS_MCTP_H__ #define __NETNS_MCTP_H__ +#include +#include #include #include +#define MCTP_BINDS_BITS 7 + struct netns_mctp { /* Only updated under RTNL, entries freed via RCU */ struct list_head routes; - /* Bound sockets: list of sockets bound by type. - * This list is updated from non-atomic contexts (under bind_lock), - * and read (under rcu) in packet rx + /* Bound sockets: hash table of sockets, keyed by + * (type, src_eid, dest_eid). + * Specific src_eid/dest_eid entries also have an entry for + * MCTP_ADDR_ANY. This list is updated from non-atomic contexts + * (under bind_lock), and read (under rcu) in packet rx. */ struct mutex bind_lock; - struct hlist_head binds; + DECLARE_HASHTABLE(binds, MCTP_BINDS_BITS); /* tag allocations. This list is read and updated from atomic contexts, * but elements are free()ed after a RCU grace-period @@ -34,4 +40,10 @@ struct netns_mctp { struct list_head neighbours; }; +static inline u32 mctp_bind_hash(u8 type, u8 local_addr, u8 peer_addr) +{ + return hash_32(type | (u32)local_addr << 8 | (u32)peer_addr << 16, + MCTP_BINDS_BITS); +} + #endif /* __NETNS_MCTP_H__ */ diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 20edaf840a60..16341de5cf28 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -626,17 +626,17 @@ static int mctp_sk_hash(struct sock *sk) struct net *net = sock_net(sk); struct sock *existing; struct mctp_sock *msk; + u32 hash; int rc; msk = container_of(sk, struct mctp_sock, sk); - /* Bind lookup runs under RCU, remain live during that. */ - sock_set_flag(sk, SOCK_RCU_FREE); + hash = mctp_bind_hash(msk->bind_type, msk->bind_addr, MCTP_ADDR_ANY); mutex_lock(&net->mctp.bind_lock); /* Prevent duplicate binds. */ - sk_for_each(existing, &net->mctp.binds) { + sk_for_each(existing, &net->mctp.binds[hash]) { struct mctp_sock *mex = container_of(existing, struct mctp_sock, sk); @@ -648,7 +648,10 @@ static int mctp_sk_hash(struct sock *sk) } } - sk_add_node_rcu(sk, &net->mctp.binds); + /* Bind lookup runs under RCU, remain live during that. */ + sock_set_flag(sk, SOCK_RCU_FREE); + + sk_add_node_rcu(sk, &net->mctp.binds[hash]); rc = 0; out: diff --git a/net/mctp/route.c b/net/mctp/route.c index a20d6b11d418..69cfb0e6c545 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -40,33 +40,32 @@ static int mctp_dst_discard(struct mctp_dst *dst, struct sk_buff *skb) return 0; } -static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +static struct mctp_sock *mctp_lookup_bind_details(struct net *net, + struct sk_buff *skb, + u8 type, u8 dest, + u8 src, bool allow_net_any) { struct mctp_skb_cb *cb = mctp_cb(skb); - struct mctp_hdr *mh; struct sock *sk; - u8 type; - - WARN_ON(!rcu_read_lock_held()); - - /* TODO: look up in skb->cb? */ - mh = mctp_hdr(skb); + u8 hash; - if (!skb_headlen(skb)) - return NULL; + WARN_ON_ONCE(!rcu_read_lock_held()); - type = (*(u8 *)skb->data) & 0x7f; + hash = mctp_bind_hash(type, dest, src); - sk_for_each_rcu(sk, &net->mctp.binds) { + sk_for_each_rcu(sk, &net->mctp.binds[hash]) { struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + if (!allow_net_any && msk->bind_net == MCTP_NET_ANY) + continue; + if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net) continue; if (msk->bind_type != type) continue; - if (!mctp_address_matches(msk->bind_addr, mh->dest)) + if (!mctp_address_matches(msk->bind_addr, dest)) continue; return msk; @@ -75,6 +74,54 @@ static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) return NULL; } +static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb) +{ + struct mctp_sock *msk; + struct mctp_hdr *mh; + u8 type; + + /* TODO: look up in skb->cb? */ + mh = mctp_hdr(skb); + + if (!skb_headlen(skb)) + return NULL; + + type = (*(u8 *)skb->data) & 0x7f; + + /* Look for binds in order of widening scope. A given destination or + * source address also implies matching on a particular network. + * + * - Matching destination and source + * - Matching destination + * - Matching source + * - Matching network, any address + * - Any network or address + */ + + msk = mctp_lookup_bind_details(net, skb, type, mh->dest, mh->src, + false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, mh->src, + false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, mh->dest, MCTP_ADDR_ANY, + false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, + MCTP_ADDR_ANY, false); + if (msk) + return msk; + msk = mctp_lookup_bind_details(net, skb, type, MCTP_ADDR_ANY, + MCTP_ADDR_ANY, true); + if (msk) + return msk; + + return NULL; +} + /* A note on the key allocations. * * struct net->mctp.keys contains our set of currently-allocated keys for @@ -1671,7 +1718,7 @@ static int __net_init mctp_routes_net_init(struct net *net) struct netns_mctp *ns = &net->mctp; INIT_LIST_HEAD(&ns->routes); - INIT_HLIST_HEAD(&ns->binds); + hash_init(ns->binds); mutex_init(&ns->bind_lock); INIT_HLIST_HEAD(&ns->keys); spin_lock_init(&ns->keys_lock); -- cgit v1.2.3 From 3549eb08e5505823857838b5cf5f08567702d054 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Thu, 10 Jul 2025 16:55:59 +0800 Subject: net: mctp: Allow limiting binds to a peer address Prior to calling bind() a program may call connect() on a socket to restrict to a remote peer address. Using connect() is the normal mechanism to specify a remote network peer, so we use that here. In MCTP connect() is only used for bound sockets - send() is not available for MCTP since a tag must be provided for each message. The smctp_type must match between connect() and bind() calls. Signed-off-by: Matt Johnston Link: https://patch.msgid.link/20250710-mctp-bind-v4-6-8ec2f6460c56@codeconstruct.com.au Signed-off-by: Paolo Abeni --- include/net/mctp.h | 5 ++- net/mctp/af_mctp.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++--- net/mctp/route.c | 6 ++- 3 files changed, 108 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/mctp.h b/include/net/mctp.h index ac4f4ecdfc24..c3207ce98f07 100644 --- a/include/net/mctp.h +++ b/include/net/mctp.h @@ -69,7 +69,10 @@ struct mctp_sock { /* bind() params */ unsigned int bind_net; - mctp_eid_t bind_addr; + mctp_eid_t bind_local_addr; + mctp_eid_t bind_peer_addr; + unsigned int bind_peer_net; + bool bind_peer_set; __u8 bind_type; /* sendmsg()/recvmsg() uses struct sockaddr_mctp_ext */ diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 16341de5cf28..df4e8cf33899 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -79,7 +79,7 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) goto out_release; } - msk->bind_addr = smctp->smctp_addr.s_addr; + msk->bind_local_addr = smctp->smctp_addr.s_addr; /* MCTP_NET_ANY with a specific EID is resolved to the default net * at bind() time. @@ -87,13 +87,35 @@ static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) * lookup time. */ if (smctp->smctp_network == MCTP_NET_ANY && - msk->bind_addr != MCTP_ADDR_ANY) { + msk->bind_local_addr != MCTP_ADDR_ANY) { msk->bind_net = mctp_default_net(net); } else { msk->bind_net = smctp->smctp_network; } - msk->bind_type = smctp->smctp_type & 0x7f; /* ignore the IC bit */ + /* ignore the IC bit */ + smctp->smctp_type &= 0x7f; + + if (msk->bind_peer_set) { + if (msk->bind_type != smctp->smctp_type) { + /* Prior connect() had a different type */ + rc = -EINVAL; + goto out_release; + } + + if (msk->bind_net == MCTP_NET_ANY) { + /* Restrict to the network passed to connect() */ + msk->bind_net = msk->bind_peer_net; + } + + if (msk->bind_net != msk->bind_peer_net) { + /* connect() had a different net to bind() */ + rc = -EINVAL; + goto out_release; + } + } else { + msk->bind_type = smctp->smctp_type; + } rc = sk->sk_prot->hash(sk); @@ -103,6 +125,67 @@ out_release: return rc; } +/* Used to set a specific peer prior to bind. Not used for outbound + * connections (Tag Owner set) since MCTP is a datagram protocol. + */ +static int mctp_connect(struct socket *sock, struct sockaddr *addr, + int addrlen, int flags) +{ + struct sock *sk = sock->sk; + struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); + struct net *net = sock_net(&msk->sk); + struct sockaddr_mctp *smctp; + int rc; + + if (addrlen != sizeof(*smctp)) + return -EINVAL; + + if (addr->sa_family != AF_MCTP) + return -EAFNOSUPPORT; + + /* It's a valid sockaddr for MCTP, cast and do protocol checks */ + smctp = (struct sockaddr_mctp *)addr; + + if (!mctp_sockaddr_is_ok(smctp)) + return -EINVAL; + + /* Can't bind by tag */ + if (smctp->smctp_tag) + return -EINVAL; + + /* IC bit must be unset */ + if (smctp->smctp_type & 0x80) + return -EINVAL; + + lock_sock(sk); + + if (sk_hashed(sk)) { + /* bind() already */ + rc = -EADDRINUSE; + goto out_release; + } + + if (msk->bind_peer_set) { + /* connect() already */ + rc = -EADDRINUSE; + goto out_release; + } + + msk->bind_peer_set = true; + msk->bind_peer_addr = smctp->smctp_addr.s_addr; + msk->bind_type = smctp->smctp_type; + if (smctp->smctp_network == MCTP_NET_ANY) + msk->bind_peer_net = mctp_default_net(net); + else + msk->bind_peer_net = smctp->smctp_network; + + rc = 0; + +out_release: + release_sock(sk); + return rc; +} + static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_mctp *, addr, msg->msg_name); @@ -546,7 +629,7 @@ static const struct proto_ops mctp_dgram_ops = { .family = PF_MCTP, .release = mctp_release, .bind = mctp_bind, - .connect = sock_no_connect, + .connect = mctp_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, @@ -613,6 +696,7 @@ static int mctp_sk_init(struct sock *sk) INIT_HLIST_HEAD(&msk->keys); timer_setup(&msk->key_expiry, mctp_sk_expire_keys, 0); + msk->bind_peer_set = false; return 0; } @@ -626,12 +710,17 @@ static int mctp_sk_hash(struct sock *sk) struct net *net = sock_net(sk); struct sock *existing; struct mctp_sock *msk; + mctp_eid_t remote; u32 hash; int rc; msk = container_of(sk, struct mctp_sock, sk); - hash = mctp_bind_hash(msk->bind_type, msk->bind_addr, MCTP_ADDR_ANY); + if (msk->bind_peer_set) + remote = msk->bind_peer_addr; + else + remote = MCTP_ADDR_ANY; + hash = mctp_bind_hash(msk->bind_type, msk->bind_local_addr, remote); mutex_lock(&net->mctp.bind_lock); @@ -640,8 +729,12 @@ static int mctp_sk_hash(struct sock *sk) struct mctp_sock *mex = container_of(existing, struct mctp_sock, sk); + bool same_peer = (mex->bind_peer_set && msk->bind_peer_set && + mex->bind_peer_addr == msk->bind_peer_addr) || + (!mex->bind_peer_set && !msk->bind_peer_set); + if (mex->bind_type == msk->bind_type && - mex->bind_addr == msk->bind_addr && + mex->bind_local_addr == msk->bind_local_addr && same_peer && mex->bind_net == msk->bind_net) { rc = -EADDRINUSE; goto out; diff --git a/net/mctp/route.c b/net/mctp/route.c index 69cfb0e6c545..2b2b958ef6a3 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -65,7 +65,11 @@ static struct mctp_sock *mctp_lookup_bind_details(struct net *net, if (msk->bind_type != type) continue; - if (!mctp_address_matches(msk->bind_addr, dest)) + if (msk->bind_peer_set && + !mctp_address_matches(msk->bind_peer_addr, src)) + continue; + + if (!mctp_address_matches(msk->bind_local_addr, dest)) continue; return msk; -- cgit v1.2.3 From dfef8d87a031ac1a46dde3de804e0fcf3c3a6afd Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 14 Jul 2025 22:27:43 +0200 Subject: Bluetooth: hci_core: fix typos in macros The provided macro parameter is named 'dev' (rather than 'hdev', which may be a variable on the stack where the macro is used). Fixes: a9a830a676a9 ("Bluetooth: hci_event: Fix sending HCI_OP_READ_ENC_KEY_SIZE") Fixes: 6126ffabba6b ("Bluetooth: Introduce HCI_CONN_FLAG_DEVICE_PRIVACY device flag") Signed-off-by: Christian Eggers Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 0da011fc8146..052c91613bb9 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1940,11 +1940,11 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define ll_privacy_capable(dev) ((dev)->le_features[0] & HCI_LE_LL_PRIVACY) #define privacy_mode_capable(dev) (ll_privacy_capable(dev) && \ - (hdev->commands[39] & 0x04)) + ((dev)->commands[39] & 0x04)) #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks)) + !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &(dev)->quirks)) #define read_voice_setting_capable(dev) \ ((dev)->commands[9] & 0x04 && \ -- cgit v1.2.3 From cdee6a4416b2a57c89082929cc60e2275bb32a3a Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 14 Jul 2025 22:27:44 +0200 Subject: Bluetooth: hci_core: add missing braces when using macro parameters Macro parameters should always be put into braces when accessing it. Fixes: 4fc9857ab8c6 ("Bluetooth: hci_sync: Add check simultaneous roles support") Signed-off-by: Christian Eggers Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 052c91613bb9..367ca43f45d1 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -829,20 +829,20 @@ extern struct mutex hci_cb_list_lock; #define hci_dev_test_and_clear_flag(hdev, nr) test_and_clear_bit((nr), (hdev)->dev_flags) #define hci_dev_test_and_change_flag(hdev, nr) test_and_change_bit((nr), (hdev)->dev_flags) -#define hci_dev_clear_volatile_flags(hdev) \ - do { \ - hci_dev_clear_flag(hdev, HCI_LE_SCAN); \ - hci_dev_clear_flag(hdev, HCI_LE_ADV); \ - hci_dev_clear_flag(hdev, HCI_LL_RPA_RESOLUTION);\ - hci_dev_clear_flag(hdev, HCI_PERIODIC_INQ); \ - hci_dev_clear_flag(hdev, HCI_QUALITY_REPORT); \ +#define hci_dev_clear_volatile_flags(hdev) \ + do { \ + hci_dev_clear_flag((hdev), HCI_LE_SCAN); \ + hci_dev_clear_flag((hdev), HCI_LE_ADV); \ + hci_dev_clear_flag((hdev), HCI_LL_RPA_RESOLUTION); \ + hci_dev_clear_flag((hdev), HCI_PERIODIC_INQ); \ + hci_dev_clear_flag((hdev), HCI_QUALITY_REPORT); \ } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) && \ - (hdev->le_states[4] & 0x08) && /* Central */ \ - (hdev->le_states[4] & 0x40) && /* Peripheral */ \ - (hdev->le_states[3] & 0x10)) /* Simultaneous */ + (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &(hdev)->quirks) && \ + ((hdev)->le_states[4] & 0x08) && /* Central */ \ + ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ + ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ /* ----- HCI interface to upper protocols ----- */ int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr); -- cgit v1.2.3 From 6851a0c228fc040dce8e4c393004209e7372e0a3 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 14 Jul 2025 22:27:45 +0200 Subject: Bluetooth: hci_dev: replace 'quirks' integer by 'quirk_flags' bitmap The 'quirks' member already ran out of bits on some platforms some time ago. Replace the integer member by a bitmap in order to have enough bits in future. Replace raw bit operations by accessor macros. Fixes: ff26b2dd6568 ("Bluetooth: Add quirk for broken READ_VOICE_SETTING") Fixes: 127881334eaa ("Bluetooth: Add quirk for broken READ_PAGE_SCAN_TYPE") Suggested-by: Pauli Virtanen Tested-by: Ivan Pravdin Signed-off-by: Kiran K Signed-off-by: Christian Eggers Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/bfusb.c | 2 +- drivers/bluetooth/bpa10x.c | 2 +- drivers/bluetooth/btbcm.c | 8 ++--- drivers/bluetooth/btintel.c | 28 ++++++++-------- drivers/bluetooth/btintel_pcie.c | 8 ++--- drivers/bluetooth/btmtksdio.c | 4 +-- drivers/bluetooth/btmtkuart.c | 2 +- drivers/bluetooth/btnxpuart.c | 2 +- drivers/bluetooth/btqca.c | 2 +- drivers/bluetooth/btqcomsmd.c | 2 +- drivers/bluetooth/btrtl.c | 10 +++--- drivers/bluetooth/btsdio.c | 2 +- drivers/bluetooth/btusb.c | 70 ++++++++++++++++++++-------------------- drivers/bluetooth/hci_aml.c | 2 +- drivers/bluetooth/hci_bcm.c | 4 +-- drivers/bluetooth/hci_bcm4377.c | 10 +++--- drivers/bluetooth/hci_intel.c | 2 +- drivers/bluetooth/hci_ldisc.c | 6 ++-- drivers/bluetooth/hci_ll.c | 4 +-- drivers/bluetooth/hci_nokia.c | 2 +- drivers/bluetooth/hci_qca.c | 14 ++++---- drivers/bluetooth/hci_serdev.c | 8 ++--- drivers/bluetooth/hci_vhci.c | 8 ++--- drivers/bluetooth/virtio_bt.c | 10 +++--- include/net/bluetooth/hci.h | 2 ++ include/net/bluetooth/hci_core.h | 28 +++++++++------- net/bluetooth/hci_core.c | 4 +-- net/bluetooth/hci_debugfs.c | 8 ++--- net/bluetooth/hci_event.c | 19 ++++++----- net/bluetooth/hci_sync.c | 59 +++++++++++++++++---------------- net/bluetooth/mgmt.c | 38 +++++++++++----------- net/bluetooth/msft.c | 2 +- 32 files changed, 187 insertions(+), 185 deletions(-) (limited to 'include/net') diff --git a/drivers/bluetooth/bfusb.c b/drivers/bluetooth/bfusb.c index 0d6ad50da046..8df310983bf6 100644 --- a/drivers/bluetooth/bfusb.c +++ b/drivers/bluetooth/bfusb.c @@ -670,7 +670,7 @@ static int bfusb_probe(struct usb_interface *intf, const struct usb_device_id *i hdev->flush = bfusb_flush; hdev->send = bfusb_send_frame; - set_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS); if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c index 1fa58c059cbf..8b43dfc755de 100644 --- a/drivers/bluetooth/bpa10x.c +++ b/drivers/bluetooth/bpa10x.c @@ -398,7 +398,7 @@ static int bpa10x_probe(struct usb_interface *intf, hdev->send = bpa10x_send_frame; hdev->set_diag = bpa10x_set_diag; - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); err = hci_register_dev(hdev); if (err < 0) { diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index 0a60660fc8ce..3a3a56ddbb06 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -135,7 +135,7 @@ int btbcm_check_bdaddr(struct hci_dev *hdev) if (btbcm_set_bdaddr_from_efi(hdev) != 0) { bt_dev_info(hdev, "BCM: Using default device address (%pMR)", &bda->bdaddr); - set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR); } } @@ -467,7 +467,7 @@ static int btbcm_print_controller_features(struct hci_dev *hdev) /* Read DMI and disable broken Read LE Min/Max Tx Power */ if (dmi_first_match(disable_broken_read_transmit_power)) - set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER); return 0; } @@ -706,7 +706,7 @@ int btbcm_finalize(struct hci_dev *hdev, bool *fw_load_done, bool use_autobaud_m btbcm_check_bdaddr(hdev); - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER); return 0; } @@ -769,7 +769,7 @@ int btbcm_setup_apple(struct hci_dev *hdev) kfree_skb(skb); } - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER); return 0; } diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index 06575a0b9aee..06016ac3965c 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -88,7 +88,7 @@ int btintel_check_bdaddr(struct hci_dev *hdev) if (!bacmp(&bda->bdaddr, BDADDR_INTEL)) { bt_dev_err(hdev, "Found Intel default device address (%pMR)", &bda->bdaddr); - set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR); } kfree_skb(skb); @@ -2027,7 +2027,7 @@ static int btintel_download_fw(struct hci_dev *hdev, */ if (!bacmp(¶ms->otp_bdaddr, BDADDR_ANY)) { bt_dev_info(hdev, "No device address configured"); - set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR); } download: @@ -2295,7 +2295,7 @@ static int btintel_prepare_fw_download_tlv(struct hci_dev *hdev, */ if (!bacmp(&ver->otp_bd_addr, BDADDR_ANY)) { bt_dev_info(hdev, "No device address configured"); - set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR); } } @@ -3435,9 +3435,9 @@ static int btintel_setup_combined(struct hci_dev *hdev) } /* Apply the common HCI quirks for Intel device */ - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); - set_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG); /* Set up the quality report callback for Intel devices */ hdev->set_quality_report = btintel_set_quality_report; @@ -3475,8 +3475,8 @@ static int btintel_setup_combined(struct hci_dev *hdev) */ if (!btintel_test_flag(hdev, INTEL_ROM_LEGACY_NO_WBS_SUPPORT)) - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - &hdev->quirks); + hci_set_quirk(hdev, + HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); err = btintel_legacy_rom_setup(hdev, &ver); break; @@ -3491,11 +3491,11 @@ static int btintel_setup_combined(struct hci_dev *hdev) * * All Legacy bootloader devices support WBS */ - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - &hdev->quirks); + hci_set_quirk(hdev, + HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); /* These variants don't seem to support LE Coded PHY */ - set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_CODED); /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, ver.hw_variant); @@ -3571,10 +3571,10 @@ static int btintel_setup_combined(struct hci_dev *hdev) * * All Legacy bootloader devices support WBS */ - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); /* These variants don't seem to support LE Coded PHY */ - set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_CODED); /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, ver.hw_variant); @@ -3600,7 +3600,7 @@ static int btintel_setup_combined(struct hci_dev *hdev) * * All TLV based devices support WBS */ - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, diff --git a/drivers/bluetooth/btintel_pcie.c b/drivers/bluetooth/btintel_pcie.c index e1c688dd2d45..f4e3fb54fe76 100644 --- a/drivers/bluetooth/btintel_pcie.c +++ b/drivers/bluetooth/btintel_pcie.c @@ -2081,9 +2081,9 @@ static int btintel_pcie_setup_internal(struct hci_dev *hdev) } /* Apply the common HCI quirks for Intel device */ - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); - set_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG); /* Set up the quality report callback for Intel devices */ hdev->set_quality_report = btintel_set_quality_report; @@ -2123,7 +2123,7 @@ static int btintel_pcie_setup_internal(struct hci_dev *hdev) * * All TLV based devices support WBS */ - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); /* Setup MSFT Extension support */ btintel_set_msft_opcode(hdev, diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c index c16a3518b8ff..4fc673640bfc 100644 --- a/drivers/bluetooth/btmtksdio.c +++ b/drivers/bluetooth/btmtksdio.c @@ -1141,7 +1141,7 @@ static int btmtksdio_setup(struct hci_dev *hdev) } /* Enable WBS with mSBC codec */ - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); /* Enable GPIO reset mechanism */ if (bdev->reset) { @@ -1384,7 +1384,7 @@ static int btmtksdio_probe(struct sdio_func *func, SET_HCIDEV_DEV(hdev, &func->dev); hdev->manufacturer = 70; - set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP); sdio_set_drvdata(func, bdev); diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c index c97e260fcb0c..51400a891f6e 100644 --- a/drivers/bluetooth/btmtkuart.c +++ b/drivers/bluetooth/btmtkuart.c @@ -872,7 +872,7 @@ static int btmtkuart_probe(struct serdev_device *serdev) SET_HCIDEV_DEV(hdev, &serdev->dev); hdev->manufacturer = 70; - set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP); if (btmtkuart_is_standalone(bdev)) { err = clk_prepare_enable(bdev->osc); diff --git a/drivers/bluetooth/btnxpuart.c b/drivers/bluetooth/btnxpuart.c index 1088db6056a4..24f9b52605a1 100644 --- a/drivers/bluetooth/btnxpuart.c +++ b/drivers/bluetooth/btnxpuart.c @@ -1807,7 +1807,7 @@ static int nxp_serdev_probe(struct serdev_device *serdev) "local-bd-address", (u8 *)&ba, sizeof(ba)); if (bacmp(&ba, BDADDR_ANY)) - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); if (hci_register_dev(hdev) < 0) { dev_err(&serdev->dev, "Can't register HCI device\n"); diff --git a/drivers/bluetooth/btqca.c b/drivers/bluetooth/btqca.c index edefb9dc76aa..7c958d6065be 100644 --- a/drivers/bluetooth/btqca.c +++ b/drivers/bluetooth/btqca.c @@ -739,7 +739,7 @@ static int qca_check_bdaddr(struct hci_dev *hdev, const struct qca_fw_config *co bda = (struct hci_rp_read_bd_addr *)skb->data; if (!bacmp(&bda->bdaddr, &config->bdaddr)) - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); kfree_skb(skb); diff --git a/drivers/bluetooth/btqcomsmd.c b/drivers/bluetooth/btqcomsmd.c index c0eb71d6ffd3..d2e13fcb6bab 100644 --- a/drivers/bluetooth/btqcomsmd.c +++ b/drivers/bluetooth/btqcomsmd.c @@ -117,7 +117,7 @@ static int btqcomsmd_setup(struct hci_dev *hdev) /* Devices do not have persistent storage for BD address. Retrieve * it from the firmware node property. */ - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); return 0; } diff --git a/drivers/bluetooth/btrtl.c b/drivers/bluetooth/btrtl.c index 7838c89e529e..4d182cf6e037 100644 --- a/drivers/bluetooth/btrtl.c +++ b/drivers/bluetooth/btrtl.c @@ -1287,7 +1287,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) /* Enable controller to do both LE scan and BR/EDR inquiry * simultaneously. */ - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); /* Enable central-peripheral role (able to create new connections with * an existing connection in slave role). @@ -1301,7 +1301,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) case CHIP_ID_8851B: case CHIP_ID_8922A: case CHIP_ID_8852BT: - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); /* RTL8852C needs to transmit mSBC data continuously without * the zero length of USB packets for the ALT 6 supported chips @@ -1312,7 +1312,8 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) if (btrtl_dev->project_id == CHIP_ID_8852A || btrtl_dev->project_id == CHIP_ID_8852B || btrtl_dev->project_id == CHIP_ID_8852C) - set_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks); + hci_set_quirk(hdev, + HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER); hci_set_aosp_capable(hdev); break; @@ -1331,8 +1332,7 @@ void btrtl_set_quirks(struct hci_dev *hdev, struct btrtl_device_info *btrtl_dev) * but it doesn't support any features from page 2 - * it either responds with garbage or with error status */ - set_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2, - &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2); break; default: break; diff --git a/drivers/bluetooth/btsdio.c b/drivers/bluetooth/btsdio.c index a69feb08486a..8325655ce6aa 100644 --- a/drivers/bluetooth/btsdio.c +++ b/drivers/bluetooth/btsdio.c @@ -327,7 +327,7 @@ static int btsdio_probe(struct sdio_func *func, hdev->send = btsdio_send_frame; if (func->vendor == 0x0104 && func->device == 0x00c5) - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); err = hci_register_dev(hdev); if (err < 0) { diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 9ab661d2d1e6..64509f5bfc99 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -2472,18 +2472,18 @@ static int btusb_setup_csr(struct hci_dev *hdev) * Probably will need to be expanded in the future; * without these the controller will lock up. */ - set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks); - set_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL); + hci_set_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_VOICE_SETTING); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE); /* Clear the reset quirk since this is not an actual * early Bluetooth 1.1 device from CSR. */ - clear_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); - clear_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + hci_clear_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); + hci_clear_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); /* * Special workaround for these BT 4.0 chip clones, and potentially more: @@ -3494,7 +3494,7 @@ static int btusb_setup_qca(struct hci_dev *hdev) /* Mark HCI_OP_ENHANCED_SETUP_SYNC_CONN as broken as it doesn't seem to * work with the likes of HSP/HFP mSBC. */ - set_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN); return 0; } @@ -4008,10 +4008,10 @@ static int btusb_probe(struct usb_interface *intf, } #endif if (id->driver_info & BTUSB_CW6622) - set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY); if (id->driver_info & BTUSB_BCM2045) - set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY); if (id->driver_info & BTUSB_BCM92035) hdev->setup = btusb_setup_bcm92035; @@ -4068,8 +4068,8 @@ static int btusb_probe(struct usb_interface *intf, hdev->reset = btmtk_reset_sync; hdev->set_bdaddr = btmtk_set_bdaddr; hdev->send = btusb_send_frame_mtk; - set_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &hdev->quirks); - set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP); data->recv_acl = btmtk_usb_recv_acl; data->suspend = btmtk_usb_suspend; data->resume = btmtk_usb_resume; @@ -4077,20 +4077,20 @@ static int btusb_probe(struct usb_interface *intf, } if (id->driver_info & BTUSB_SWAVE) { - set_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS); } if (id->driver_info & BTUSB_INTEL_BOOT) { hdev->manufacturer = 2; - set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE); } if (id->driver_info & BTUSB_ATH3012) { data->setup_on_usb = btusb_setup_qca; hdev->set_bdaddr = btusb_set_bdaddr_ath3012; - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); + hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER); } if (id->driver_info & BTUSB_QCA_ROME) { @@ -4098,7 +4098,7 @@ static int btusb_probe(struct usb_interface *intf, hdev->shutdown = btusb_shutdown_qca; hdev->set_bdaddr = btusb_set_bdaddr_ath3012; hdev->reset = btusb_qca_reset; - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); btusb_check_needs_reset_resume(intf); } @@ -4112,7 +4112,7 @@ static int btusb_probe(struct usb_interface *intf, hdev->shutdown = btusb_shutdown_qca; hdev->set_bdaddr = btusb_set_bdaddr_wcn6855; hdev->reset = btusb_qca_reset; - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); hci_set_msft_opcode(hdev, 0xFD70); } @@ -4140,35 +4140,35 @@ static int btusb_probe(struct usb_interface *intf, if (id->driver_info & BTUSB_ACTIONS_SEMI) { /* Support is advertised, but not implemented */ - set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &hdev->quirks); - set_bit(HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_EXT_SCAN); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_EXT_CREATE_CONN); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT); } if (!reset) - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); if (force_scofix || id->driver_info & BTUSB_WRONG_SCO_MTU) { if (!disable_scofix) - set_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE); } if (id->driver_info & BTUSB_BROKEN_ISOC) data->isoc = NULL; if (id->driver_info & BTUSB_WIDEBAND_SPEECH) - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); if (id->driver_info & BTUSB_INVALID_LE_STATES) - set_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES); if (id->driver_info & BTUSB_DIGIANSWER) { data->cmdreq_type = USB_TYPE_VENDOR; - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); } if (id->driver_info & BTUSB_CSR) { @@ -4177,10 +4177,10 @@ static int btusb_probe(struct usb_interface *intf, /* Old firmware would otherwise execute USB reset */ if (bcdDevice < 0x117) - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); /* This must be set first in case we disable it for fakes */ - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); /* Fake CSR devices with broken commands */ if (le16_to_cpu(udev->descriptor.idVendor) == 0x0a12 && @@ -4193,7 +4193,7 @@ static int btusb_probe(struct usb_interface *intf, /* New sniffer firmware has crippled HCI interface */ if (le16_to_cpu(udev->descriptor.bcdDevice) > 0x997) - set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE); } if (id->driver_info & BTUSB_INTEL_BOOT) { diff --git a/drivers/bluetooth/hci_aml.c b/drivers/bluetooth/hci_aml.c index 1394c575aa6d..707e90f80130 100644 --- a/drivers/bluetooth/hci_aml.c +++ b/drivers/bluetooth/hci_aml.c @@ -424,7 +424,7 @@ static int aml_check_bdaddr(struct hci_dev *hdev) if (!bacmp(&paddr->bdaddr, AML_BDADDR_DEFAULT)) { bt_dev_info(hdev, "amlbt using default bdaddr (%pM)", &paddr->bdaddr); - set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR); } exit: diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 9684eb16059b..f96617b85d87 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -643,8 +643,8 @@ static int bcm_setup(struct hci_uart *hu) * Allow the bootloader to set a valid address through the * device tree. */ - if (test_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks)) - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hu->hdev->quirks); + if (hci_test_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR)) + hci_set_quirk(hu->hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); if (!bcm_request_irq(bcm)) err = bcm_setup_sleep(hu); diff --git a/drivers/bluetooth/hci_bcm4377.c b/drivers/bluetooth/hci_bcm4377.c index 9bce53e49cfa..8a9aa33776b0 100644 --- a/drivers/bluetooth/hci_bcm4377.c +++ b/drivers/bluetooth/hci_bcm4377.c @@ -1435,7 +1435,7 @@ static int bcm4377_check_bdaddr(struct bcm4377_data *bcm4377) bda = (struct hci_rp_read_bd_addr *)skb->data; if (!bcm4377_is_valid_bdaddr(bcm4377, &bda->bdaddr)) - set_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &bcm4377->hdev->quirks); + hci_set_quirk(bcm4377->hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); kfree_skb(skb); return 0; @@ -2389,13 +2389,13 @@ static int bcm4377_probe(struct pci_dev *pdev, const struct pci_device_id *id) hdev->setup = bcm4377_hci_setup; if (bcm4377->hw->broken_mws_transport_config) - set_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG); if (bcm4377->hw->broken_ext_scan) - set_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_EXT_SCAN); if (bcm4377->hw->broken_le_coded) - set_bit(HCI_QUIRK_BROKEN_LE_CODED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_CODED); if (bcm4377->hw->broken_le_ext_adv_report_phy) - set_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY); pci_set_drvdata(pdev, bcm4377); hci_set_drvdata(hdev, bcm4377); diff --git a/drivers/bluetooth/hci_intel.c b/drivers/bluetooth/hci_intel.c index 811f33701f84..d22fbb7f9fc5 100644 --- a/drivers/bluetooth/hci_intel.c +++ b/drivers/bluetooth/hci_intel.c @@ -660,7 +660,7 @@ static int intel_setup(struct hci_uart *hu) */ if (!bacmp(¶ms.otp_bdaddr, BDADDR_ANY)) { bt_dev_info(hdev, "No device address configured"); - set_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_INVALID_BDADDR); } /* With this Intel bootloader only the hardware variant and device diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c index acba83156de9..d0adae3267b4 100644 --- a/drivers/bluetooth/hci_ldisc.c +++ b/drivers/bluetooth/hci_ldisc.c @@ -667,13 +667,13 @@ static int hci_uart_register_dev(struct hci_uart *hu) SET_HCIDEV_DEV(hdev, hu->tty->dev); if (test_bit(HCI_UART_RAW_DEVICE, &hu->hdev_flags)) - set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE); if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags)) - set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG); if (!test_bit(HCI_UART_RESET_ON_INIT, &hu->hdev_flags)) - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE); /* Only call open() for the protocol after hdev is fully initialized as * open() (or a timer/workqueue it starts) may attempt to reference it. diff --git a/drivers/bluetooth/hci_ll.c b/drivers/bluetooth/hci_ll.c index e19e9bd49555..7044c86325ce 100644 --- a/drivers/bluetooth/hci_ll.c +++ b/drivers/bluetooth/hci_ll.c @@ -649,11 +649,11 @@ static int ll_setup(struct hci_uart *hu) /* This means that there was an error getting the BD address * during probe, so mark the device as having a bad address. */ - set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks); + hci_set_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR); } else if (bacmp(&lldev->bdaddr, BDADDR_ANY)) { err = ll_set_bdaddr(hu->hdev, &lldev->bdaddr); if (err) - set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks); + hci_set_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR); } /* Operational speed if any */ diff --git a/drivers/bluetooth/hci_nokia.c b/drivers/bluetooth/hci_nokia.c index 9fc10a16fd96..cd7575c20f65 100644 --- a/drivers/bluetooth/hci_nokia.c +++ b/drivers/bluetooth/hci_nokia.c @@ -439,7 +439,7 @@ static int nokia_setup(struct hci_uart *hu) if (btdev->man_id == NOKIA_ID_BCM2048) { hu->hdev->set_bdaddr = btbcm_set_bdaddr; - set_bit(HCI_QUIRK_INVALID_BDADDR, &hu->hdev->quirks); + hci_set_quirk(hu->hdev, HCI_QUIRK_INVALID_BDADDR); dev_dbg(dev, "bcm2048 has invalid bluetooth address!"); } diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c index 3ec0be496820..33c43503714b 100644 --- a/drivers/bluetooth/hci_qca.c +++ b/drivers/bluetooth/hci_qca.c @@ -1892,7 +1892,7 @@ static int qca_setup(struct hci_uart *hu) /* Enable controller to do both LE scan and BR/EDR inquiry * simultaneously. */ - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); switch (soc_type) { case QCA_QCA2066: @@ -1944,7 +1944,7 @@ retry: case QCA_WCN7850: qcadev = serdev_device_get_drvdata(hu->serdev); if (qcadev->bdaddr_property_broken) - set_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN); hci_set_aosp_capable(hdev); @@ -2487,7 +2487,7 @@ static int qca_serdev_probe(struct serdev_device *serdev) hdev = qcadev->serdev_hu.hdev; if (power_ctrl_enabled) { - set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP); hdev->shutdown = qca_power_off; } @@ -2496,11 +2496,11 @@ static int qca_serdev_probe(struct serdev_device *serdev) * be queried via hci. Same with the valid le states quirk. */ if (data->capabilities & QCA_CAP_WIDEBAND_SPEECH) - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - &hdev->quirks); + hci_set_quirk(hdev, + HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); if (!(data->capabilities & QCA_CAP_VALID_LE_STATES)) - set_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES); } return 0; @@ -2550,7 +2550,7 @@ static void qca_serdev_shutdown(struct device *dev) * invoked and the SOC is already in the initial state, so * don't also need to send the VSC. */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks) || + if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP) || hci_dev_test_flag(hdev, HCI_SETUP)) return; diff --git a/drivers/bluetooth/hci_serdev.c b/drivers/bluetooth/hci_serdev.c index 89a22e9b3253..593d9cefbbf9 100644 --- a/drivers/bluetooth/hci_serdev.c +++ b/drivers/bluetooth/hci_serdev.c @@ -152,7 +152,7 @@ static int hci_uart_close(struct hci_dev *hdev) * BT SOC is completely powered OFF during BT OFF, holding port * open may drain the battery. */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP)) { clear_bit(HCI_UART_PROTO_READY, &hu->flags); serdev_device_close(hu->serdev); } @@ -358,13 +358,13 @@ int hci_uart_register_device_priv(struct hci_uart *hu, SET_HCIDEV_DEV(hdev, &hu->serdev->dev); if (test_bit(HCI_UART_NO_SUSPEND_NOTIFIER, &hu->flags)) - set_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER); if (test_bit(HCI_UART_RAW_DEVICE, &hu->hdev_flags)) - set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE); if (test_bit(HCI_UART_EXT_CONFIG, &hu->hdev_flags)) - set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG); if (test_bit(HCI_UART_INIT_PENDING, &hu->hdev_flags)) return 0; diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index 59f4d7bdffdc..f7d8c3c00655 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -415,16 +415,16 @@ static int __vhci_create_device(struct vhci_data *data, __u8 opcode) hdev->get_codec_config_data = vhci_get_codec_config_data; hdev->wakeup = vhci_wakeup; hdev->setup = vhci_setup; - set_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks); - set_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP); + hci_set_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED); /* bit 6 is for external configuration */ if (opcode & 0x40) - set_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG); /* bit 7 is for raw device */ if (opcode & 0x80) - set_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_RAW_DEVICE); if (hci_register_dev(hdev) < 0) { BT_ERR("Can't register HCI device"); diff --git a/drivers/bluetooth/virtio_bt.c b/drivers/bluetooth/virtio_bt.c index 756f292df9e8..6f1a37e85c6a 100644 --- a/drivers/bluetooth/virtio_bt.c +++ b/drivers/bluetooth/virtio_bt.c @@ -327,17 +327,17 @@ static int virtbt_probe(struct virtio_device *vdev) hdev->setup = virtbt_setup_intel; hdev->shutdown = virtbt_shutdown_generic; hdev->set_bdaddr = virtbt_set_bdaddr_intel; - set_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks); - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); break; case VIRTIO_BT_CONFIG_VENDOR_REALTEK: hdev->manufacturer = 93; hdev->setup = virtbt_setup_realtek; hdev->shutdown = virtbt_shutdown_generic; - set_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks); - set_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks); + hci_set_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY); + hci_set_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED); break; } } diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 82cbd54443ac..c79901f2dc2a 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -377,6 +377,8 @@ enum { * This quirk must be set before hci_register_dev is called. */ HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, + + __HCI_NUM_QUIRKS, }; /* HCI device flags */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 367ca43f45d1..f79f59e67114 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -464,7 +464,7 @@ struct hci_dev { unsigned int auto_accept_delay; - unsigned long quirks; + DECLARE_BITMAP(quirk_flags, __HCI_NUM_QUIRKS); atomic_t cmd_cnt; unsigned int acl_cnt; @@ -656,6 +656,10 @@ struct hci_dev { u8 (*classify_pkt_type)(struct hci_dev *hdev, struct sk_buff *skb); }; +#define hci_set_quirk(hdev, nr) set_bit((nr), (hdev)->quirk_flags) +#define hci_clear_quirk(hdev, nr) clear_bit((nr), (hdev)->quirk_flags) +#define hci_test_quirk(hdev, nr) test_bit((nr), (hdev)->quirk_flags) + #define HCI_PHY_HANDLE(handle) (handle & 0xff) enum conn_reasons { @@ -839,7 +843,7 @@ extern struct mutex hci_cb_list_lock; } while (0) #define hci_dev_le_state_simultaneous(hdev) \ - (!test_bit(HCI_QUIRK_BROKEN_LE_STATES, &(hdev)->quirks) && \ + (!hci_test_quirk((hdev), HCI_QUIRK_BROKEN_LE_STATES) && \ ((hdev)->le_states[4] & 0x08) && /* Central */ \ ((hdev)->le_states[4] & 0x40) && /* Peripheral */ \ ((hdev)->le_states[3] & 0x10)) /* Simultaneous */ @@ -1931,8 +1935,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_2M)) #define le_coded_capable(dev) (((dev)->le_features[1] & HCI_LE_PHY_CODED) && \ - !test_bit(HCI_QUIRK_BROKEN_LE_CODED, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_LE_CODED)) #define scan_coded(dev) (((dev)->le_tx_def_phys & HCI_LE_SET_PHY_CODED) || \ ((dev)->le_rx_def_phys & HCI_LE_SET_PHY_CODED)) @@ -1944,27 +1948,27 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define read_key_size_capable(dev) \ ((dev)->commands[20] & 0x10 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_ENC_KEY_SIZE)) #define read_voice_setting_capable(dev) \ ((dev)->commands[9] & 0x04 && \ - !test_bit(HCI_QUIRK_BROKEN_READ_VOICE_SETTING, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_READ_VOICE_SETTING)) /* Use enhanced synchronous connection if command is supported and its quirk * has not been set. */ #define enhanced_sync_conn_capable(dev) \ (((dev)->commands[29] & 0x08) && \ - !test_bit(HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN)) /* Use ext scanning if set ext scan param and ext scan enable is supported */ #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \ ((dev)->commands[37] & 0x40) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_SCAN)) /* Use ext create connection if command is supported */ #define use_ext_conn(dev) (((dev)->commands[37] & 0x80) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, &(dev)->quirks)) + !hci_test_quirk((dev), HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Extended advertising support */ #define ext_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_EXT_ADV)) @@ -1979,8 +1983,8 @@ void hci_conn_del_sysfs(struct hci_conn *conn); */ #define use_enhanced_conn_complete(dev) ((ll_privacy_capable(dev) || \ ext_adv_capable(dev)) && \ - !test_bit(HCI_QUIRK_BROKEN_EXT_CREATE_CONN, \ - &(dev)->quirks)) + !hci_test_quirk((dev), \ + HCI_QUIRK_BROKEN_EXT_CREATE_CONN)) /* Periodic advertising support */ #define per_adv_capable(dev) (((dev)->le_features[1] & HCI_LE_PERIODIC_ADV)) @@ -1997,7 +2001,7 @@ void hci_conn_del_sysfs(struct hci_conn *conn); #define sync_recv_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_SYNC_RECEIVER) #define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \ - (!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks))) + (!hci_test_quirk((dev), HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG))) /* ----- HCI protocols ----- */ #define HCI_PROTO_DEFER 0x01 diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 14d7221b8ac0..441cb1700f99 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2654,7 +2654,7 @@ int hci_register_dev(struct hci_dev *hdev) /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup @@ -2784,7 +2784,7 @@ int hci_register_suspend_notifier(struct hci_dev *hdev) int ret = 0; if (!hdev->suspend_notifier.notifier_call && - !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { + !hci_test_quirk(hdev, HCI_QUIRK_NO_SUSPEND_NOTIFIER)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index f625074d1f00..99e2e9fc70e8 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -38,7 +38,7 @@ static ssize_t __name ## _read(struct file *file, \ struct hci_dev *hdev = file->private_data; \ char buf[3]; \ \ - buf[0] = test_bit(__quirk, &hdev->quirks) ? 'Y' : 'N'; \ + buf[0] = test_bit(__quirk, hdev->quirk_flags) ? 'Y' : 'N'; \ buf[1] = '\n'; \ buf[2] = '\0'; \ return simple_read_from_buffer(user_buf, count, ppos, buf, 2); \ @@ -59,10 +59,10 @@ static ssize_t __name ## _write(struct file *file, \ if (err) \ return err; \ \ - if (enable == test_bit(__quirk, &hdev->quirks)) \ + if (enable == test_bit(__quirk, hdev->quirk_flags)) \ return -EALREADY; \ \ - change_bit(__quirk, &hdev->quirks); \ + change_bit(__quirk, hdev->quirk_flags); \ \ return count; \ } \ @@ -1356,7 +1356,7 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf, * for the vendor callback. Instead just store the desired value and * the setting will be programmed when the controller gets powered on. */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) && (!test_bit(HCI_RUNNING, &hdev->flags) || hci_dev_test_flag(hdev, HCI_USER_CHANNEL))) goto done; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 992131f88a45..cf4b30ac9e0e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -908,8 +908,8 @@ static u8 hci_cc_read_local_ext_features(struct hci_dev *hdev, void *data, return rp->status; if (hdev->max_page < rp->max_page) { - if (test_bit(HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2, - &hdev->quirks)) + if (hci_test_quirk(hdev, + HCI_QUIRK_BROKEN_LOCAL_EXT_FEATURES_PAGE_2)) bt_dev_warn(hdev, "broken local ext features page 2"); else hdev->max_page = rp->max_page; @@ -936,7 +936,7 @@ static u8 hci_cc_read_buffer_size(struct hci_dev *hdev, void *data, hdev->acl_pkts = __le16_to_cpu(rp->acl_max_pkt); hdev->sco_pkts = __le16_to_cpu(rp->sco_max_pkt); - if (test_bit(HCI_QUIRK_FIXUP_BUFFER_SIZE, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_FIXUP_BUFFER_SIZE)) { hdev->sco_mtu = 64; hdev->sco_pkts = 8; } @@ -2971,7 +2971,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data, * state to indicate completion. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || - !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); goto unlock; } @@ -2990,7 +2990,7 @@ static void hci_inquiry_complete_evt(struct hci_dev *hdev, void *data, * state to indicate completion. */ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN) || - !test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); } @@ -3614,8 +3614,7 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, /* We skip the WRITE_AUTH_PAYLOAD_TIMEOUT for ATS2851 based controllers * to avoid unexpected SMP command errors when pairing. */ - if (test_bit(HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT, - &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_WRITE_AUTH_PAYLOAD_TIMEOUT)) goto notify; /* Set the default Authenticated Payload Timeout after @@ -5914,7 +5913,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, * while we have an existing one in peripheral role. */ if (hdev->conn_hash.le_num_peripheral > 0 && - (test_bit(HCI_QUIRK_BROKEN_LE_STATES, &hdev->quirks) || + (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LE_STATES) || !(hdev->le_states[3] & 0x10))) return NULL; @@ -6310,8 +6309,8 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK; legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type); - if (test_bit(HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY, - &hdev->quirks)) { + if (hci_test_quirk(hdev, + HCI_QUIRK_FIXUP_LE_EXT_ADV_REPORT_PHY)) { info->primary_phy &= 0x1f; info->secondary_phy &= 0x1f; } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index acbf06aa3dd7..7938c004071c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -393,7 +393,7 @@ static void le_scan_disable(struct work_struct *work) if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) goto _return; - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) { if (!test_bit(HCI_INQUIRY, &hdev->flags) && hdev->discovery.state != DISCOVERY_RESOLVING) goto discov_stopped; @@ -3587,7 +3587,7 @@ static void hci_dev_get_bd_addr_from_property(struct hci_dev *hdev) if (ret < 0 || !bacmp(&ba, BDADDR_ANY)) return; - if (test_bit(HCI_QUIRK_BDADDR_PROPERTY_BROKEN, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BDADDR_PROPERTY_BROKEN)) baswap(&hdev->public_addr, &ba); else bacpy(&hdev->public_addr, &ba); @@ -3662,7 +3662,7 @@ static int hci_init0_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); /* Reset */ - if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) { err = hci_reset_sync(hdev); if (err) return err; @@ -3675,7 +3675,7 @@ static int hci_unconf_init_sync(struct hci_dev *hdev) { int err; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return 0; err = hci_init0_sync(hdev); @@ -3718,7 +3718,7 @@ static int hci_read_local_cmds_sync(struct hci_dev *hdev) * supported commands. */ if (hdev->hci_ver > BLUETOOTH_VER_1_1 && - !test_bit(HCI_QUIRK_BROKEN_LOCAL_COMMANDS, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_BROKEN_LOCAL_COMMANDS)) return __hci_cmd_sync_status(hdev, HCI_OP_READ_LOCAL_COMMANDS, 0, NULL, HCI_CMD_TIMEOUT); @@ -3732,7 +3732,7 @@ static int hci_init1_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, ""); /* Reset */ - if (!test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks)) { + if (!hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE)) { err = hci_reset_sync(hdev); if (err) return err; @@ -3795,7 +3795,7 @@ static int hci_set_event_filter_sync(struct hci_dev *hdev, u8 flt_type, if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) return 0; - if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; memset(&cp, 0, sizeof(cp)); @@ -3822,7 +3822,7 @@ static int hci_clear_event_filter_sync(struct hci_dev *hdev) * a hci_set_event_filter_sync() call succeeds, but we do * the check both for parity and as a future reminder. */ - if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; return hci_set_event_filter_sync(hdev, HCI_FLT_CLEAR_ALL, 0x00, @@ -3846,7 +3846,7 @@ static int hci_write_sync_flowctl_sync(struct hci_dev *hdev) /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) || - !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED)) return 0; memset(&cp, 0, sizeof(cp)); @@ -3921,7 +3921,7 @@ static int hci_write_inquiry_mode_sync(struct hci_dev *hdev) u8 mode; if (!lmp_inq_rssi_capable(hdev) && - !test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE)) return 0; /* If Extended Inquiry Result events are supported, then @@ -4111,7 +4111,7 @@ static int hci_set_event_mask_sync(struct hci_dev *hdev) } if (lmp_inq_rssi_capable(hdev) || - test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_FIXUP_INQUIRY_MODE)) events[4] |= 0x02; /* Inquiry Result with RSSI */ if (lmp_ext_feat_capable(hdev)) @@ -4163,7 +4163,7 @@ static int hci_read_stored_link_key_sync(struct hci_dev *hdev) struct hci_cp_read_stored_link_key cp; if (!(hdev->commands[6] & 0x20) || - test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY)) return 0; memset(&cp, 0, sizeof(cp)); @@ -4212,7 +4212,7 @@ static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev) { if (!(hdev->commands[18] & 0x04) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || - test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING, @@ -4226,7 +4226,7 @@ static int hci_read_page_scan_type_sync(struct hci_dev *hdev) * this command in the bit mask of supported commands. */ if (!(hdev->commands[13] & 0x01) || - test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE, @@ -4421,7 +4421,7 @@ static int hci_le_read_adv_tx_power_sync(struct hci_dev *hdev) static int hci_le_read_tx_power_sync(struct hci_dev *hdev) { if (!(hdev->commands[38] & 0x80) || - test_bit(HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_READ_TRANSMIT_POWER)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_TRANSMIT_POWER, @@ -4464,7 +4464,7 @@ static int hci_le_set_rpa_timeout_sync(struct hci_dev *hdev) __le16 timeout = cpu_to_le16(hdev->rpa_timeout); if (!(hdev->commands[35] & 0x04) || - test_bit(HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_SET_RPA_TIMEOUT)) return 0; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_RPA_TIMEOUT, @@ -4609,7 +4609,7 @@ static int hci_delete_stored_link_key_sync(struct hci_dev *hdev) * just disable this command. */ if (!(hdev->commands[6] & 0x80) || - test_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_STORED_LINK_KEY)) return 0; memset(&cp, 0, sizeof(cp)); @@ -4735,7 +4735,7 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev) if (!(hdev->commands[18] & 0x08) || !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) || - test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks)) + hci_test_quirk(hdev, HCI_QUIRK_BROKEN_ERR_DATA_REPORTING)) return 0; if (enabled == hdev->err_data_reporting) @@ -4948,7 +4948,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) size_t i; if (!hci_dev_test_flag(hdev, HCI_SETUP) && - !test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) + !hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_SETUP)) return 0; bt_dev_dbg(hdev, ""); @@ -4959,7 +4959,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) ret = hdev->setup(hdev); for (i = 0; i < ARRAY_SIZE(hci_broken_table); i++) { - if (test_bit(hci_broken_table[i].quirk, &hdev->quirks)) + if (hci_test_quirk(hdev, hci_broken_table[i].quirk)) bt_dev_warn(hdev, "%s", hci_broken_table[i].desc); } @@ -4967,10 +4967,10 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) * BD_ADDR invalid before creating the HCI device or in * its setup callback. */ - invalid_bdaddr = test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || - test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks); + invalid_bdaddr = hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || + hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY); if (!ret) { - if (test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY) && !bacmp(&hdev->public_addr, BDADDR_ANY)) hci_dev_get_bd_addr_from_property(hdev); @@ -4992,7 +4992,7 @@ static int hci_dev_setup_sync(struct hci_dev *hdev) * In case any of them is set, the controller has to * start up as unconfigured. */ - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || invalid_bdaddr) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); @@ -5052,7 +5052,7 @@ static int hci_dev_init_sync(struct hci_dev *hdev) * then they need to be reprogrammed after the init procedure * completed. */ - if (test_bit(HCI_QUIRK_NON_PERSISTENT_DIAG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_NON_PERSISTENT_DIAG) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) && hdev->set_diag) ret = hdev->set_diag(hdev, true); @@ -5309,7 +5309,7 @@ int hci_dev_close_sync(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_RESET_ON_CLOSE) && !auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { set_bit(HCI_INIT, &hdev->flags); hci_reset_sync(hdev); @@ -5959,7 +5959,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) own_addr_type = ADDR_LE_DEV_PUBLIC; if (hci_is_adv_monitoring(hdev) || - (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) && + (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER) && hdev->discovery.result_filtering)) { /* Duplicate filter should be disabled when some advertisement * monitor is activated, otherwise AdvMon can only receive one @@ -6022,8 +6022,7 @@ int hci_start_discovery_sync(struct hci_dev *hdev) * and LE scanning are done sequentially with separate * timeouts. */ - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, - &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_SIMULTANEOUS_DISCOVERY)) { timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT); /* During simultaneous discovery, we double LE scan * interval. We must leave some time for the controller @@ -6100,7 +6099,7 @@ static int hci_update_event_filter_sync(struct hci_dev *hdev) /* Some fake CSR controllers lock up after setting this type of * filter, so avoid sending the request altogether. */ - if (test_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL)) return 0; /* Always clear event filter when starting */ diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1485b455ade4..63dba0503653 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -464,7 +464,7 @@ static int read_index_list(struct sock *sk, struct hci_dev *hdev, void *data, /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE)) continue; if (!hci_dev_test_flag(d, HCI_UNCONFIGURED)) { @@ -522,7 +522,7 @@ static int read_unconf_index_list(struct sock *sk, struct hci_dev *hdev, /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE)) continue; if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) { @@ -576,7 +576,7 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, /* Devices marked as raw-only are neither configured * nor unconfigured controllers. */ - if (test_bit(HCI_QUIRK_RAW_DEVICE, &d->quirks)) + if (hci_test_quirk(d, HCI_QUIRK_RAW_DEVICE)) continue; if (hci_dev_test_flag(d, HCI_UNCONFIGURED)) @@ -612,12 +612,12 @@ static int read_ext_index_list(struct sock *sk, struct hci_dev *hdev, static bool is_configured(struct hci_dev *hdev) { - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) && !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) return false; - if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || - test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) && + if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || + hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) && !bacmp(&hdev->public_addr, BDADDR_ANY)) return false; @@ -628,12 +628,12 @@ static __le32 get_missing_options(struct hci_dev *hdev) { u32 options = 0; - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) && + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) && !hci_dev_test_flag(hdev, HCI_EXT_CONFIGURED)) options |= MGMT_OPTION_EXTERNAL_CONFIG; - if ((test_bit(HCI_QUIRK_INVALID_BDADDR, &hdev->quirks) || - test_bit(HCI_QUIRK_USE_BDADDR_PROPERTY, &hdev->quirks)) && + if ((hci_test_quirk(hdev, HCI_QUIRK_INVALID_BDADDR) || + hci_test_quirk(hdev, HCI_QUIRK_USE_BDADDR_PROPERTY)) && !bacmp(&hdev->public_addr, BDADDR_ANY)) options |= MGMT_OPTION_PUBLIC_ADDRESS; @@ -669,7 +669,7 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev, memset(&rp, 0, sizeof(rp)); rp.manufacturer = cpu_to_le16(hdev->manufacturer); - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG)) options |= MGMT_OPTION_EXTERNAL_CONFIG; if (hdev->set_bdaddr) @@ -828,8 +828,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (lmp_sc_capable(hdev)) settings |= MGMT_SETTING_SECURE_CONN; - if (test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, - &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; } @@ -841,8 +840,7 @@ static u32 get_supported_settings(struct hci_dev *hdev) settings |= MGMT_SETTING_ADVERTISING; } - if (test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks) || - hdev->set_bdaddr) + if (hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG) || hdev->set_bdaddr) settings |= MGMT_SETTING_CONFIGURATION; if (cis_central_capable(hdev)) @@ -4307,7 +4305,7 @@ static int set_wideband_speech(struct sock *sk, struct hci_dev *hdev, bt_dev_dbg(hdev, "sock %p", sk); - if (!test_bit(HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED, &hdev->quirks)) + if (!hci_test_quirk(hdev, HCI_QUIRK_WIDEBAND_SPEECH_SUPPORTED)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_WIDEBAND_SPEECH, MGMT_STATUS_NOT_SUPPORTED); @@ -7935,7 +7933,7 @@ static int set_external_config(struct sock *sk, struct hci_dev *hdev, return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_INVALID_PARAMS); - if (!test_bit(HCI_QUIRK_EXTERNAL_CONFIG, &hdev->quirks)) + if (!hci_test_quirk(hdev, HCI_QUIRK_EXTERNAL_CONFIG)) return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_EXTERNAL_CONFIG, MGMT_STATUS_NOT_SUPPORTED); @@ -9338,7 +9336,7 @@ void mgmt_index_added(struct hci_dev *hdev) { struct mgmt_ev_ext_index ev; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { @@ -9362,7 +9360,7 @@ void mgmt_index_removed(struct hci_dev *hdev) struct mgmt_ev_ext_index ev; struct cmd_lookup match = { NULL, hdev, MGMT_STATUS_INVALID_INDEX }; - if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + if (hci_test_quirk(hdev, HCI_QUIRK_RAW_DEVICE)) return; mgmt_pending_foreach(0, hdev, true, cmd_complete_rsp, &match); @@ -10089,7 +10087,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, if (hdev->discovery.rssi != HCI_RSSI_INVALID && (rssi == HCI_RSSI_INVALID || (rssi < hdev->discovery.rssi && - !test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)))) + !hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)))) return false; if (hdev->discovery.uuid_count != 0) { @@ -10107,7 +10105,7 @@ static bool is_filter_match(struct hci_dev *hdev, s8 rssi, u8 *eir, /* If duplicate filtering does not report RSSI changes, then restart * scanning to ensure updated result with updated RSSI values. */ - if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks)) { + if (hci_test_quirk(hdev, HCI_QUIRK_STRICT_DUPLICATE_FILTER)) { /* Validate RSSI value against the RSSI threshold once more. */ if (hdev->discovery.rssi != HCI_RSSI_INVALID && rssi < hdev->discovery.rssi) diff --git a/net/bluetooth/msft.c b/net/bluetooth/msft.c index 5a8ccc491b14..c560d8467669 100644 --- a/net/bluetooth/msft.c +++ b/net/bluetooth/msft.c @@ -989,7 +989,7 @@ static void msft_monitor_device_evt(struct hci_dev *hdev, struct sk_buff *skb) handle_data = msft_find_handle_data(hdev, ev->monitor_handle, false); - if (!test_bit(HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER, &hdev->quirks)) { + if (!hci_test_quirk(hdev, HCI_QUIRK_USE_MSFT_EXT_ADDRESS_FILTER)) { if (!handle_data) return; mgmt_handle = handle_data->mgmt_handle; -- cgit v1.2.3 From 2d72afb340657f03f7261e9243b44457a9228ac7 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 16 Jul 2025 20:39:14 +0200 Subject: netfilter: nf_conntrack: fix crash due to removal of uninitialised entry A crash in conntrack was reported while trying to unlink the conntrack entry from the hash bucket list: [exception RIP: __nf_ct_delete_from_lists+172] [..] #7 [ff539b5a2b043aa0] nf_ct_delete at ffffffffc124d421 [nf_conntrack] #8 [ff539b5a2b043ad0] nf_ct_gc_expired at ffffffffc124d999 [nf_conntrack] #9 [ff539b5a2b043ae0] __nf_conntrack_find_get at ffffffffc124efbc [nf_conntrack] [..] The nf_conn struct is marked as allocated from slab but appears to be in a partially initialised state: ct hlist pointer is garbage; looks like the ct hash value (hence crash). ct->status is equal to IPS_CONFIRMED|IPS_DYING, which is expected ct->timeout is 30000 (=30s), which is unexpected. Everything else looks like normal udp conntrack entry. If we ignore ct->status and pretend its 0, the entry matches those that are newly allocated but not yet inserted into the hash: - ct hlist pointers are overloaded and store/cache the raw tuple hash - ct->timeout matches the relative time expected for a new udp flow rather than the absolute 'jiffies' value. If it were not for the presence of IPS_CONFIRMED, __nf_conntrack_find_get() would have skipped the entry. Theory is that we did hit following race: cpu x cpu y cpu z found entry E found entry E E is expired nf_ct_delete() return E to rcu slab init_conntrack E is re-inited, ct->status set to 0 reply tuplehash hnnode.pprev stores hash value. cpu y found E right before it was deleted on cpu x. E is now re-inited on cpu z. cpu y was preempted before checking for expiry and/or confirm bit. ->refcnt set to 1 E now owned by skb ->timeout set to 30000 If cpu y were to resume now, it would observe E as expired but would skip E due to missing CONFIRMED bit. nf_conntrack_confirm gets called sets: ct->status |= CONFIRMED This is wrong: E is not yet added to hashtable. cpu y resumes, it observes E as expired but CONFIRMED: nf_ct_expired() -> yes (ct->timeout is 30s) confirmed bit set. cpu y will try to delete E from the hashtable: nf_ct_delete() -> set DYING bit __nf_ct_delete_from_lists Even this scenario doesn't guarantee a crash: cpu z still holds the table bucket lock(s) so y blocks: wait for spinlock held by z CONFIRMED is set but there is no guarantee ct will be added to hash: "chaintoolong" or "clash resolution" logic both skip the insert step. reply hnnode.pprev still stores the hash value. unlocks spinlock return NF_DROP In case CPU z does insert the entry into the hashtable, cpu y will unlink E again right away but no crash occurs. Without 'cpu y' race, 'garbage' hlist is of no consequence: ct refcnt remains at 1, eventually skb will be free'd and E gets destroyed via: nf_conntrack_put -> nf_conntrack_destroy -> nf_ct_destroy. To resolve this, move the IPS_CONFIRMED assignment after the table insertion but before the unlock. Pablo points out that the confirm-bit-store could be reordered to happen before hlist add resp. the timeout fixup, so switch to set_bit and before_atomic memory barrier to prevent this. It doesn't matter if other CPUs can observe a newly inserted entry right before the CONFIRMED bit was set: Such event cannot be distinguished from above "E is the old incarnation" case: the entry will be skipped. Also change nf_ct_should_gc() to first check the confirmed bit. The gc sequence is: 1. Check if entry has expired, if not skip to next entry 2. Obtain a reference to the expired entry. 3. Call nf_ct_should_gc() to double-check step 1. nf_ct_should_gc() is thus called only for entries that already failed an expiry check. After this patch, once the confirmed bit check passes ct->timeout has been altered to reflect the absolute 'best before' date instead of a relative time. Step 3 will therefore not remove the entry. Without this change to nf_ct_should_gc() we could still get this sequence: 1. Check if entry has expired. 2. Obtain a reference. 3. Call nf_ct_should_gc() to double-check step 1: 4 - entry is still observed as expired 5 - meanwhile, ct->timeout is corrected to absolute value on other CPU and confirm bit gets set 6 - confirm bit is seen 7 - valid entry is removed again First do check 6), then 4) so the gc expiry check always picks up either confirmed bit unset (entry gets skipped) or expiry re-check failure for re-inited conntrack objects. This change cannot be backported to releases before 5.19. Without commit 8a75a2c17410 ("netfilter: conntrack: remove unconfirmed list") |= IPS_CONFIRMED line cannot be moved without further changes. Cc: Razvan Cojocaru Link: https://lore.kernel.org/netfilter-devel/20250627142758.25664-1-fw@strlen.de/ Link: https://lore.kernel.org/netfilter-devel/4239da15-83ff-4ca4-939d-faef283471bb@gmail.com/ Fixes: 1397af5bfd7d ("netfilter: conntrack: remove the percpu dying list") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 15 +++++++++++++-- net/netfilter/nf_conntrack_core.c | 26 ++++++++++++++++++++------ 2 files changed, 33 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 3f02a45773e8..ca26274196b9 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -306,8 +306,19 @@ static inline bool nf_ct_is_expired(const struct nf_conn *ct) /* use after obtaining a reference count */ static inline bool nf_ct_should_gc(const struct nf_conn *ct) { - return nf_ct_is_expired(ct) && nf_ct_is_confirmed(ct) && - !nf_ct_is_dying(ct); + if (!nf_ct_is_confirmed(ct)) + return false; + + /* load ct->timeout after is_confirmed() test. + * Pairs with __nf_conntrack_confirm() which: + * 1. Increases ct->timeout value + * 2. Inserts ct into rcu hlist + * 3. Sets the confirmed bit + * 4. Unlocks the hlist lock + */ + smp_acquire__after_ctrl_dep(); + + return nf_ct_is_expired(ct) && !nf_ct_is_dying(ct); } #define NF_CT_DAY (86400 * HZ) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 201d3c4ec623..e51f0b441109 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1124,6 +1124,12 @@ static int nf_ct_resolve_clash_harder(struct sk_buff *skb, u32 repl_idx) hlist_nulls_add_head_rcu(&loser_ct->tuplehash[IP_CT_DIR_REPLY].hnnode, &nf_conntrack_hash[repl_idx]); + /* confirmed bit must be set after hlist add, not before: + * loser_ct can still be visible to other cpu due to + * SLAB_TYPESAFE_BY_RCU. + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &loser_ct->status); NF_CT_STAT_INC(net, clash_resolve); return NF_ACCEPT; @@ -1260,8 +1266,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) * user context, else we insert an already 'dead' hash, blocking * further use of that particular connection -JM. */ - ct->status |= IPS_CONFIRMED; - if (unlikely(nf_ct_is_dying(ct))) { NF_CT_STAT_INC(net, insert_failed); goto dying; @@ -1293,7 +1297,7 @@ chaintoolong: } } - /* Timer relative to confirmation time, not original + /* Timeout is relative to confirmation time, not original setting time, otherwise we'd get timer wrap in weird delay cases. */ ct->timeout += nfct_time_stamp; @@ -1301,11 +1305,21 @@ chaintoolong: __nf_conntrack_insert_prepare(ct); /* Since the lookup is lockless, hash insertion must be done after - * starting the timer and setting the CONFIRMED bit. The RCU barriers - * guarantee that no other CPU can find the conntrack before the above - * stores are visible. + * setting ct->timeout. The RCU barriers guarantee that no other CPU + * can find the conntrack before the above stores are visible. */ __nf_conntrack_hash_insert(ct, hash, reply_hash); + + /* IPS_CONFIRMED unset means 'ct not (yet) in hash', conntrack lookups + * skip entries that lack this bit. This happens when a CPU is looking + * at a stale entry that is being recycled due to SLAB_TYPESAFE_BY_RCU + * or when another CPU encounters this entry right after the insertion + * but before the set-confirm-bit below. This bit must not be set until + * after __nf_conntrack_hash_insert(). + */ + smp_mb__before_atomic(); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); -- cgit v1.2.3 From e804bd83c1fd7e1f03899c948812ebc207ac5a7e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:10 +0000 Subject: neighbour: Split pneigh_lookup(). pneigh_lookup() has ASSERT_RTNL() in the middle of the function, which is confusing. When called with the last argument, creat, 0, pneigh_lookup() literally looks up a proxy neighbour entry. This is the case of the reader path as the fast path and RTM_GETNEIGH. pneigh_lookup(), however, creates a pneigh_entry when called with creat 1 from RTM_NEWNEIGH and SIOCSARP, which require RTNL. Let's split pneigh_lookup() into two functions. We will convert all the reader paths to RCU, and read_lock_bh(&tbl->lock) in the new pneigh_lookup() will be dropped. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 39 +++++++++++++++++++++++++++++---------- net/ipv4/arp.c | 4 ++-- net/ipv6/ip6_output.c | 2 +- net/ipv6/ndisc.c | 2 +- 5 files changed, 36 insertions(+), 16 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7e865b14749d..7f3d57da5689 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -376,10 +376,11 @@ unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev, - int creat); + const void *key, struct net_device *dev); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, + const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ffb8d80328ed..d0e303360b2c 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -746,24 +747,44 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, } EXPORT_SYMBOL_GPL(__pneigh_lookup); -struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev, int creat) +struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); + + read_lock_bh(&tbl->lock); + n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + net, pkey, key_len, dev); + read_unlock_bh(&tbl->lock); + + return n; +} +EXPORT_IPV6_MOD(pneigh_lookup); + +struct pneigh_entry *pneigh_create(struct neigh_table *tbl, + struct net *net, const void *pkey, + struct net_device *dev) { struct pneigh_entry *n; unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); + ASSERT_RTNL(); + read_lock_bh(&tbl->lock); n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); - if (n || !creat) + if (n) goto out; - ASSERT_RTNL(); - n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); if (!n) goto out; @@ -787,8 +808,6 @@ struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, out: return n; } -EXPORT_SYMBOL(pneigh_lookup); - int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) @@ -2007,7 +2026,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, } err = -ENOBUFS; - pn = pneigh_lookup(tbl, net, dst, dev, 1); + pn = pneigh_create(tbl, net, dst, dev); if (pn) { pn->flags = ndm_flags; pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); @@ -3036,7 +3055,7 @@ static int neigh_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, if (ndm->ndm_flags & NTF_PROXY) { struct pneigh_entry *pn; - pn = pneigh_lookup(tbl, net, dst, dev, 0); + pn = pneigh_lookup(tbl, net, dst, dev); if (!pn) { NL_SET_ERR_MSG(extack, "Proxy neighbour entry not found"); err = -ENOENT; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index c0440d61cf2f..d93b5735b0ba 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -864,7 +864,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && - pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { + pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -1089,7 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_lookup(&arp_tbl, net, &ip, dev, 1)) + if (!pneigh_create(&arp_tbl, net, &ip, dev)) return -ENOBUFS; return 0; } diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index fcc20c7250eb..0412f8544695 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -563,7 +563,7 @@ int ip6_forward(struct sk_buff *skb) /* XXX: idev->cnf.proxy_ndp? */ if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { + pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { int proxied = ip6_forward_proxy_check(skb); if (proxied > 0) { /* It's tempting to decrease the hop limit diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d4c5876e1771..a3ac26c1df6d 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1100,7 +1100,7 @@ static enum skb_drop_reason ndisc_recv_na(struct sk_buff *skb) if (lladdr && !memcmp(lladdr, dev->dev_addr, dev->addr_len) && READ_ONCE(net->ipv6.devconf_all->forwarding) && READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && - pneigh_lookup(&nd_tbl, net, &msg->target, dev, 0)) { + pneigh_lookup(&nd_tbl, net, &msg->target, dev)) { /* XXX: idev->cnf.proxy_ndp */ goto out; } -- cgit v1.2.3 From d63382aea70aa4ecb516126e00930bc8ab5e55ef Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:11 +0000 Subject: neighbour: Annotate neigh_table.phash_buckets and pneigh_entry.next with __rcu. The next patch will free pneigh_entry with call_rcu(). Then, we need to annotate neigh_table.phash_buckets[] and pneigh_entry.next with __rcu. To make the next patch cleaner, let's annotate the fields in advance. Currently, all accesses to the fields are under the neigh table lock, so rcu_dereference_protected() is used with 1 for now, but most of them (except in pneigh_delete() and pneigh_ifdown_and_unlock()) will be replaced with rcu_dereference() and rcu_dereference_check(). Note that pneigh_ifdown_and_unlock() changes pneigh_entry.next to a local list, which is illegal because the RCU iterator could be moved to another list. This part will be fixed in the next patch. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 ++-- net/core/neighbour.c | 52 +++++++++++++++++++++++++++++-------------------- 2 files changed, 33 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 7f3d57da5689..1ddc44a04200 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -176,7 +176,7 @@ struct neigh_ops { }; struct pneigh_entry { - struct pneigh_entry *next; + struct pneigh_entry __rcu *next; possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; @@ -236,7 +236,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; - struct pneigh_entry **phash_buckets; + struct pneigh_entry __rcu **phash_buckets; }; static inline int neigh_parms_family(struct neigh_parms *p) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d0e303360b2c..7fcb0a8d655f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -731,7 +731,8 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, net_eq(pneigh_net(n), net) && (n->dev == dev || !n->dev)) return n; - n = n->next; + + n = rcu_dereference_protected(n->next, 1); } return NULL; } @@ -742,7 +743,7 @@ struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, unsigned int key_len = tbl->key_len; u32 hash_val = pneigh_hash(pkey, key_len); - return __pneigh_lookup_1(tbl->phash_buckets[hash_val], + return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); } EXPORT_SYMBOL_GPL(__pneigh_lookup); @@ -759,7 +760,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, hash_val = pneigh_hash(pkey, key_len); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); @@ -778,7 +779,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, ASSERT_RTNL(); read_lock_bh(&tbl->lock); - n = __pneigh_lookup_1(tbl->phash_buckets[hash_val], + n = __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), net, pkey, key_len, dev); read_unlock_bh(&tbl->lock); @@ -803,7 +804,7 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; - tbl->phash_buckets[hash_val] = n; + rcu_assign_pointer(tbl->phash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); out: return n; @@ -812,16 +813,20 @@ out: int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { - struct pneigh_entry *n, **np; - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); + struct pneigh_entry *n, __rcu **np; + unsigned int key_len; + u32 hash_val; + + key_len = tbl->key_len; + hash_val = pneigh_hash(pkey, key_len); write_lock_bh(&tbl->lock); - for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + for (np = &tbl->phash_buckets[hash_val]; + (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { - *np = n->next; + rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -838,17 +843,17 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - struct pneigh_entry *n, **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np, *freelist = NULL; u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, 1)) != NULL) { if (skip_perm && n->permanent) goto skip; if (!dev || n->dev == dev) { - *np = n->next; - n->next = freelist; + rcu_assign_pointer(*np, n->next); + rcu_assign_pointer(n->next, freelist); freelist = n; continue; } @@ -858,7 +863,7 @@ skip: } write_unlock_bh(&tbl->lock); while ((n = freelist)) { - freelist = n->next; + freelist = rcu_dereference_protected(n->next, 1); n->next = NULL; if (tbl->pdestructor) tbl->pdestructor(n); @@ -2794,7 +2799,9 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, for (h = s_h; h <= PNEIGH_HASHMASK; h++) { if (h > s_h) s_idx = 0; - for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference_protected(tbl->phash_buckets[h], 1), idx = 0; + n; + n = rcu_dereference_protected(n->next, 1)) { if (idx < s_idx || pneigh_net(n) != net) goto next; if (neigh_ifindex_filtered(n->dev, filter->dev_idx) || @@ -3288,9 +3295,10 @@ static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) state->flags |= NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { - pn = tbl->phash_buckets[bucket]; + pn = rcu_dereference_protected(tbl->phash_buckets[bucket], 1); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); if (pn) break; } @@ -3308,15 +3316,17 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, struct neigh_table *tbl = state->tbl; do { - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); } while (pn && !net_eq(pneigh_net(pn), net)); while (!pn) { if (++state->bucket > PNEIGH_HASHMASK) break; - pn = tbl->phash_buckets[state->bucket]; + + pn = rcu_dereference_protected(tbl->phash_buckets[state->bucket], 1); + while (pn && !net_eq(pneigh_net(pn), net)) - pn = pn->next; + pn = rcu_dereference_protected(pn->next, 1); if (pn) break; } -- cgit v1.2.3 From d539d8fbd8fcf64a1492c51f5ee99aaa8a8dc9ab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:12 +0000 Subject: neighbour: Free pneigh_entry after RCU grace period. We will convert RTM_GETNEIGH to RCU. neigh_get() looks up pneigh_entry by pneigh_lookup() and passes it to pneigh_fill_info(). Then, we must ensure that the entry is alive till pneigh_fill_info() completes, but read_lock_bh(&tbl->lock) in pneigh_lookup() does not guarantee that. Also, we will convert all readers of tbl->phash_buckets[] to RCU. Let's use call_rcu() to free pneigh_entry and update phash_buckets[] and ->next by rcu_assign_pointer(). pneigh_ifdown_and_unlock() uses list_head to avoid overwriting ->next and moving RCU iterators to another list. pndisc_destructor() (only IPv6 ndisc uses this) uses a mutex, so it is not delayed to call_rcu(), where we cannot sleep. This is fine because the mcast code works with RCU and ipv6_dev_mc_dec() frees mcast objects after RCU grace period. While at it, we change the return type of pneigh_ifdown_and_unlock() to void. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-8-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 4 ++++ net/core/neighbour.c | 45 ++++++++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 1ddc44a04200..6d7f9aa53a7a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -180,6 +180,10 @@ struct pneigh_entry { possible_net_t net; struct net_device *dev; netdevice_tracker dev_tracker; + union { + struct list_head free_node; + struct rcu_head rcu; + }; u32 flags; u8 protocol; bool permanent; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7fcb0a8d655f..fa2e60a479ef 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,9 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -810,6 +810,14 @@ out: return n; } +static void pneigh_destroy(struct rcu_head *rcu) +{ + struct pneigh_entry *n = container_of(rcu, struct pneigh_entry, rcu); + + netdev_put(n->dev, &n->dev_tracker); + kfree(n); +} + int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) { @@ -828,10 +836,11 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); return 0; } } @@ -839,11 +848,12 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return -ENOENT; } -static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, + struct net_device *dev, + bool skip_perm) { - struct pneigh_entry *n, __rcu **np, *freelist = NULL; + struct pneigh_entry *n, __rcu **np; + LIST_HEAD(head); u32 h; for (h = 0; h <= PNEIGH_HASHMASK; h++) { @@ -853,24 +863,25 @@ static int pneigh_ifdown_and_unlock(struct neigh_table *tbl, goto skip; if (!dev || n->dev == dev) { rcu_assign_pointer(*np, n->next); - rcu_assign_pointer(n->next, freelist); - freelist = n; + list_add(&n->free_node, &head); continue; } skip: np = &n->next; } } + write_unlock_bh(&tbl->lock); - while ((n = freelist)) { - freelist = rcu_dereference_protected(n->next, 1); - n->next = NULL; + + while (!list_empty(&head)) { + n = list_first_entry(&head, typeof(*n), free_node); + list_del(&n->free_node); + if (tbl->pdestructor) tbl->pdestructor(n); - netdev_put(n->dev, &n->dev_tracker); - kfree(n); + + call_rcu(&n->rcu, pneigh_destroy); } - return -ENOENT; } static inline void neigh_parms_put(struct neigh_parms *parms) -- cgit v1.2.3 From dd103c9a53752d3754a3182ec8dd97885680cfe2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:17 +0000 Subject: neighbour: Remove __pneigh_lookup(). __pneigh_lookup() is the lockless version of pneigh_lookup(), but its only caller pndisc_is_router() holds the table lock and reads pneigh_netry.flags. This is because accessing pneigh_entry after pneigh_lookup() was illegal unless the caller holds RTNL or the table lock. Now, pneigh_entry is guaranteed to be alive during the RCU critical section. Let's call pneigh_lookup() and use READ_ONCE() for n->flags in pndisc_is_router() and remove __pneigh_lookup(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-13-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 2 -- net/core/neighbour.c | 11 ----------- net/ipv6/ndisc.c | 6 ++---- 3 files changed, 2 insertions(+), 17 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 6d7f9aa53a7a..f8c7261cd4eb 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -381,8 +381,6 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b76ff416b9a7..e7bd8111f97f 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -737,17 +737,6 @@ static struct pneigh_entry *__pneigh_lookup_1(struct pneigh_entry *n, return NULL; } -struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, - struct net *net, const void *pkey, struct net_device *dev) -{ - unsigned int key_len = tbl->key_len; - u32 hash_val = pneigh_hash(pkey, key_len); - - return __pneigh_lookup_1(rcu_dereference_protected(tbl->phash_buckets[hash_val], 1), - net, pkey, key_len, dev); -} -EXPORT_SYMBOL_GPL(__pneigh_lookup); - struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *pkey, struct net_device *dev) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a3ac26c1df6d..7d5abb3158ec 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -768,11 +768,9 @@ static int pndisc_is_router(const void *pkey, struct pneigh_entry *n; int ret = -1; - read_lock_bh(&nd_tbl.lock); - n = __pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); + n = pneigh_lookup(&nd_tbl, dev_net(dev), pkey, dev); if (n) - ret = !!(n->flags & NTF_ROUTER); - read_unlock_bh(&nd_tbl.lock); + ret = !!(READ_ONCE(n->flags) & NTF_ROUTER); return ret; } -- cgit v1.2.3 From 13a936bb99fb6385dc8620d24d7111e514448371 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:19 +0000 Subject: neighbour: Protect tbl->phash_buckets[] with a dedicated mutex. tbl->phash_buckets[] is only modified in the slow path by pneigh_create() and pneigh_delete() under the table lock. Both of them are called under RTNL, so no extra lock is needed, but we will remove RTNL from the paths. pneigh_create() looks up a pneigh_entry, and this part can be lockless, but it would complicate the logic like 1. lookup 2. allocate pengih_entry for GFP_KERNEL 3. lookup again but under lock 4. if found, return it after freeing the allocated memory 5. else, return the new one Instead, let's add a per-table mutex and run lookup and allocation under it. Note that updating pneigh_entry part in neigh_add() is still protected by RTNL and will be moved to pneigh_create() in the next patch. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-15-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 1 + net/core/neighbour.c | 39 +++++++++++++++++++++------------------ 2 files changed, 22 insertions(+), 18 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f8c7261cd4eb..f333f9ebc425 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -240,6 +240,7 @@ struct neigh_table { unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; + struct mutex phash_lock; struct pneigh_entry __rcu **phash_buckets; }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 38f0067068c5..d312b6323ff2 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -54,9 +54,8 @@ static void neigh_timer_handler(struct timer_list *t); static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm); +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm); #ifdef CONFIG_PROC_FS static const struct seq_operations neigh_stat_seq_ops; @@ -437,7 +436,9 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, { write_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, skip_perm); - pneigh_ifdown_and_unlock(tbl, dev, skip_perm); + write_unlock_bh(&tbl->lock); + + pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, tbl->family); if (skb_queue_empty_lockless(&tbl->proxy_queue)) @@ -731,7 +732,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); n = rcu_dereference_check(tbl->phash_buckets[hash_val], - lockdep_is_held(&tbl->lock)); + lockdep_is_held(&tbl->phash_lock)); while (n) { if (!memcmp(n->key, pkey, key_len) && @@ -739,7 +740,7 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, (n->dev == dev || !n->dev)) return n; - n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->lock)); + n = rcu_dereference_check(n->next, lockdep_is_held(&tbl->phash_lock)); } return NULL; @@ -754,11 +755,9 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, unsigned int key_len; u32 hash_val; - ASSERT_RTNL(); + mutex_lock(&tbl->phash_lock); - read_lock_bh(&tbl->lock); n = pneigh_lookup(tbl, net, pkey, dev); - read_unlock_bh(&tbl->lock); if (n) goto out; @@ -780,11 +779,10 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, } hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); - write_unlock_bh(&tbl->lock); out: + mutex_unlock(&tbl->phash_lock); return n; } @@ -806,14 +804,16 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, key_len = tbl->key_len; hash_val = pneigh_hash(pkey, key_len); - write_lock_bh(&tbl->lock); + mutex_lock(&tbl->phash_lock); + for (np = &tbl->phash_buckets[hash_val]; (n = rcu_dereference_protected(*np, 1)) != NULL; np = &n->next) { if (!memcmp(n->key, pkey, key_len) && n->dev == dev && net_eq(pneigh_net(n), net)) { rcu_assign_pointer(*np, n->next); - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); if (tbl->pdestructor) tbl->pdestructor(n); @@ -822,18 +822,20 @@ int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *pkey, return 0; } } - write_unlock_bh(&tbl->lock); + + mutex_unlock(&tbl->phash_lock); return -ENOENT; } -static void pneigh_ifdown_and_unlock(struct neigh_table *tbl, - struct net_device *dev, - bool skip_perm) +static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, + bool skip_perm) { struct pneigh_entry *n, __rcu **np; LIST_HEAD(head); u32 h; + mutex_lock(&tbl->phash_lock); + for (h = 0; h <= PNEIGH_HASHMASK; h++) { np = &tbl->phash_buckets[h]; while ((n = rcu_dereference_protected(*np, 1)) != NULL) { @@ -849,7 +851,7 @@ skip: } } - write_unlock_bh(&tbl->lock); + mutex_unlock(&tbl->phash_lock); while (!list_empty(&head)) { n = list_first_entry(&head, typeof(*n), free_node); @@ -1796,6 +1798,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); rwlock_init(&tbl->lock); + mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, -- cgit v1.2.3 From dc2a27e524ac13e7a599bc693934ed81f868dc2d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 16 Jul 2025 22:08:20 +0000 Subject: neighbour: Update pneigh_entry in pneigh_create(). neigh_add() updates pneigh_entry() found or created by pneigh_create(). This update is serialised by RTNL, but we will remove it. Let's move the update part to pneigh_create() and make it return errno instead of a pointer of pneigh_entry. Now, the pneigh code is RTNL free. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250716221221.442239-16-kuniyu@google.com Signed-off-by: Jakub Kicinski --- include/net/neighbour.h | 5 +++-- net/core/neighbour.c | 34 ++++++++++++++++------------------ net/ipv4/arp.c | 4 +--- 3 files changed, 20 insertions(+), 23 deletions(-) (limited to 'include/net') diff --git a/include/net/neighbour.h b/include/net/neighbour.h index f333f9ebc425..4a30bd458c5a 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -382,8 +382,9 @@ void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, struct net *net, - const void *key, struct net_device *dev); +int pneigh_create(struct neigh_table *tbl, struct net *net, const void *key, + struct net_device *dev, u32 flags, u8 protocol, + bool permanent); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d312b6323ff2..4316ca3d9872 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -747,24 +747,27 @@ struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, } EXPORT_IPV6_MOD(pneigh_lookup); -struct pneigh_entry *pneigh_create(struct neigh_table *tbl, - struct net *net, const void *pkey, - struct net_device *dev) +int pneigh_create(struct neigh_table *tbl, struct net *net, + const void *pkey, struct net_device *dev, + u32 flags, u8 protocol, bool permanent) { struct pneigh_entry *n; unsigned int key_len; u32 hash_val; + int err = 0; mutex_lock(&tbl->phash_lock); n = pneigh_lookup(tbl, net, pkey, dev); if (n) - goto out; + goto update; key_len = tbl->key_len; n = kzalloc(sizeof(*n) + key_len, GFP_KERNEL); - if (!n) + if (!n) { + err = -ENOBUFS; goto out; + } write_pnet(&n->net, net); memcpy(n->key, pkey, key_len); @@ -774,16 +777,20 @@ struct pneigh_entry *pneigh_create(struct neigh_table *tbl, if (tbl->pconstructor && tbl->pconstructor(n)) { netdev_put(dev, &n->dev_tracker); kfree(n); - n = NULL; + err = -ENOBUFS; goto out; } hash_val = pneigh_hash(pkey, key_len); n->next = tbl->phash_buckets[hash_val]; rcu_assign_pointer(tbl->phash_buckets[hash_val], n); +update: + WRITE_ONCE(n->flags, flags); + n->permanent = permanent; + WRITE_ONCE(n->protocol, protocol); out: mutex_unlock(&tbl->phash_lock); - return n; + return err; } static void pneigh_destroy(struct rcu_head *rcu) @@ -2015,22 +2022,13 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[NDA_PROTOCOL]) protocol = nla_get_u8(tb[NDA_PROTOCOL]); if (ndm_flags & NTF_PROXY) { - struct pneigh_entry *pn; - if (ndm_flags & (NTF_MANAGED | NTF_EXT_VALIDATED)) { NL_SET_ERR_MSG(extack, "Invalid NTF_* flag combination"); goto out; } - err = -ENOBUFS; - pn = pneigh_create(tbl, net, dst, dev); - if (pn) { - WRITE_ONCE(pn->flags, ndm_flags); - pn->permanent = !!(ndm->ndm_state & NUD_PERMANENT); - if (protocol) - WRITE_ONCE(pn->protocol, protocol); - err = 0; - } + err = pneigh_create(tbl, net, dst, dev, ndm_flags, protocol, + !!(ndm->ndm_state & NUD_PERMANENT)); goto out; } diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d93b5735b0ba..5cfc1c939673 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -1089,9 +1089,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; - if (!pneigh_create(&arp_tbl, net, &ip, dev)) - return -ENOBUFS; - return 0; + return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); -- cgit v1.2.3 From 6624a0af82a6e3a4d3609264ef591a8fa3467139 Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 17 Jul 2025 17:42:02 +1000 Subject: wifi: cfg80211: support configuring an S1G short beaconing BSS S1G short beacons are an optional frame type used in an S1G BSS that contain a limited set of elements. While they are optional, they are a fundamental part of S1G that enables significant power saving. Expose 2 additional netlink attributes, NL80211_ATTR_S1G_LONG_BEACON_PERIOD which denotes the number of beacon intervals between each long beacon and NL80211_ATTR_S1G_SHORT_BEACON which is a nested attribute containing the short beacon tail and head. We split them as the long beacon period cannot be updated, and is only used when initialisng the interface, whereas the short beacon data can be used to both initialise and update the templates. This follows how things such as the beacon interval and DTIM period currently operate. During the initialisation path, we ensure we have the long beacon period if the short beacon data is being passed down, whereas the update path will simply update the template if its sent down. The short beacon data is validated using the same routines for regular beacons as they support correctly parsing the short beacon format while ensuring the frame is well-formed. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250717074205.312577-2-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 23 ++++++++++++++ include/uapi/linux/nl80211.h | 39 ++++++++++++++++++++++++ net/wireless/nl80211.c | 72 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 134 insertions(+) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 17f2a665dce6..44a1055a81ba 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1423,6 +1423,23 @@ struct cfg80211_unsol_bcast_probe_resp { const u8 *tmpl; }; +/** + * struct cfg80211_s1g_short_beacon - S1G short beacon data. + * + * @update: Set to true if the feature configuration should be updated. + * @short_head: Short beacon head. + * @short_tail: Short beacon tail. + * @short_head_len: Short beacon head len. + * @short_tail_len: Short beacon tail len. + */ +struct cfg80211_s1g_short_beacon { + bool update; + const u8 *short_head; + const u8 *short_tail; + size_t short_head_len; + size_t short_tail_len; +}; + /** * struct cfg80211_ap_settings - AP configuration * @@ -1463,6 +1480,8 @@ struct cfg80211_unsol_bcast_probe_resp { * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters * @mbssid_config: AP settings for multiple bssid + * @s1g_long_beacon_period: S1G long beacon period + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -1496,6 +1515,8 @@ struct cfg80211_ap_settings { struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; struct cfg80211_mbssid_config mbssid_config; + u8 s1g_long_beacon_period; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; @@ -1507,11 +1528,13 @@ struct cfg80211_ap_settings { * @beacon: beacon data * @fils_discovery: FILS discovery transmission parameters * @unsol_bcast_probe_resp: Unsolicited broadcast probe response parameters + * @s1g_short_beacon: S1G short beacon data */ struct cfg80211_ap_update { struct cfg80211_beacon_data beacon; struct cfg80211_fils_discovery fils_discovery; struct cfg80211_unsol_bcast_probe_resp unsol_bcast_probe_resp; + struct cfg80211_s1g_short_beacon s1g_short_beacon; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 39460334dafb..d1a14f2892d9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2915,6 +2915,19 @@ enum nl80211_commands { * applicable to that specific radio only. If the radio id is greater * thank the number of radios, error denoting invalid value is returned. * + * @NL80211_ATTR_S1G_LONG_BEACON_PERIOD: (u8) Integer attribute that represents + * the number of beacon intervals between each long beacon transmission + * for an S1G BSS with short beaconing enabled. This is a required + * attribute for initialising an S1G short beaconing BSS. When updating + * the short beacon data, this is not required. It has a minimum value of + * 2 (i.e 2 beacon intervals). + * + * @NL80211_ATTR_S1G_SHORT_BEACON: Nested attribute containing the short beacon + * head and tail used to set or update the short beacon templates. When + * bringing up a new interface, %NL80211_ATTR_S1G_LONG_BEACON_PERIOD is + * required alongside this attribute. Refer to + * @enum nl80211_s1g_short_beacon_attrs for the attribute definitions. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -3474,6 +3487,9 @@ enum nl80211_attrs { NL80211_ATTR_WIPHY_RADIO_INDEX, + NL80211_ATTR_S1G_LONG_BEACON_PERIOD, + NL80211_ATTR_S1G_SHORT_BEACON, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -8148,4 +8164,27 @@ enum nl80211_wiphy_radio_freq_range { NL80211_WIPHY_RADIO_FREQ_ATTR_MAX = __NL80211_WIPHY_RADIO_FREQ_ATTR_LAST - 1, }; +/** + * enum nl80211_s1g_short_beacon_attrs - S1G short beacon data + * + * @__NL80211_S1G_SHORT_BEACON_ATTR_INVALID: Invalid + * + * @NL80211_S1G_SHORT_BEACON_ATTR_HEAD: Short beacon head (binary). + * @NL80211_S1G_SHORT_BEACON_ATTR_TAIL: Short beacon tail (binary). + * + * @__NL80211_S1G_SHORT_BEACON_ATTR_LAST: Internal + * @NL80211_S1G_SHORT_BEACON_ATTR_MAX: Highest attribute + */ +enum nl80211_s1g_short_beacon_attrs { + __NL80211_S1G_SHORT_BEACON_ATTR_INVALID, + + NL80211_S1G_SHORT_BEACON_ATTR_HEAD, + NL80211_S1G_SHORT_BEACON_ATTR_TAIL, + + /* keep last */ + __NL80211_S1G_SHORT_BEACON_ATTR_LAST, + NL80211_S1G_SHORT_BEACON_ATTR_MAX = + __NL80211_S1G_SHORT_BEACON_ATTR_LAST - 1 +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 20bc0f052c16..1c808b08b747 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -482,6 +482,16 @@ nl80211_sta_wme_policy[NL80211_STA_WME_MAX + 1] = { [NL80211_STA_WME_MAX_SP] = { .type = NLA_U8 }, }; +static const struct nla_policy +nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { + [NL80211_S1G_SHORT_BEACON_ATTR_HEAD] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_beacon_head, + IEEE80211_MAX_DATA_LEN), + [NL80211_S1G_SHORT_BEACON_ATTR_TAIL] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -858,6 +868,9 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_EPCS] = { .type = NLA_FLAG }, [NL80211_ATTR_ASSOC_MLD_EXT_CAPA_OPS] = { .type = NLA_U16 }, [NL80211_ATTR_WIPHY_RADIO_INDEX] = { .type = NLA_U8 }, + [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), + [NL80211_ATTR_S1G_SHORT_BEACON] = + NLA_POLICY_NESTED(nl80211_s1g_short_beacon), }; /* policy for the key attributes */ @@ -6202,6 +6215,41 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params return 0; } +static int +nl80211_parse_s1g_short_beacon(struct cfg80211_registered_device *rdev, + struct nlattr *attrs, + struct cfg80211_s1g_short_beacon *sb) +{ + struct nlattr *tb[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1]; + int ret; + + if (!rdev->wiphy.bands[NL80211_BAND_S1GHZ]) + return -EINVAL; + + ret = nla_parse_nested(tb, NL80211_S1G_SHORT_BEACON_ATTR_MAX, attrs, + NULL, NULL); + if (ret) + return ret; + + /* Short beacon tail is optional (i.e might only include the TIM) */ + if (!tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]) + return -EINVAL; + + sb->short_head = nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]); + sb->short_head_len = nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_HEAD]); + sb->short_tail_len = 0; + + if (tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]) { + sb->short_tail = + nla_data(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]); + sb->short_tail_len = + nla_len(tb[NL80211_S1G_SHORT_BEACON_ATTR_TAIL]); + } + + sb->update = true; + return 0; +} + static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6442,6 +6490,22 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) goto out; } + if (info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]) { + if (!info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]) { + err = -EINVAL; + goto out; + } + + params->s1g_long_beacon_period = nla_get_u8( + info->attrs[NL80211_ATTR_S1G_LONG_BEACON_PERIOD]); + + err = nl80211_parse_s1g_short_beacon( + rdev, info->attrs[NL80211_ATTR_S1G_SHORT_BEACON], + ¶ms->s1g_short_beacon); + if (err) + goto out; + } + err = nl80211_calculate_ap_params(params); if (err) goto out; @@ -6550,6 +6614,14 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) goto out; } + attr = info->attrs[NL80211_ATTR_S1G_SHORT_BEACON]; + if (attr) { + err = nl80211_parse_s1g_short_beacon(rdev, attr, + ¶ms->s1g_short_beacon); + if (err) + goto out; + } + err = rdev_change_beacon(rdev, dev, params); out: -- cgit v1.2.3 From bbf93a06d73505591db3a93797f44b9c44555d9b Mon Sep 17 00:00:00 2001 From: Lachlan Hodges Date: Thu, 17 Jul 2025 17:42:03 +1000 Subject: wifi: mac80211: support initialising an S1G short beaconing BSS Introduce the ability to parse the short beacon data and long beacon period. The long beacon period represents the number of beacon intervals between each long beacon transmission. Additionally, as a BSS cannot change its configuration such that short beaconing is dynamically disabled/enabled without tearing down the interface - we ensure we have an existing short beacon before performing the update. Signed-off-by: Lachlan Hodges Link: https://patch.msgid.link/20250717074205.312577-3-lachlan.hodges@morsemicro.com Signed-off-by: Johannes Berg --- include/net/mac80211.h | 4 +++ net/mac80211/cfg.c | 66 ++++++++++++++++++++++++++++++++++++++++++++-- net/mac80211/ieee80211_i.h | 9 +++++++ 3 files changed, 77 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index 577fd6a8c372..a2dbaad2f6d3 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -758,6 +758,8 @@ struct ieee80211_parsed_tpe { * be updated to 1, even if bss_param_ch_cnt didn't change. This allows * the link to know that it heard the latest value from its own beacon * (as opposed to hearing its value from another link's beacon). + * @s1g_long_beacon_period: number of beacon intervals between each long + * beacon transmission. */ struct ieee80211_bss_conf { struct ieee80211_vif *vif; @@ -857,6 +859,8 @@ struct ieee80211_bss_conf { u8 bss_param_ch_cnt; u8 bss_param_ch_cnt_link_id; + + u8 s1g_long_beacon_period; }; /** diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index b99e39cb808b..2f97e2d5bb8b 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1071,6 +1071,47 @@ ieee80211_set_unsol_bcast_probe_resp(struct ieee80211_sub_if_data *sdata, return 0; } +static int +ieee80211_set_s1g_short_beacon(struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_data *link, + struct cfg80211_s1g_short_beacon *params) +{ + struct s1g_short_beacon_data *new; + struct s1g_short_beacon_data *old = + sdata_dereference(link->u.ap.s1g_short_beacon, sdata); + size_t new_len = + sizeof(*new) + params->short_head_len + params->short_tail_len; + + if (!params->update) + return 0; + + if (!params->short_head) + return -EINVAL; + + new = kzalloc(new_len, GFP_KERNEL); + if (!new) + return -ENOMEM; + + /* Memory layout: | struct | head | tail | */ + new->short_head = (u8 *)new + sizeof(*new); + new->short_head_len = params->short_head_len; + memcpy(new->short_head, params->short_head, params->short_head_len); + + if (params->short_tail) { + new->short_tail = new->short_head + params->short_head_len; + new->short_tail_len = params->short_tail_len; + memcpy(new->short_tail, params->short_tail, + params->short_tail_len); + } + + rcu_assign_pointer(link->u.ap.s1g_short_beacon, new); + + if (old) + kfree_rcu(old, rcu_head); + + return 0; +} + static int ieee80211_set_ftm_responder_params( struct ieee80211_sub_if_data *sdata, const u8 *lci, size_t lci_len, @@ -1493,8 +1534,8 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->twt_responder = params->twt_responder; link_conf->he_obss_pd = params->he_obss_pd; link_conf->he_bss_color = params->beacon.he_bss_color; - sdata->vif.cfg.s1g = params->chandef.chan->band == - NL80211_BAND_S1GHZ; + link_conf->s1g_long_beacon_period = params->s1g_long_beacon_period; + sdata->vif.cfg.s1g = params->chandef.chan->band == NL80211_BAND_S1GHZ; sdata->vif.cfg.ssid_len = params->ssid_len; if (params->ssid_len) @@ -1541,6 +1582,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, if (err < 0) goto error; + if (sdata->vif.cfg.s1g) { + err = ieee80211_set_s1g_short_beacon(sdata, link, + ¶ms->s1g_short_beacon); + if (err < 0) + goto error; + } + err = drv_start_ap(sdata->local, sdata, link_conf); if (err) { old = sdata_dereference(link->u.ap.beacon, sdata); @@ -1619,6 +1667,13 @@ static int ieee80211_change_beacon(struct wiphy *wiphy, struct net_device *dev, if (err < 0) return err; + if (link->u.ap.s1g_short_beacon) { + err = ieee80211_set_s1g_short_beacon(sdata, link, + ¶ms->s1g_short_beacon); + if (err < 0) + return err; + } + if (beacon->he_bss_color_valid && beacon->he_bss_color.enabled != link_conf->he_bss_color.enabled) { link_conf->he_bss_color.enabled = beacon->he_bss_color.enabled; @@ -1650,6 +1705,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, struct probe_resp *old_probe_resp; struct fils_discovery_data *old_fils_discovery; struct unsol_bcast_probe_resp_data *old_unsol_bcast_probe_resp; + struct s1g_short_beacon_data *old_s1g_short_beacon; struct cfg80211_chan_def chandef; struct ieee80211_link_data *link = sdata_dereference(sdata->link[link_id], sdata); @@ -1668,6 +1724,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, old_unsol_bcast_probe_resp = sdata_dereference(link->u.ap.unsol_bcast_probe_resp, sdata); + old_s1g_short_beacon = + sdata_dereference(link->u.ap.s1g_short_beacon, sdata); /* abort any running channel switch or color change */ link_conf->csa_active = false; @@ -1690,6 +1748,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, RCU_INIT_POINTER(link->u.ap.probe_resp, NULL); RCU_INIT_POINTER(link->u.ap.fils_discovery, NULL); RCU_INIT_POINTER(link->u.ap.unsol_bcast_probe_resp, NULL); + RCU_INIT_POINTER(link->u.ap.s1g_short_beacon, NULL); kfree_rcu(old_beacon, rcu_head); if (old_probe_resp) kfree_rcu(old_probe_resp, rcu_head); @@ -1697,6 +1756,8 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, kfree_rcu(old_fils_discovery, rcu_head); if (old_unsol_bcast_probe_resp) kfree_rcu(old_unsol_bcast_probe_resp, rcu_head); + if (old_s1g_short_beacon) + kfree_rcu(old_s1g_short_beacon, rcu_head); kfree(link_conf->ftmr_params); link_conf->ftmr_params = NULL; @@ -1720,6 +1781,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->enable_beacon = false; sdata->beacon_rate_set = false; sdata->vif.cfg.ssid_len = 0; + sdata->vif.cfg.s1g = false; clear_bit(SDATA_STATE_OFFCHANNEL_BEACON_STOPPED, &sdata->state); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_BEACON_ENABLED); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9c0603eb580f..61cd1cc098ac 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -296,6 +296,14 @@ struct unsol_bcast_probe_resp_data { u8 data[]; }; +struct s1g_short_beacon_data { + struct rcu_head rcu_head; + u8 *short_head; + u8 *short_tail; + int short_head_len; + int short_tail_len; +}; + struct ps_data { /* yes, this looks ugly, but guarantees that we can later use * bitmap_empty :) @@ -1042,6 +1050,7 @@ struct ieee80211_link_data_ap { struct probe_resp __rcu *probe_resp; struct fils_discovery_data __rcu *fils_discovery; struct unsol_bcast_probe_resp_data __rcu *unsol_bcast_probe_resp; + struct s1g_short_beacon_data __rcu *s1g_short_beacon; /* to be used after channel switch. */ struct cfg80211_beacon_data *next_beacon; -- cgit v1.2.3 From a6f190630d070173897a7e98a30188b7638ba0a1 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Wed, 16 Jul 2025 18:26:53 +0200 Subject: net: track pfmemalloc drops via SKB_DROP_REASON_PFMEMALLOC Add a new SKB drop reason (SKB_DROP_REASON_PFMEMALLOC) to track packets dropped due to memory pressure. In production environments, we've observed memory exhaustion reported by memory layer stack traces, but these drops were not properly tracked in the SKB drop reason infrastructure. While most network code paths now properly report pfmemalloc drops, some protocol-specific socket implementations still use sk_filter() without drop reason tracking: - Bluetooth L2CAP sockets - CAIF sockets - IUCV sockets - Netlink sockets - SCTP sockets - Unix domain sockets These remaining cases represent less common paths and could be converted in a follow-up patch if needed. The current implementation provides significantly improved observability into memory pressure events in the network stack, especially for key protocols like TCP and UDP, helping to diagnose problems in production environments. Reported-by: Matt Fleming Signed-off-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/175268316579.2407873.11634752355644843509.stgit@firesoul Signed-off-by: Jakub Kicinski --- drivers/net/tun.c | 6 ++---- include/linux/filter.h | 14 ++++++++++++-- include/net/dropreason-core.h | 6 ++++++ include/net/tcp.h | 2 +- net/core/dev.c | 8 ++++++-- net/core/filter.c | 15 ++++++++++++--- net/core/sock.c | 20 +++++++++++++------- net/ipv4/tcp_ipv4.c | 26 +++++++++++++++----------- net/ipv4/udp.c | 6 ++---- net/ipv6/tcp_ipv6.c | 9 +++------ net/ipv6/udp.c | 4 +--- net/rose/rose_in.c | 3 ++- 12 files changed, 75 insertions(+), 44 deletions(-) (limited to 'include/net') diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 49bcd12a4ac8..e65228ba3fae 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1002,8 +1002,8 @@ static unsigned int run_ebpf_filter(struct tun_struct *tun, /* Net device start xmit */ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct tun_struct *tun = netdev_priv(dev); - enum skb_drop_reason drop_reason; int txq = skb->queue_mapping; struct netdev_queue *queue; struct tun_file *tfile; @@ -1032,10 +1032,8 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) } if (tfile->socket.sk->sk_filter && - sk_filter(tfile->socket.sk, skb)) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + sk_filter_reason(tfile->socket.sk, skb, &drop_reason)) goto drop; - } len = run_ebpf_filter(tun, skb, len); if (len == 0) { diff --git a/include/linux/filter.h b/include/linux/filter.h index f5cf4d35d83e..4e82332afe03 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1073,10 +1073,20 @@ bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) return set_memory_rox((unsigned long)hdr, hdr->size >> PAGE_SHIFT); } -int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap, + enum skb_drop_reason *reason); + static inline int sk_filter(struct sock *sk, struct sk_buff *skb) { - return sk_filter_trim_cap(sk, skb, 1); + enum skb_drop_reason ignore_reason; + + return sk_filter_trim_cap(sk, skb, 1, &ignore_reason); +} + +static inline int sk_filter_reason(struct sock *sk, struct sk_buff *skb, + enum skb_drop_reason *reason) +{ + return sk_filter_trim_cap(sk, skb, 1, reason); } struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index 229bb1826f2a..e19184dd1b0f 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -125,6 +125,7 @@ FN(CAN_RX_INVALID_FRAME) \ FN(CANFD_RX_INVALID_FRAME) \ FN(CANXL_RX_INVALID_FRAME) \ + FN(PFMEMALLOC) \ FNe(MAX) /** @@ -598,6 +599,11 @@ enum skb_drop_reason { * non conform CAN-XL frame (or device is unable to receive CAN frames) */ SKB_DROP_REASON_CANXL_RX_INVALID_FRAME, + /** + * @SKB_DROP_REASON_PFMEMALLOC: packet allocated from memory reserve + * reached a path or socket not eligible for use of memory reserves + */ + SKB_DROP_REASON_PFMEMALLOC, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/include/net/tcp.h b/include/net/tcp.h index bc08de49805c..b3815d104340 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1559,7 +1559,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); -int tcp_filter(struct sock *sk, struct sk_buff *skb); +int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); diff --git a/net/core/dev.c b/net/core/dev.c index 621a639aeba1..59a9089117de 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5749,6 +5749,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, struct packet_type **ppt_prev) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_UNHANDLED_PROTO; struct packet_type *ptype, *pt_prev; rx_handler_func_t *rx_handler; struct sk_buff *skb = *pskb; @@ -5840,8 +5841,10 @@ skip_taps: #endif skb_reset_redirect(skb); skip_classify: - if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) + if (pfmemalloc && !skb_pfmemalloc_protocol(skb)) { + drop_reason = SKB_DROP_REASON_PFMEMALLOC; goto drop; + } if (skb_vlan_tag_present(skb)) { if (pt_prev) { @@ -5946,7 +5949,8 @@ drop: dev_core_stats_rx_dropped_inc(skb->dev); else dev_core_stats_rx_nohandler_inc(skb->dev); - kfree_skb_reason(skb, SKB_DROP_REASON_UNHANDLED_PROTO); + + kfree_skb_reason(skb, drop_reason); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) */ diff --git a/net/core/filter.c b/net/core/filter.c index 7a72f766aacf..2eb8947d8097 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -122,6 +122,7 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * @sk: sock associated with &sk_buff * @skb: buffer to filter * @cap: limit on how short the eBPF program may trim the packet + * @reason: record drop reason on errors (negative return value) * * Run the eBPF program and then cut skb->data to correct size returned by * the program. If pkt_len is 0 we toss packet. If skb->len is smaller @@ -130,7 +131,8 @@ EXPORT_SYMBOL_GPL(copy_bpf_fprog_from_user); * be accepted or -EPERM if the packet should be tossed. * */ -int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) +int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, + unsigned int cap, enum skb_drop_reason *reason) { int err; struct sk_filter *filter; @@ -142,15 +144,20 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + *reason = SKB_DROP_REASON_PFMEMALLOC; return -ENOMEM; } err = BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SOCKET_FILTER; return err; + } err = security_sock_rcv_skb(sk, skb); - if (err) + if (err) { + *reason = SKB_DROP_REASON_SECURITY_HOOK; return err; + } rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); @@ -162,6 +169,8 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) pkt_len = bpf_prog_run_save_cb(filter->prog, skb); skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; + if (err) + *reason = SKB_DROP_REASON_SOCKET_FILTER; } rcu_read_unlock(); diff --git a/net/core/sock.c b/net/core/sock.c index 8b7623c7d547..7c26ec8dce63 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -526,11 +526,10 @@ int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason drop_reason; int err; - err = sk_filter(sk, skb); - if (err) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + err = sk_filter_reason(sk, skb, &drop_reason); + if (err) goto out; - } + err = __sock_queue_rcv_skb(sk, skb); switch (err) { case -ENOMEM: @@ -553,15 +552,18 @@ EXPORT_SYMBOL(sock_queue_rcv_skb_reason); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted) { + enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; int rc = NET_RX_SUCCESS; + int err; - if (sk_filter_trim_cap(sk, skb, trim_cap)) + if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) goto discard_and_relse; skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { atomic_inc(&sk->sk_drops); + reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } if (nested) @@ -577,8 +579,12 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, _RET_IP_); - } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) { + } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { bh_unlock_sock(sk); + if (err == -ENOMEM) + reason = SKB_DROP_REASON_PFMEMALLOC; + if (err == -ENOBUFS) + reason = SKB_DROP_REASON_SOCKET_BACKLOG; atomic_inc(&sk->sk_drops); goto discard_and_relse; } @@ -589,7 +595,7 @@ out: sock_put(sk); return rc; discard_and_relse: - kfree_skb(skb); + sk_skb_reason_drop(sk, skb, reason); goto out; } EXPORT_SYMBOL(__sk_receive_skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 16bf6fdff96b..84d3d556ed80 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2026,6 +2026,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, u32 gso_size; u64 limit; int delta; + int err; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), * we can fix skb->truesize to its real value to avoid future drops. @@ -2136,21 +2137,27 @@ no_coalesce: limit = min_t(u64, limit, UINT_MAX); - if (unlikely(sk_add_backlog(sk, skb, limit))) { + err = sk_add_backlog(sk, skb, limit); + if (unlikely(err)) { bh_unlock_sock(sk); - *reason = SKB_DROP_REASON_SOCKET_BACKLOG; - __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + if (err == -ENOMEM) { + *reason = SKB_DROP_REASON_PFMEMALLOC; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_PFMEMALLOCDROP); + } else { + *reason = SKB_DROP_REASON_SOCKET_BACKLOG; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP); + } return true; } return false; } EXPORT_IPV6_MOD(tcp_add_backlog); -int tcp_filter(struct sock *sk, struct sk_buff *skb) +int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) { struct tcphdr *th = (struct tcphdr *)skb->data; - return sk_filter_trim_cap(sk, skb, th->doff * 4); + return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); } EXPORT_IPV6_MOD(tcp_filter); @@ -2277,14 +2284,12 @@ lookup: } refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) { + if (!tcp_filter(sk, skb, &drop_reason)) { th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen, &drop_reason); - } else { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); @@ -2340,10 +2345,9 @@ process: nf_reset_ct(skb); - if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (tcp_filter(sk, skb, &drop_reason)) goto discard_and_relse; - } + th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); tcp_v4_fill_cb(skb, iph, th); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 49f43c54cfb0..cc3ce0f762ec 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2347,7 +2347,7 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) */ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { - int drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); @@ -2436,10 +2436,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; - } udp_csum_pull_header(skb); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8f2c3cba1f1f..7577e7eb2c97 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1834,14 +1834,12 @@ lookup: } refcounted = true; nsk = NULL; - if (!tcp_filter(sk, skb)) { + if (!tcp_filter(sk, skb, &drop_reason)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen, &drop_reason); - } else { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); @@ -1897,10 +1895,9 @@ process: nf_reset_ct(skb); - if (tcp_filter(sk, skb)) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (tcp_filter(sk, skb, &drop_reason)) goto discard_and_relse; - } + th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6bbdadbd5fec..6a68f77da44b 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -894,10 +894,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { - drop_reason = SKB_DROP_REASON_SOCKET_FILTER; + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr), &drop_reason)) goto drop; - } udp_csum_pull_header(skb); diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 4d67f36dce1b..3e99181e759f 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -101,6 +101,7 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety */ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) { + enum skb_drop_reason dr; /* ignored */ struct rose_sock *rose = rose_sk(sk); int queued = 0; @@ -162,7 +163,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety rose_frames_acked(sk, nr); if (ns == rose->vr) { rose_start_idletimer(sk); - if (sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN) == 0 && + if (!sk_filter_trim_cap(sk, skb, ROSE_MIN_LEN, &dr) && __sock_queue_rcv_skb(sk, skb) == 0) { rose->vr = (rose->vr + 1) % ROSE_MODULUS; queued = 1; -- cgit v1.2.3 From 460114eae8284155b51f6e72ed26f627ee338a30 Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 21 Jul 2025 09:20:03 +0300 Subject: wifi: mac80211: remove ieee80211_remove_key It is no longer used, remove it. Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250721091956.e964ceacd85c.Idecab8ef161fa58e000b3969bc936399284b79f0@changeid Signed-off-by: Johannes Berg --- include/net/mac80211.h | 16 +--------------- net/mac80211/key.c | 27 +-------------------------- 2 files changed, 2 insertions(+), 41 deletions(-) (limited to 'include/net') diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a2dbaad2f6d3..a0cf976a9117 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6032,18 +6032,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf, void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, int tid, struct ieee80211_key_seq *seq); -/** - * ieee80211_remove_key - remove the given key - * @keyconf: the parameter passed with the set key - * - * Context: Must be called with the wiphy mutex held. - * - * Remove the given key. If the key was uploaded to the hardware at the - * time this function is called, it is not deleted in the hardware but - * instead assumed to have been removed already. - */ -void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); - /** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on @@ -6070,9 +6058,7 @@ void ieee80211_remove_key(struct ieee80211_key_conf *keyconf); * for the new key for each TID to set up sequence counters properly. * * IMPORTANT: If this replaces a key that is present in the hardware, - * then it will attempt to remove it during this call. In many cases - * this isn't what you want, so call ieee80211_remove_key() first for - * the key that's being replaced. + * then it will attempt to remove it during this call. */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 997892da8886..9d65013ddac7 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -6,7 +6,7 @@ * Copyright 2007-2008 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright 2018-2020, 2022-2024 Intel Corporation + * Copyright 2018-2020, 2022-2025 Intel Corporation */ #include @@ -1354,31 +1354,6 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, } EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq); -void ieee80211_remove_key(struct ieee80211_key_conf *keyconf) -{ - struct ieee80211_key *key; - - key = container_of(keyconf, struct ieee80211_key, conf); - - lockdep_assert_wiphy(key->local->hw.wiphy); - - /* - * if key was uploaded, we assume the driver will/has remove(d) - * it, so adjust bookkeeping accordingly - */ - if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) { - key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; - - if (!(key->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | - IEEE80211_KEY_FLAG_PUT_MIC_SPACE | - IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) - increment_tailroom_need_count(key->sdata); - } - - ieee80211_key_free(key, false); -} -EXPORT_SYMBOL_GPL(ieee80211_remove_key); - struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, struct ieee80211_key_conf *keyconf, -- cgit v1.2.3 From 84b62b72b4c759b51568e44b0e8dc80f4cb8a2b9 Mon Sep 17 00:00:00 2001 From: Michael-CY Lee Date: Mon, 21 Jul 2025 14:51:59 +0800 Subject: wifi: cfg80211/mac80211: report link ID for unexpected frames The upper layer may require the link ID to properly handle unexpected frames. For instance, if hostapd, operating as an AP MLD, receives a data frame from a non-associated STA, it must send deauthentication to the link on which the STA is operating. Signed-off-by: Michael-CY Lee Reviewed-by: Money Wang Link: https://patch.msgid.link/20250721065159.1740992-1-michael-cy.lee@mediatek.com [edit commit message] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 10 ++++++---- net/mac80211/rx.c | 10 +++++----- net/wireless/nl80211.c | 22 ++++++++++++---------- net/wireless/trace.h | 17 ++++++++++------- 4 files changed, 33 insertions(+), 26 deletions(-) (limited to 'include/net') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 44a1055a81ba..406626ff6cc8 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -9048,6 +9048,7 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, /** * cfg80211_rx_spurious_frame - inform userspace about a spurious frame * @dev: The device the frame matched to + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @addr: the transmitter address * @gfp: context flags * @@ -9057,13 +9058,14 @@ void cfg80211_pmksa_candidate_notify(struct net_device *dev, int index, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_rx_unexpected_4addr_frame - inform about unexpected WDS frame * @dev: The device the frame matched to * @addr: the transmitter address + * @link_id: the link the frame was received on, -1 if not applicable or unknown * @gfp: context flags * * This function is used in AP mode (only!) to inform userspace that @@ -9073,8 +9075,8 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev, * Return: %true if the frame was passed to userspace (or this failed * for a reason other than not having a subscription.) */ -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp); +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp); /** * cfg80211_probe_status - notify userspace about probe status diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 576e399fc99c..4d4ff4d4917a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1532,9 +1532,8 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) } if (rx->sdata->vif.type == NL80211_IFTYPE_AP && - cfg80211_rx_spurious_frame(rx->sdata->dev, - hdr->addr2, - GFP_ATOMIC)) + cfg80211_rx_spurious_frame(rx->sdata->dev, hdr->addr2, + rx->link_id, GFP_ATOMIC)) return RX_DROP_U_SPURIOUS; return RX_DROP; @@ -1872,7 +1871,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) if (!test_and_set_sta_flag(sta, WLAN_STA_4ADDR_EVENT)) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, sta->sta.addr, - GFP_ATOMIC); + rx->link_id, GFP_ATOMIC); return RX_DROP_U_UNEXPECTED_4ADDR_FRAME; } /* @@ -3191,7 +3190,8 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) if (rx->sta && !test_and_set_sta_flag(rx->sta, WLAN_STA_4ADDR_EVENT)) cfg80211_rx_unexpected_4addr_frame( - rx->sdata->dev, rx->sta->sta.addr, GFP_ATOMIC); + rx->sdata->dev, rx->sta->sta.addr, rx->link_id, + GFP_ATOMIC); return RX_DROP; } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1c808b08b747..b4bf44768dc8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -19755,7 +19755,7 @@ void cfg80211_conn_failed(struct net_device *dev, const u8 *mac_addr, EXPORT_SYMBOL(cfg80211_conn_failed); static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, - const u8 *addr, gfp_t gfp) + const u8 *addr, int link_id, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -19778,7 +19778,9 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) || - nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr)) + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) || + (link_id >= 0 && + nla_put_u8(msg, NL80211_ATTR_MLO_LINK_ID, link_id))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -19790,13 +19792,13 @@ static bool __nl80211_unexpected_frame(struct net_device *dev, u8 cmd, return true; } -bool cfg80211_rx_spurious_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) +bool cfg80211_rx_spurious_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; bool ret; - trace_cfg80211_rx_spurious_frame(dev, addr); + trace_cfg80211_rx_spurious_frame(dev, addr, link_id); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO)) { @@ -19804,19 +19806,19 @@ bool cfg80211_rx_spurious_frame(struct net_device *dev, return false; } ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_FRAME, - addr, gfp); + addr, link_id, gfp); trace_cfg80211_return_bool(ret); return ret; } EXPORT_SYMBOL(cfg80211_rx_spurious_frame); -bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, - const u8 *addr, gfp_t gfp) +bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, const u8 *addr, + int link_id, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; bool ret; - trace_cfg80211_rx_unexpected_4addr_frame(dev, addr); + trace_cfg80211_rx_unexpected_4addr_frame(dev, addr, link_id); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO && @@ -19826,7 +19828,7 @@ bool cfg80211_rx_unexpected_4addr_frame(struct net_device *dev, } ret = __nl80211_unexpected_frame(dev, NL80211_CMD_UNEXPECTED_4ADDR_FRAME, - addr, gfp); + addr, link_id, gfp); trace_cfg80211_return_bool(ret); return ret; } diff --git a/net/wireless/trace.h b/net/wireless/trace.h index a07d88d61bec..34c584a215e5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3570,27 +3570,30 @@ TRACE_EVENT(cfg80211_cac_event, ); DECLARE_EVENT_CLASS(cfg80211_rx_evt, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr), + TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), + TP_ARGS(netdev, addr, link_id), TP_STRUCT__entry( NETDEV_ENTRY MAC_ENTRY(addr) + __field(int, link_id) ), TP_fast_assign( NETDEV_ASSIGN; MAC_ASSIGN(addr, addr); + __entry->link_id = link_id; ), - TP_printk(NETDEV_PR_FMT ", %pM", NETDEV_PR_ARG, __entry->addr) + TP_printk(NETDEV_PR_FMT ", %pM, link_id:%d", NETDEV_PR_ARG, + __entry->addr, __entry->link_id) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_spurious_frame, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr) + TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), + TP_ARGS(netdev, addr, link_id) ); DEFINE_EVENT(cfg80211_rx_evt, cfg80211_rx_unexpected_4addr_frame, - TP_PROTO(struct net_device *netdev, const u8 *addr), - TP_ARGS(netdev, addr) + TP_PROTO(struct net_device *netdev, const u8 *addr, int link_id), + TP_ARGS(netdev, addr, link_id) ); TRACE_EVENT(cfg80211_ibss_joined, -- cgit v1.2.3 From 69fdb084355d6c0b353536024cc51aa5f7ffb62c Mon Sep 17 00:00:00 2001 From: Miri Korenblit Date: Mon, 21 Jul 2025 21:50:49 +0300 Subject: wifi: mac80211: don't require cipher and keylen in gtk rekey ieee80211_add_gtk_rekey receives a keyconf as an argument, and the cipher and keylen are taken from there to the new allocated key. But in rekey, both the cipher and the keylen should be the same as of the old key, so let ieee80211_add_gtk_rekey find those, so drivers won't have to fill it in. Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250721214922.3c5c023bfae9.Ie6594ae2b4b6d5b3d536e642b349046ebfce7a5d@changeid Signed-off-by: Johannes Berg --- drivers/net/wireless/intel/iwlwifi/mld/d3.c | 7 +++++- drivers/net/wireless/intel/iwlwifi/mvm/d3.c | 11 +++++++-- drivers/net/wireless/realtek/rtw89/wow.c | 7 ++++-- include/net/mac80211.h | 7 ++++-- net/mac80211/key.c | 36 ++++++++++++++++++++++++++--- 5 files changed, 58 insertions(+), 10 deletions(-) (limited to 'include/net') diff --git a/drivers/net/wireless/intel/iwlwifi/mld/d3.c b/drivers/net/wireless/intel/iwlwifi/mld/d3.c index 26255246a320..ed0a0f76f1c5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/d3.c @@ -762,6 +762,7 @@ iwl_mld_add_mcast_rekey(struct ieee80211_vif *vif, .conf.keyidx = key_data->id, }; int link_id = vif->active_links ? __ffs(vif->active_links) : -1; + u8 key[WOWLAN_KEY_MAX_SIZE]; BUILD_BUG_ON(WLAN_KEY_LEN_CCMP != WLAN_KEY_LEN_GCMP); BUILD_BUG_ON(sizeof(conf.key) < WLAN_KEY_LEN_CCMP); @@ -803,7 +804,11 @@ iwl_mld_add_mcast_rekey(struct ieee80211_vif *vif, } memcpy(conf.conf.key, key_data->key, conf.conf.keylen); - key_config = ieee80211_gtk_rekey_add(vif, &conf.conf, link_id); + + memcpy(key, key_data->key, sizeof(key_data->key)); + + key_config = ieee80211_gtk_rekey_add(vif, key_data->id, key, + sizeof(key), link_id); if (IS_ERR(key_config)) return; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index ef9bab042902..997cdd76b13c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -1954,6 +1954,7 @@ static bool iwl_mvm_gtk_rekey(struct iwl_wowlan_status_data *status, DEFINE_RAW_FLEX(struct ieee80211_key_conf, conf, key, WOWLAN_KEY_MAX_SIZE); int link_id = vif->active_links ? __ffs(vif->active_links) : -1; + u8 key_data[WOWLAN_KEY_MAX_SIZE]; conf->cipher = gtk_cipher; @@ -1988,8 +1989,10 @@ static bool iwl_mvm_gtk_rekey(struct iwl_wowlan_status_data *status, conf->cipher, conf->keyidx); memcpy(conf->key, status->gtk[i].key, sizeof(status->gtk[i].key)); + memcpy(key_data, status->gtk[i].key, sizeof(status->gtk[i].key)); - key = ieee80211_gtk_rekey_add(vif, conf, link_id); + key = ieee80211_gtk_rekey_add(vif, status->gtk[i].id, key_data, + sizeof(key_data), link_id); if (IS_ERR(key)) { /* FW may send also the old keys */ if (PTR_ERR(key) == -EALREADY) @@ -2021,6 +2024,7 @@ iwl_mvm_d3_igtk_bigtk_rekey_add(struct iwl_wowlan_status_data *status, struct ieee80211_key_conf *key_config; struct ieee80211_key_seq seq; int link_id = vif->active_links ? __ffs(vif->active_links) : -1; + u8 key[WOWLAN_KEY_MAX_SIZE]; s8 keyidx = key_data->id; conf->cipher = cipher; @@ -2050,7 +2054,10 @@ iwl_mvm_d3_igtk_bigtk_rekey_add(struct iwl_wowlan_status_data *status, BUILD_BUG_ON(WOWLAN_KEY_MAX_SIZE < sizeof(key_data->key)); memcpy(conf->key, key_data->key, conf->keylen); - key_config = ieee80211_gtk_rekey_add(vif, conf, link_id); + memcpy(key, key_data->key, sizeof(key_data->key)); + + key_config = ieee80211_gtk_rekey_add(vif, keyidx, key, sizeof(key), + link_id); if (IS_ERR(key_config)) { /* FW may send also the old keys */ return PTR_ERR(key_config) == -EALREADY; diff --git a/drivers/net/wireless/realtek/rtw89/wow.c b/drivers/net/wireless/realtek/rtw89/wow.c index 071c7577df52..5bb7c1a42f1d 100644 --- a/drivers/net/wireless/realtek/rtw89/wow.c +++ b/drivers/net/wireless/realtek/rtw89/wow.c @@ -619,9 +619,12 @@ static struct ieee80211_key_conf *rtw89_wow_gtk_rekey(struct rtw89_dev *rtwdev, flex_array_size(rekey_conf, key, cipher_info->len)); if (ieee80211_vif_is_mld(wow_vif)) - key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, rtwvif_link->link_id); + key = ieee80211_gtk_rekey_add(wow_vif, keyidx, gtk, + cipher_info->len, + rtwvif_link->link_id); else - key = ieee80211_gtk_rekey_add(wow_vif, rekey_conf, -1); + key = ieee80211_gtk_rekey_add(wow_vif, keyidx, gtk, + cipher_info->len, -1); kfree(rekey_conf); if (IS_ERR(key)) { diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a0cf976a9117..a45e4bee65d4 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -6035,7 +6035,10 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, /** * ieee80211_gtk_rekey_add - add a GTK key from rekeying during WoWLAN * @vif: the virtual interface to add the key on - * @keyconf: new key data + * @idx: the keyidx of the key + * @key_data: the key data + * @key_len: the key data. Might be bigger than the actual key length, + * but not smaller (for the driver convinence) * @link_id: the link id of the key or -1 for non-MLO * * When GTK rekeying was done while the system was suspended, (a) new @@ -6062,7 +6065,7 @@ void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf, */ struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf, + u8 idx, u8 *key_data, u8 key_len, int link_id); /** diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 9d65013ddac7..b14e9cd9713f 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -1356,11 +1356,12 @@ EXPORT_SYMBOL_GPL(ieee80211_set_key_rx_seq); struct ieee80211_key_conf * ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, - struct ieee80211_key_conf *keyconf, + u8 idx, u8 *key_data, u8 key_len, int link_id) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; + struct ieee80211_key *prev_key; struct ieee80211_key *key; int err; struct ieee80211_link_data *link_data = @@ -1376,8 +1377,37 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif, if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) return ERR_PTR(-EINVAL); - key = ieee80211_key_alloc(keyconf->cipher, keyconf->keyidx, - keyconf->keylen, keyconf->key, + if (WARN_ON(idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + + NUM_DEFAULT_BEACON_KEYS)) + return ERR_PTR(-EINVAL); + + prev_key = wiphy_dereference(local->hw.wiphy, + link_data->gtk[idx]); + if (!prev_key) { + if (idx < NUM_DEFAULT_KEYS) { + for (int i = 0; i < NUM_DEFAULT_KEYS; i++) { + if (i == idx) + continue; + prev_key = wiphy_dereference(local->hw.wiphy, + link_data->gtk[i]); + if (prev_key) + break; + } + } else { + /* For IGTK we have 4 and 5 and for BIGTK - 6 and 7 */ + prev_key = wiphy_dereference(local->hw.wiphy, + link_data->gtk[idx ^ 1]); + } + } + + if (WARN_ON(!prev_key)) + return ERR_PTR(-EINVAL); + + if (WARN_ON(key_len < prev_key->conf.keylen)) + return ERR_PTR(-EINVAL); + + key = ieee80211_key_alloc(prev_key->conf.cipher, idx, + prev_key->conf.keylen, key_data, 0, NULL); if (IS_ERR(key)) return ERR_CAST(key); -- cgit v1.2.3 From 70c672f933337fc1de2df8628567ee0a8146562b Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Fri, 20 Jun 2025 15:03:45 +0800 Subject: Bluetooth: Remove hci_conn_hash_lookup_state() Since commit 4aa42119d971 ("Bluetooth: Remove pending ACL connection attempts") this function is unused. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index f79f59e67114..69f491399dac 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1420,26 +1420,6 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) return NULL; } -static inline struct hci_conn *hci_conn_hash_lookup_state(struct hci_dev *hdev, - __u8 type, __u16 state) -{ - struct hci_conn_hash *h = &hdev->conn_hash; - struct hci_conn *c; - - rcu_read_lock(); - - list_for_each_entry_rcu(c, &h->list, list) { - if (c->type == type && c->state == state) { - rcu_read_unlock(); - return c; - } - } - - rcu_read_unlock(); - - return NULL; -} - typedef void (*hci_conn_func_t)(struct hci_conn *conn, void *data); static inline void hci_conn_hash_list_state(struct hci_dev *hdev, hci_conn_func_t func, __u8 type, -- cgit v1.2.3 From b2a5f2e1c127cb431df22e114998ff72eb4578c8 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 1 Jul 2025 15:56:22 +0800 Subject: Bluetooth: hci_event: Add support for handling LE BIG Sync Lost event When the BIS source stops, the controller sends an LE BIG Sync Lost event (subevent 0x1E). Currently, this event is not handled, causing the BIS stream to remain active in BlueZ and preventing recovery. Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 6 ++++++ include/net/bluetooth/hci_core.h | 5 +++-- net/bluetooth/hci_conn.c | 3 ++- net/bluetooth/hci_event.c | 39 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 49 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index c79901f2dc2a..6213012610d7 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2851,6 +2851,12 @@ struct hci_evt_le_big_sync_estabilished { __le16 bis[]; } __packed; +#define HCI_EVT_LE_BIG_SYNC_LOST 0x1e +struct hci_evt_le_big_sync_lost { + __u8 handle; + __u8 reason; +} __packed; + #define HCI_EVT_LE_BIG_INFO_ADV_REPORT 0x22 struct hci_evt_le_big_info_adv_report { __le16 sync_handle; diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 69f491399dac..1ef9279cfd6f 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1346,7 +1346,8 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, } static inline struct hci_conn * -hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) +hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state, + __u8 role) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; @@ -1354,7 +1355,7 @@ hci_conn_hash_lookup_big_state(struct hci_dev *hdev, __u8 handle, __u16 state) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != BIS_LINK || c->state != state) + if (c->type != BIS_LINK || c->state != state || c->role != role) continue; if (handle == c->iso_qos.bcast.big) { diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 4f379184df5b..f5cd935490ad 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -2146,7 +2146,8 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct hci_link *link; /* Look for any BIS that is open for rebinding */ - conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN); + conn = hci_conn_hash_lookup_big_state(hdev, qos->bcast.big, BT_OPEN, + HCI_ROLE_MASTER); if (conn) { memcpy(qos, &conn->iso_qos, sizeof(*qos)); conn->state = BT_CONNECTED; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index f668bde007d4..fca58984ee4e 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6876,7 +6876,8 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, /* Connect all BISes that are bound to the BIG */ while ((conn = hci_conn_hash_lookup_big_state(hdev, ev->handle, - BT_BOUND))) { + BT_BOUND, + HCI_ROLE_MASTER))) { if (ev->status) { hci_connect_cfm(conn, ev->status); hci_conn_del(conn); @@ -6992,6 +6993,37 @@ unlock: hci_dev_unlock(hdev); } +static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_evt_le_big_sync_lost *ev = data; + struct hci_conn *bis, *conn; + + bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); + + hci_dev_lock(hdev); + + /* Delete the pa sync connection */ + bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle); + if (bis) { + conn = hci_conn_hash_lookup_pa_sync_handle(hdev, + bis->sync_handle); + if (conn) + hci_conn_del(conn); + } + + /* Delete each bis connection */ + while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, + BT_CONNECTED, + HCI_ROLE_SLAVE))) { + clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); + hci_disconn_cfm(bis, ev->reason); + hci_conn_del(bis); + } + + hci_dev_unlock(hdev); +} + static void hci_le_big_info_adv_report_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -7115,6 +7147,11 @@ static const struct hci_le_ev { hci_le_big_sync_established_evt, sizeof(struct hci_evt_le_big_sync_estabilished), HCI_MAX_EVENT_SIZE), + /* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */ + HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST, + hci_le_big_sync_lost_evt, + sizeof(struct hci_evt_le_big_sync_lost), + HCI_MAX_EVENT_SIZE), /* [0x22 = HCI_EVT_LE_BIG_INFO_ADV_REPORT] */ HCI_LE_EV_VL(HCI_EVT_LE_BIG_INFO_ADV_REPORT, hci_le_big_info_adv_report_evt, -- cgit v1.2.3 From be31d11ec9144f7f8f7fcbf84ba6971b664683f3 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Tue, 1 Jul 2025 16:47:26 +0800 Subject: Bluetooth: Fix spelling mistakes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct the misspelling of “estabilished” in the code. Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 2 +- net/bluetooth/hci_event.c | 16 ++++++++-------- net/bluetooth/iso.c | 8 ++++---- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 6213012610d7..94f365b75166 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2837,7 +2837,7 @@ struct hci_evt_le_create_big_complete { } __packed; #define HCI_EVT_LE_BIG_SYNC_ESTABLISHED 0x1d -struct hci_evt_le_big_sync_estabilished { +struct hci_evt_le_big_sync_established { __u8 status; __u8 handle; __u8 latency[3]; diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fca58984ee4e..d44463e65194 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6350,8 +6350,8 @@ static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle) return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp); } -static void hci_le_pa_sync_estabilished_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) +static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) { struct hci_ev_le_pa_sync_established *ev = data; int mask = hdev->link_mode; @@ -6681,8 +6681,8 @@ unlock: hci_dev_unlock(hdev); } -static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) +static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) { struct hci_evt_le_cis_established *ev = data; struct hci_conn *conn; @@ -6910,7 +6910,7 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { - struct hci_evt_le_big_sync_estabilished *ev = data; + struct hci_evt_le_big_sync_established *ev = data; struct hci_conn *bis, *conn; int i; @@ -7121,7 +7121,7 @@ static const struct hci_le_ev { HCI_MAX_EVENT_SIZE), /* [0x0e = HCI_EV_LE_PA_SYNC_ESTABLISHED] */ HCI_LE_EV(HCI_EV_LE_PA_SYNC_ESTABLISHED, - hci_le_pa_sync_estabilished_evt, + hci_le_pa_sync_established_evt, sizeof(struct hci_ev_le_pa_sync_established)), /* [0x0f = HCI_EV_LE_PER_ADV_REPORT] */ HCI_LE_EV_VL(HCI_EV_LE_PER_ADV_REPORT, @@ -7132,7 +7132,7 @@ static const struct hci_le_ev { HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), /* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */ - HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_estabilished_evt, + HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt, sizeof(struct hci_evt_le_cis_established)), /* [0x1a = HCI_EVT_LE_CIS_REQ] */ HCI_LE_EV(HCI_EVT_LE_CIS_REQ, hci_le_cis_req_evt, @@ -7145,7 +7145,7 @@ static const struct hci_le_ev { /* [0x1d = HCI_EV_LE_BIG_SYNC_ESTABLISHED] */ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_ESTABLISHED, hci_le_big_sync_established_evt, - sizeof(struct hci_evt_le_big_sync_estabilished), + sizeof(struct hci_evt_le_big_sync_established), HCI_MAX_EVENT_SIZE), /* [0x1e = HCI_EVT_LE_BIG_SYNC_LOST] */ HCI_LE_EV_VL(HCI_EVT_LE_BIG_SYNC_LOST, diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 34e89bb5f384..5e752950e266 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1891,7 +1891,7 @@ static void iso_sock_ready(struct sock *sk) static bool iso_match_big(struct sock *sk, void *data) { - struct hci_evt_le_big_sync_estabilished *ev = data; + struct hci_evt_le_big_sync_established *ev = data; return ev->handle == iso_pi(sk)->qos.bcast.big; } @@ -1912,7 +1912,7 @@ static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; struct sock *sk = conn->sk; - struct hci_ev_le_big_sync_estabilished *ev = NULL; + struct hci_ev_le_big_sync_established *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; @@ -2023,7 +2023,7 @@ static void iso_conn_ready(struct iso_conn *conn) hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); - if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) || + if ((ev && ((struct hci_evt_le_big_sync_established *)ev)->status) || (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; @@ -2082,7 +2082,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) * proceed to establishing a BIG sync: * * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific - * SID to listen to and once sync is estabilished its handle needs to + * SID to listen to and once sync is established its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a -- cgit v1.2.3 From 7565bc56598c3d135318f1bd76a0178dd3ea918f Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 14 Jul 2025 19:40:37 +0300 Subject: Bluetooth: ISO: add socket option to report packet seqnum via CMSG User applications need a way to track which ISO interval a given SDU belongs to, to properly detect packet loss. All controllers do not set timestamps, and it's not guaranteed user application receives all packet reports (small socket buffer, or controller doesn't send all reports like Intel AX210 is doing). Add socket option BT_PKT_SEQNUM that enables reporting of received packet ISO sequence number in BT_SCM_PKT_SEQNUM CMSG. Use BT_PKT_SEQNUM == 22 for the socket option, as 21 was used earlier for a removed experimental feature that never got into mainline. Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/bluetooth.h | 11 ++++++++++- net/bluetooth/af_bluetooth.c | 7 +++++++ net/bluetooth/iso.c | 21 ++++++++++++++++++--- 3 files changed, 35 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h index 114299bd8b98..ada5b56a4413 100644 --- a/include/net/bluetooth/bluetooth.h +++ b/include/net/bluetooth/bluetooth.h @@ -244,6 +244,12 @@ struct bt_codecs { #define BT_ISO_BASE 20 +/* Socket option value 21 reserved */ + +#define BT_PKT_SEQNUM 22 + +#define BT_SCM_PKT_SEQNUM 0x05 + __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) @@ -391,7 +397,8 @@ struct bt_sock { enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, - BT_SK_PKT_STATUS + BT_SK_PKT_STATUS, + BT_SK_PKT_SEQNUM, }; struct bt_sock_list { @@ -475,6 +482,7 @@ struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; + u16 pkt_seqnum; u8 incoming:1; u8 pkt_status:2; union { @@ -488,6 +496,7 @@ struct bt_skb_cb { #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type #define hci_skb_pkt_status(skb) bt_cb((skb))->pkt_status +#define hci_skb_pkt_seqnum(skb) bt_cb((skb))->pkt_seqnum #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode #define hci_skb_event(skb) bt_cb((skb))->hci.req_event diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ee9bf84c88a7..2b94e2077203 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -364,6 +364,13 @@ int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_STATUS, sizeof(pkt_status), &pkt_status); } + + if (test_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags)) { + u16 pkt_seqnum = hci_skb_pkt_seqnum(skb); + + put_cmsg(msg, SOL_BLUETOOTH, BT_SCM_PKT_SEQNUM, + sizeof(pkt_seqnum), &pkt_seqnum); + } } skb_free_datagram(sk, skb); diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 5e752950e266..2f229f2077e8 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1687,6 +1687,17 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; + case BT_PKT_SEQNUM: + err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) + break; + + if (opt) + set_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags); + else + clear_bit(BT_SK_PKT_SEQNUM, &bt_sk(sk)->flags); + break; + case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2 && @@ -2278,7 +2289,7 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct iso_conn *conn = hcon->iso_data; - __u16 pb, ts, len; + __u16 pb, ts, len, sn; if (!conn) goto drop; @@ -2308,6 +2319,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; } + sn = __le16_to_cpu(hdr->sn); len = __le16_to_cpu(hdr->slen); } else { struct hci_iso_data_hdr *hdr; @@ -2318,18 +2330,20 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; } + sn = __le16_to_cpu(hdr->sn); len = __le16_to_cpu(hdr->slen); } flags = hci_iso_data_flags(len); len = hci_iso_data_len(len); - BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len, - skb->len, flags); + BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x sn %d", + len, skb->len, flags, sn); if (len == skb->len) { /* Complete frame received */ hci_skb_pkt_status(skb) = flags & 0x03; + hci_skb_pkt_seqnum(skb) = sn; iso_recv_frame(conn, skb); return; } @@ -2352,6 +2366,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) goto drop; hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; + hci_skb_pkt_seqnum(conn->rx_skb) = sn; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; -- cgit v1.2.3 From 2935e556850e9c94d7a00adf14d3cd7fe406ac03 Mon Sep 17 00:00:00 2001 From: Arseniy Krasnov Date: Wed, 16 Jul 2025 22:23:58 +0300 Subject: Bluetooth: hci_sync: fix double free in 'hci_discovery_filter_clear()' Function 'hci_discovery_filter_clear()' frees 'uuids' array and then sets it to NULL. There is a tiny chance of the following race: 'hci_cmd_sync_work()' 'update_passive_scan_sync()' 'hci_update_passive_scan_sync()' 'hci_discovery_filter_clear()' kfree(uuids); <-------------------------preempted--------------------------------> 'start_service_discovery()' 'hci_discovery_filter_clear()' kfree(uuids); // DOUBLE FREE <-------------------------preempted--------------------------------> uuids = NULL; To fix it let's add locking around 'kfree()' call and NULL pointer assignment. Otherwise the following backtrace fires: [ ] ------------[ cut here ]------------ [ ] kernel BUG at mm/slub.c:547! [ ] Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP [ ] CPU: 3 UID: 0 PID: 246 Comm: bluetoothd Tainted: G O 6.12.19-kernel #1 [ ] Tainted: [O]=OOT_MODULE [ ] pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ ] pc : __slab_free+0xf8/0x348 [ ] lr : __slab_free+0x48/0x348 ... [ ] Call trace: [ ] __slab_free+0xf8/0x348 [ ] kfree+0x164/0x27c [ ] start_service_discovery+0x1d0/0x2c0 [ ] hci_sock_sendmsg+0x518/0x924 [ ] __sock_sendmsg+0x54/0x60 [ ] sock_write_iter+0x98/0xf8 [ ] do_iter_readv_writev+0xe4/0x1c8 [ ] vfs_writev+0x128/0x2b0 [ ] do_writev+0xfc/0x118 [ ] __arm64_sys_writev+0x20/0x2c [ ] invoke_syscall+0x68/0xf0 [ ] el0_svc_common.constprop.0+0x40/0xe0 [ ] do_el0_svc+0x1c/0x28 [ ] el0_svc+0x30/0xd0 [ ] el0t_64_sync_handler+0x100/0x12c [ ] el0t_64_sync+0x194/0x198 [ ] Code: 8b0002e6 eb17031f 54fffbe1 d503201f (d4210000) [ ] ---[ end trace 0000000000000000 ]--- Fixes: ad383c2c65a5 ("Bluetooth: hci_sync: Enable advertising when LL privacy is enabled") Signed-off-by: Arseniy Krasnov Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 1ef9279cfd6f..3728495f0819 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -94,6 +95,7 @@ struct discovery_state { u16 uuid_count; u8 (*uuids)[16]; unsigned long name_resolve_timeout; + spinlock_t lock; }; #define SUSPEND_NOTIFIER_TIMEOUT msecs_to_jiffies(2000) /* 2 seconds */ @@ -889,6 +891,7 @@ static inline void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, static inline void discovery_init(struct hci_dev *hdev) { + spin_lock_init(&hdev->discovery.lock); hdev->discovery.state = DISCOVERY_STOPPED; INIT_LIST_HEAD(&hdev->discovery.all); INIT_LIST_HEAD(&hdev->discovery.unknown); @@ -903,8 +906,11 @@ static inline void hci_discovery_filter_clear(struct hci_dev *hdev) hdev->discovery.report_invalid_rssi = true; hdev->discovery.rssi = HCI_RSSI_INVALID; hdev->discovery.uuid_count = 0; + + spin_lock(&hdev->discovery.lock); kfree(hdev->discovery.uuids); hdev->discovery.uuids = NULL; + spin_unlock(&hdev->discovery.lock); } bool hci_discovery_active(struct hci_dev *hdev); -- cgit v1.2.3 From 0cadf8534f2a727bc3a01e8c583b085d25963ee0 Mon Sep 17 00:00:00 2001 From: Chris Down Date: Mon, 21 Jul 2025 16:30:23 +0100 Subject: Bluetooth: hci_event: Mask data status from LE ext adv reports The Event_Type field in an LE Extended Advertising Report uses bits 5 and 6 for data status (e.g. truncation or fragmentation), not the PDU type itself. The ext_evt_type_to_legacy() function fails to mask these status bits before evaluation. This causes valid advertisements with status bits set (e.g. a truncated non-connectable advertisement, which ends up showing as PDU type 0x40) to be misclassified as unknown and subsequently dropped. This is okay for most checks which use bitwise AND on the relevant event type bits, but it doesn't work for non-connectable types, which are checked with '== LE_EXT_ADV_NON_CONN_IND' (that is, zero). In terms of behaviour, first the device sends a truncated report: > HCI Event: LE Meta Event (0x3e) plen 26 LE Extended Advertising Report (0x0d) Entry 0 Event type: 0x0040 Data status: Incomplete, data truncated, no more to come Address type: Random (0x01) Address: 1D:12:46:FA:F8:6E (Non-Resolvable) SID: 0x03 RSSI: -98 dBm (0x9e) Data length: 0x00 Then, a few seconds later, it sends the subsequent complete report: > HCI Event: LE Meta Event (0x3e) plen 122 LE Extended Advertising Report (0x0d) Entry 0 Event type: 0x0000 Data status: Complete Address type: Random (0x01) Address: 1D:12:46:FA:F8:6E (Non-Resolvable) SID: 0x03 RSSI: -97 dBm (0x9f) Data length: 0x60 Service Data: Google (0xfef3) Data[92]: ... These devices often send multiple truncated reports per second. This patch introduces a PDU type mask to ensure only the relevant bits are evaluated, allowing for the correct translation of all valid extended advertising packets. Fixes: b2cc9761f144 ("Bluetooth: Handle extended ADV PDU types") Signed-off-by: Chris Down Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + net/bluetooth/hci_event.c | 8 ++++++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index 94f365b75166..e90a7b753926 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -2634,6 +2634,7 @@ struct hci_ev_le_conn_complete { #define LE_EXT_ADV_DIRECT_IND 0x0004 #define LE_EXT_ADV_SCAN_RSP 0x0008 #define LE_EXT_ADV_LEGACY_PDU 0x0010 +#define LE_EXT_ADV_DATA_STATUS_MASK 0x0060 #define LE_EXT_ADV_EVT_TYPE_MASK 0x007f #define ADDR_LE_DEV_PUBLIC 0x00 diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index c0eb03e5cbf8..b7b473473b70 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -6239,6 +6239,11 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type) { + u16 pdu_type = evt_type & ~LE_EXT_ADV_DATA_STATUS_MASK; + + if (!pdu_type) + return LE_ADV_NONCONN_IND; + if (evt_type & LE_EXT_ADV_LEGACY_PDU) { switch (evt_type) { case LE_LEGACY_ADV_IND: @@ -6270,8 +6275,7 @@ static u8 ext_evt_type_to_legacy(struct hci_dev *hdev, u16 evt_type) if (evt_type & LE_EXT_ADV_SCAN_IND) return LE_ADV_SCAN_IND; - if (evt_type == LE_EXT_ADV_NON_CONN_IND || - evt_type & LE_EXT_ADV_DIRECT_IND) + if (evt_type & LE_EXT_ADV_DIRECT_IND) return LE_ADV_NONCONN_IND; invalid: -- cgit v1.2.3 From a7bcffc673de219af2698fbb90627016233de67b Mon Sep 17 00:00:00 2001 From: Yang Li Date: Thu, 10 Jul 2025 18:52:47 +0800 Subject: Bluetooth: Add PA_LINK to distinguish BIG sync and PA sync connections Currently, BIS_LINK is used for both BIG sync and PA sync connections, which makes it impossible to distinguish them when searching for a PA sync connection. Adding PA_LINK will make the distinction clearer and simplify future extensions for PA-related features. Signed-off-by: Yang Li Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci.h | 1 + include/net/bluetooth/hci_core.h | 10 +++++++--- net/bluetooth/hci_conn.c | 14 +++++++++----- net/bluetooth/hci_core.c | 27 +++++++++++++++------------ net/bluetooth/hci_event.c | 7 ++++--- net/bluetooth/hci_sync.c | 10 +++++----- net/bluetooth/iso.c | 6 ++++-- net/bluetooth/mgmt.c | 1 + 8 files changed, 46 insertions(+), 30 deletions(-) (limited to 'include/net') diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h index e90a7b753926..df1847b74e55 100644 --- a/include/net/bluetooth/hci.h +++ b/include/net/bluetooth/hci.h @@ -562,6 +562,7 @@ enum { #define LE_LINK 0x80 #define CIS_LINK 0x82 #define BIS_LINK 0x83 +#define PA_LINK 0x84 #define INVALID_LINK 0xff /* LMP features */ diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 3728495f0819..4dc11c66f7b8 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -1015,6 +1015,7 @@ static inline void hci_conn_hash_add(struct hci_dev *hdev, struct hci_conn *c) break; case CIS_LINK: case BIS_LINK: + case PA_LINK: h->iso_num++; break; } @@ -1042,6 +1043,7 @@ static inline void hci_conn_hash_del(struct hci_dev *hdev, struct hci_conn *c) break; case CIS_LINK: case BIS_LINK: + case PA_LINK: h->iso_num--; break; } @@ -1060,6 +1062,7 @@ static inline unsigned int hci_conn_num(struct hci_dev *hdev, __u8 type) return h->sco_num; case CIS_LINK: case BIS_LINK: + case PA_LINK: return h->iso_num; default: return 0; @@ -1142,7 +1145,7 @@ hci_conn_hash_lookup_create_pa_sync(struct hci_dev *hdev) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != BIS_LINK) + if (c->type != PA_LINK) continue; if (!test_bit(HCI_CONN_CREATE_PA_SYNC, &c->flags)) @@ -1337,7 +1340,7 @@ hci_conn_hash_lookup_big_sync_pend(struct hci_dev *hdev, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != BIS_LINK) + if (c->type != PA_LINK) continue; if (handle == c->iso_qos.bcast.big && num_bis == c->num_bis) { @@ -1407,7 +1410,7 @@ hci_conn_hash_lookup_pa_sync_handle(struct hci_dev *hdev, __u16 sync_handle) rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if (c->type != BIS_LINK) + if (c->type != PA_LINK) continue; /* Ignore the listen hcon, we are looking @@ -2006,6 +2009,7 @@ static inline int hci_proto_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, case CIS_LINK: case BIS_LINK: + case PA_LINK: return iso_connect_ind(hdev, bdaddr, flags); default: diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index b2e09e7f70e1..7d1e79f69cd1 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -785,7 +785,7 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->sync_handle = conn->sync_handle; if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { - hci_conn_hash_list_flag(hdev, find_bis, BIS_LINK, + hci_conn_hash_list_flag(hdev, find_bis, PA_LINK, HCI_CONN_PA_SYNC, d); if (!d->count) @@ -914,6 +914,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t break; case CIS_LINK: case BIS_LINK: + case PA_LINK: if (hdev->iso_mtu) /* Dedicated ISO Buffer exists */ break; @@ -979,6 +980,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t break; case CIS_LINK: case BIS_LINK: + case PA_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); @@ -1033,7 +1035,6 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t } hci_conn_init_sysfs(conn); - return conn; } @@ -1077,6 +1078,7 @@ static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) break; case CIS_LINK: case BIS_LINK: + case PA_LINK: if ((conn->state != BT_CONNECTED && !test_bit(HCI_CONN_CREATE_CIS, &conn->flags)) || test_bit(HCI_CONN_BIG_CREATED, &conn->flags)) @@ -1152,7 +1154,8 @@ void hci_conn_del(struct hci_conn *conn) } else { /* Unacked ISO frames */ if (conn->type == CIS_LINK || - conn->type == BIS_LINK) { + conn->type == BIS_LINK || + conn->type == PA_LINK) { if (hdev->iso_pkts) hdev->iso_cnt += conn->sent; else if (hdev->le_pkts) @@ -2081,7 +2084,7 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); - conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_SLAVE); + conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; @@ -2246,7 +2249,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ @@ -2980,6 +2983,7 @@ void hci_conn_tx_queue(struct hci_conn *conn, struct sk_buff *skb) switch (conn->type) { case CIS_LINK: case BIS_LINK: + case PA_LINK: case ACL_LINK: case LE_LINK: break; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f2fbe9c8e1be..55e0722fd066 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2936,12 +2936,14 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ if (hci_conn_num(hdev, CIS_LINK) || - hci_conn_num(hdev, BIS_LINK)) { + hci_conn_num(hdev, BIS_LINK) || + hci_conn_num(hdev, PA_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); - if (type == CIS_LINK || type == BIS_LINK) + if (type == CIS_LINK || type == BIS_LINK || + type == PA_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; @@ -3396,6 +3398,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) break; case CIS_LINK: case BIS_LINK: + case PA_LINK: cnt = hdev->iso_mtu ? hdev->iso_cnt : hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; @@ -3409,7 +3412,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, - __u8 type2, int *quote) + int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; @@ -3421,7 +3424,7 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { - if ((c->type != type && c->type != type2) || + if (c->type != type || skb_queue_empty(&c->data_q)) continue; @@ -3625,7 +3628,7 @@ static void hci_sched_sco(struct hci_dev *hdev, __u8 type) else cnt = &hdev->sco_cnt; - while (*cnt && (conn = hci_low_sent(hdev, type, type, "e))) { + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); @@ -3744,8 +3747,8 @@ static void hci_sched_le(struct hci_dev *hdev) hci_prio_recalculate(hdev, LE_LINK); } -/* Schedule CIS */ -static void hci_sched_iso(struct hci_dev *hdev) +/* Schedule iso */ +static void hci_sched_iso(struct hci_dev *hdev, __u8 type) { struct hci_conn *conn; struct sk_buff *skb; @@ -3753,14 +3756,12 @@ static void hci_sched_iso(struct hci_dev *hdev) BT_DBG("%s", hdev->name); - if (!hci_conn_num(hdev, CIS_LINK) && - !hci_conn_num(hdev, BIS_LINK)) + if (!hci_conn_num(hdev, type)) return; cnt = hdev->iso_pkts ? &hdev->iso_cnt : hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; - while (*cnt && (conn = hci_low_sent(hdev, CIS_LINK, BIS_LINK, - "e))) { + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_conn_frame(hdev, conn, skb); @@ -3785,7 +3786,9 @@ static void hci_tx_work(struct work_struct *work) /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev, SCO_LINK); hci_sched_sco(hdev, ESCO_LINK); - hci_sched_iso(hdev); + hci_sched_iso(hdev, CIS_LINK); + hci_sched_iso(hdev, BIS_LINK); + hci_sched_iso(hdev, PA_LINK); hci_sched_acl(hdev); hci_sched_le(hdev); } diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index b7b473473b70..8aa5039b975a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4432,6 +4432,7 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, case CIS_LINK: case BIS_LINK: + case PA_LINK: if (hdev->iso_pkts) { hdev->iso_cnt += count; if (hdev->iso_cnt > hdev->iso_pkts) @@ -6381,7 +6382,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, conn->sync_handle = le16_to_cpu(ev->handle); conn->sid = HCI_SID_INVALID; - mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, BIS_LINK, + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) { hci_le_pa_term_sync(hdev, ev->handle); @@ -6392,7 +6393,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, goto unlock; /* Add connection to indicate PA sync event */ - pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -6423,7 +6424,7 @@ static void hci_le_per_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); - mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, BIS_LINK, &flags); + mask |= hci_proto_connect_ind(hdev, BDADDR_ANY, PA_LINK, &flags); if (!(mask & HCI_LM_ACCEPT)) goto unlock; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index e9df6502e58e..2b4f21fbf9c1 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2929,7 +2929,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (sent) { struct hci_conn *conn; - conn = hci_conn_hash_lookup_ba(hdev, BIS_LINK, + conn = hci_conn_hash_lookup_ba(hdev, PA_LINK, &sent->bdaddr); if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; @@ -5493,7 +5493,7 @@ static int hci_disconnect_sync(struct hci_dev *hdev, struct hci_conn *conn, { struct hci_cp_disconnect cp; - if (conn->type == BIS_LINK) { + if (conn->type == BIS_LINK || conn->type == PA_LINK) { /* This is a BIS connection, hci_conn_del will * do the necessary cleanup. */ @@ -5562,7 +5562,7 @@ static int hci_connect_cancel_sync(struct hci_dev *hdev, struct hci_conn *conn, return HCI_ERROR_LOCAL_HOST_TERM; } - if (conn->type == BIS_LINK) { + if (conn->type == BIS_LINK || conn->type == PA_LINK) { /* There is no way to cancel a BIS without terminating the BIG * which is done later on connection cleanup. */ @@ -5627,7 +5627,7 @@ static int hci_reject_conn_sync(struct hci_dev *hdev, struct hci_conn *conn, if (conn->type == CIS_LINK) return hci_le_reject_cis_sync(hdev, conn, reason); - if (conn->type == BIS_LINK) + if (conn->type == BIS_LINK || conn->type == PA_LINK) return -EINVAL; if (conn->type == SCO_LINK || conn->type == ESCO_LINK) @@ -6994,7 +6994,7 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) goto unlock; /* Add connection to indicate PA sync error */ - pa_sync = hci_conn_add_unset(hdev, BIS_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 2f45e46a9b6a..7bd3aa0a6db9 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -2237,7 +2237,8 @@ done: static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { - if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) { + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK && + hcon->type != PA_LINK) { if (hcon->type != LE_LINK) return; @@ -2278,7 +2279,8 @@ static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { - if (hcon->type != CIS_LINK && hcon->type != BIS_LINK) + if (hcon->type != CIS_LINK && hcon->type != BIS_LINK && + hcon->type != PA_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 63dba0503653..1ce682038b51 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -3237,6 +3237,7 @@ static u8 link_to_bdaddr(u8 link_type, u8 addr_type) switch (link_type) { case CIS_LINK: case BIS_LINK: + case PA_LINK: case LE_LINK: switch (addr_type) { case ADDR_LE_DEV_PUBLIC: -- cgit v1.2.3 From f3d85c9ee51036ac7ed129ec16eef5df2192763e Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 21 Jul 2025 11:18:24 +0900 Subject: netmem: introduce struct netmem_desc mirroring struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To simplify struct page, the page pool members of struct page should be moved to other, allowing these members to be removed from struct page. Introduce a network memory descriptor to store the members, struct netmem_desc, and make it union'ed with the existing fields in struct net_iov, allowing to organize the fields of struct net_iov. Signed-off-by: Byungchul Park Reviewed-by: Toke Høiland-Jørgensen Reviewed-by: Pavel Begunkov Reviewed-by: Mina Almasry Reviewed-by: Vlastimil Babka Acked-by: Harry Yoo Link: https://patch.msgid.link/20250721021835.63939-2-byungchul@sk.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 116 +++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 95 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/netmem.h b/include/net/netmem.h index de1d95f04076..535cf17b9134 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -12,6 +12,50 @@ #include #include +/* These fields in struct page are used by the page_pool and net stack: + * + * struct { + * unsigned long pp_magic; + * struct page_pool *pp; + * unsigned long _pp_mapping_pad; + * unsigned long dma_addr; + * atomic_long_t pp_ref_count; + * }; + * + * We mirror the page_pool fields here so the page_pool can access these + * fields without worrying whether the underlying fields belong to a + * page or netmem_desc. + * + * CAUTION: Do not update the fields in netmem_desc without also + * updating the anonymous aliasing union in struct net_iov. + */ +struct netmem_desc { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; +}; + +#define NETMEM_DESC_ASSERT_OFFSET(pg, desc) \ + static_assert(offsetof(struct page, pg) == \ + offsetof(struct netmem_desc, desc)) +NETMEM_DESC_ASSERT_OFFSET(flags, _flags); +NETMEM_DESC_ASSERT_OFFSET(pp_magic, pp_magic); +NETMEM_DESC_ASSERT_OFFSET(pp, pp); +NETMEM_DESC_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); +NETMEM_DESC_ASSERT_OFFSET(dma_addr, dma_addr); +NETMEM_DESC_ASSERT_OFFSET(pp_ref_count, pp_ref_count); +#undef NETMEM_DESC_ASSERT_OFFSET + +/* + * Since struct netmem_desc uses the space in struct page, the size + * should be checked, until struct netmem_desc has its own instance from + * slab, to avoid conflicting with other members within struct page. + */ +static_assert(sizeof(struct netmem_desc) <= offsetof(struct page, _refcount)); + /* net_iov */ DECLARE_STATIC_KEY_FALSE(page_pool_mem_providers); @@ -30,13 +74,48 @@ enum net_iov_type { NET_IOV_MAX = ULONG_MAX }; +/* A memory descriptor representing abstract networking I/O vectors, + * generally for non-pages memory that doesn't have its corresponding + * struct page and needs to be explicitly allocated through slab. + * + * net_iovs are allocated and used by networking code, and the size of + * the chunk is PAGE_SIZE. + * + * This memory can be any form of non-struct paged memory. Examples + * include imported dmabuf memory and imported io_uring memory. See + * net_iov_type for all the supported types. + * + * @pp_magic: pp field, similar to the one in struct page/struct + * netmem_desc. + * @pp: the pp this net_iov belongs to, if any. + * @dma_addr: the dma addrs of the net_iov. Needed for the network + * card to send/receive this net_iov. + * @pp_ref_count: the pp ref count of this net_iov, exactly the same + * usage as struct page/struct netmem_desc. + * @owner: the net_iov_area this net_iov belongs to, if any. + * @type: the type of the memory. Different types of net_iovs are + * supported. + */ struct net_iov { - enum net_iov_type type; - unsigned long pp_magic; - struct page_pool *pp; + union { + struct netmem_desc desc; + + /* XXX: The following part should be removed once all + * the references to them are converted so as to be + * accessed via netmem_desc e.g. niov->desc.pp instead + * of niov->pp. + */ + struct { + unsigned long _flags; + unsigned long pp_magic; + struct page_pool *pp; + unsigned long _pp_mapping_pad; + unsigned long dma_addr; + atomic_long_t pp_ref_count; + }; + }; struct net_iov_area *owner; - unsigned long dma_addr; - atomic_long_t pp_ref_count; + enum net_iov_type type; }; struct net_iov_area { @@ -48,27 +127,22 @@ struct net_iov_area { unsigned long base_virtual; }; -/* These fields in struct page are used by the page_pool and net stack: +/* net_iov is union'ed with struct netmem_desc mirroring struct page, so + * the page_pool can access these fields without worrying whether the + * underlying fields are accessed via netmem_desc or directly via + * net_iov, until all the references to them are converted so as to be + * accessed via netmem_desc e.g. niov->desc.pp instead of niov->pp. * - * struct { - * unsigned long pp_magic; - * struct page_pool *pp; - * unsigned long _pp_mapping_pad; - * unsigned long dma_addr; - * atomic_long_t pp_ref_count; - * }; - * - * We mirror the page_pool fields here so the page_pool can access these fields - * without worrying whether the underlying fields belong to a page or net_iov. - * - * The non-net stack fields of struct page are private to the mm stack and must - * never be mirrored to net_iov. + * The non-net stack fields of struct page are private to the mm stack + * and must never be mirrored to net_iov. */ -#define NET_IOV_ASSERT_OFFSET(pg, iov) \ - static_assert(offsetof(struct page, pg) == \ +#define NET_IOV_ASSERT_OFFSET(desc, iov) \ + static_assert(offsetof(struct netmem_desc, desc) == \ offsetof(struct net_iov, iov)) +NET_IOV_ASSERT_OFFSET(_flags, _flags); NET_IOV_ASSERT_OFFSET(pp_magic, pp_magic); NET_IOV_ASSERT_OFFSET(pp, pp); +NET_IOV_ASSERT_OFFSET(_pp_mapping_pad, _pp_mapping_pad); NET_IOV_ASSERT_OFFSET(dma_addr, dma_addr); NET_IOV_ASSERT_OFFSET(pp_ref_count, pp_ref_count); #undef NET_IOV_ASSERT_OFFSET -- cgit v1.2.3 From 38a436d4e26487e16ac6c1de17c030b1bef84d83 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 21 Jul 2025 11:18:25 +0900 Subject: netmem: use netmem_desc instead of page to access ->pp in __netmem_get_pp() To eliminate the use of the page pool fields in struct page, the page pool code should use netmem descriptor and APIs instead. However, __netmem_get_pp() still accesses ->pp via struct page. So change it to use struct netmem_desc instead, since ->pp no longer will be available in struct page. While at it, add a helper, __netmem_to_nmdesc(), that can be used to unsafely get pointer to netmem_desc backing the netmem_ref, only when the netmem_ref is always backed by system memory. Signed-off-by: Byungchul Park Link: https://patch.msgid.link/20250721021835.63939-3-byungchul@sk.com Signed-off-by: Jakub Kicinski --- include/net/netmem.h | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netmem.h b/include/net/netmem.h index 535cf17b9134..097bc74d9555 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -247,6 +247,24 @@ static inline unsigned long netmem_pfn_trace(netmem_ref netmem) return page_to_pfn(netmem_to_page(netmem)); } +/** + * __netmem_to_nmdesc - unsafely get pointer to the &netmem_desc backing + * @netmem + * @netmem: netmem reference to convert + * + * Unsafe version that can be used only when @netmem is always backed by + * system memory, performs faster and generates smaller object code (no + * check for the LSB, no WARN). When @netmem points to IOV, provokes + * undefined behaviour. + * + * Return: pointer to the &netmem_desc (garbage if @netmem is not backed + * by system memory). + */ +static inline struct netmem_desc *__netmem_to_nmdesc(netmem_ref netmem) +{ + return (__force struct netmem_desc *)netmem; +} + /* __netmem_clear_lsb - convert netmem_ref to struct net_iov * for access to * common fields. * @netmem: netmem reference to extract as net_iov. @@ -280,7 +298,7 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) */ static inline struct page_pool *__netmem_get_pp(netmem_ref netmem) { - return __netmem_to_page(netmem)->pp; + return __netmem_to_nmdesc(netmem)->pp; } static inline struct page_pool *netmem_get_pp(netmem_ref netmem) -- cgit v1.2.3 From 89ade7c7306508f46b811cd43960eaed88e0e1dd Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 21 Jul 2025 11:18:26 +0900 Subject: netmem, mlx4: access ->pp_ref_count through netmem_desc instead of page To eliminate the use of struct page in page pool, the page pool users should use netmem descriptor and APIs instead. Make mlx4 access ->pp_ref_count through netmem_desc instead of page. While at it, add a helper, pp_page_to_nmdesc() and __pp_page_to_nmdesc(), that can be used to get netmem_desc from page only if it's a pp page. For now that netmem_desc overlays on page, it can be achieved by just casting, and use macro and _Generic to cover const casting as well. Plus, change page_pool_page_is_pp() to check for 'const struct page *' instead of 'struct page *' since it doesn't modify data and additionally covers const type. Signed-off-by: Byungchul Park Link: https://patch.msgid.link/20250721021835.63939-4-byungchul@sk.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 4 +++- include/linux/mm.h | 4 ++-- include/net/netmem.h | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 3 deletions(-) (limited to 'include/net') diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b33285d755b9..92a16ddb7d86 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -460,9 +460,11 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv, truesize += frag_info->frag_stride; if (frag_info->frag_stride == PAGE_SIZE / 2) { + struct netmem_desc *desc = pp_page_to_nmdesc(page); + frags->page_offset ^= PAGE_SIZE / 2; release = page_count(page) != 1 || - atomic_long_read(&page->pp_ref_count) != 1 || + atomic_long_read(&desc->pp_ref_count) != 1 || page_is_pfmemalloc(page) || page_to_nid(page) != numa_mem_id(); } else if (!priv->rx_headroom) { diff --git a/include/linux/mm.h b/include/linux/mm.h index fa538feaa8d9..ae50c1641bed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4178,12 +4178,12 @@ int arch_lock_shadow_stack_status(struct task_struct *t, unsigned long status); #define PP_MAGIC_MASK ~(PP_DMA_INDEX_MASK | 0x3UL) #ifdef CONFIG_PAGE_POOL -static inline bool page_pool_page_is_pp(struct page *page) +static inline bool page_pool_page_is_pp(const struct page *page) { return (page->pp_magic & PP_MAGIC_MASK) == PP_SIGNATURE; } #else -static inline bool page_pool_page_is_pp(struct page *page) +static inline bool page_pool_page_is_pp(const struct page *page) { return false; } diff --git a/include/net/netmem.h b/include/net/netmem.h index 097bc74d9555..f7dacc9e75fd 100644 --- a/include/net/netmem.h +++ b/include/net/netmem.h @@ -285,6 +285,23 @@ static inline struct net_iov *__netmem_clear_lsb(netmem_ref netmem) return (struct net_iov *)((__force unsigned long)netmem & ~NET_IOV); } +/* XXX: How to extract netmem_desc from page must be changed, once + * netmem_desc no longer overlays on page and will be allocated through + * slab. + */ +#define __pp_page_to_nmdesc(p) (_Generic((p), \ + const struct page * : (const struct netmem_desc *)(p), \ + struct page * : (struct netmem_desc *)(p))) + +/* CAUTION: Check if the page is a pp page before calling this helper or + * know it's a pp page. + */ +#define pp_page_to_nmdesc(p) \ +({ \ + DEBUG_NET_WARN_ON_ONCE(!page_pool_page_is_pp(p)); \ + __pp_page_to_nmdesc(p); \ +}) + /** * __netmem_get_pp - unsafely get pointer to the &page_pool backing @netmem * @netmem: netmem reference to get the pointer from -- cgit v1.2.3 From 9dfd871a3e2ed433d5fee519b90b7e619b972043 Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Mon, 21 Jul 2025 11:18:35 +0900 Subject: libeth: xdp: access ->pp through netmem_desc instead of page To eliminate the use of struct page in page pool, the page pool users should use netmem descriptor and APIs instead. Make xdp access ->pp through netmem_desc instead of page. Signed-off-by: Byungchul Park Link: https://patch.msgid.link/20250721021835.63939-13-byungchul@sk.com Signed-off-by: Jakub Kicinski --- include/net/libeth/xdp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index 6ce6aec6884c..f4880b50e804 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1292,7 +1292,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); #endif xdp_prepare_buff(&xdp->base, page_address(page) + fqe->offset, - page->pp->p.offset, len, true); + pp_page_to_nmdesc(page)->pp->p.offset, len, true); } /** -- cgit v1.2.3 From 8f9516daedd67097a0c6e463fcb7a42b5ee9d477 Mon Sep 17 00:00:00 2001 From: Koen De Schepper Date: Tue, 22 Jul 2025 11:59:12 +0200 Subject: sched: Add enqueue/dequeue of dualpi2 qdisc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DualPI2 provides L4S-type low latency & loss to traffic that uses a scalable congestion controller (e.g. TCP-Prague, DCTCP) without degrading the performance of 'classic' traffic (e.g. Reno, Cubic etc.). It is to be the reference implementation of IETF RFC9332 DualQ Coupled AQM (https://datatracker.ietf.org/doc/html/rfc9332). Note that creating two independent queues cannot meet the goal of DualPI2 mentioned in RFC9332: "...to preserve fairness between ECN-capable and non-ECN-capable traffic." Further, it could even lead to starvation of Classic traffic, which is also inconsistent with the requirements in RFC9332: "...although priority MUST be bounded in order not to starve Classic traffic." DualPI2 is designed to maintain approximate per-flow fairness on L-queue and C-queue by forming a single qdisc using the coupling factor and scheduler between two queues. The qdisc provides two queues called low latency and classic. It classifies packets based on the ECN field in the IP headers. By default it directs non-ECN and ECT(0) into the classic queue and ECT(1) and CE into the low latency queue, as per the IETF spec. Each queue runs its own AQM: * The classic AQM is called PI2, which is similar to the PIE AQM but more responsive and simpler. Classic traffic requires a decent target queue (default 15ms for Internet deployment) to fully utilize the link and to avoid high drop rates. * The low latency AQM is, by default, a very shallow ECN marking threshold (1ms) similar to that used for DCTCP. The DualQ isolates the low queuing delay of the Low Latency queue from the larger delay of the 'Classic' queue. However, from a bandwidth perspective, flows in either queue will share out the link capacity as if there was just a single queue. This bandwidth pooling effect is achieved by coupling together the drop and ECN-marking probabilities of the two AQMs. The PI2 AQM has two main parameters in addition to its target delay. The integral gain factor alpha is used to slowly correct any persistent standing queue error from the target delay, while the proportional gain factor beta is used to quickly compensate for queue changes (growth or shrinkage). Either alpha and beta are given as a parameter, or they can be calculated by tc from alternative typical and maximum RTT parameters. Internally, the output of a linear Proportional Integral (PI) controller is used for both queues. This output is squared to calculate the drop or ECN-marking probability of the classic queue. This counterbalances the square-root rate equation of Reno/Cubic, which is the trick that balances flow rates across the queues. For the ECN-marking probability of the low latency queue, the output of the base AQM is multiplied by a coupling factor. This determines the balance between the flow rates in each queue. The default setting makes the flow rates roughly equal, which should be generally applicable. If DUALPI2 AQM has detected overload (due to excessive non-responsive traffic in either queue), it will switch to signaling congestion solely using drop, irrespective of the ECN field. Alternatively, it can be configured to limit the drop probability and let the queue grow and eventually overflow (like tail-drop). GSO splitting in DUALPI2 is configurable from userspace while the default behavior is to split gso. When running DUALPI2 at unshaped 10gigE with 4 download streams test, splitting gso apart results in halving the latency with no loss in throughput: Summary of tcp_4down run 'no_split_gso': avg median # data pts Ping (ms) ICMP : 0.53 0.30 ms 350 TCP download avg : 2326.86 N/A Mbits/s 350 TCP download sum : 9307.42 N/A Mbits/s 350 TCP download::1 : 2672.99 2568.73 Mbits/s 350 TCP download::2 : 2586.96 2570.51 Mbits/s 350 TCP download::3 : 1786.26 1798.82 Mbits/s 350 TCP download::4 : 2261.21 2309.49 Mbits/s 350 Summart of tcp_4down run 'split_gso': avg median # data pts Ping (ms) ICMP   : 0.22 0.23 ms 350 TCP download avg : 2335.02 N/A Mbits/s 350 TCP download sum : 9340.09 N/A Mbits/s 350 TCP download::1 : 2335.30 2334.22 Mbits/s 350 TCP download::2 : 2334.72 2334.20 Mbits/s 350 TCP download::3 : 2335.28 2334.58 Mbits/s 350 TCP download::4 : 2334.79 2334.39 Mbits/s 350 A similar result is observed when running DUALPI2 at unshaped 1gigE with 1 download stream test: Summary of tcp_1down run 'no_split_gso': avg median # data pts Ping (ms) ICMP : 1.13 1.25 ms 350 TCP download : 941.41 941.46 Mbits/s 350 Summart of tcp_1down run 'split_gso': avg median # data pts Ping (ms) ICMP : 0.51 0.55 ms 350 TCP download : 941.41 941.45 Mbits/s 350 Additional details can be found in the draft: https://datatracker.ietf.org/doc/html/rfc9332 Signed-off-by: Koen De Schepper Co-developed-by: Olga Albisser Signed-off-by: Olga Albisser Co-developed-by: Olivier Tilmans Signed-off-by: Olivier Tilmans Co-developed-by: Henrik Steen Signed-off-by: Henrik Steen Co-developed-by: Chia-Yu Chang Signed-off-by: Chia-Yu Chang Signed-off-by: Bob Briscoe Signed-off-by: Ilpo Järvinen Acked-by: Dave Taht Link: https://patch.msgid.link/20250722095915.24485-4-chia-yu.chang@nokia-bell-labs.com Signed-off-by: Jakub Kicinski --- include/net/dropreason-core.h | 6 + net/sched/Kconfig | 12 ++ net/sched/Makefile | 1 + net/sched/sch_dualpi2.c | 472 +++++++++++++++++++++++++++++++++++++++++- 4 files changed, 487 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/dropreason-core.h b/include/net/dropreason-core.h index e19184dd1b0f..d8ff24a33459 100644 --- a/include/net/dropreason-core.h +++ b/include/net/dropreason-core.h @@ -126,6 +126,7 @@ FN(CANFD_RX_INVALID_FRAME) \ FN(CANXL_RX_INVALID_FRAME) \ FN(PFMEMALLOC) \ + FN(DUALPI2_STEP_DROP) \ FNe(MAX) /** @@ -604,6 +605,11 @@ enum skb_drop_reason { * reached a path or socket not eligible for use of memory reserves */ SKB_DROP_REASON_PFMEMALLOC, + /** + * @SKB_DROP_REASON_DUALPI2_STEP_DROP: dropped by the step drop + * threshold of DualPI2 qdisc. + */ + SKB_DROP_REASON_DUALPI2_STEP_DROP, /** * @SKB_DROP_REASON_MAX: the maximum of core drop reasons, which * shouldn't be used as a real 'reason' - only for tracing code gen diff --git a/net/sched/Kconfig b/net/sched/Kconfig index ad914d2b2e22..6ddff028b81a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -415,6 +415,18 @@ config NET_SCH_BPF If unsure, say N. +config NET_SCH_DUALPI2 + tristate "Dual Queue PI Square (DUALPI2) scheduler" + help + Say Y here if you want to use the Dual Queue Proportional Integral + Controller Improved with a Square scheduling algorithm. + For more information, please see https://tools.ietf.org/html/rfc9332 + + To compile this driver as a module, choose M here: the module + will be called sch_dualpi2. + + If unsure, say N. + menuconfig NET_SCH_DEFAULT bool "Allow override default queue discipline" help diff --git a/net/sched/Makefile b/net/sched/Makefile index 904d784902d1..5078ea84e6ad 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -63,6 +63,7 @@ obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_SCH_BPF) += bpf_qdisc.o +obj-$(CONFIG_NET_SCH_DUALPI2) += sch_dualpi2.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 0a96d57c40d1..845375ebd4ea 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -113,8 +113,44 @@ struct dualpi2_sched_data { u32 step_marks; /* ECN mark pkt counter due to step AQM */ u32 memory_used; /* Memory used of both queues */ u32 max_memory_used;/* Maximum used memory */ + + /* Deferred drop statistics */ + u32 deferred_drops_cnt; /* Packets dropped */ + u32 deferred_drops_len; /* Bytes dropped */ +}; + +struct dualpi2_skb_cb { + u64 ts; /* Timestamp at enqueue */ + u8 apply_step:1, /* Can we apply the step threshold */ + classified:2, /* Packet classification results */ + ect:2; /* Packet ECT codepoint */ +}; + +enum dualpi2_classification_results { + DUALPI2_C_CLASSIC = 0, /* C-queue */ + DUALPI2_C_L4S = 1, /* L-queue (scale mark/classic drop) */ + DUALPI2_C_LLLL = 2, /* L-queue (no drops/marks) */ + __DUALPI2_C_MAX /* Keep last*/ }; +static struct dualpi2_skb_cb *dualpi2_skb_cb(struct sk_buff *skb) +{ + qdisc_cb_private_validate(skb, sizeof(struct dualpi2_skb_cb)); + return (struct dualpi2_skb_cb *)qdisc_skb_cb(skb)->data; +} + +static u64 dualpi2_sojourn_time(struct sk_buff *skb, u64 reference) +{ + return reference - dualpi2_skb_cb(skb)->ts; +} + +static u64 head_enqueue_time(struct Qdisc *q) +{ + struct sk_buff *skb = qdisc_peek_head(q); + + return skb ? dualpi2_skb_cb(skb)->ts : 0; +} + static u32 dualpi2_scale_alpha_beta(u32 param) { u64 tmp = ((u64)param * MAX_PROB >> ALPHA_BETA_SCALING); @@ -136,6 +172,30 @@ static ktime_t next_pi2_timeout(struct dualpi2_sched_data *q) return ktime_add_ns(ktime_get_ns(), q->pi2_tupdate); } +static bool skb_is_l4s(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified == DUALPI2_C_L4S; +} + +static bool skb_in_l_queue(struct sk_buff *skb) +{ + return dualpi2_skb_cb(skb)->classified != DUALPI2_C_CLASSIC; +} + +static bool skb_apply_step(struct sk_buff *skb, struct dualpi2_sched_data *q) +{ + return skb_is_l4s(skb) && qdisc_qlen(q->l_queue) >= q->min_qlen_step; +} + +static bool dualpi2_mark(struct dualpi2_sched_data *q, struct sk_buff *skb) +{ + if (INET_ECN_set_ce(skb)) { + q->ecn_mark++; + return true; + } + return false; +} + static void dualpi2_reset_c_protection(struct dualpi2_sched_data *q) { q->c_protection_credit = q->c_protection_init; @@ -155,6 +215,408 @@ static void dualpi2_calculate_c_protection(struct Qdisc *sch, dualpi2_reset_c_protection(q); } +static bool dualpi2_roll(u32 prob) +{ + return get_random_u32() <= prob; +} + +/* Packets in the C-queue are subject to a marking probability pC, which is the + * square of the internal PI probability (i.e., have an overall lower mark/drop + * probability). If the qdisc is overloaded, ignore ECT values and only drop. + * + * Note that this marking scheme is also applied to L4S packets during overload. + * Return true if packet dropping is required in C queue + */ +static bool dualpi2_classic_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, u32 prob, + bool overload) +{ + if (dualpi2_roll(prob) && dualpi2_roll(prob)) { + if (overload || dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; + dualpi2_mark(q, skb); + } + return false; +} + +/* Packets in the L-queue are subject to a marking probability pL given by the + * internal PI probability scaled by the coupling factor. + * + * On overload (i.e., @local_l_prob is >= 100%): + * - if the qdisc is configured to trade losses to preserve latency (i.e., + * @q->drop_overload), apply classic drops first before marking. + * - otherwise, preserve the "no loss" property of ECN at the cost of queueing + * delay, eventually resulting in taildrop behavior once sch->limit is + * reached. + * Return true if packet dropping is required in L queue + */ +static bool dualpi2_scalable_marking(struct dualpi2_sched_data *q, + struct sk_buff *skb, + u64 local_l_prob, u32 prob, + bool overload) +{ + if (overload) { + /* Apply classic drop */ + if (!q->drop_overload || + !(dualpi2_roll(prob) && dualpi2_roll(prob))) + goto mark; + return true; + } + + /* We can safely cut the upper 32b as overload==false */ + if (dualpi2_roll(local_l_prob)) { + /* Non-ECT packets could have classified as L4S by filters. */ + if (dualpi2_skb_cb(skb)->ect == INET_ECN_NOT_ECT) + return true; +mark: + dualpi2_mark(q, skb); + } + return false; +} + +/* Decide whether a given packet must be dropped (or marked if ECT), according + * to the PI2 probability. + * + * Never mark/drop if we have a standing queue of less than 2 MTUs. + */ +static bool must_drop(struct Qdisc *sch, struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + u64 local_l_prob; + bool overload; + u32 prob; + + if (sch->qstats.backlog < 2 * psched_mtu(qdisc_dev(sch))) + return false; + + prob = READ_ONCE(q->pi2_prob); + local_l_prob = (u64)prob * q->coupling_factor; + overload = local_l_prob > MAX_PROB; + + switch (dualpi2_skb_cb(skb)->classified) { + case DUALPI2_C_CLASSIC: + return dualpi2_classic_marking(q, skb, prob, overload); + case DUALPI2_C_L4S: + return dualpi2_scalable_marking(q, skb, local_l_prob, prob, + overload); + default: /* DUALPI2_C_LLLL */ + return false; + } +} + +static void dualpi2_read_ect(struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + int wlen = skb_network_offset(skb); + + switch (skb_protocol(skb, true)) { + case htons(ETH_P_IP): + wlen += sizeof(struct iphdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv4_get_dsfield(ip_hdr(skb)) & INET_ECN_MASK; + break; + case htons(ETH_P_IPV6): + wlen += sizeof(struct ipv6hdr); + if (!pskb_may_pull(skb, wlen) || + skb_try_make_writable(skb, wlen)) + goto not_ecn; + + cb->ect = ipv6_get_dsfield(ipv6_hdr(skb)) & INET_ECN_MASK; + break; + default: + goto not_ecn; + } + return; + +not_ecn: + /* Non pullable/writable packets can only be dropped hence are + * classified as not ECT. + */ + cb->ect = INET_ECN_NOT_ECT; +} + +static int dualpi2_skb_classify(struct dualpi2_sched_data *q, + struct sk_buff *skb) +{ + struct dualpi2_skb_cb *cb = dualpi2_skb_cb(skb); + struct tcf_result res; + struct tcf_proto *fl; + int result; + + dualpi2_read_ect(skb); + if (cb->ect & q->ecn_mask) { + cb->classified = DUALPI2_C_L4S; + return NET_XMIT_SUCCESS; + } + + if (TC_H_MAJ(skb->priority) == q->sch->handle && + TC_H_MIN(skb->priority) < __DUALPI2_C_MAX) { + cb->classified = TC_H_MIN(skb->priority); + return NET_XMIT_SUCCESS; + } + + fl = rcu_dereference_bh(q->tcf_filters); + if (!fl) { + cb->classified = DUALPI2_C_CLASSIC; + return NET_XMIT_SUCCESS; + } + + result = tcf_classify(skb, NULL, fl, &res, false); + if (result >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + case TC_ACT_TRAP: + return NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; + case TC_ACT_SHOT: + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } +#endif + cb->classified = TC_H_MIN(res.classid) < __DUALPI2_C_MAX ? + TC_H_MIN(res.classid) : DUALPI2_C_CLASSIC; + } + return NET_XMIT_SUCCESS; +} + +static int dualpi2_enqueue_skb(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct dualpi2_skb_cb *cb; + + if (unlikely(qdisc_qlen(sch) >= sch->limit) || + unlikely((u64)q->memory_used + skb->truesize > q->memory_limit)) { + qdisc_qstats_overlimit(sch); + if (skb_in_l_queue(skb)) + qdisc_qstats_overlimit(q->l_queue); + return qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_OVERLIMIT); + } + + if (q->drop_early && must_drop(sch, q, skb)) { + qdisc_drop_reason(skb, sch, to_free, + SKB_DROP_REASON_QDISC_CONGESTED); + return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; + } + + cb = dualpi2_skb_cb(skb); + cb->ts = ktime_get_ns(); + q->memory_used += skb->truesize; + if (q->memory_used > q->max_memory_used) + q->max_memory_used = q->memory_used; + + if (qdisc_qlen(sch) > q->maxq) + q->maxq = qdisc_qlen(sch); + + if (skb_in_l_queue(skb)) { + /* Apply step thresh if skb is L4S && L-queue len >= min_qlen */ + dualpi2_skb_cb(skb)->apply_step = skb_apply_step(skb, q); + + /* Keep the overall qdisc stats consistent */ + ++sch->q.qlen; + qdisc_qstats_backlog_inc(sch, skb); + ++q->packets_in_l; + if (!q->l_head_ts) + q->l_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, q->l_queue); + } + ++q->packets_in_c; + if (!q->c_head_ts) + q->c_head_ts = cb->ts; + return qdisc_enqueue_tail(skb, sch); +} + +/* By default, dualpi2 will split GSO skbs into independent skbs and enqueue + * each of those individually. This yields the following benefits, at the + * expense of CPU usage: + * - Finer-grained AQM actions as the sub-packets of a burst no longer share the + * same fate (e.g., the random mark/drop probability is applied individually) + * - Improved precision of the starvation protection/WRR scheduler at dequeue, + * as the size of the dequeued packets will be smaller. + */ +static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + int err; + + err = dualpi2_skb_classify(q, skb); + if (err != NET_XMIT_SUCCESS) { + if (err & __NET_XMIT_BYPASS) + qdisc_qstats_drop(sch); + __qdisc_drop(skb, to_free); + return err; + } + + if (q->split_gso && skb_is_gso(skb)) { + netdev_features_t features; + struct sk_buff *nskb, *next; + int cnt, byte_len, orig_len; + int err; + + features = netif_skb_features(skb); + nskb = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR_OR_NULL(nskb)) + return qdisc_drop(skb, sch, to_free); + + cnt = 1; + byte_len = 0; + orig_len = qdisc_pkt_len(skb); + skb_list_walk_safe(nskb, nskb, next) { + skb_mark_not_on_list(nskb); + + /* Iterate through GSO fragments of an skb: + * (1) Set pkt_len from the single GSO fragments + * (2) Copy classified and ect values of an skb + * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb + */ + qdisc_skb_cb(nskb)->pkt_len = nskb->len; + dualpi2_skb_cb(nskb)->classified = + dualpi2_skb_cb(skb)->classified; + dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; + err = dualpi2_enqueue_skb(nskb, sch, to_free); + + if (err == NET_XMIT_SUCCESS) { + /* Compute the backlog adjustment that needs + * to be propagated in the qdisc tree to reflect + * all new skbs successfully enqueued. + */ + ++cnt; + byte_len += nskb->len; + } + } + if (cnt > 1) { + /* The caller will add the original skb stats to its + * backlog, compensate this if any nskb is enqueued. + */ + --cnt; + byte_len -= orig_len; + } + qdisc_tree_reduce_backlog(sch, -cnt, -byte_len); + consume_skb(skb); + return err; + } + return dualpi2_enqueue_skb(skb, sch, to_free); +} + +/* Select the queue from which the next packet can be dequeued, ensuring that + * neither queue can starve the other with a WRR scheduler. + * + * The sign of the WRR credit determines the next queue, while the size of + * the dequeued packet determines the magnitude of the WRR credit change. If + * either queue is empty, the WRR credit is kept unchanged. + * + * As the dequeued packet can be dropped later, the caller has to perform the + * qdisc_bstats_update() calls. + */ +static struct sk_buff *dequeue_packet(struct Qdisc *sch, + struct dualpi2_sched_data *q, + int *credit_change, + u64 now) +{ + struct sk_buff *skb = NULL; + int c_len; + + *credit_change = 0; + c_len = qdisc_qlen(sch) - qdisc_qlen(q->l_queue); + if (qdisc_qlen(q->l_queue) && (!c_len || q->c_protection_credit <= 0)) { + skb = __qdisc_dequeue_head(&q->l_queue->q); + WRITE_ONCE(q->l_head_ts, head_enqueue_time(q->l_queue)); + if (c_len) + *credit_change = q->c_protection_wc; + qdisc_qstats_backlog_dec(q->l_queue, skb); + + /* Keep the global queue size consistent */ + --sch->q.qlen; + q->memory_used -= skb->truesize; + } else if (c_len) { + skb = __qdisc_dequeue_head(&sch->q); + WRITE_ONCE(q->c_head_ts, head_enqueue_time(sch)); + if (qdisc_qlen(q->l_queue)) + *credit_change = ~((s32)q->c_protection_wl) + 1; + q->memory_used -= skb->truesize; + } else { + dualpi2_reset_c_protection(q); + return NULL; + } + *credit_change *= qdisc_pkt_len(skb); + qdisc_qstats_backlog_dec(sch, skb); + return skb; +} + +static int do_step_aqm(struct dualpi2_sched_data *q, struct sk_buff *skb, + u64 now) +{ + u64 qdelay = 0; + + if (q->step_in_packets) + qdelay = qdisc_qlen(q->l_queue); + else + qdelay = dualpi2_sojourn_time(skb, now); + + if (dualpi2_skb_cb(skb)->apply_step && qdelay > q->step_thresh) { + if (!dualpi2_skb_cb(skb)->ect) { + /* Drop this non-ECT packet */ + return 1; + } + + if (dualpi2_mark(q, skb)) + ++q->step_marks; + } + qdisc_bstats_update(q->l_queue, skb); + return 0; +} + +static void drop_and_retry(struct dualpi2_sched_data *q, struct sk_buff *skb, + struct Qdisc *sch, enum skb_drop_reason reason) +{ + ++q->deferred_drops_cnt; + q->deferred_drops_len += qdisc_pkt_len(skb); + kfree_skb_reason(skb, reason); + qdisc_qstats_drop(sch); +} + +static struct sk_buff *dualpi2_qdisc_dequeue(struct Qdisc *sch) +{ + struct dualpi2_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + int credit_change; + u64 now; + + now = ktime_get_ns(); + + while ((skb = dequeue_packet(sch, q, &credit_change, now))) { + if (!q->drop_early && must_drop(sch, q, skb)) { + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_QDISC_CONGESTED); + continue; + } + + if (skb_in_l_queue(skb) && do_step_aqm(q, skb, now)) { + qdisc_qstats_drop(q->l_queue); + drop_and_retry(q, skb, sch, + SKB_DROP_REASON_DUALPI2_STEP_DROP); + continue; + } + + q->c_protection_credit += credit_change; + qdisc_bstats_update(sch, skb); + break; + } + + if (q->deferred_drops_cnt) { + qdisc_tree_reduce_backlog(sch, q->deferred_drops_cnt, + q->deferred_drops_len); + q->deferred_drops_cnt = 0; + q->deferred_drops_len = 0; + } + return skb; +} + static s64 __scale_delta(u64 diff) { do_div(diff, 1 << ALPHA_BETA_GRANULARITY); @@ -167,8 +629,8 @@ static void get_queue_delays(struct dualpi2_sched_data *q, u64 *qdelay_c, u64 now, qc, ql; now = ktime_get_ns(); - qc = q->c_head_ts; - ql = q->l_head_ts; + qc = READ_ONCE(q->c_head_ts); + ql = READ_ONCE(q->l_head_ts); *qdelay_c = qc ? now - qc : 0; *qdelay_l = ql ? now - ql : 0; @@ -254,7 +716,7 @@ static enum hrtimer_restart dualpi2_timer(struct hrtimer *timer) root_lock = qdisc_lock(qdisc_root_sleeping(sch)); spin_lock(root_lock); - q->pi2_prob = calculate_probability(sch); + WRITE_ONCE(q->pi2_prob, calculate_probability(sch)); hrtimer_set_expires(&q->pi2_timer, next_pi2_timeout(q)); spin_unlock(root_lock); @@ -559,7 +1021,7 @@ static int dualpi2_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct dualpi2_sched_data *q = qdisc_priv(sch); struct tc_dualpi2_xstats st = { - .prob = q->pi2_prob, + .prob = READ_ONCE(q->pi2_prob), .packets_in_c = q->packets_in_c, .packets_in_l = q->packets_in_l, .maxq = q->maxq, @@ -677,6 +1139,8 @@ static struct Qdisc_ops dualpi2_qdisc_ops __read_mostly = { .id = "dualpi2", .cl_ops = &dualpi2_class_ops, .priv_size = sizeof(struct dualpi2_sched_data), + .enqueue = dualpi2_qdisc_enqueue, + .dequeue = dualpi2_qdisc_dequeue, .peek = qdisc_peek_dequeued, .init = dualpi2_init, .destroy = dualpi2_destroy, -- cgit v1.2.3 From 71c52411c51bf4f0869c572294ce8123b26528d5 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 23 Jul 2025 01:30:29 +0000 Subject: net: Create separate gro_flush_normal function Move multiple copies of same code snippet doing `gro_flush` and `gro_normal_list` into separate helper function. Signed-off-by: Samiullah Khawaja Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250723013031.2911384-2-skhawaja@google.com Signed-off-by: Jakub Kicinski --- include/net/gro.h | 6 ++++++ kernel/bpf/cpumap.c | 3 +-- net/core/dev.c | 9 +++------ 3 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/net') diff --git a/include/net/gro.h b/include/net/gro.h index 22d3a69e4404..a0fca7ac6e7e 100644 --- a/include/net/gro.h +++ b/include/net/gro.h @@ -534,6 +534,12 @@ static inline void gro_normal_list(struct gro_node *gro) gro->rx_count = 0; } +static inline void gro_flush_normal(struct gro_node *gro, bool flush_old) +{ + gro_flush(gro, flush_old); + gro_normal_list(gro); +} + /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded, * pass the whole batch up to the stack. */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 67e8a2fc1a99..b2b7b8ec2c2a 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -282,8 +282,7 @@ static void cpu_map_gro_flush(struct bpf_cpu_map_entry *rcpu, bool empty) * This is equivalent to how NAPI decides whether to perform a full * flush. */ - gro_flush(&rcpu->gro, !empty && HZ >= 1000); - gro_normal_list(&rcpu->gro); + gro_flush_normal(&rcpu->gro, !empty && HZ >= 1000); } static int cpu_map_kthread_run(void *data) diff --git a/net/core/dev.c b/net/core/dev.c index 354d3453b407..76384b8a7871 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6578,8 +6578,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done) * it, we need to bound somehow the time packets are kept in * the GRO layer. */ - gro_flush(&n->gro, !!timeout); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, !!timeout); if (unlikely(!list_empty(&n->poll_list))) { /* If n->poll_list is not empty, we need to mask irqs */ @@ -6649,8 +6648,7 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&napi->gro, HZ >= 1000); - gro_normal_list(&napi->gro); + gro_flush_normal(&napi->gro, HZ >= 1000); clear_bit(NAPI_STATE_SCHED, &napi->state); } @@ -7515,8 +7513,7 @@ static int __napi_poll(struct napi_struct *n, bool *repoll) } /* Flush too old packets. If HZ < 1000, flush all packets */ - gro_flush(&n->gro, HZ >= 1000); - gro_normal_list(&n->gro); + gro_flush_normal(&n->gro, HZ >= 1000); /* Some drivers may have called napi_schedule * prior to exhausting their budget. -- cgit v1.2.3 From e89a68046687fe9913ce3bfad82f7ccbb65687e0 Mon Sep 17 00:00:00 2001 From: Lance Yang Date: Mon, 26 May 2025 16:59:02 +0800 Subject: netfilter: load nf_log_syslog on enabling nf_conntrack_log_invalid When no logger is registered, nf_conntrack_log_invalid fails to log invalid packets, leaving users unaware of actual invalid traffic. Improve this by loading nf_log_syslog, similar to how 'iptables -I FORWARD 1 -m conntrack --ctstate INVALID -j LOG' triggers it. Suggested-by: Florian Westphal Signed-off-by: Zi Li Signed-off-by: Lance Yang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 3 +++ net/netfilter/nf_conntrack_standalone.c | 26 +++++++++++++++++++++++++- net/netfilter/nf_log.c | 26 ++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index e55eedc84ed7..00506792a06d 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -59,6 +59,9 @@ extern int sysctl_nf_log_all_netns; int nf_log_register(u_int8_t pf, struct nf_logger *logger); void nf_log_unregister(struct nf_logger *logger); +/* Check if any logger is registered for a given protocol family. */ +bool nf_log_is_registered(u_int8_t pf); + int nf_log_set(struct net *net, u_int8_t pf, const struct nf_logger *logger); void nf_log_unset(struct net *net, const struct nf_logger *logger); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 829f60496008..9b8b10a85233 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -14,6 +14,7 @@ #include #endif +#include #include #include #include @@ -555,6 +556,29 @@ nf_conntrack_hash_sysctl(const struct ctl_table *table, int write, return ret; } +static int +nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, i; + + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); + if (ret < 0 || !write) + return ret; + + if (*(u8 *)table->data == 0) + return ret; + + /* Load nf_log_syslog only if no logger is currently registered */ + for (i = 0; i < NFPROTO_NUMPROTO; i++) { + if (nf_log_is_registered(i)) + return ret; + } + request_module("%s", "nf_log_syslog"); + + return ret; +} + static struct ctl_table_header *nf_ct_netfilter_header; enum nf_ct_sysctl_index { @@ -651,7 +675,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &init_net.ct.sysctl_log_invalid, .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dou8vec_minmax, + .proc_handler = nf_conntrack_log_invalid_sysctl, }, [NF_SYSCTL_CT_EXPECT_MAX] = { .procname = "nf_conntrack_expect_max", diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 6dd0de33eebd..74cef8bf554c 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -125,6 +125,32 @@ void nf_log_unregister(struct nf_logger *logger) } EXPORT_SYMBOL(nf_log_unregister); +/** + * nf_log_is_registered - Check if any logger is registered for a given + * protocol family. + * + * @pf: Protocol family + * + * Returns: true if at least one logger is active for @pf, false otherwise. + */ +bool nf_log_is_registered(u_int8_t pf) +{ + int i; + + if (pf >= NFPROTO_NUMPROTO) { + WARN_ON_ONCE(1); + return false; + } + + for (i = 0; i < NF_LOG_TYPE_MAX; i++) { + if (rcu_access_pointer(loggers[pf][i])) + return true; + } + + return false; +} +EXPORT_SYMBOL(nf_log_is_registered); + int nf_log_bind_pf(struct net *net, u_int8_t pf, const struct nf_logger *logger) { -- cgit v1.2.3 From bf6788742b8d6c73de441e088a71de7154f0d4aa Mon Sep 17 00:00:00 2001 From: Yue Haibing Date: Tue, 24 Jun 2025 09:48:18 +0800 Subject: netfilter: nf_tables: Remove unused nft_reduce_is_readonly() Since commit 9e539c5b6d9c ("netfilter: nf_tables: disable expression reduction infra") this is unused. Signed-off-by: Yue Haibing Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5e49619ae49c..b092e57d3c75 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1939,11 +1939,6 @@ static inline u64 nft_net_tstamp(const struct net *net) #define __NFT_REDUCE_READONLY 1UL #define NFT_REDUCE_READONLY (void *)__NFT_REDUCE_READONLY -static inline bool nft_reduce_is_readonly(const struct nft_expr *expr) -{ - return expr->ops->reduce == NFT_REDUCE_READONLY; -} - void nft_reg_track_update(struct nft_regs_track *track, const struct nft_expr *expr, u8 dreg, u8 len); void nft_reg_track_cancel(struct nft_regs_track *track, u8 dreg, u8 len); -- cgit v1.2.3 From 17a20e09f086f2c574ac87f3cf6e14c4377f65f6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Jul 2025 19:05:13 +0200 Subject: netfilter: nft_set: remove one argument from lookup and update functions Return the extension pointer instead of passing it as a function argument to be filled in by the callee. As-is, whenever false is returned, the extension pointer is not used. For all set types, when true is returned, the extension pointer was set to the matching element. Only exception: nft_set_bitmap doesn't support extensions. Return a pointer to a static const empty element extension container. return false -> return NULL return true -> return the elements' extension pointer. This saves one function argument. Signed-off-by: Florian Westphal Reviewed-by: Stefano Brivio Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 10 +++---- include/net/netfilter/nf_tables_core.h | 47 ++++++++++++++++------------- net/netfilter/nft_dynset.c | 5 ++-- net/netfilter/nft_lookup.c | 27 +++++++++-------- net/netfilter/nft_objref.c | 5 ++-- net/netfilter/nft_set_bitmap.c | 11 +++++-- net/netfilter/nft_set_hash.c | 54 +++++++++++++++------------------- net/netfilter/nft_set_pipapo.c | 19 +++++++----- net/netfilter/nft_set_pipapo_avx2.c | 25 +++++++++------- net/netfilter/nft_set_rbtree.c | 40 ++++++++++++------------- 10 files changed, 126 insertions(+), 117 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index b092e57d3c75..5b6725475906 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -459,19 +459,17 @@ struct nft_set_ext; * control plane functions. */ struct nft_set_ops { - bool (*lookup)(const struct net *net, + const struct nft_set_ext * (*lookup)(const struct net *net, const struct nft_set *set, - const u32 *key, - const struct nft_set_ext **ext); - bool (*update)(struct nft_set *set, + const u32 *key); + const struct nft_set_ext * (*update)(struct nft_set *set, const u32 *key, struct nft_elem_priv * (*new)(struct nft_set *, const struct nft_expr *, struct nft_regs *), const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext); + struct nft_regs *regs); bool (*delete)(const struct nft_set *set, const u32 *key); diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 03b6165756fc..6a52fb97b844 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -94,34 +94,41 @@ extern const struct nft_set_type nft_set_pipapo_type; extern const struct nft_set_type nft_set_pipapo_avx2_type; #ifdef CONFIG_MITIGATION_RETPOLINE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); #else -static inline bool +static inline const struct nft_set_ext * nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) + const u32 *key) { - return set->ops->lookup(net, set, key, ext); + return set->ops->lookup(net, set, key); } #endif /* called from nft_pipapo_avx2.c */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); /* called from nft_set_pipapo.c */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext); +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key); void nft_counter_init_seqcount(void); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 88922e0e8e83..e24493d9e776 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -91,8 +91,9 @@ void nft_dynset_eval(const struct nft_expr *expr, return; } - if (set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, - expr, regs, &ext)) { + ext = set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, + expr, regs); + if (ext) { if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && READ_ONCE(nft_set_ext_timeout(ext)->timeout) != 0) { diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 63ef832b8aa7..40c602ffbcba 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -25,32 +25,33 @@ struct nft_lookup { }; #ifdef CONFIG_MITIGATION_RETPOLINE -bool nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { if (set->ops == &nft_set_hash_fast_type.ops) - return nft_hash_lookup_fast(net, set, key, ext); + return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) - return nft_hash_lookup(net, set, key, ext); + return nft_hash_lookup(net, set, key); if (set->ops == &nft_set_rhash_type.ops) - return nft_rhash_lookup(net, set, key, ext); + return nft_rhash_lookup(net, set, key); if (set->ops == &nft_set_bitmap_type.ops) - return nft_bitmap_lookup(net, set, key, ext); + return nft_bitmap_lookup(net, set, key); if (set->ops == &nft_set_pipapo_type.ops) - return nft_pipapo_lookup(net, set, key, ext); + return nft_pipapo_lookup(net, set, key); #if defined(CONFIG_X86_64) && !defined(CONFIG_UML) if (set->ops == &nft_set_pipapo_avx2_type.ops) - return nft_pipapo_avx2_lookup(net, set, key, ext); + return nft_pipapo_avx2_lookup(net, set, key); #endif if (set->ops == &nft_set_rbtree_type.ops) - return nft_rbtree_lookup(net, set, key, ext); + return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); - return set->ops->lookup(net, set, key, ext); + return set->ops->lookup(net, set, key); } EXPORT_SYMBOL_GPL(nft_set_do_lookup); #endif @@ -61,12 +62,12 @@ void nft_lookup_eval(const struct nft_expr *expr, { const struct nft_lookup *priv = nft_expr_priv(expr); const struct nft_set *set = priv->set; - const struct nft_set_ext *ext = NULL; const struct net *net = nft_net(pkt); + const struct nft_set_ext *ext; bool found; - found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext) ^ - priv->invert; + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + found = !!ext ^ priv->invert; if (!found) { ext = nft_set_catchall_lookup(net, set); if (!ext) { diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 09da7a3f9f96..8ee66a86c3bc 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -111,10 +111,9 @@ void nft_objref_map_eval(const struct nft_expr *expr, struct net *net = nft_net(pkt); const struct nft_set_ext *ext; struct nft_object *obj; - bool found; - found = nft_set_do_lookup(net, set, ®s->data[priv->sreg], &ext); - if (!found) { + ext = nft_set_do_lookup(net, set, ®s->data[priv->sreg]); + if (!ext) { ext = nft_set_catchall_lookup(net, set); if (!ext) { regs->verdict.code = NFT_BREAK; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 12390d2e994f..c24c922f895d 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -75,16 +75,21 @@ nft_bitmap_active(const u8 *bitmap, u32 idx, u32 off, u8 genmask) } INDIRECT_CALLABLE_SCOPE -bool nft_bitmap_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_bitmap_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { const struct nft_bitmap *priv = nft_set_priv(set); + static const struct nft_set_ext found; u8 genmask = nft_genmask_cur(net); u32 idx, off; nft_bitmap_location(set, key, &idx, &off); - return nft_bitmap_active(priv->bitmap, idx, off, genmask); + if (nft_bitmap_active(priv->bitmap, idx, off, genmask)) + return &found; + + return NULL; } static struct nft_bitmap_elem * diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index abb0c8ec6371..9903c737c9f0 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -81,8 +81,9 @@ static const struct rhashtable_params nft_rhash_params = { }; INDIRECT_CALLABLE_SCOPE -bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_rhash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rhash *priv = nft_set_priv(set); const struct nft_rhash_elem *he; @@ -95,9 +96,9 @@ bool nft_rhash_lookup(const struct net *net, const struct nft_set *set, he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); if (he != NULL) - *ext = &he->ext; + return &he->ext; - return !!he; + return NULL; } static struct nft_elem_priv * @@ -120,14 +121,11 @@ nft_rhash_get(const struct net *net, const struct nft_set *set, return ERR_PTR(-ENOENT); } -static bool nft_rhash_update(struct nft_set *set, const u32 *key, - struct nft_elem_priv * - (*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *regs), - const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_set_ext **ext) +static const struct nft_set_ext * +nft_rhash_update(struct nft_set *set, const u32 *key, + struct nft_elem_priv *(*new)(struct nft_set *, const struct nft_expr *, + struct nft_regs *regs), + const struct nft_expr *expr, struct nft_regs *regs) { struct nft_rhash *priv = nft_set_priv(set); struct nft_rhash_elem *he, *prev; @@ -161,14 +159,13 @@ static bool nft_rhash_update(struct nft_set *set, const u32 *key, } out: - *ext = &he->ext; - return true; + return &he->ext; err2: nft_set_elem_destroy(set, &he->priv, true); atomic_dec(&set->nelems); err1: - return false; + return NULL; } static int nft_rhash_insert(const struct net *net, const struct nft_set *set, @@ -507,8 +504,9 @@ struct nft_hash_elem { }; INDIRECT_CALLABLE_SCOPE -bool nft_hash_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_hash_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -519,12 +517,10 @@ bool nft_hash_lookup(const struct net *net, const struct nft_set *set, hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry_rcu(he, &priv->table[hash], node) { if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } static struct nft_elem_priv * @@ -547,9 +543,9 @@ nft_hash_get(const struct net *net, const struct nft_set *set, } INDIRECT_CALLABLE_SCOPE -bool nft_hash_lookup_fast(const struct net *net, - const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_hash_lookup_fast(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); @@ -562,12 +558,10 @@ bool nft_hash_lookup_fast(const struct net *net, hlist_for_each_entry_rcu(he, &priv->table[hash], node) { k2 = *(u32 *)nft_set_ext_key(&he->ext)->data; if (k1 == k2 && - nft_set_elem_active(&he->ext, genmask)) { - *ext = &he->ext; - return true; - } + nft_set_elem_active(&he->ext, genmask)) + return &he->ext; } - return false; + return NULL; } static u32 nft_jhash(const struct nft_set *set, const struct nft_hash *priv, diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 08fb6720673f..36a4de11995b 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -407,8 +407,9 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, * * Return: true on match, false otherwise. */ -bool nft_pipapo_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_pipapo_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; @@ -465,13 +466,15 @@ next_match: scratch->map_index = map_index; local_bh_enable(); - return false; + return NULL; } if (last) { - *ext = &f->mt[b].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) + const struct nft_set_ext *ext; + + ext = &f->mt[b].e->ext; + if (unlikely(nft_set_elem_expired(ext) || + !nft_set_elem_active(ext, genmask))) goto next_match; /* Last field: we're just returning the key without @@ -482,7 +485,7 @@ next_match: scratch->map_index = map_index; local_bh_enable(); - return true; + return ext; } /* Swap bitmap indices: res_map is the initial bitmap for the @@ -497,7 +500,7 @@ next_match: out: local_bh_enable(); - return false; + return NULL; } /** diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index be7c16c79f71..6c441e2dc8af 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1146,8 +1146,9 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns * * Return: true on match, false otherwise. */ -bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; @@ -1155,17 +1156,18 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; const u8 *rp = (const u8 *)key; + const struct nft_set_ext *ext; unsigned long *res, *fill; bool map_index; - int i, ret = 0; + int i; local_bh_disable(); if (unlikely(!irq_fpu_usable())) { - bool fallback_res = nft_pipapo_lookup(net, set, key, ext); + ext = nft_pipapo_lookup(net, set, key); local_bh_enable(); - return fallback_res; + return ext; } m = rcu_dereference(priv->match); @@ -1182,7 +1184,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, if (unlikely(!scratch)) { kernel_fpu_end(); local_bh_enable(); - return false; + return NULL; } map_index = scratch->map_index; @@ -1197,6 +1199,7 @@ bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; + int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ @@ -1244,10 +1247,10 @@ next_match: goto out; if (last) { - *ext = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(*ext) || - !nft_set_elem_active(*ext, genmask))) { - ret = 0; + ext = &f->mt[ret].e->ext; + if (unlikely(nft_set_elem_expired(ext) || + !nft_set_elem_active(ext, genmask))) { + ext = NULL; goto next_match; } @@ -1264,5 +1267,5 @@ out: kernel_fpu_end(); local_bh_enable(); - return ret >= 0; + return ext; } diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 2e8ef16ff191..938a257c069e 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -52,9 +52,9 @@ static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) return nft_set_elem_expired(&rbe->ext); } -static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext, - unsigned int seq) +static const struct nft_set_ext * +__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, unsigned int seq) { struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; @@ -65,7 +65,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(priv->root.rb_node); while (parent != NULL) { if (read_seqcount_retry(&priv->count, seq)) - return false; + return NULL; rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -87,50 +87,48 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set } if (nft_rbtree_elem_expired(rbe)) - return false; + return NULL; if (nft_rbtree_interval_end(rbe)) { if (nft_set_is_anonymous(set)) - return false; + return NULL; parent = rcu_dereference_raw(parent->rb_left); interval = NULL; continue; } - *ext = &rbe->ext; - return true; + return &rbe->ext; } } if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && !nft_rbtree_elem_expired(interval) && - nft_rbtree_interval_start(interval)) { - *ext = &interval->ext; - return true; - } + nft_rbtree_interval_start(interval)) + return &interval->ext; - return false; + return NULL; } INDIRECT_CALLABLE_SCOPE -bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, const struct nft_set_ext **ext) +const struct nft_set_ext * +nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); unsigned int seq = read_seqcount_begin(&priv->count); - bool ret; + const struct nft_set_ext *ext; - ret = __nft_rbtree_lookup(net, set, key, ext, seq); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return ret; + ext = __nft_rbtree_lookup(net, set, key, seq); + if (ext || !read_seqcount_retry(&priv->count, seq)) + return ext; read_lock_bh(&priv->lock); seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_lookup(net, set, key, ext, seq); + ext = __nft_rbtree_lookup(net, set, key, seq); read_unlock_bh(&priv->lock); - return ret; + return ext; } static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, -- cgit v1.2.3 From 531e61312104d991459af73c838396db26aa3550 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 9 Jul 2025 19:05:14 +0200 Subject: netfilter: nft_set: remove indirection from update API call This stems from a time when sets and nft_dynset resided in different kernel modules. We can replace this with a direct call. We could even remove both ->update and ->delete, given its only supported by rhashtable, but on the off-chance we'll see runtime add/delete for other types or a new set type keep that as-is for now. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ---- include/net/netfilter/nf_tables_core.h | 3 +++ net/netfilter/nft_dynset.c | 9 ++++----- net/netfilter/nft_set_hash.c | 4 +--- net/netfilter/nft_set_pipapo_avx2.c | 1 - 5 files changed, 8 insertions(+), 13 deletions(-) (limited to 'include/net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5b6725475906..891e43a01bdc 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -464,10 +464,6 @@ struct nft_set_ops { const u32 *key); const struct nft_set_ext * (*update)(struct nft_set *set, const u32 *key, - struct nft_elem_priv * - (*new)(struct nft_set *, - const struct nft_expr *, - struct nft_regs *), const struct nft_expr *expr, struct nft_regs *regs); bool (*delete)(const struct nft_set *set, diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 6a52fb97b844..6c2f483d9828 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -188,4 +188,7 @@ void nft_objref_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); void nft_objref_map_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs); #endif /* _NET_NF_TABLES_CORE_H */ diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index e24493d9e776..7807d8129664 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -44,9 +44,9 @@ static int nft_dynset_expr_setup(const struct nft_dynset *priv, return 0; } -static struct nft_elem_priv *nft_dynset_new(struct nft_set *set, - const struct nft_expr *expr, - struct nft_regs *regs) +struct nft_elem_priv *nft_dynset_new(struct nft_set *set, + const struct nft_expr *expr, + struct nft_regs *regs) { const struct nft_dynset *priv = nft_expr_priv(expr); struct nft_set_ext *ext; @@ -91,8 +91,7 @@ void nft_dynset_eval(const struct nft_expr *expr, return; } - ext = set->ops->update(set, ®s->data[priv->sreg_key], nft_dynset_new, - expr, regs); + ext = set->ops->update(set, ®s->data[priv->sreg_key], expr, regs); if (ext) { if (priv->op == NFT_DYNSET_OP_UPDATE && nft_set_ext_exists(ext, NFT_SET_EXT_TIMEOUT) && diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 9903c737c9f0..266d0c637225 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -123,8 +123,6 @@ nft_rhash_get(const struct net *net, const struct nft_set *set, static const struct nft_set_ext * nft_rhash_update(struct nft_set *set, const u32 *key, - struct nft_elem_priv *(*new)(struct nft_set *, const struct nft_expr *, - struct nft_regs *regs), const struct nft_expr *expr, struct nft_regs *regs) { struct nft_rhash *priv = nft_set_priv(set); @@ -141,7 +139,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, if (he != NULL) goto out; - elem_priv = new(set, expr, regs); + elem_priv = nft_dynset_new(set, expr, regs); if (!elem_priv) goto err1; diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index 6c441e2dc8af..db5d367e43c4 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1137,7 +1137,6 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns * @net: Network namespace * @set: nftables API set representation * @key: nftables API element representation containing key data - * @ext: nftables API extension pointer, filled with matching reference * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * -- cgit v1.2.3 From 511d10b4c2f91fb6aa676006b2bdff4df5d6e270 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 22 Jul 2025 10:18:33 -0700 Subject: sctp: Replace sockaddr with sockaddr_inet in sctp_addr union As part of the removal of the variably-sized sockaddr for kernel internals, replace struct sockaddr with sockaddr_inet in the sctp_addr union. No binary changes; the union size remains unchanged due to sockaddr_inet matching the size of sockaddr_in6. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250722171836.1078436-3-kees@kernel.org Signed-off-by: Jakub Kicinski --- include/net/sctp/structs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 1ad7ce71d0a7..8a540ad9b509 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -51,9 +51,9 @@ * We should wean ourselves off this. */ union sctp_addr { + struct sockaddr_inet sa; /* Large enough for both address families */ struct sockaddr_in v4; struct sockaddr_in6 v6; - struct sockaddr sa; }; /* Forward declarations for data structures. */ -- cgit v1.2.3