summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/8021q/vlan_dev.c31
-rw-r--r--net/appletalk/aarp.c45
-rw-r--r--net/atm/lec.c3
-rw-r--r--net/atm/mpc.c2
-rw-r--r--net/ax25/af_ax25.c30
-rw-r--r--net/ax25/ax25_route.c74
-rw-r--r--net/batman-adv/bat_iv_ogm.c3
-rw-r--r--net/batman-adv/bat_v_ogm.c3
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c33
-rw-r--r--net/batman-adv/distributed-arp-table.c20
-rw-r--r--net/batman-adv/gateway_client.c18
-rw-r--r--net/batman-adv/main.c7
-rw-r--r--net/batman-adv/main.h4
-rw-r--r--net/batman-adv/multicast.c17
-rw-r--r--net/batman-adv/netlink.c146
-rw-r--r--net/batman-adv/netlink.h5
-rw-r--r--net/batman-adv/originator.c116
-rw-r--r--net/batman-adv/soft-interface.c16
-rw-r--r--net/batman-adv/translation-table.c92
-rw-r--r--net/batman-adv/types.h4
-rw-r--r--net/bluetooth/6lowpan.c7
-rw-r--r--net/bluetooth/hci_core.c96
-rw-r--r--net/bluetooth/hci_event.c67
-rw-r--r--net/bluetooth/hci_sync.c106
-rw-r--r--net/bluetooth/hci_sysfs.c19
-rw-r--r--net/bluetooth/hidp/Kconfig3
-rw-r--r--net/bluetooth/iso.c42
-rw-r--r--net/bluetooth/l2cap_core.c202
-rw-r--r--net/bluetooth/l2cap_sock.c15
-rw-r--r--net/bluetooth/mgmt.c135
-rw-r--r--net/bluetooth/rfcomm/core.c6
-rw-r--r--net/bluetooth/sco.c25
-rw-r--r--net/bpf/test_run.c5
-rw-r--r--net/bridge/br.c7
-rw-r--r--net/bridge/br_arp_nd_proxy.c2
-rw-r--r--net/bridge/br_fdb.c3
-rw-r--r--net/bridge/br_forward.c16
-rw-r--r--net/bridge/br_input.c20
-rw-r--r--net/bridge/br_ioctl.c36
-rw-r--r--net/bridge/br_netfilter_hooks.c30
-rw-r--r--net/bridge/br_private.h14
-rw-r--r--net/bridge/br_sysfs_br.c6
-rw-r--r--net/bridge/br_vlan.c48
-rw-r--r--net/can/af_can.c12
-rw-r--r--net/can/af_can.h12
-rw-r--r--net/can/proc.c46
-rw-r--r--net/can/raw.c2
-rw-r--r--net/core/bpf_sk_storage.c24
-rw-r--r--net/core/dev.c559
-rw-r--r--net/core/dev.h33
-rw-r--r--net/core/dev_addr_lists.c7
-rw-r--r--net/core/dev_ioctl.c82
-rw-r--r--net/core/devmem.c25
-rw-r--r--net/core/dst.c8
-rw-r--r--net/core/fib_rules.c2
-rw-r--r--net/core/filter.c126
-rw-r--r--net/core/lwtunnel.c65
-rw-r--r--net/core/neighbour.c1
-rw-r--r--net/core/net-sysfs.c39
-rw-r--r--net/core/net_namespace.c5
-rw-r--r--net/core/netdev-genl.c65
-rw-r--r--net/core/netdev_rx_queue.c1
-rw-r--r--net/core/netpoll.c19
-rw-r--r--net/core/page_pool.c154
-rw-r--r--net/core/page_pool_user.c2
-rw-r--r--net/core/pktgen.c7
-rw-r--r--net/core/rtnetlink.c122
-rw-r--r--net/core/rtnl_net_debug.c17
-rw-r--r--net/core/skbuff.c106
-rw-r--r--net/core/skmsg.c7
-rw-r--r--net/core/sock.c31
-rw-r--r--net/core/timestamping.c52
-rw-r--r--net/core/xdp.c327
-rw-r--r--net/dccp/ipv4.c3
-rw-r--r--net/dccp/sysctl.c4
-rw-r--r--net/devlink/core.c2
-rw-r--r--net/devlink/health.c67
-rw-r--r--net/devlink/port.c11
-rw-r--r--net/dsa/dsa.c61
-rw-r--r--net/dsa/port.c16
-rw-r--r--net/dsa/tag_8021q.c2
-rw-r--r--net/dsa/tag_ksz.c2
-rw-r--r--net/dsa/tag_ocelot_8021q.c2
-rw-r--r--net/dsa/tag_sja1105.c2
-rw-r--r--net/dsa/user.c34
-rw-r--r--net/ethtool/Makefile2
-rw-r--r--net/ethtool/cmis.h1
-rw-r--r--net/ethtool/cmis_cdb.c20
-rw-r--r--net/ethtool/common.c172
-rw-r--r--net/ethtool/common.h21
-rw-r--r--net/ethtool/ioctl.c4
-rw-r--r--net/ethtool/netlink.c62
-rw-r--r--net/ethtool/netlink.h11
-rw-r--r--net/ethtool/pse-pd.c8
-rw-r--r--net/ethtool/rings.c62
-rw-r--r--net/ethtool/stats.c39
-rw-r--r--net/ethtool/strset.c10
-rw-r--r--net/ethtool/ts.h20
-rw-r--r--net/ethtool/tsconfig.c457
-rw-r--r--net/ethtool/tsinfo.c361
-rw-r--r--net/hsr/hsr_debugfs.c9
-rw-r--r--net/hsr/hsr_device.c13
-rw-r--r--net/hsr/hsr_main.h10
-rw-r--r--net/hsr/hsr_slave.c5
-rw-r--r--net/ipv4/af_inet.c14
-rw-r--r--net/ipv4/datagram.c11
-rw-r--r--net/ipv4/esp4.c5
-rw-r--r--net/ipv4/fib_rules.c6
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/igmp.c66
-rw-r--r--net/ipv4/inet_connection_sock.c11
-rw-r--r--net/ipv4/inetpeer.c18
-rw-r--r--net/ipv4/ip_gre.c17
-rw-r--r--net/ipv4/ip_input.c11
-rw-r--r--net/ipv4/ip_output.c33
-rw-r--r--net/ipv4/ip_sockglue.c14
-rw-r--r--net/ipv4/ip_tunnel_core.c4
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/raw.c2
-rw-r--r--net/ipv4/route.c9
-rw-r--r--net/ipv4/sysctl_net_ipv4.c10
-rw-r--r--net/ipv4/tcp.c29
-rw-r--r--net/ipv4/tcp_bpf.c36
-rw-r--r--net/ipv4/tcp_input.c84
-rw-r--r--net/ipv4/tcp_ipv4.c7
-rw-r--r--net/ipv4/tcp_minisocks.c7
-rw-r--r--net/ipv4/udp.c42
-rw-r--r--net/ipv6/addrconf.c324
-rw-r--r--net/ipv6/anycast.c35
-rw-r--r--net/ipv6/calipso.c21
-rw-r--r--net/ipv6/esp6.c5
-rw-r--r--net/ipv6/fib6_rules.c57
-rw-r--r--net/ipv6/ioam6_iptunnel.c8
-rw-r--r--net/ipv6/ip6_input.c14
-rw-r--r--net/ipv6/ip6_output.c16
-rw-r--r--net/ipv6/mcast.c100
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c23
-rw-r--r--net/ipv6/ping.c1
-rw-r--r--net/ipv6/raw.c3
-rw-r--r--net/ipv6/route.c68
-rw-r--r--net/ipv6/tcpv6_offload.c21
-rw-r--r--net/ipv6/udp.c1
-rw-r--r--net/ipv6/xfrm6_output.c4
-rw-r--r--net/l2tp/l2tp_eth.c9
-rw-r--r--net/l2tp/l2tp_ip.c19
-rw-r--r--net/llc/sysctl_net_llc.c4
-rw-r--r--net/mac80211/cfg.c54
-rw-r--r--net/mac80211/chan.c7
-rw-r--r--net/mac80211/debug.h10
-rw-r--r--net/mac80211/debugfs.c48
-rw-r--r--net/mac80211/debugfs_key.c47
-rw-r--r--net/mac80211/debugfs_key.h15
-rw-r--r--net/mac80211/debugfs_netdev.c11
-rw-r--r--net/mac80211/driver-ops.c10
-rw-r--r--net/mac80211/driver-ops.h7
-rw-r--r--net/mac80211/eht.c9
-rw-r--r--net/mac80211/ethtool.c22
-rw-r--r--net/mac80211/he.c119
-rw-r--r--net/mac80211/ibss.c3
-rw-r--r--net/mac80211/ieee80211_i.h23
-rw-r--r--net/mac80211/iface.c48
-rw-r--r--net/mac80211/key.c2
-rw-r--r--net/mac80211/main.c14
-rw-r--r--net/mac80211/mesh_hwmp.c14
-rw-r--r--net/mac80211/mesh_plink.c5
-rw-r--r--net/mac80211/mlme.c1238
-rw-r--r--net/mac80211/rx.c29
-rw-r--r--net/mac80211/sta_info.c43
-rw-r--r--net/mac80211/sta_info.h12
-rw-r--r--net/mac80211/tests/Makefile2
-rw-r--r--net/mac80211/tests/util.c313
-rw-r--r--net/mac80211/tests/util.h36
-rw-r--r--net/mac80211/trace.h130
-rw-r--r--net/mac80211/util.c77
-rw-r--r--net/mac80211/vht.c33
-rw-r--r--net/mac802154/ieee802154_i.h3
-rw-r--r--net/mac802154/tx.c13
-rw-r--r--net/mctp/af_mctp.c3
-rw-r--r--net/mctp/device.c50
-rw-r--r--net/mctp/route.c10
-rw-r--r--net/mctp/test/route-test.c109
-rw-r--r--net/mptcp/ctrl.c21
-rw-r--r--net/mptcp/options.c6
-rw-r--r--net/mptcp/pm_netlink.c46
-rw-r--r--net/mptcp/pm_userspace.c295
-rw-r--r--net/mptcp/protocol.c8
-rw-r--r--net/mptcp/protocol.h9
-rw-r--r--net/mptcp/sockopt.c28
-rw-r--r--net/mptcp/subflow.c19
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c50
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c8
-rw-r--r--net/netfilter/nf_conncount.c6
-rw-r--r--net/netfilter/nf_conntrack_amanda.c2
-rw-r--r--net/netfilter/nf_conntrack_broadcast.c2
-rw-r--r--net/netfilter/nf_conntrack_core.c13
-rw-r--r--net/netfilter/nf_conntrack_ecache.c23
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c4
-rw-r--r--net/netfilter/nf_conntrack_netlink.c45
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c21
-rw-r--r--net/netfilter/nf_conntrack_sip.c4
-rw-r--r--net/netfilter/nf_conntrack_standalone.c11
-rw-r--r--net/netfilter/nf_flow_table_core.c193
-rw-r--r--net/netfilter/nf_flow_table_ip.c6
-rw-r--r--net/netfilter/nf_tables_api.c109
-rw-r--r--net/netfilter/nf_tables_core.c11
-rw-r--r--net/netfilter/nfnetlink_queue.c26
-rw-r--r--net/netfilter/nft_chain_filter.c48
-rw-r--r--net/netfilter/nft_compat.c8
-rw-r--r--net/netfilter/nft_ct.c8
-rw-r--r--net/netfilter/nft_exthdr.c10
-rw-r--r--net/netfilter/nft_set_hash.c3
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c3
-rw-r--r--net/netfilter/nft_tunnel.c6
-rw-r--r--net/netfilter/nft_xfrm.c3
-rw-r--r--net/netfilter/xt_hashlimit.c6
-rw-r--r--net/netlabel/netlabel_unlabeled.c44
-rw-r--r--net/netlabel/netlabel_user.c10
-rw-r--r--net/netlink/af_netlink.c1
-rw-r--r--net/openvswitch/actions.c6
-rw-r--r--net/openvswitch/conntrack.c30
-rw-r--r--net/openvswitch/datapath.h3
-rw-r--r--net/openvswitch/flow_netlink.c18
-rw-r--r--net/packet/af_packet.c2
-rw-r--r--net/rxrpc/Makefile1
-rw-r--r--net/rxrpc/af_rxrpc.c4
-rw-r--r--net/rxrpc/ar-internal.h342
-rw-r--r--net/rxrpc/call_accept.c22
-rw-r--r--net/rxrpc/call_event.c385
-rw-r--r--net/rxrpc/call_object.c66
-rw-r--r--net/rxrpc/conn_client.c26
-rw-r--r--net/rxrpc/conn_event.c28
-rw-r--r--net/rxrpc/conn_object.c14
-rw-r--r--net/rxrpc/input.c710
-rw-r--r--net/rxrpc/input_rack.c418
-rw-r--r--net/rxrpc/insecure.c5
-rw-r--r--net/rxrpc/io_thread.c113
-rw-r--r--net/rxrpc/local_object.c3
-rw-r--r--net/rxrpc/misc.c4
-rw-r--r--net/rxrpc/output.c590
-rw-r--r--net/rxrpc/peer_event.c114
-rw-r--r--net/rxrpc/peer_object.c33
-rw-r--r--net/rxrpc/proc.c61
-rw-r--r--net/rxrpc/protocol.h13
-rw-r--r--net/rxrpc/recvmsg.c18
-rw-r--r--net/rxrpc/rtt.c103
-rw-r--r--net/rxrpc/rxkad.c68
-rw-r--r--net/rxrpc/rxperf.c2
-rw-r--r--net/rxrpc/security.c4
-rw-r--r--net/rxrpc/sendmsg.c104
-rw-r--r--net/rxrpc/sysctl.c6
-rw-r--r--net/rxrpc/txbuf.c160
-rw-r--r--net/sched/act_tunnel_key.c2
-rw-r--r--net/sched/cls_api.c66
-rw-r--r--net/sched/cls_flower.c2
-rw-r--r--net/sched/sch_api.c12
-rw-r--r--net/sched/sch_cake.c45
-rw-r--r--net/sched/sch_codel.c10
-rw-r--r--net/sched/sch_fq.c14
-rw-r--r--net/sched/sch_fq_codel.c9
-rw-r--r--net/sched/sch_fq_pie.c6
-rw-r--r--net/sched/sch_generic.c55
-rw-r--r--net/sched/sch_gred.c7
-rw-r--r--net/sched/sch_pie.c5
-rw-r--r--net/sched/sch_red.c4
-rw-r--r--net/sched/sch_sfb.c4
-rw-r--r--net/sched/sch_sfq.c66
-rw-r--r--net/sched/sch_skbprio.c3
-rw-r--r--net/sctp/protocol.c10
-rw-r--r--net/sctp/socket.c22
-rw-r--r--net/sctp/stream.c2
-rw-r--r--net/sctp/sysctl.c4
-rw-r--r--net/sctp/transport.c2
-rw-r--r--net/shaper/shaper.c6
-rw-r--r--net/smc/af_smc.c10
-rw-r--r--net/smc/smc_core.c7
-rw-r--r--net/smc/smc_core.h11
-rw-r--r--net/smc/smc_ib.c3
-rw-r--r--net/smc/smc_llc.c21
-rw-r--r--net/smc/smc_rx.c2
-rw-r--r--net/smc/smc_wr.c42
-rw-r--r--net/socket.c65
-rw-r--r--net/strparser/strparser.c11
-rw-r--r--net/sunrpc/auth_gss/Makefile2
-rw-r--r--net/sunrpc/auth_gss/gss_generic_token.c231
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c55
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_internal.h7
-rw-r--r--net/sunrpc/auth_gss/gss_mech_switch.c1
-rw-r--r--net/sunrpc/cache.c53
-rw-r--r--net/sunrpc/clnt.c29
-rw-r--r--net/sunrpc/debugfs.c15
-rw-r--r--net/sunrpc/rpc_pipe.c14
-rw-r--r--net/sunrpc/svc.c4
-rw-r--r--net/sunrpc/svc_xprt.c41
-rw-r--r--net/sunrpc/xdr.c6
-rw-r--r--net/sunrpc/xprtmultipath.c17
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c3
-rw-r--r--net/switchdev/switchdev.c25
-rw-r--r--net/tipc/link.c1
-rw-r--r--net/tipc/name_table.c4
-rw-r--r--net/tipc/name_table.h2
-rw-r--r--net/tls/tls.h3
-rw-r--r--net/tls/tls_device.c2
-rw-r--r--net/tls/tls_main.c81
-rw-r--r--net/tls/tls_proc.c5
-rw-r--r--net/tls/tls_sw.c140
-rw-r--r--net/unix/Kconfig4
-rw-r--r--net/unix/af_unix.c240
-rw-r--r--net/unix/garbage.c2
-rw-r--r--net/vmw_vsock/af_vsock.c6
-rw-r--r--net/vmw_vsock/hyperv_transport.c6
-rw-r--r--net/wireless/chan.c374
-rw-r--r--net/wireless/core.c60
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/mlme.c92
-rw-r--r--net/wireless/nl80211.c578
-rw-r--r--net/wireless/nl80211.h3
-rw-r--r--net/wireless/pmsr.c4
-rw-r--r--net/wireless/rdev-ops.h41
-rw-r--r--net/wireless/reg.c55
-rw-r--r--net/wireless/scan.c80
-rw-r--r--net/wireless/sme.c12
-rw-r--r--net/wireless/trace.h122
-rw-r--r--net/wireless/util.c7
-rw-r--r--net/wireless/wext-compat.c317
-rw-r--r--net/wireless/wext-core.c4
-rw-r--r--net/wireless/wext-sme.c43
-rw-r--r--net/xdp/xsk.c8
-rw-r--r--net/xdp/xsk_buff_pool.c2
-rw-r--r--net/xfrm/Kconfig16
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/trace_iptfs.h218
-rw-r--r--net/xfrm/xfrm_compat.c10
-rw-r--r--net/xfrm/xfrm_device.c17
-rw-r--r--net/xfrm/xfrm_input.c27
-rw-r--r--net/xfrm/xfrm_interface_core.c2
-rw-r--r--net/xfrm/xfrm_iptfs.c2764
-rw-r--r--net/xfrm/xfrm_output.c56
-rw-r--r--net/xfrm/xfrm_policy.c28
-rw-r--r--net/xfrm/xfrm_proc.c2
-rw-r--r--net/xfrm/xfrm_replay.c1
-rw-r--r--net/xfrm/xfrm_state.c116
-rw-r--r--net/xfrm/xfrm_user.c79
342 files changed, 14235 insertions, 5800 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 91d134961357..ee7186e4d353 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -273,17 +273,6 @@ static int vlan_dev_open(struct net_device *dev)
goto out;
}
- if (dev->flags & IFF_ALLMULTI) {
- err = dev_set_allmulti(real_dev, 1);
- if (err < 0)
- goto del_unicast;
- }
- if (dev->flags & IFF_PROMISC) {
- err = dev_set_promiscuity(real_dev, 1);
- if (err < 0)
- goto clear_allmulti;
- }
-
ether_addr_copy(vlan->real_dev_addr, real_dev->dev_addr);
if (vlan->flags & VLAN_FLAG_GVRP)
@@ -297,12 +286,6 @@ static int vlan_dev_open(struct net_device *dev)
netif_carrier_on(dev);
return 0;
-clear_allmulti:
- if (dev->flags & IFF_ALLMULTI)
- dev_set_allmulti(real_dev, -1);
-del_unicast:
- if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
- dev_uc_del(real_dev, dev->dev_addr);
out:
netif_carrier_off(dev);
return err;
@@ -315,10 +298,6 @@ static int vlan_dev_stop(struct net_device *dev)
dev_mc_unsync(real_dev, dev);
dev_uc_unsync(real_dev, dev);
- if (dev->flags & IFF_ALLMULTI)
- dev_set_allmulti(real_dev, -1);
- if (dev->flags & IFF_PROMISC)
- dev_set_promiscuity(real_dev, -1);
if (!ether_addr_equal(dev->dev_addr, real_dev->dev_addr))
dev_uc_del(real_dev, dev->dev_addr);
@@ -490,12 +469,10 @@ static void vlan_dev_change_rx_flags(struct net_device *dev, int change)
{
struct net_device *real_dev = vlan_dev_priv(dev)->real_dev;
- if (dev->flags & IFF_UP) {
- if (change & IFF_ALLMULTI)
- dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
- if (change & IFF_PROMISC)
- dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
- }
+ if (change & IFF_ALLMULTI)
+ dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1);
+ if (change & IFF_PROMISC)
+ dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1);
}
static void vlan_dev_set_rx_mode(struct net_device *vlan_dev)
diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c
index 9fa0b246902b..05cbb3c227c5 100644
--- a/net/appletalk/aarp.c
+++ b/net/appletalk/aarp.c
@@ -432,49 +432,18 @@ static struct atalk_addr *__aarp_proxy_find(struct net_device *dev,
return a ? sa : NULL;
}
-/*
- * Probe a Phase 1 device or a device that requires its Net:Node to
- * be set via an ioctl.
- */
-static void aarp_send_probe_phase1(struct atalk_iface *iface)
-{
- struct ifreq atreq;
- struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr;
- const struct net_device_ops *ops = iface->dev->netdev_ops;
-
- sa->sat_addr.s_node = iface->address.s_node;
- sa->sat_addr.s_net = ntohs(iface->address.s_net);
-
- /* We pass the Net:Node to the drivers/cards by a Device ioctl. */
- if (!(ops->ndo_do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) {
- ops->ndo_do_ioctl(iface->dev, &atreq, SIOCGIFADDR);
- if (iface->address.s_net != htons(sa->sat_addr.s_net) ||
- iface->address.s_node != sa->sat_addr.s_node)
- iface->status |= ATIF_PROBE_FAIL;
-
- iface->address.s_net = htons(sa->sat_addr.s_net);
- iface->address.s_node = sa->sat_addr.s_node;
- }
-}
-
-
void aarp_probe_network(struct atalk_iface *atif)
{
- if (atif->dev->type == ARPHRD_LOCALTLK ||
- atif->dev->type == ARPHRD_PPP)
- aarp_send_probe_phase1(atif);
- else {
- unsigned int count;
+ unsigned int count;
- for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
- aarp_send_probe(atif->dev, &atif->address);
+ for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) {
+ aarp_send_probe(atif->dev, &atif->address);
- /* Defer 1/10th */
- msleep(100);
+ /* Defer 1/10th */
+ msleep(100);
- if (atif->status & ATIF_PROBE_FAIL)
- break;
- }
+ if (atif->status & ATIF_PROBE_FAIL)
+ break;
}
}
diff --git a/net/atm/lec.c b/net/atm/lec.c
index ffef658862db..a948dd47c3f3 100644
--- a/net/atm/lec.c
+++ b/net/atm/lec.c
@@ -181,6 +181,7 @@ static void
lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
+ unsigned int len = skb->len;
ATM_SKB(skb)->vcc = vcc;
atm_account_tx(vcc, skb);
@@ -191,7 +192,7 @@ lec_send(struct atm_vcc *vcc, struct sk_buff *skb)
}
dev->stats.tx_packets++;
- dev->stats.tx_bytes += skb->len;
+ dev->stats.tx_bytes += len;
}
static void lec_tx_timeout(struct net_device *dev, unsigned int txqueue)
diff --git a/net/atm/mpc.c b/net/atm/mpc.c
index 324e3ab96bb3..12da0269275c 100644
--- a/net/atm/mpc.c
+++ b/net/atm/mpc.c
@@ -1314,6 +1314,8 @@ static void MPOA_cache_impos_rcvd(struct k_message *msg,
holding_time = msg->content.eg_info.holding_time;
dprintk("(%s) entry = %p, holding_time = %u\n",
mpc->dev->name, entry, holding_time);
+ if (entry == NULL && !holding_time)
+ return;
if (entry == NULL && holding_time) {
entry = mpc->eg_ops->add_entry(msg, mpc);
mpc->eg_ops->put(entry);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index 9f3b8b682adb..3ee7dba34310 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1270,28 +1270,18 @@ static int __must_check ax25_connect(struct socket *sock,
}
}
- /*
- * Must bind first - autobinding in this may or may not work. If
- * the socket is already bound, check to see if the device has
- * been filled in, error if it hasn't.
- */
+ /* Must bind first - autobinding does not work. */
if (sock_flag(sk, SOCK_ZAPPED)) {
- /* check if we can remove this feature. It is broken. */
- printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n",
- current->comm);
- if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) {
- kfree(digi);
- goto out_release;
- }
+ kfree(digi);
+ err = -EINVAL;
+ goto out_release;
+ }
- ax25_fillin_cb(ax25, ax25->ax25_dev);
- ax25_cb_add(ax25);
- } else {
- if (ax25->ax25_dev == NULL) {
- kfree(digi);
- err = -EHOSTUNREACH;
- goto out_release;
- }
+ /* Check to see if the device has been filled in, error if it hasn't. */
+ if (ax25->ax25_dev == NULL) {
+ kfree(digi);
+ err = -EHOSTUNREACH;
+ goto out_release;
}
if (sk->sk_type == SOCK_SEQPACKET &&
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c
index 69de75db0c9c..10577434f40b 100644
--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -373,80 +373,6 @@ ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev)
return ax25_rt;
}
-/*
- * Adjust path: If you specify a default route and want to connect
- * a target on the digipeater path but w/o having a special route
- * set before, the path has to be truncated from your target on.
- */
-static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat)
-{
- int k;
-
- for (k = 0; k < digipeat->ndigi; k++) {
- if (ax25cmp(addr, &digipeat->calls[k]) == 0)
- break;
- }
-
- digipeat->ndigi = k;
-}
-
-
-/*
- * Find which interface to use.
- */
-int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
-{
- ax25_uid_assoc *user;
- ax25_route *ax25_rt;
- int err = 0;
-
- ax25_route_lock_use();
- ax25_rt = ax25_get_route(addr, NULL);
- if (!ax25_rt) {
- ax25_route_lock_unuse();
- return -EHOSTUNREACH;
- }
- rcu_read_lock();
- if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) {
- err = -EHOSTUNREACH;
- goto put;
- }
-
- user = ax25_findbyuid(current_euid());
- if (user) {
- ax25->source_addr = user->call;
- ax25_uid_put(user);
- } else {
- if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) {
- err = -EPERM;
- goto put;
- }
- ax25->source_addr = *(ax25_address *)ax25->ax25_dev->dev->dev_addr;
- }
-
- if (ax25_rt->digipeat != NULL) {
- ax25->digipeat = kmemdup(ax25_rt->digipeat, sizeof(ax25_digi),
- GFP_ATOMIC);
- if (ax25->digipeat == NULL) {
- err = -ENOMEM;
- goto put;
- }
- ax25_adjust_path(addr, ax25->digipeat);
- }
-
- if (ax25->sk != NULL) {
- local_bh_disable();
- bh_lock_sock(ax25->sk);
- sock_reset_flag(ax25->sk, SOCK_ZAPPED);
- bh_unlock_sock(ax25->sk);
- local_bh_enable();
- }
-
-put:
- rcu_read_unlock();
- ax25_route_lock_unuse();
- return err;
-}
struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
ax25_address *dest, ax25_digi *digi)
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 07ae5dd1f150..b12645949ae5 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -325,8 +325,7 @@ batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
/* check if there is enough space for the optional TVLV */
next_buff_pos += ntohs(ogm_packet->tvlv_len);
- return (next_buff_pos <= packet_len) &&
- (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+ return next_buff_pos <= packet_len;
}
/* send a batman ogm to a given interface */
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
index e503ee0d896b..8f89ffe6020c 100644
--- a/net/batman-adv/bat_v_ogm.c
+++ b/net/batman-adv/bat_v_ogm.c
@@ -839,8 +839,7 @@ batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
/* check if there is enough space for the optional TVLV */
next_buff_pos += ntohs(ogm2_packet->tvlv_len);
- return (next_buff_pos <= packet_len) &&
- (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+ return next_buff_pos <= packet_len;
}
/**
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 449faf5a5487..8c814f790d17 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -12,6 +12,7 @@
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/crc16.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -38,7 +39,6 @@
#include <net/arp.h>
#include <net/genetlink.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -47,7 +47,6 @@
#include "log.h"
#include "netlink.h"
#include "originator.h"
-#include "soft-interface.h"
#include "translation-table.h"
static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05};
@@ -2233,25 +2232,16 @@ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
int portid = NETLINK_CB(cb->skb).portid;
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
int idx = cb->args[1];
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
hash = bat_priv->bla.claim_hash;
@@ -2403,25 +2393,16 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
int portid = NETLINK_CB(cb->skb).portid;
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
int idx = cb->args[1];
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
hash = bat_priv->bla.backbone_hash;
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 801eff8a40e5..e5a07152d4ec 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -7,11 +7,11 @@
#include "distributed-arp-table.h"
#include "main.h"
-#include <linux/unaligned.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/byteorder/generic.h>
#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -32,11 +32,11 @@
#include <linux/stddef.h>
#include <linux/string.h>
#include <linux/udp.h>
+#include <linux/unaligned.h>
#include <linux/workqueue.h>
#include <net/arp.h>
#include <net/genetlink.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batman_adv.h>
#include "bridge_loop_avoidance.h"
@@ -46,7 +46,6 @@
#include "netlink.h"
#include "originator.h"
#include "send.h"
-#include "soft-interface.h"
#include "translation-table.h"
#include "tvlv.h"
@@ -937,25 +936,16 @@ int batadv_dat_cache_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
int portid = NETLINK_CB(cb->skb).portid;
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_hashtable *hash;
struct batadv_priv *bat_priv;
int bucket = cb->args[0];
int idx = cb->args[1];
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
hash = bat_priv->dat.hash;
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 0ddd8b4b3f4c..f68e34ed1f62 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -10,6 +10,7 @@
#include <linux/atomic.h>
#include <linux/byteorder/generic.h>
#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -31,7 +32,6 @@
#include <linux/sprintf.h>
#include <linux/stddef.h>
#include <linux/udp.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -40,7 +40,6 @@
#include "netlink.h"
#include "originator.h"
#include "routing.h"
-#include "soft-interface.h"
#include "translation-table.h"
/* These are the offsets of the "hw type" and "hw address length" in the dhcp
@@ -502,22 +501,13 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
int batadv_gw_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
struct batadv_hard_iface *primary_if = NULL;
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
- int ifindex;
int ret;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 8e0f44c71696..333e947afcce 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -637,6 +637,13 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
vhdr = (struct vlan_ethhdr *)(skb->data + header_len);
vid = ntohs(vhdr->h_vlan_TCI) & VLAN_VID_MASK;
+
+ /* VID 0 is only used to indicate "priority tag" frames which only
+ * contain priority information and no VID.
+ */
+ if (vid == 0)
+ return BATADV_NO_FLAGS;
+
vid |= BATADV_VLAN_HAS_TAG;
return vid;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 97ea71a052f8..964f3088af5b 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -7,13 +7,13 @@
#ifndef _NET_BATMAN_ADV_MAIN_H_
#define _NET_BATMAN_ADV_MAIN_H_
-#define BATADV_DRIVER_AUTHOR "Marek Lindner <mareklindner@neomailbox.ch>, " \
+#define BATADV_DRIVER_AUTHOR "Marek Lindner <marek.lindner@mailbox.org>, " \
"Simon Wunderlich <sw@simonwunderlich.de>"
#define BATADV_DRIVER_DESC "B.A.T.M.A.N. advanced"
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2024.3"
+#define BATADV_SOURCE_VERSION "2025.0"
#endif
/* B.A.T.M.A.N. parameters */
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index 14088c4ff2f6..d95c418484fa 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -12,6 +12,7 @@
#include <linux/bug.h>
#include <linux/byteorder/generic.h>
#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -46,7 +47,6 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -56,7 +56,6 @@
#include "log.h"
#include "netlink.h"
#include "send.h"
-#include "soft-interface.h"
#include "translation-table.h"
#include "tvlv.h"
@@ -2104,21 +2103,13 @@ batadv_mcast_netlink_get_primary(struct netlink_callback *cb,
struct batadv_hard_iface **primary_if)
{
struct batadv_hard_iface *hard_iface = NULL;
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
- int ifindex;
int ret = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index 9362cd9d6f3d..eefba5600ded 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -158,8 +158,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
*
* Return: interface index, or 0.
*/
-int
-batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
+static int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype)
{
struct nlattr *attr = nlmsg_find_attr(nlh, GENL_HDRLEN, attrtype);
@@ -881,14 +880,14 @@ static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv,
}
/**
- * batadv_netlink_get_hardif() - Get hardif attributes
+ * batadv_netlink_cmd_get_hardif() - Get hardif attributes
* @skb: Netlink message with request data
* @info: receiver information
*
* Return: 0 on success or negative error number in case of failure
*/
-static int batadv_netlink_get_hardif(struct sk_buff *skb,
- struct genl_info *info)
+static int batadv_netlink_cmd_get_hardif(struct sk_buff *skb,
+ struct genl_info *info)
{
struct batadv_hard_iface *hard_iface = info->user_ptr[1];
struct batadv_priv *bat_priv = info->user_ptr[0];
@@ -964,28 +963,16 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb,
static int
batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_hard_iface *hard_iface;
struct batadv_priv *bat_priv;
- int ifindex;
int portid = NETLINK_CB(cb->skb).portid;
int skip = cb->args[0];
int i = 0;
- ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface)
- return -ENODEV;
-
- if (!batadv_softif_is_valid(soft_iface)) {
- dev_put(soft_iface);
- return -ENODEV;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
@@ -1150,23 +1137,17 @@ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info)
}
/**
- * batadv_get_softif_from_info() - Retrieve soft interface from genl attributes
+ * batadv_netlink_get_softif_from_ifindex() - Get soft-iface from ifindex
* @net: the applicable net namespace
- * @info: receiver information
+ * @ifindex: index of the soft interface
*
* Return: Pointer to soft interface (with increased refcnt) on success, error
* pointer on error
*/
static struct net_device *
-batadv_get_softif_from_info(struct net *net, struct genl_info *info)
+batadv_netlink_get_softif_from_ifindex(struct net *net, int ifindex)
{
struct net_device *soft_iface;
- int ifindex;
-
- if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
- return ERR_PTR(-EINVAL);
-
- ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
soft_iface = dev_get_by_index(net, ifindex);
if (!soft_iface)
@@ -1184,28 +1165,61 @@ err_put_softif:
}
/**
- * batadv_get_hardif_from_info() - Retrieve hardif from genl attributes
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_netlink_get_softif_from_info() - Get soft-iface from genl attributes
* @net: the applicable net namespace
* @info: receiver information
*
+ * Return: Pointer to soft interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct net_device *
+batadv_netlink_get_softif_from_info(struct net *net, struct genl_info *info)
+{
+ int ifindex;
+
+ if (!info->attrs[BATADV_ATTR_MESH_IFINDEX])
+ return ERR_PTR(-EINVAL);
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]);
+
+ return batadv_netlink_get_softif_from_ifindex(net, ifindex);
+}
+
+/**
+ * batadv_netlink_get_softif() - Retrieve soft interface from netlink callback
+ * @cb: callback structure containing arguments
+ *
+ * Return: Pointer to soft interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+struct net_device *batadv_netlink_get_softif(struct netlink_callback *cb)
+{
+ int ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_MESH_IFINDEX);
+ if (!ifindex)
+ return ERR_PTR(-ENONET);
+
+ return batadv_netlink_get_softif_from_ifindex(sock_net(cb->skb->sk),
+ ifindex);
+}
+
+/**
+ * batadv_netlink_get_hardif_from_ifindex() - Get hard-iface from ifindex
+ * @bat_priv: the bat priv with all the soft interface information
+ * @net: the applicable net namespace
+ * @ifindex: index of the hard interface
+ *
* Return: Pointer to hard interface (with increased refcnt) on success, error
* pointer on error
*/
static struct batadv_hard_iface *
-batadv_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net,
- struct genl_info *info)
+batadv_netlink_get_hardif_from_ifindex(struct batadv_priv *bat_priv,
+ struct net *net, int ifindex)
{
struct batadv_hard_iface *hard_iface;
struct net_device *hard_dev;
- unsigned int hardif_index;
-
- if (!info->attrs[BATADV_ATTR_HARD_IFINDEX])
- return ERR_PTR(-EINVAL);
-
- hardif_index = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]);
- hard_dev = dev_get_by_index(net, hardif_index);
+ hard_dev = dev_get_by_index(net, ifindex);
if (!hard_dev)
return ERR_PTR(-ENODEV);
@@ -1230,6 +1244,51 @@ err_put_harddev:
}
/**
+ * batadv_netlink_get_hardif_from_info() - Get hard-iface from genl attributes
+ * @bat_priv: the bat priv with all the soft interface information
+ * @net: the applicable net namespace
+ * @info: receiver information
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+static struct batadv_hard_iface *
+batadv_netlink_get_hardif_from_info(struct batadv_priv *bat_priv,
+ struct net *net, struct genl_info *info)
+{
+ int ifindex;
+
+ if (!info->attrs[BATADV_ATTR_HARD_IFINDEX])
+ return ERR_PTR(-EINVAL);
+
+ ifindex = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]);
+
+ return batadv_netlink_get_hardif_from_ifindex(bat_priv, net, ifindex);
+}
+
+/**
+ * batadv_netlink_get_hardif() - Retrieve hard interface from netlink callback
+ * @bat_priv: the bat priv with all the soft interface information
+ * @cb: callback structure containing arguments
+ *
+ * Return: Pointer to hard interface (with increased refcnt) on success, error
+ * pointer on error
+ */
+struct batadv_hard_iface *
+batadv_netlink_get_hardif(struct batadv_priv *bat_priv,
+ struct netlink_callback *cb)
+{
+ int ifindex = batadv_netlink_get_ifindex(cb->nlh,
+ BATADV_ATTR_HARD_IFINDEX);
+ if (!ifindex)
+ return ERR_PTR(-ENONET);
+
+ return batadv_netlink_get_hardif_from_ifindex(bat_priv,
+ sock_net(cb->skb->sk),
+ ifindex);
+}
+
+/**
* batadv_get_vlan_from_info() - Retrieve vlan from genl attributes
* @bat_priv: the bat priv with all the soft interface information
* @net: the applicable net namespace
@@ -1288,7 +1347,7 @@ static int batadv_pre_doit(const struct genl_split_ops *ops,
return -EINVAL;
if (ops->internal_flags & BATADV_FLAG_NEED_MESH) {
- soft_iface = batadv_get_softif_from_info(net, info);
+ soft_iface = batadv_netlink_get_softif_from_info(net, info);
if (IS_ERR(soft_iface))
return PTR_ERR(soft_iface);
@@ -1297,7 +1356,8 @@ static int batadv_pre_doit(const struct genl_split_ops *ops,
}
if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) {
- hard_iface = batadv_get_hardif_from_info(bat_priv, net, info);
+ hard_iface = batadv_netlink_get_hardif_from_info(bat_priv, net,
+ info);
if (IS_ERR(hard_iface)) {
ret = PTR_ERR(hard_iface);
goto err_put_softif;
@@ -1390,7 +1450,7 @@ static const struct genl_small_ops batadv_netlink_ops[] = {
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
/* can be retrieved by unprivileged users */
.dumpit = batadv_netlink_dump_hardif,
- .doit = batadv_netlink_get_hardif,
+ .doit = batadv_netlink_cmd_get_hardif,
.internal_flags = BATADV_FLAG_NEED_MESH |
BATADV_FLAG_NEED_HARDIF,
},
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index 876d2806a67d..2097c2ae98f1 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -15,7 +15,10 @@
void batadv_netlink_register(void);
void batadv_netlink_unregister(void);
-int batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype);
+struct net_device *batadv_netlink_get_softif(struct netlink_callback *cb);
+struct batadv_hard_iface *
+batadv_netlink_get_hardif(struct batadv_priv *bat_priv,
+ struct netlink_callback *cb);
int batadv_netlink_tpmeter_notify(struct batadv_priv *bat_priv, const u8 *dst,
u8 result, u32 test_time, u64 total_bytes,
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 8f6dd2c6ee41..bcc2e20e0cd6 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -9,6 +9,7 @@
#include <linux/atomic.h>
#include <linux/container_of.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -26,9 +27,7 @@
#include <linux/spinlock.h>
#include <linux/stddef.h>
#include <linux/workqueue.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
-#include <uapi/linux/batman_adv.h>
#include "bat_algo.h"
#include "distributed-arp-table.h"
@@ -41,7 +40,6 @@
#include "netlink.h"
#include "network-coding.h"
#include "routing.h"
-#include "soft-interface.h"
#include "translation-table.h"
/* hash class keys */
@@ -755,64 +753,48 @@ batadv_neigh_node_get_or_create(struct batadv_orig_node *orig_node,
*/
int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
+ struct batadv_hard_iface *primary_if, *hard_iface;
struct net_device *soft_iface;
- struct net_device *hard_iface = NULL;
- struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
struct batadv_priv *bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
int ret;
- int ifindex, hard_ifindex;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
ret = -ENOENT;
- goto out;
+ goto out_put_soft_iface;
}
- hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_HARD_IFINDEX);
- if (hard_ifindex) {
- hard_iface = dev_get_by_index(net, hard_ifindex);
- if (hard_iface)
- hardif = batadv_hardif_get_by_netdev(hard_iface);
-
- if (!hardif) {
- ret = -ENODEV;
- goto out;
- }
-
- if (hardif->soft_iface != soft_iface) {
- ret = -ENOENT;
- goto out;
- }
+ hard_iface = batadv_netlink_get_hardif(bat_priv, cb);
+ if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) {
+ ret = PTR_ERR(hard_iface);
+ goto out_put_primary_if;
+ } else if (IS_ERR(hard_iface)) {
+ /* => PTR_ERR(hard_iface) == -ENONET
+ * => no hard-iface given, ok
+ */
+ hard_iface = BATADV_IF_DEFAULT;
}
if (!bat_priv->algo_ops->neigh.dump) {
ret = -EOPNOTSUPP;
- goto out;
+ goto out_put_hard_iface;
}
- bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hardif);
+ bat_priv->algo_ops->neigh.dump(msg, cb, bat_priv, hard_iface);
ret = msg->len;
- out:
- batadv_hardif_put(hardif);
- dev_put(hard_iface);
+out_put_hard_iface:
+ batadv_hardif_put(hard_iface);
+out_put_primary_if:
batadv_hardif_put(primary_if);
+out_put_soft_iface:
dev_put(soft_iface);
return ret;
@@ -1342,64 +1324,48 @@ static void batadv_purge_orig(struct work_struct *work)
*/
int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
+ struct batadv_hard_iface *primary_if, *hard_iface;
struct net_device *soft_iface;
- struct net_device *hard_iface = NULL;
- struct batadv_hard_iface *hardif = BATADV_IF_DEFAULT;
struct batadv_priv *bat_priv;
- struct batadv_hard_iface *primary_if = NULL;
int ret;
- int ifindex, hard_ifindex;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
primary_if = batadv_primary_if_get_selected(bat_priv);
if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) {
ret = -ENOENT;
- goto out;
+ goto out_put_soft_iface;
}
- hard_ifindex = batadv_netlink_get_ifindex(cb->nlh,
- BATADV_ATTR_HARD_IFINDEX);
- if (hard_ifindex) {
- hard_iface = dev_get_by_index(net, hard_ifindex);
- if (hard_iface)
- hardif = batadv_hardif_get_by_netdev(hard_iface);
-
- if (!hardif) {
- ret = -ENODEV;
- goto out;
- }
-
- if (hardif->soft_iface != soft_iface) {
- ret = -ENOENT;
- goto out;
- }
+ hard_iface = batadv_netlink_get_hardif(bat_priv, cb);
+ if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) {
+ ret = PTR_ERR(hard_iface);
+ goto out_put_primary_if;
+ } else if (IS_ERR(hard_iface)) {
+ /* => PTR_ERR(hard_iface) == -ENONET
+ * => no hard-iface given, ok
+ */
+ hard_iface = BATADV_IF_DEFAULT;
}
if (!bat_priv->algo_ops->orig.dump) {
ret = -EOPNOTSUPP;
- goto out;
+ goto out_put_hard_iface;
}
- bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hardif);
+ bat_priv->algo_ops->orig.dump(msg, cb, bat_priv, hard_iface);
ret = msg->len;
- out:
- batadv_hardif_put(hardif);
- dev_put(hard_iface);
+out_put_hard_iface:
+ batadv_hardif_put(hard_iface);
+out_put_primary_if:
batadv_hardif_put(primary_if);
+out_put_soft_iface:
dev_put(soft_iface);
return ret;
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index 2758aba47a2f..822d788a5f86 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -637,6 +637,14 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
if (proto != htons(ETH_P_8021Q))
return -EINVAL;
+ /* VID 0 is only used to indicate "priority tag" frames which only
+ * contain priority information and no VID. No management structures
+ * should be created for this VID and it should be handled like an
+ * untagged frame.
+ */
+ if (vid == 0)
+ return 0;
+
vid |= BATADV_VLAN_HAS_TAG;
/* if a new vlan is getting created and it already exists, it means that
@@ -684,6 +692,12 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
if (proto != htons(ETH_P_8021Q))
return -EINVAL;
+ /* "priority tag" frames are handled like "untagged" frames
+ * and no softif_vlan needs to be destroyed
+ */
+ if (vid == 0)
+ return 0;
+
vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG);
if (!vlan)
return -ENOENT;
@@ -783,13 +797,13 @@ static int batadv_softif_init_late(struct net_device *dev)
atomic_set(&bat_priv->mesh_state, BATADV_MESH_INACTIVE);
atomic_set(&bat_priv->bcast_seqno, 1);
atomic_set(&bat_priv->tt.vn, 0);
- atomic_set(&bat_priv->tt.local_changes, 0);
atomic_set(&bat_priv->tt.ogm_append_cnt, 0);
#ifdef CONFIG_BATMAN_ADV_BLA
atomic_set(&bat_priv->bla.num_requests, 0);
#endif
atomic_set(&bat_priv->tp_num, 0);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
bat_priv->tt.last_changeset = NULL;
bat_priv->tt.last_changeset_len = 0;
bat_priv->isolation_mark = 0;
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 7d5de4cbb814..d4b71d34310f 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -15,6 +15,7 @@
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/crc32c.h>
+#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/gfp.h>
@@ -39,7 +40,6 @@
#include <linux/workqueue.h>
#include <net/genetlink.h>
#include <net/netlink.h>
-#include <net/sock.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -423,8 +423,8 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
struct batadv_tt_change_node *tt_change_node, *entry, *safe;
struct batadv_tt_common_entry *common = &tt_local_entry->common;
u8 flags = common->flags | event_flags;
- bool event_removed = false;
bool del_op_requested, del_op_entry;
+ size_t changes;
tt_change_node = kmem_cache_alloc(batadv_tt_change_cache, GFP_ATOMIC);
if (!tt_change_node)
@@ -438,51 +438,45 @@ static void batadv_tt_local_event(struct batadv_priv *bat_priv,
del_op_requested = flags & BATADV_TT_CLIENT_DEL;
- /* check for ADD+DEL or DEL+ADD events */
+ /* check for ADD+DEL, DEL+ADD, ADD+ADD or DEL+DEL events */
spin_lock_bh(&bat_priv->tt.changes_list_lock);
+ changes = READ_ONCE(bat_priv->tt.local_changes);
list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
list) {
if (!batadv_compare_eth(entry->change.addr, common->addr))
continue;
- /* DEL+ADD in the same orig interval have no effect and can be
- * removed to avoid silly behaviour on the receiver side. The
- * other way around (ADD+DEL) can happen in case of roaming of
- * a client still in the NEW state. Roaming of NEW clients is
- * now possible due to automatically recognition of "temporary"
- * clients
- */
del_op_entry = entry->change.flags & BATADV_TT_CLIENT_DEL;
- if (!del_op_requested && del_op_entry)
- goto del;
- if (del_op_requested && !del_op_entry)
- goto del;
-
- /* this is a second add in the same originator interval. It
- * means that flags have been changed: update them!
- */
- if (!del_op_requested && !del_op_entry)
+ if (del_op_requested != del_op_entry) {
+ /* DEL+ADD in the same orig interval have no effect and
+ * can be removed to avoid silly behaviour on the
+ * receiver side. The other way around (ADD+DEL) can
+ * happen in case of roaming of a client still in the
+ * NEW state. Roaming of NEW clients is now possible due
+ * to automatically recognition of "temporary" clients
+ */
+ list_del(&entry->list);
+ kmem_cache_free(batadv_tt_change_cache, entry);
+ changes--;
+ } else {
+ /* this is a second add or del in the same originator
+ * interval. It could mean that flags have been changed
+ * (e.g. double add): update them
+ */
entry->change.flags = flags;
+ }
- continue;
-del:
- list_del(&entry->list);
- kmem_cache_free(batadv_tt_change_cache, entry);
kmem_cache_free(batadv_tt_change_cache, tt_change_node);
- event_removed = true;
- goto unlock;
+ goto update_changes;
}
/* track the change in the OGMinterval list */
list_add_tail(&tt_change_node->list, &bat_priv->tt.changes_list);
+ changes++;
-unlock:
+update_changes:
+ WRITE_ONCE(bat_priv->tt.local_changes, changes);
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
-
- if (event_removed)
- atomic_dec(&bat_priv->tt.local_changes);
- else
- atomic_inc(&bat_priv->tt.local_changes);
}
/**
@@ -952,7 +946,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
size_t tt_extra_len = 0;
u16 tvlv_len;
- tt_diff_entries_num = atomic_read(&bat_priv->tt.local_changes);
+ tt_diff_entries_num = READ_ONCE(bat_priv->tt.local_changes);
tt_diff_len = batadv_tt_len(tt_diff_entries_num);
/* if we have too many changes for one packet don't send any
@@ -979,7 +973,7 @@ static void batadv_tt_tvlv_container_update(struct batadv_priv *bat_priv)
goto container_register;
spin_lock_bh(&bat_priv->tt.changes_list_lock);
- atomic_set(&bat_priv->tt.local_changes, 0);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
list_for_each_entry_safe(entry, safe, &bat_priv->tt.changes_list,
list) {
@@ -1136,26 +1130,18 @@ batadv_tt_local_dump_bucket(struct sk_buff *msg, u32 portid,
*/
int batadv_tt_local_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_hashtable *hash;
int ret;
- int ifindex;
int bucket = cb->args[0];
int idx = cb->args[1];
int portid = NETLINK_CB(cb->skb).portid;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
@@ -1395,7 +1381,7 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
kmem_cache_free(batadv_tt_change_cache, entry);
}
- atomic_set(&bat_priv->tt.local_changes, 0);
+ WRITE_ONCE(bat_priv->tt.local_changes, 0);
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
}
@@ -1911,28 +1897,20 @@ batadv_tt_global_dump_bucket(struct sk_buff *msg, u32 portid, u32 seq,
*/
int batadv_tt_global_dump(struct sk_buff *msg, struct netlink_callback *cb)
{
- struct net *net = sock_net(cb->skb->sk);
struct net_device *soft_iface;
struct batadv_priv *bat_priv;
struct batadv_hard_iface *primary_if = NULL;
struct batadv_hashtable *hash;
struct hlist_head *head;
int ret;
- int ifindex;
int bucket = cb->args[0];
int idx = cb->args[1];
int sub = cb->args[2];
int portid = NETLINK_CB(cb->skb).portid;
- ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX);
- if (!ifindex)
- return -EINVAL;
-
- soft_iface = dev_get_by_index(net, ifindex);
- if (!soft_iface || !batadv_softif_is_valid(soft_iface)) {
- ret = -ENODEV;
- goto out;
- }
+ soft_iface = batadv_netlink_get_softif(cb);
+ if (IS_ERR(soft_iface))
+ return PTR_ERR(soft_iface);
bat_priv = netdev_priv(soft_iface);
@@ -3656,7 +3634,7 @@ static void batadv_tt_local_commit_changes_nolock(struct batadv_priv *bat_priv)
{
lockdep_assert_held(&bat_priv->tt.commit_lock);
- if (atomic_read(&bat_priv->tt.local_changes) < 1) {
+ if (READ_ONCE(bat_priv->tt.local_changes) == 0) {
if (!batadv_atomic_dec_not_zero(&bat_priv->tt.ogm_append_cnt))
batadv_tt_tvlv_container_update(bat_priv);
return;
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 85a50096f5b2..fe89f08533fe 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1019,7 +1019,7 @@ struct batadv_priv_tt {
atomic_t ogm_append_cnt;
/** @local_changes: changes registered in an originator interval */
- atomic_t local_changes;
+ size_t local_changes;
/**
* @changes_list: tracks tt local changes within an originator interval
@@ -1041,7 +1041,7 @@ struct batadv_priv_tt {
*/
struct list_head roam_list;
- /** @changes_list_lock: lock protecting changes_list */
+ /** @changes_list_lock: lock protecting changes_list & local_changes */
spinlock_t changes_list_lock;
/** @req_list_lock: lock protecting req_list */
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 50cfec8ccac4..3c29778171c5 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -825,11 +825,16 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan,
unsigned long hdr_len,
unsigned long len, int nb)
{
+ struct sk_buff *skb;
+
/* Note that we must allocate using GFP_ATOMIC here as
* this function is called originally from netdev hard xmit
* function in atomic context.
*/
- return bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
+ skb = bt_skb_alloc(hdr_len + len, GFP_ATOMIC);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+ return skb;
}
static void chan_suspend_cb(struct l2cap_chan *chan)
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 18ab5628f85a..94d9147612da 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -57,6 +57,7 @@ DEFINE_RWLOCK(hci_dev_list_lock);
/* HCI callback list */
LIST_HEAD(hci_cb_list);
+DEFINE_MUTEX(hci_cb_list_lock);
/* HCI ID Numbering */
static DEFINE_IDA(hci_index_ida);
@@ -1456,8 +1457,8 @@ static void hci_cmd_timeout(struct work_struct *work)
bt_dev_err(hdev, "command tx timeout");
}
- if (hdev->cmd_timeout)
- hdev->cmd_timeout(hdev);
+ if (hdev->reset)
+ hdev->reset(hdev);
atomic_set(&hdev->cmd_cnt, 1);
queue_work(hdev->workqueue, &hdev->cmd_work);
@@ -2181,26 +2182,6 @@ int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr,
return 0;
}
-int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr,
- u8 type)
-{
- struct bdaddr_list_with_flags *entry;
-
- if (!bacmp(bdaddr, BDADDR_ANY)) {
- hci_bdaddr_list_clear(list);
- return 0;
- }
-
- entry = hci_bdaddr_list_lookup_with_flags(list, bdaddr, type);
- if (!entry)
- return -ENOENT;
-
- list_del(&entry->list);
- kfree(entry);
-
- return 0;
-}
-
/* This function requires the caller holds hdev->lock */
struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev,
bdaddr_t *addr, u8 addr_type)
@@ -2992,7 +2973,9 @@ int hci_register_cb(struct hci_cb *cb)
{
BT_DBG("%p name %s", cb, cb->name);
- list_add_tail_rcu(&cb->list, &hci_cb_list);
+ mutex_lock(&hci_cb_list_lock);
+ list_add_tail(&cb->list, &hci_cb_list);
+ mutex_unlock(&hci_cb_list_lock);
return 0;
}
@@ -3002,8 +2985,9 @@ int hci_unregister_cb(struct hci_cb *cb)
{
BT_DBG("%p name %s", cb, cb->name);
- list_del_rcu(&cb->list);
- synchronize_rcu();
+ mutex_lock(&hci_cb_list_lock);
+ list_del(&cb->list);
+ mutex_unlock(&hci_cb_list_lock);
return 0;
}
@@ -3568,42 +3552,27 @@ static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type)
}
/* Schedule SCO */
-static void hci_sched_sco(struct hci_dev *hdev)
+static void hci_sched_sco(struct hci_dev *hdev, __u8 type)
{
struct hci_conn *conn;
struct sk_buff *skb;
- int quote;
+ int quote, *cnt;
+ unsigned int pkts = hdev->sco_pkts;
- BT_DBG("%s", hdev->name);
+ bt_dev_dbg(hdev, "type %u", type);
- if (!hci_conn_num(hdev, SCO_LINK))
+ if (!hci_conn_num(hdev, type) || !pkts)
return;
- while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) {
- while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
- BT_DBG("skb %p len %d", skb, skb->len);
- hci_send_frame(hdev, skb);
-
- conn->sent++;
- if (conn->sent == ~0)
- conn->sent = 0;
- }
- }
-}
-
-static void hci_sched_esco(struct hci_dev *hdev)
-{
- struct hci_conn *conn;
- struct sk_buff *skb;
- int quote;
-
- BT_DBG("%s", hdev->name);
-
- if (!hci_conn_num(hdev, ESCO_LINK))
- return;
+ /* Use sco_pkts if flow control has not been enabled which will limit
+ * the amount of buffer sent in a row.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ cnt = &pkts;
+ else
+ cnt = &hdev->sco_cnt;
- while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK,
- &quote))) {
+ while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
BT_DBG("skb %p len %d", skb, skb->len);
hci_send_frame(hdev, skb);
@@ -3611,8 +3580,17 @@ static void hci_sched_esco(struct hci_dev *hdev)
conn->sent++;
if (conn->sent == ~0)
conn->sent = 0;
+ (*cnt)--;
}
}
+
+ /* Rescheduled if all packets were sent and flow control is not enabled
+ * as there could be more packets queued that could not be sent and
+ * since no HCI_EV_NUM_COMP_PKTS event will be generated the reschedule
+ * needs to be forced.
+ */
+ if (!pkts && !hci_dev_test_flag(hdev, HCI_SCO_FLOWCTL))
+ queue_work(hdev->workqueue, &hdev->tx_work);
}
static void hci_sched_acl_pkt(struct hci_dev *hdev)
@@ -3648,8 +3626,8 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev)
chan->conn->sent++;
/* Send pending SCO packets right away */
- hci_sched_sco(hdev);
- hci_sched_esco(hdev);
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
}
}
@@ -3704,8 +3682,8 @@ static void hci_sched_le(struct hci_dev *hdev)
chan->conn->sent++;
/* Send pending SCO packets right away */
- hci_sched_sco(hdev);
- hci_sched_esco(hdev);
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
}
}
@@ -3750,8 +3728,8 @@ static void hci_tx_work(struct work_struct *work)
if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) {
/* Schedule queues and send stuff to HCI driver */
- hci_sched_sco(hdev);
- hci_sched_esco(hdev);
+ hci_sched_sco(hdev, SCO_LINK);
+ hci_sched_sco(hdev, ESCO_LINK);
hci_sched_iso(hdev);
hci_sched_acl(hdev);
hci_sched_le(hdev);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 2cc7a9306350..20d3cdcb14f6 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -151,7 +151,7 @@ static u8 hci_cc_exit_periodic_inq(struct hci_dev *hdev, void *data,
static u8 hci_cc_remote_name_req_cancel(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
- struct hci_ev_status *rp = data;
+ struct hci_rp_remote_name_req_cancel *rp = data;
bt_dev_dbg(hdev, "status 0x%2.2x", rp->status);
@@ -3391,23 +3391,30 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data,
hci_update_scan(hdev);
}
- params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
- if (params) {
- switch (params->auto_connect) {
- case HCI_AUTO_CONN_LINK_LOSS:
- if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ /* Re-enable passive scanning if disconnected device is marked
+ * as auto-connectable.
+ */
+ if (conn->type == LE_LINK) {
+ params = hci_conn_params_lookup(hdev, &conn->dst,
+ conn->dst_type);
+ if (params) {
+ switch (params->auto_connect) {
+ case HCI_AUTO_CONN_LINK_LOSS:
+ if (ev->reason != HCI_ERROR_CONNECTION_TIMEOUT)
+ break;
+ fallthrough;
+
+ case HCI_AUTO_CONN_DIRECT:
+ case HCI_AUTO_CONN_ALWAYS:
+ hci_pend_le_list_del_init(params);
+ hci_pend_le_list_add(params,
+ &hdev->pend_le_conns);
+ hci_update_passive_scan(hdev);
break;
- fallthrough;
- case HCI_AUTO_CONN_DIRECT:
- case HCI_AUTO_CONN_ALWAYS:
- hci_pend_le_list_del_init(params);
- hci_pend_le_list_add(params, &hdev->pend_le_conns);
- hci_update_passive_scan(hdev);
- break;
-
- default:
- break;
+ default:
+ break;
+ }
}
}
@@ -4005,8 +4012,8 @@ static const struct hci_cc {
HCI_CC_STATUS(HCI_OP_INQUIRY_CANCEL, hci_cc_inquiry_cancel),
HCI_CC_STATUS(HCI_OP_PERIODIC_INQ, hci_cc_periodic_inq),
HCI_CC_STATUS(HCI_OP_EXIT_PERIODIC_INQ, hci_cc_exit_periodic_inq),
- HCI_CC_STATUS(HCI_OP_REMOTE_NAME_REQ_CANCEL,
- hci_cc_remote_name_req_cancel),
+ HCI_CC(HCI_OP_REMOTE_NAME_REQ_CANCEL, hci_cc_remote_name_req_cancel,
+ sizeof(struct hci_rp_remote_name_req_cancel)),
HCI_CC(HCI_OP_ROLE_DISCOVERY, hci_cc_role_discovery,
sizeof(struct hci_rp_role_discovery)),
HCI_CC(HCI_OP_READ_LINK_POLICY, hci_cc_read_link_policy,
@@ -4435,9 +4442,11 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
break;
case SCO_LINK:
+ case ESCO_LINK:
hdev->sco_cnt += count;
if (hdev->sco_cnt > hdev->sco_pkts)
hdev->sco_cnt = hdev->sco_pkts;
+
break;
case ISO_LINK:
@@ -6044,8 +6053,17 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
* a LE Direct Advertising Report event. In that case it is
* important to see if the address is matching the local
* controller address.
+ *
+ * If local privacy is not enable the controller shall not be
+ * generating such event since according to its documentation it is only
+ * valid for filter_policy 0x02 and 0x03, but the fact that it did
+ * generate LE Direct Advertising Report means it is probably broken and
+ * won't generate any other event which can potentially break
+ * auto-connect logic so in case local privacy is not enable this
+ * ignores the direct_addr so it works as a regular report.
*/
- if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr) {
+ if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr &&
+ hci_dev_test_flag(hdev, HCI_PRIVACY)) {
direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type,
&bdaddr_resolved);
@@ -6055,12 +6073,6 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
if (!hci_bdaddr_is_rpa(direct_addr, direct_addr_type))
return;
- /* If the controller is not using resolvable random
- * addresses, then this report can be ignored.
- */
- if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
- return;
-
/* If the local IRK of the controller does not match
* with the resolvable random address provided, then
* this report can be ignored.
@@ -6141,11 +6153,12 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr,
* event or send an immediate device found event if the data
* should not be stored for later.
*/
- if (!ext_adv && !has_pending_adv_report(hdev)) {
+ if (!has_pending_adv_report(hdev)) {
/* If the report will trigger a SCAN_REQ store it for
* later merging.
*/
- if (type == LE_ADV_IND || type == LE_ADV_SCAN_IND) {
+ if (!ext_adv && (type == LE_ADV_IND ||
+ type == LE_ADV_SCAN_IND)) {
store_pending_adv_report(hdev, bdaddr, bdaddr_type,
rssi, flags, data, len);
return;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 7b2b04d6b856..14c3ee5c6a1e 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1066,7 +1066,7 @@ int hci_update_random_address_sync(struct hci_dev *hdev, bool require_privacy,
/* If Controller supports LL Privacy use own address type is
* 0x03
*/
- if (use_ll_privacy(hdev))
+ if (ll_privacy_capable(hdev))
*own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
else
*own_addr_type = ADDR_LE_DEV_RANDOM;
@@ -1786,30 +1786,6 @@ int hci_remove_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance,
HCI_CMD_TIMEOUT, sk);
}
-static int remove_ext_adv_sync(struct hci_dev *hdev, void *data)
-{
- struct adv_info *adv = data;
- u8 instance = 0;
-
- if (adv)
- instance = adv->instance;
-
- return hci_remove_ext_adv_instance_sync(hdev, instance, NULL);
-}
-
-int hci_remove_ext_adv_instance(struct hci_dev *hdev, u8 instance)
-{
- struct adv_info *adv = NULL;
-
- if (instance) {
- adv = hci_find_adv_instance(hdev, instance);
- if (!adv)
- return -EINVAL;
- }
-
- return hci_cmd_sync_queue(hdev, remove_ext_adv_sync, adv, NULL);
-}
-
int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason)
{
struct hci_cp_le_term_big cp;
@@ -2162,7 +2138,7 @@ static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val,
static int hci_le_set_addr_resolution_enable_sync(struct hci_dev *hdev, u8 val)
{
- if (!use_ll_privacy(hdev))
+ if (!ll_privacy_capable(hdev))
return 0;
/* If controller is not/already resolving we are done. */
@@ -2254,7 +2230,7 @@ static int hci_le_del_resolve_list_sync(struct hci_dev *hdev,
struct hci_cp_le_del_from_resolv_list cp;
struct bdaddr_list_with_irk *entry;
- if (!use_ll_privacy(hdev))
+ if (!ll_privacy_capable(hdev))
return 0;
/* Check if the IRK has been programmed */
@@ -2319,7 +2295,7 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev,
struct bdaddr_list_with_irk *entry;
struct hci_conn_params *p;
- if (!use_ll_privacy(hdev))
+ if (!ll_privacy_capable(hdev))
return 0;
/* Attempt to program local identity address, type and irk if params is
@@ -2332,7 +2308,8 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev,
hci_copy_identity_address(hdev, &cp.bdaddr, &cp.bdaddr_type);
memcpy(cp.peer_irk, hdev->irk, 16);
goto done;
- }
+ } else if (!(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION))
+ return 0;
irk = hci_find_irk_by_addr(hdev, &params->addr, params->addr_type);
if (!irk)
@@ -2379,6 +2356,10 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev,
struct hci_cp_le_set_privacy_mode cp;
struct smp_irk *irk;
+ if (!ll_privacy_capable(hdev) ||
+ !(params->flags & HCI_CONN_FLAG_ADDRESS_RESOLUTION))
+ return 0;
+
/* If device privacy mode has already been set there is nothing to do */
if (params->privacy_mode == HCI_DEVICE_PRIVACY)
return 0;
@@ -2428,11 +2409,6 @@ static int hci_le_add_accept_list_sync(struct hci_dev *hdev,
if (*num_entries >= hdev->le_accept_list_size)
return -ENOSPC;
- /* Accept list can not be used with RPAs */
- if (!use_ll_privacy(hdev) &&
- hci_find_irk_by_addr(hdev, &params->addr, params->addr_type))
- return -EINVAL;
-
/* Attempt to program the device in the resolving list first to avoid
* having to rollback in case it fails since the resolving list is
* dynamic it can probably be smaller than the accept list.
@@ -2567,7 +2543,7 @@ static int hci_pause_addr_resolution(struct hci_dev *hdev)
{
int err;
- if (!use_ll_privacy(hdev))
+ if (!ll_privacy_capable(hdev))
return 0;
if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION))
@@ -2671,12 +2647,12 @@ static int hci_le_clear_accept_list_sync(struct hci_dev *hdev)
*
* Update is done using the following sequence:
*
- * use_ll_privacy((Disable Advertising) -> Disable Resolving List) ->
+ * ll_privacy_capable((Disable Advertising) -> Disable Resolving List) ->
* Remove Devices From Accept List ->
- * (has IRK && use_ll_privacy(Remove Devices From Resolving List))->
+ * (has IRK && ll_privacy_capable(Remove Devices From Resolving List))->
* Add Devices to Accept List ->
- * (has IRK && use_ll_privacy(Remove Devices From Resolving List)) ->
- * use_ll_privacy(Enable Resolving List -> (Enable Advertising)) ->
+ * (has IRK && ll_privacy_capable(Remove Devices From Resolving List)) ->
+ * ll_privacy_capable(Enable Resolving List -> (Enable Advertising)) ->
* Enable Scanning
*
* In case of failure advertising shall be restored to its original state and
@@ -2697,7 +2673,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev)
/* Pause advertising if resolving list can be used as controllers
* cannot accept resolving list modifications while advertising.
*/
- if (use_ll_privacy(hdev)) {
+ if (ll_privacy_capable(hdev)) {
err = hci_pause_advertising_sync(hdev);
if (err) {
bt_dev_err(hdev, "pause advertising failed: %d", err);
@@ -2842,7 +2818,7 @@ done:
bt_dev_err(hdev, "Unable to enable LL privacy: %d", err);
/* Resume advertising if it was paused */
- if (use_ll_privacy(hdev))
+ if (ll_privacy_capable(hdev))
hci_resume_advertising_sync(hdev);
/* Select filter policy to use accept list */
@@ -3100,7 +3076,7 @@ static int hci_passive_scan_sync(struct hci_dev *hdev)
* If there are devices to scan:
*
* Disable Scanning -> Update Accept List ->
- * use_ll_privacy((Disable Advertising) -> Disable Resolving List ->
+ * ll_privacy_capable((Disable Advertising) -> Disable Resolving List ->
* Update Resolving List -> Enable Resolving List -> (Enable Advertising)) ->
* Enable Scanning
*
@@ -3454,7 +3430,7 @@ int hci_update_name_sync(struct hci_dev *hdev)
*
* HCI_SSP_ENABLED(Enable SSP)
* HCI_LE_ENABLED(Enable LE)
- * HCI_LE_ENABLED(use_ll_privacy(Add local IRK to Resolving List) ->
+ * HCI_LE_ENABLED(ll_privacy_capable(Add local IRK to Resolving List) ->
* Update adv data)
* Enable Authentication
* lmp_bredr_capable(Set Fast Connectable -> Set Scan Type -> Set Class ->
@@ -3720,6 +3696,9 @@ static int hci_read_local_name_sync(struct hci_dev *hdev)
/* Read Voice Setting */
static int hci_read_voice_setting_sync(struct hci_dev *hdev)
{
+ if (!read_voice_setting_capable(hdev))
+ return 0;
+
return __hci_cmd_sync_status(hdev, HCI_OP_READ_VOICE_SETTING,
0, NULL, HCI_CMD_TIMEOUT);
}
@@ -3790,6 +3769,28 @@ static int hci_write_ca_timeout_sync(struct hci_dev *hdev)
sizeof(param), &param, HCI_CMD_TIMEOUT);
}
+/* Enable SCO flow control if supported */
+static int hci_write_sync_flowctl_sync(struct hci_dev *hdev)
+{
+ struct hci_cp_write_sync_flowctl cp;
+ int err;
+
+ /* Check if the controller supports SCO and HCI_OP_WRITE_SYNC_FLOWCTL */
+ if (!lmp_sco_capable(hdev) || !(hdev->commands[10] & BIT(4)) ||
+ !test_bit(HCI_QUIRK_SYNC_FLOWCTL_SUPPORTED, &hdev->quirks))
+ return 0;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = 0x01;
+
+ err = __hci_cmd_sync_status(hdev, HCI_OP_WRITE_SYNC_FLOWCTL,
+ sizeof(cp), &cp, HCI_CMD_TIMEOUT);
+ if (!err)
+ hci_dev_set_flag(hdev, HCI_SCO_FLOWCTL);
+
+ return err;
+}
+
/* BR Controller init stage 2 command sequence */
static const struct hci_init_stage br_init2[] = {
/* HCI_OP_READ_BUFFER_SIZE */
@@ -3808,6 +3809,8 @@ static const struct hci_init_stage br_init2[] = {
HCI_INIT(hci_clear_event_filter_sync),
/* HCI_OP_WRITE_CA_TIMEOUT */
HCI_INIT(hci_write_ca_timeout_sync),
+ /* HCI_OP_WRITE_SYNC_FLOWCTL */
+ HCI_INIT(hci_write_sync_flowctl_sync),
{}
};
@@ -4153,7 +4156,8 @@ static int hci_read_page_scan_type_sync(struct hci_dev *hdev)
* support the Read Page Scan Type command. Check support for
* this command in the bit mask of supported commands.
*/
- if (!(hdev->commands[13] & 0x01))
+ if (!(hdev->commands[13] & 0x01) ||
+ test_bit(HCI_QUIRK_BROKEN_READ_PAGE_SCAN_TYPE, &hdev->quirks))
return 0;
return __hci_cmd_sync_status(hdev, HCI_OP_READ_PAGE_SCAN_TYPE,
@@ -4229,6 +4233,14 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
if (use_enhanced_conn_complete(hdev))
events[1] |= 0x02; /* LE Enhanced Connection Complete */
+ /* Mark Device Privacy if Privacy Mode is supported */
+ if (privacy_mode_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY;
+
+ /* Mark Address Resolution if LL Privacy is supported */
+ if (ll_privacy_capable(hdev))
+ hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION;
+
/* If the controller supports Extended Scanner Filter
* Policies, enable the corresponding event.
*/
@@ -5385,7 +5397,7 @@ int hci_stop_discovery_sync(struct hci_dev *hdev)
}
/* Resume advertising if it was paused */
- if (use_ll_privacy(hdev))
+ if (ll_privacy_capable(hdev))
hci_resume_advertising_sync(hdev);
/* No further actions needed for LE-only discovery */
@@ -5897,7 +5909,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval)
failed:
/* Resume advertising if it was paused */
- if (use_ll_privacy(hdev))
+ if (ll_privacy_capable(hdev))
hci_resume_advertising_sync(hdev);
/* Resume passive scanning */
@@ -6673,7 +6685,7 @@ int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
/* If Controller supports LL Privacy use own address type is
* 0x03
*/
- if (use_ll_privacy(hdev))
+ if (ll_privacy_capable(hdev))
*own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED;
else
*own_addr_type = ADDR_LE_DEV_RANDOM;
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index 4b54dbbf0729..041ce9adc378 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -90,9 +90,28 @@ static void bt_host_release(struct device *dev)
module_put(THIS_MODULE);
}
+static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct hci_dev *hdev = to_hci_dev(dev);
+
+ if (hdev->reset)
+ hdev->reset(hdev);
+
+ return count;
+}
+static DEVICE_ATTR_WO(reset);
+
+static struct attribute *bt_host_attrs[] = {
+ &dev_attr_reset.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(bt_host);
+
static const struct device_type bt_host = {
.name = "host",
.release = bt_host_release,
+ .groups = bt_host_groups,
};
void hci_init_sysfs(struct hci_dev *hdev)
diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig
index 6746be07e222..e08aae35351a 100644
--- a/net/bluetooth/hidp/Kconfig
+++ b/net/bluetooth/hidp/Kconfig
@@ -1,8 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config BT_HIDP
tristate "HIDP protocol support"
- depends on BT_BREDR && INPUT && HID_SUPPORT
- select HID
+ depends on BT_BREDR && HID
help
HIDP (Human Interface Device Protocol) is a transport layer
for HID reports. HIDP is required for the Bluetooth Human
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 43d0ebe11100..0cb52a3308ba 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -1281,6 +1281,42 @@ static int iso_sock_accept(struct socket *sock, struct socket *newsock,
BT_DBG("new socket %p", ch);
+ /* A Broadcast Sink might require BIG sync to be terminated
+ * and re-established multiple times, while keeping the same
+ * PA sync handle active. To allow this, once all BIS
+ * connections have been accepted on a PA sync parent socket,
+ * "reset" socket state, to allow future BIG re-sync procedures.
+ */
+ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) {
+ /* Iterate through the list of bound BIS indices
+ * and clear each BIS as they are accepted by the
+ * user space, one by one.
+ */
+ for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) {
+ if (iso_pi(sk)->bc_bis[i] > 0) {
+ iso_pi(sk)->bc_bis[i] = 0;
+ iso_pi(sk)->bc_num_bis--;
+ break;
+ }
+ }
+
+ if (iso_pi(sk)->bc_num_bis == 0) {
+ /* Once the last BIS was accepted, reset parent
+ * socket parameters to mark that the listening
+ * process for BIS connections has been completed:
+ *
+ * 1. Reset the DEFER setup flag on the parent sk.
+ * 2. Clear the flag marking that the BIG create
+ * sync command is pending.
+ * 3. Transition socket state from BT_LISTEN to
+ * BT_CONNECTED.
+ */
+ set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags);
+ clear_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags);
+ sk->sk_state = BT_CONNECTED;
+ }
+ }
+
done:
release_sock(sk);
return err;
@@ -2151,11 +2187,6 @@ done:
return HCI_LM_ACCEPT;
}
-static bool iso_match(struct hci_conn *hcon)
-{
- return hcon->type == ISO_LINK || hcon->type == LE_LINK;
-}
-
static void iso_connect_cfm(struct hci_conn *hcon, __u8 status)
{
if (hcon->type != ISO_LINK) {
@@ -2337,7 +2368,6 @@ drop:
static struct hci_cb iso_cb = {
.name = "ISO",
- .match = iso_match,
.connect_cfm = iso_connect_cfm,
.disconn_cfm = iso_disconn_cfm,
};
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 728a5ce9b505..a55388fbf07c 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -119,7 +119,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
{
struct l2cap_chan *c;
- mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_scid(conn, cid);
if (c) {
/* Only lock if chan reference is not 0 */
@@ -127,7 +126,6 @@ static struct l2cap_chan *l2cap_get_chan_by_scid(struct l2cap_conn *conn,
if (c)
l2cap_chan_lock(c);
}
- mutex_unlock(&conn->chan_lock);
return c;
}
@@ -140,7 +138,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
{
struct l2cap_chan *c;
- mutex_lock(&conn->chan_lock);
c = __l2cap_get_chan_by_dcid(conn, cid);
if (c) {
/* Only lock if chan reference is not 0 */
@@ -148,7 +145,6 @@ static struct l2cap_chan *l2cap_get_chan_by_dcid(struct l2cap_conn *conn,
if (c)
l2cap_chan_lock(c);
}
- mutex_unlock(&conn->chan_lock);
return c;
}
@@ -418,7 +414,7 @@ static void l2cap_chan_timeout(struct work_struct *work)
if (!conn)
return;
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
/* __set_chan_timer() calls l2cap_chan_hold(chan) while scheduling
* this work. No need to call l2cap_chan_hold(chan) here again.
*/
@@ -439,7 +435,7 @@ static void l2cap_chan_timeout(struct work_struct *work)
l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
struct l2cap_chan *l2cap_chan_create(void)
@@ -642,9 +638,9 @@ void __l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
void l2cap_chan_add(struct l2cap_conn *conn, struct l2cap_chan *chan)
{
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
__l2cap_chan_add(conn, chan);
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
void l2cap_chan_del(struct l2cap_chan *chan, int err)
@@ -732,9 +728,9 @@ void l2cap_chan_list(struct l2cap_conn *conn, l2cap_chan_func_t func,
if (!conn)
return;
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
__l2cap_chan_list(conn, func, data);
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
EXPORT_SYMBOL_GPL(l2cap_chan_list);
@@ -746,7 +742,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work)
struct hci_conn *hcon = conn->hcon;
struct l2cap_chan *chan;
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
list_for_each_entry(chan, &conn->chan_l, list) {
l2cap_chan_lock(chan);
@@ -755,7 +751,7 @@ static void l2cap_conn_update_id_addr(struct work_struct *work)
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
static void l2cap_chan_le_connect_reject(struct l2cap_chan *chan)
@@ -949,6 +945,16 @@ static u8 l2cap_get_ident(struct l2cap_conn *conn)
return id;
}
+static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb,
+ u8 flags)
+{
+ /* Check if the hcon still valid before attempting to send */
+ if (hci_conn_valid(conn->hcon->hdev, conn->hcon))
+ hci_send_acl(conn->hchan, skb, flags);
+ else
+ kfree_skb(skb);
+}
+
static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
void *data)
{
@@ -971,7 +977,7 @@ static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len,
bt_cb(skb)->force_active = BT_POWER_FORCE_ACTIVE_ON;
skb->priority = HCI_PRIO_MAX;
- hci_send_acl(conn->hchan, skb, flags);
+ l2cap_send_acl(conn, skb, flags);
}
static void l2cap_do_send(struct l2cap_chan *chan, struct sk_buff *skb)
@@ -1498,8 +1504,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
BT_DBG("conn %p", conn);
- mutex_lock(&conn->chan_lock);
-
list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
l2cap_chan_lock(chan);
@@ -1568,8 +1572,6 @@ static void l2cap_conn_start(struct l2cap_conn *conn)
l2cap_chan_unlock(chan);
}
-
- mutex_unlock(&conn->chan_lock);
}
static void l2cap_le_conn_ready(struct l2cap_conn *conn)
@@ -1615,7 +1617,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
if (hcon->type == ACL_LINK)
l2cap_request_info(conn);
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
list_for_each_entry(chan, &conn->chan_l, list) {
@@ -1633,7 +1635,7 @@ static void l2cap_conn_ready(struct l2cap_conn *conn)
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
if (hcon->type == LE_LINK)
l2cap_le_conn_ready(conn);
@@ -1648,14 +1650,10 @@ static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err)
BT_DBG("conn %p", conn);
- mutex_lock(&conn->chan_lock);
-
list_for_each_entry(chan, &conn->chan_l, list) {
if (test_bit(FLAG_FORCE_RELIABLE, &chan->flags))
l2cap_chan_set_err(chan, err);
}
-
- mutex_unlock(&conn->chan_lock);
}
static void l2cap_info_timeout(struct work_struct *work)
@@ -1666,7 +1664,9 @@ static void l2cap_info_timeout(struct work_struct *work)
conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE;
conn->info_ident = 0;
+ mutex_lock(&conn->lock);
l2cap_conn_start(conn);
+ mutex_unlock(&conn->lock);
}
/*
@@ -1758,6 +1758,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
BT_DBG("hcon %p conn %p, err %d", hcon, conn, err);
+ mutex_lock(&conn->lock);
+
kfree_skb(conn->rx_skb);
skb_queue_purge(&conn->pending_rx);
@@ -1776,8 +1778,6 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
/* Force the connection to be immediately dropped */
hcon->disc_timeout = 0;
- mutex_lock(&conn->chan_lock);
-
/* Kill channels */
list_for_each_entry_safe(chan, l, &conn->chan_l, list) {
l2cap_chan_hold(chan);
@@ -1791,15 +1791,14 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
l2cap_chan_put(chan);
}
- mutex_unlock(&conn->chan_lock);
-
- hci_chan_del(conn->hchan);
-
if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT)
cancel_delayed_work_sync(&conn->info_timer);
- hcon->l2cap_data = NULL;
+ hci_chan_del(conn->hchan);
conn->hchan = NULL;
+
+ hcon->l2cap_data = NULL;
+ mutex_unlock(&conn->lock);
l2cap_conn_put(conn);
}
@@ -2917,8 +2916,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
BT_DBG("conn %p", conn);
- mutex_lock(&conn->chan_lock);
-
list_for_each_entry(chan, &conn->chan_l, list) {
if (chan->chan_type != L2CAP_CHAN_RAW)
continue;
@@ -2933,8 +2930,6 @@ static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb)
if (chan->ops->recv(chan, nskb))
kfree_skb(nskb);
}
-
- mutex_unlock(&conn->chan_lock);
}
/* ---- L2CAP signalling commands ---- */
@@ -3957,12 +3952,12 @@ static void l2cap_connect(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd,
goto response;
}
- mutex_lock(&conn->chan_lock);
l2cap_chan_lock(pchan);
/* Check if the ACL is secure enough (if not SDP) */
if (psm != cpu_to_le16(L2CAP_PSM_SDP) &&
- !hci_conn_check_link_mode(conn->hcon)) {
+ (!hci_conn_check_link_mode(conn->hcon) ||
+ !l2cap_check_enc_key_size(conn->hcon))) {
conn->disc_reason = HCI_ERROR_AUTH_FAILURE;
result = L2CAP_CR_SEC_BLOCK;
goto response;
@@ -4064,7 +4059,6 @@ response:
}
l2cap_chan_unlock(pchan);
- mutex_unlock(&conn->chan_lock);
l2cap_chan_put(pchan);
}
@@ -4103,27 +4097,19 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x",
dcid, scid, result, status);
- mutex_lock(&conn->chan_lock);
-
if (scid) {
chan = __l2cap_get_chan_by_scid(conn, scid);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
} else {
chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
}
chan = l2cap_chan_hold_unless_zero(chan);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
err = 0;
@@ -4161,9 +4147,6 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn,
l2cap_chan_unlock(chan);
l2cap_chan_put(chan);
-unlock:
- mutex_unlock(&conn->chan_lock);
-
return err;
}
@@ -4451,11 +4434,7 @@ static inline int l2cap_disconnect_req(struct l2cap_conn *conn,
chan->ops->set_shutdown(chan);
- l2cap_chan_unlock(chan);
- mutex_lock(&conn->chan_lock);
- l2cap_chan_lock(chan);
l2cap_chan_del(chan, ECONNRESET);
- mutex_unlock(&conn->chan_lock);
chan->ops->close(chan);
@@ -4492,11 +4471,7 @@ static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn,
return 0;
}
- l2cap_chan_unlock(chan);
- mutex_lock(&conn->chan_lock);
- l2cap_chan_lock(chan);
l2cap_chan_del(chan, 0);
- mutex_unlock(&conn->chan_lock);
chan->ops->close(chan);
@@ -4694,13 +4669,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x",
dcid, mtu, mps, credits, result);
- mutex_lock(&conn->chan_lock);
-
chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
- if (!chan) {
- err = -EBADSLT;
- goto unlock;
- }
+ if (!chan)
+ return -EBADSLT;
err = 0;
@@ -4748,9 +4719,6 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
l2cap_chan_unlock(chan);
-unlock:
- mutex_unlock(&conn->chan_lock);
-
return err;
}
@@ -4862,7 +4830,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
goto response;
}
- mutex_lock(&conn->chan_lock);
l2cap_chan_lock(pchan);
if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
@@ -4928,7 +4895,6 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
response_unlock:
l2cap_chan_unlock(pchan);
- mutex_unlock(&conn->chan_lock);
l2cap_chan_put(pchan);
if (result == L2CAP_CR_PEND)
@@ -5062,7 +5028,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
goto response;
}
- mutex_lock(&conn->chan_lock);
l2cap_chan_lock(pchan);
if (!smp_sufficient_security(conn->hcon, pchan->sec_level,
@@ -5137,7 +5102,6 @@ static inline int l2cap_ecred_conn_req(struct l2cap_conn *conn,
unlock:
l2cap_chan_unlock(pchan);
- mutex_unlock(&conn->chan_lock);
l2cap_chan_put(pchan);
response:
@@ -5174,8 +5138,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
BT_DBG("mtu %u mps %u credits %u result 0x%4.4x", mtu, mps, credits,
result);
- mutex_lock(&conn->chan_lock);
-
cmd_len -= sizeof(*rsp);
list_for_each_entry_safe(chan, tmp, &conn->chan_l, list) {
@@ -5261,8 +5223,6 @@ static inline int l2cap_ecred_conn_rsp(struct l2cap_conn *conn,
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
-
return err;
}
@@ -5375,8 +5335,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn,
if (cmd_len < sizeof(*rej))
return -EPROTO;
- mutex_lock(&conn->chan_lock);
-
chan = __l2cap_get_chan_by_ident(conn, cmd->ident);
if (!chan)
goto done;
@@ -5391,7 +5349,6 @@ static inline int l2cap_le_command_rej(struct l2cap_conn *conn,
l2cap_chan_put(chan);
done:
- mutex_unlock(&conn->chan_lock);
return 0;
}
@@ -6846,8 +6803,12 @@ static void process_pending_rx(struct work_struct *work)
BT_DBG("");
+ mutex_lock(&conn->lock);
+
while ((skb = skb_dequeue(&conn->pending_rx)))
l2cap_recv_frame(conn, skb);
+
+ mutex_unlock(&conn->lock);
}
static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
@@ -6886,7 +6847,7 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR;
mutex_init(&conn->ident_lock);
- mutex_init(&conn->chan_lock);
+ mutex_init(&conn->lock);
INIT_LIST_HEAD(&conn->chan_l);
INIT_LIST_HEAD(&conn->users);
@@ -7077,7 +7038,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
}
}
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
l2cap_chan_lock(chan);
if (cid && __l2cap_get_chan_by_dcid(conn, cid)) {
@@ -7118,7 +7079,7 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
chan_unlock:
l2cap_chan_unlock(chan);
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
done:
hci_dev_unlock(hdev);
hci_dev_put(hdev);
@@ -7222,11 +7183,6 @@ static struct l2cap_chan *l2cap_global_fixed_chan(struct l2cap_chan *c,
return NULL;
}
-static bool l2cap_match(struct hci_conn *hcon)
-{
- return hcon->type == ACL_LINK || hcon->type == LE_LINK;
-}
-
static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
{
struct hci_dev *hdev = hcon->hdev;
@@ -7234,6 +7190,9 @@ static void l2cap_connect_cfm(struct hci_conn *hcon, u8 status)
struct l2cap_chan *pchan;
u8 dst_type;
+ if (hcon->type != ACL_LINK && hcon->type != LE_LINK)
+ return;
+
BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status);
if (status) {
@@ -7298,6 +7257,9 @@ int l2cap_disconn_ind(struct hci_conn *hcon)
static void l2cap_disconn_cfm(struct hci_conn *hcon, u8 reason)
{
+ if (hcon->type != ACL_LINK && hcon->type != LE_LINK)
+ return;
+
BT_DBG("hcon %p reason %d", hcon, reason);
l2cap_conn_del(hcon, bt_to_errno(reason));
@@ -7330,7 +7292,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
BT_DBG("conn %p status 0x%2.2x encrypt %u", conn, status, encrypt);
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
list_for_each_entry(chan, &conn->chan_l, list) {
l2cap_chan_lock(chan);
@@ -7404,7 +7366,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt)
l2cap_chan_unlock(chan);
}
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
}
/* Append fragment into frame respecting the maximum len of rx_skb */
@@ -7471,19 +7433,45 @@ static void l2cap_recv_reset(struct l2cap_conn *conn)
conn->rx_len = 0;
}
+struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c)
+{
+ if (!c)
+ return NULL;
+
+ BT_DBG("conn %p orig refcnt %u", c, kref_read(&c->ref));
+
+ if (!kref_get_unless_zero(&c->ref))
+ return NULL;
+
+ return c;
+}
+
void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
{
- struct l2cap_conn *conn = hcon->l2cap_data;
+ struct l2cap_conn *conn;
int len;
+ /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */
+ hci_dev_lock(hcon->hdev);
+
+ conn = hcon->l2cap_data;
+
if (!conn)
conn = l2cap_conn_add(hcon);
- if (!conn)
- goto drop;
+ conn = l2cap_conn_hold_unless_zero(conn);
+
+ hci_dev_unlock(hcon->hdev);
+
+ if (!conn) {
+ kfree_skb(skb);
+ return;
+ }
BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags);
+ mutex_lock(&conn->lock);
+
switch (flags) {
case ACL_START:
case ACL_START_NO_FLUSH:
@@ -7508,7 +7496,7 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
if (len == skb->len) {
/* Complete frame received */
l2cap_recv_frame(conn, skb);
- return;
+ goto unlock;
}
BT_DBG("Start: total len %d, frag len %u", len, skb->len);
@@ -7516,8 +7504,24 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
if (skb->len > len) {
BT_ERR("Frame is too long (len %u, expected len %d)",
skb->len, len);
+ /* PTS test cases L2CAP/COS/CED/BI-14-C and BI-15-C
+ * (Multiple Signaling Command in one PDU, Data
+ * Truncated, BR/EDR) send a C-frame to the IUT with
+ * PDU Length set to 8 and Channel ID set to the
+ * correct signaling channel for the logical link.
+ * The Information payload contains one L2CAP_ECHO_REQ
+ * packet with Data Length set to 0 with 0 octets of
+ * echo data and one invalid command packet due to
+ * data truncated in PDU but present in HCI packet.
+ *
+ * Shorter the socket buffer to the PDU length to
+ * allow to process valid commands from the PDU before
+ * setting the socket unreliable.
+ */
+ skb->len = len;
+ l2cap_recv_frame(conn, skb);
l2cap_conn_unreliable(conn, ECOMM);
- goto drop;
+ goto unlock;
}
/* Append fragment into frame (with header) */
@@ -7572,11 +7576,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
drop:
kfree_skb(skb);
+unlock:
+ mutex_unlock(&conn->lock);
+ l2cap_conn_put(conn);
}
static struct hci_cb l2cap_cb = {
.name = "L2CAP",
- .match = l2cap_match,
.connect_cfm = l2cap_connect_cfm,
.disconn_cfm = l2cap_disconn_cfm,
.security_cfm = l2cap_security_cfm,
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 46ea0bee2259..acd11b268b98 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -1326,9 +1326,10 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
/* prevent sk structure from being freed whilst unlocked */
sock_hold(sk);
- chan = l2cap_pi(sk)->chan;
/* prevent chan structure from being freed whilst unlocked */
- l2cap_chan_hold(chan);
+ chan = l2cap_chan_hold_unless_zero(l2cap_pi(sk)->chan);
+ if (!chan)
+ goto shutdown_already;
BT_DBG("chan %p state %s", chan, state_to_string(chan->state));
@@ -1358,22 +1359,20 @@ static int l2cap_sock_shutdown(struct socket *sock, int how)
release_sock(sk);
l2cap_chan_lock(chan);
- conn = chan->conn;
- if (conn)
- /* prevent conn structure from being freed */
- l2cap_conn_get(conn);
+ /* prevent conn structure from being freed */
+ conn = l2cap_conn_hold_unless_zero(chan->conn);
l2cap_chan_unlock(chan);
if (conn)
/* mutex lock must be taken before l2cap_chan_lock() */
- mutex_lock(&conn->chan_lock);
+ mutex_lock(&conn->lock);
l2cap_chan_lock(chan);
l2cap_chan_close(chan, 0);
l2cap_chan_unlock(chan);
if (conn) {
- mutex_unlock(&conn->chan_lock);
+ mutex_unlock(&conn->lock);
l2cap_conn_put(conn);
}
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index aa8b0df41291..621c555f639b 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -210,7 +210,7 @@ static const u16 mgmt_untrusted_events[] = {
MGMT_EV_EXP_FEATURE_CHANGED,
};
-#define CACHE_TIMEOUT msecs_to_jiffies(2 * 1000)
+#define CACHE_TIMEOUT secs_to_jiffies(2)
#define ZERO_KEY "\x00\x00\x00\x00\x00\x00\x00\x00" \
"\x00\x00\x00\x00\x00\x00\x00\x00"
@@ -4417,12 +4417,6 @@ static const u8 le_simultaneous_roles_uuid[16] = {
0x96, 0x46, 0xc0, 0x42, 0xb5, 0x10, 0x1b, 0x67,
};
-/* 15c0a148-c273-11ea-b3de-0242ac130004 */
-static const u8 rpa_resolution_uuid[16] = {
- 0x04, 0x00, 0x13, 0xac, 0x42, 0x02, 0xde, 0xb3,
- 0xea, 0x11, 0x73, 0xc2, 0x48, 0xa1, 0xc0, 0x15,
-};
-
/* 6fbaf188-05e0-496a-9885-d6ddfdb4e03e */
static const u8 iso_socket_uuid[16] = {
0x3e, 0xe0, 0xb4, 0xfd, 0xdd, 0xd6, 0x85, 0x98,
@@ -4473,17 +4467,6 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
idx++;
}
- if (hdev && ll_privacy_capable(hdev)) {
- if (hci_dev_test_flag(hdev, HCI_ENABLE_LL_PRIVACY))
- flags = BIT(0) | BIT(1);
- else
- flags = BIT(1);
-
- memcpy(rp->features[idx].uuid, rpa_resolution_uuid, 16);
- rp->features[idx].flags = cpu_to_le32(flags);
- idx++;
- }
-
if (hdev && (aosp_has_quality_report(hdev) ||
hdev->set_quality_report)) {
if (hci_dev_test_flag(hdev, HCI_QUALITY_REPORT))
@@ -4540,27 +4523,6 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
return status;
}
-static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev,
- struct sock *skip)
-{
- struct mgmt_ev_exp_feature_changed ev;
-
- memset(&ev, 0, sizeof(ev));
- memcpy(ev.uuid, rpa_resolution_uuid, 16);
- ev.flags = cpu_to_le32((enabled ? BIT(0) : 0) | BIT(1));
-
- // Do we need to be atomic with the conn_flags?
- if (enabled && privacy_mode_capable(hdev))
- hdev->conn_flags |= HCI_CONN_FLAG_DEVICE_PRIVACY;
- else
- hdev->conn_flags &= ~HCI_CONN_FLAG_DEVICE_PRIVACY;
-
- return mgmt_limited_event(MGMT_EV_EXP_FEATURE_CHANGED, hdev,
- &ev, sizeof(ev),
- HCI_MGMT_EXP_FEATURE_EVENTS, skip);
-
-}
-
static int exp_feature_changed(struct hci_dev *hdev, const u8 *uuid,
bool enabled, struct sock *skip)
{
@@ -4601,16 +4563,6 @@ static int set_zero_key_func(struct sock *sk, struct hci_dev *hdev,
}
#endif
- if (hdev && use_ll_privacy(hdev) && !hdev_is_powered(hdev)) {
- bool changed;
-
- changed = hci_dev_test_and_clear_flag(hdev,
- HCI_ENABLE_LL_PRIVACY);
- if (changed)
- exp_feature_changed(hdev, rpa_resolution_uuid, false,
- sk);
- }
-
hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE,
@@ -4716,71 +4668,6 @@ static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev,
return err;
}
-static int set_rpa_resolution_func(struct sock *sk, struct hci_dev *hdev,
- struct mgmt_cp_set_exp_feature *cp,
- u16 data_len)
-{
- struct mgmt_rp_set_exp_feature rp;
- bool val, changed;
- int err;
- u32 flags;
-
- /* Command requires to use the controller index */
- if (!hdev)
- return mgmt_cmd_status(sk, MGMT_INDEX_NONE,
- MGMT_OP_SET_EXP_FEATURE,
- MGMT_STATUS_INVALID_INDEX);
-
- /* Changes can only be made when controller is powered down */
- if (hdev_is_powered(hdev))
- return mgmt_cmd_status(sk, hdev->id,
- MGMT_OP_SET_EXP_FEATURE,
- MGMT_STATUS_REJECTED);
-
- /* Parameters are limited to a single octet */
- if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1)
- return mgmt_cmd_status(sk, hdev->id,
- MGMT_OP_SET_EXP_FEATURE,
- MGMT_STATUS_INVALID_PARAMS);
-
- /* Only boolean on/off is supported */
- if (cp->param[0] != 0x00 && cp->param[0] != 0x01)
- return mgmt_cmd_status(sk, hdev->id,
- MGMT_OP_SET_EXP_FEATURE,
- MGMT_STATUS_INVALID_PARAMS);
-
- val = !!cp->param[0];
-
- if (val) {
- changed = !hci_dev_test_and_set_flag(hdev,
- HCI_ENABLE_LL_PRIVACY);
- hci_dev_clear_flag(hdev, HCI_ADVERTISING);
-
- /* Enable LL privacy + supported settings changed */
- flags = BIT(0) | BIT(1);
- } else {
- changed = hci_dev_test_and_clear_flag(hdev,
- HCI_ENABLE_LL_PRIVACY);
-
- /* Disable LL privacy + supported settings changed */
- flags = BIT(1);
- }
-
- memcpy(rp.uuid, rpa_resolution_uuid, 16);
- rp.flags = cpu_to_le32(flags);
-
- hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS);
-
- err = mgmt_cmd_complete(sk, hdev->id,
- MGMT_OP_SET_EXP_FEATURE, 0,
- &rp, sizeof(rp));
-
- if (changed)
- exp_ll_privacy_feature_changed(val, hdev, sk);
-
- return err;
-}
-
static int set_quality_report_func(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_set_exp_feature *cp,
u16 data_len)
@@ -5032,7 +4919,6 @@ static const struct mgmt_exp_feature {
EXP_FEAT(debug_uuid, set_debug_func),
#endif
EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func),
- EXP_FEAT(rpa_resolution_uuid, set_rpa_resolution_func),
EXP_FEAT(quality_report_uuid, set_quality_report_func),
EXP_FEAT(offload_codecs_uuid, set_offload_codec_func),
EXP_FEAT(le_simultaneous_roles_uuid, set_le_simultaneous_roles_func),
@@ -5062,22 +4948,6 @@ static int set_exp_feature(struct sock *sk, struct hci_dev *hdev,
MGMT_STATUS_NOT_SUPPORTED);
}
-static u32 get_params_flags(struct hci_dev *hdev,
- struct hci_conn_params *params)
-{
- u32 flags = hdev->conn_flags;
-
- /* Devices using RPAs can only be programmed in the acceptlist if
- * LL Privacy has been enable otherwise they cannot mark
- * HCI_CONN_FLAG_REMOTE_WAKEUP.
- */
- if ((flags & HCI_CONN_FLAG_REMOTE_WAKEUP) && !use_ll_privacy(hdev) &&
- hci_find_irk_by_addr(hdev, &params->addr, params->addr_type))
- flags &= ~HCI_CONN_FLAG_REMOTE_WAKEUP;
-
- return flags;
-}
-
static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
u16 data_len)
{
@@ -5112,7 +4982,6 @@ static int get_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
if (!params)
goto done;
- supported_flags = get_params_flags(hdev, params);
current_flags = params->flags;
}
@@ -5192,7 +5061,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data,
goto unlock;
}
- supported_flags = get_params_flags(hdev, params);
+ supported_flags = hdev->conn_flags;
if ((supported_flags | current_flags) != supported_flags) {
bt_dev_warn(hdev, "Bad flag given (0x%x) vs supported (0x%0x)",
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 4c56ca5a216c..ad5177e3a69b 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -2134,11 +2134,6 @@ static int rfcomm_run(void *unused)
return 0;
}
-static bool rfcomm_match(struct hci_conn *hcon)
-{
- return hcon->type == ACL_LINK;
-}
-
static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
{
struct rfcomm_session *s;
@@ -2185,7 +2180,6 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
static struct hci_cb rfcomm_cb = {
.name = "RFCOMM",
- .match = rfcomm_match,
.security_cfm = rfcomm_security_cfm
};
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index aa7bfe26cb40..5d1bc0d6aee0 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -107,6 +107,14 @@ static void sco_conn_put(struct sco_conn *conn)
kref_put(&conn->ref, sco_conn_free);
}
+static struct sco_conn *sco_conn_hold(struct sco_conn *conn)
+{
+ BT_DBG("conn %p refcnt %u", conn, kref_read(&conn->ref));
+
+ kref_get(&conn->ref);
+ return conn;
+}
+
static struct sco_conn *sco_conn_hold_unless_zero(struct sco_conn *conn)
{
if (!conn)
@@ -1353,6 +1361,7 @@ static void sco_conn_ready(struct sco_conn *conn)
bacpy(&sco_pi(sk)->src, &conn->hcon->src);
bacpy(&sco_pi(sk)->dst, &conn->hcon->dst);
+ sco_conn_hold(conn);
hci_conn_hold(conn->hcon);
__sco_chan_add(conn, sk, parent);
@@ -1398,27 +1407,30 @@ int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags)
return lm;
}
-static bool sco_match(struct hci_conn *hcon)
-{
- return hcon->type == SCO_LINK || hcon->type == ESCO_LINK;
-}
-
static void sco_connect_cfm(struct hci_conn *hcon, __u8 status)
{
+ if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+ return;
+
BT_DBG("hcon %p bdaddr %pMR status %u", hcon, &hcon->dst, status);
if (!status) {
struct sco_conn *conn;
conn = sco_conn_add(hcon);
- if (conn)
+ if (conn) {
sco_conn_ready(conn);
+ sco_conn_put(conn);
+ }
} else
sco_conn_del(hcon, bt_to_errno(status));
}
static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason)
{
+ if (hcon->type != SCO_LINK && hcon->type != ESCO_LINK)
+ return;
+
BT_DBG("hcon %p reason %d", hcon, reason);
sco_conn_del(hcon, bt_to_errno(reason));
@@ -1444,7 +1456,6 @@ drop:
static struct hci_cb sco_cb = {
.name = "SCO",
- .match = sco_match,
.connect_cfm = sco_connect_cfm,
.disconn_cfm = sco_disconn_cfm,
};
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 8612023bec60..7cb192cbd65f 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -153,7 +153,7 @@ static void xdp_test_run_init_page(netmem_ref netmem, void *arg)
new_ctx->data = new_ctx->data_meta + meta_len;
xdp_update_frame_from_buff(new_ctx, frm);
- frm->mem = new_ctx->rxq->mem;
+ frm->mem_type = new_ctx->rxq->mem.type;
memcpy(&head->orig_ctx, new_ctx, sizeof(head->orig_ctx));
}
@@ -246,7 +246,7 @@ static void reset_ctx(struct xdp_page_head *head)
head->ctx.data_meta = head->orig_ctx.data_meta;
head->ctx.data_end = head->orig_ctx.data_end;
xdp_update_frame_from_buff(&head->ctx, head->frame);
- head->frame->mem = head->orig_ctx.rxq->mem;
+ head->frame->mem_type = head->orig_ctx.rxq->mem.type;
}
static int xdp_recv_frames(struct xdp_frame **frames, int nframes,
@@ -1015,6 +1015,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
case BPF_PROG_TYPE_LWT_IN:
case BPF_PROG_TYPE_LWT_OUT:
case BPF_PROG_TYPE_LWT_XMIT:
+ case BPF_PROG_TYPE_CGROUP_SKB:
is_direct_pkt_access = true;
break;
default:
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 2cab878e0a39..183fcb362f9e 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -51,6 +51,13 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
}
}
+ if (is_vlan_dev(dev)) {
+ struct net_device *real_dev = vlan_dev_real_dev(dev);
+
+ if (netif_is_bridge_master(real_dev))
+ br_vlan_vlan_upper_event(real_dev, dev, event);
+ }
+
/* not a port of a bridge */
p = br_port_get_rtnl(dev);
if (!p)
diff --git a/net/bridge/br_arp_nd_proxy.c b/net/bridge/br_arp_nd_proxy.c
index c7869a286df4..115a23054a58 100644
--- a/net/bridge/br_arp_nd_proxy.c
+++ b/net/bridge/br_arp_nd_proxy.c
@@ -229,7 +229,7 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
#endif
#if IS_ENABLED(CONFIG_IPV6)
-struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *msg)
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *msg)
{
struct nd_msg *m;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 82bac2426631..902694c0ce64 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -955,6 +955,7 @@ int br_fdb_dump(struct sk_buff *skb,
struct net_device *filter_dev,
int *idx)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_fdb_entry *f;
int err = 0;
@@ -970,7 +971,7 @@ int br_fdb_dump(struct sk_buff *skb,
rcu_read_lock();
hlist_for_each_entry_rcu(f, &br->fdb_list, fdb_node) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
if (filter_dev && (!f->dst || f->dst->dev != filter_dev)) {
if (filter_dev != dev)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index e19b583ff2c6..29097e984b4f 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -201,6 +201,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
enum br_pkt_type pkt_type, bool local_rcv, bool local_orig,
u16 vid)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET;
struct net_bridge_port *prev = NULL;
struct net_bridge_port *p;
@@ -234,8 +235,11 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
continue;
prev = maybe_deliver(prev, p, skb, local_orig);
- if (IS_ERR(prev))
+ if (IS_ERR(prev)) {
+ reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM :
+ SKB_DROP_REASON_NOT_SPECIFIED;
goto out;
+ }
}
if (!prev)
@@ -249,7 +253,7 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb,
out:
if (!local_rcv)
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -289,6 +293,7 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
struct net_bridge_mcast *brmctx,
bool local_rcv, bool local_orig)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NO_TX_TARGET;
struct net_bridge_port *prev = NULL;
struct net_bridge_port_group *p;
bool allow_mode_include = true;
@@ -329,8 +334,11 @@ void br_multicast_flood(struct net_bridge_mdb_entry *mdst,
}
prev = maybe_deliver(prev, port, skb, local_orig);
- if (IS_ERR(prev))
+ if (IS_ERR(prev)) {
+ reason = PTR_ERR(prev) == -ENOMEM ? SKB_DROP_REASON_NOMEM :
+ SKB_DROP_REASON_NOT_SPECIFIED;
goto out;
+ }
delivered:
if ((unsigned long)lport >= (unsigned long)port)
p = rcu_dereference(p->next);
@@ -349,6 +357,6 @@ delivered:
out:
if (!local_rcv)
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
#endif
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index ceaa5a89b947..232133a0fd21 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -75,6 +75,7 @@ static int br_pass_frame_up(struct sk_buff *skb, bool promisc)
/* note: already called with rcu_read_lock */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct net_bridge_port *p = br_port_get_rcu(skb->dev);
enum br_pkt_type pkt_type = BR_PKT_UNICAST;
struct net_bridge_fdb_entry *dst = NULL;
@@ -96,8 +97,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
if (br_mst_is_enabled(br)) {
state = BR_STATE_FORWARDING;
} else {
- if (p->state == BR_STATE_DISABLED)
+ if (p->state == BR_STATE_DISABLED) {
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
goto drop;
+ }
state = p->state;
}
@@ -155,8 +158,10 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
}
}
- if (state == BR_STATE_LEARNING)
+ if (state == BR_STATE_LEARNING) {
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
goto drop;
+ }
BR_INPUT_SKB_CB(skb)->brdev = br->dev;
BR_INPUT_SKB_CB(skb)->src_port_isolated = !!(p->flags & BR_ISOLATED);
@@ -223,7 +228,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
out:
return 0;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
goto out;
}
EXPORT_SYMBOL_GPL(br_handle_frame_finish);
@@ -324,6 +329,7 @@ static int br_process_frame_type(struct net_bridge_port *p,
*/
static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED;
struct net_bridge_port *p;
struct sk_buff *skb = *pskb;
const unsigned char *dest = eth_hdr(skb)->h_dest;
@@ -331,8 +337,10 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
if (unlikely(skb->pkt_type == PACKET_LOOPBACK))
return RX_HANDLER_PASS;
- if (!is_valid_ether_addr(eth_hdr(skb)->h_source))
+ if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) {
+ reason = SKB_DROP_REASON_MAC_INVALID_SOURCE;
goto drop;
+ }
skb = skb_share_check(skb, GFP_ATOMIC);
if (!skb)
@@ -374,6 +382,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
return RX_HANDLER_PASS;
case 0x01: /* IEEE MAC (Pause) */
+ reason = SKB_DROP_REASON_MAC_IEEE_MAC_CONTROL;
goto drop;
case 0x0E: /* 802.1AB LLDP */
@@ -423,8 +432,9 @@ defer_stp_filtering:
return nf_hook_bridge_pre(skb, pskb);
default:
+ reason = SKB_DROP_REASON_BRIDGE_INGRESS_STP_STATE;
drop:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
}
return RX_HANDLER_CONSUMED;
}
diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c
index f213ed108361..6bc0a11f2ed3 100644
--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -394,10 +394,26 @@ static int old_deviceless(struct net *net, void __user *data)
return -EOPNOTSUPP;
}
-int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd,
- struct ifreq *ifr, void __user *uarg)
+int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg)
{
int ret = -EOPNOTSUPP;
+ struct ifreq ifr;
+
+ if (cmd == SIOCBRADDIF || cmd == SIOCBRDELIF) {
+ void __user *data;
+ char *colon;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (get_user_ifreq(&ifr, &data, uarg))
+ return -EFAULT;
+
+ ifr.ifr_name[IFNAMSIZ - 1] = 0;
+ colon = strchr(ifr.ifr_name, ':');
+ if (colon)
+ *colon = 0;
+ }
rtnl_lock();
@@ -430,7 +446,21 @@ int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd,
break;
case SIOCBRADDIF:
case SIOCBRDELIF:
- ret = add_del_if(br, ifr->ifr_ifindex, cmd == SIOCBRADDIF);
+ {
+ struct net_device *dev;
+
+ dev = __dev_get_by_name(net, ifr.ifr_name);
+ if (!dev || !netif_device_present(dev)) {
+ ret = -ENODEV;
+ break;
+ }
+ if (!netif_is_bridge_master(dev)) {
+ ret = -EOPNOTSUPP;
+ break;
+ }
+
+ ret = add_del_if(netdev_priv(dev), ifr.ifr_ifindex, cmd == SIOCBRADDIF);
+ }
break;
}
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 451e45b9a6a5..94cbe967d1c1 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -393,38 +393,10 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_
reason = ip_route_input(skb, iph->daddr, iph->saddr,
ip4h_dscp(iph), dev);
if (reason) {
- struct in_device *in_dev = __in_dev_get_rcu(dev);
-
- /* If err equals -EHOSTUNREACH the error is due to a
- * martian destination or due to the fact that
- * forwarding is disabled. For most martian packets,
- * ip_route_output_key() will fail. It won't fail for 2 types of
- * martian destinations: loopback destinations and destination
- * 0.0.0.0. In both cases the packet will be dropped because the
- * destination is the loopback device and not the bridge. */
- if (reason != SKB_DROP_REASON_IP_INADDRERRORS || !in_dev ||
- IN_DEV_FORWARD(in_dev))
- goto free_skb;
-
- rt = ip_route_output(net, iph->daddr, 0,
- ip4h_dscp(iph), 0,
- RT_SCOPE_UNIVERSE);
- if (!IS_ERR(rt)) {
- /* - Bridged-and-DNAT'ed traffic doesn't
- * require ip_forwarding. */
- if (rt->dst.dev == dev) {
- skb_dst_drop(skb);
- skb_dst_set(skb, &rt->dst);
- goto bridged_dnat;
- }
- ip_rt_put(rt);
- }
-free_skb:
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
return 0;
} else {
if (skb_dst(skb)->dev == dev) {
-bridged_dnat:
skb->dev = br_indev;
nf_bridge_update_protocol(skb);
nf_bridge_push_encap_header(skb);
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 9853cfbb9d14..d5b3c5936a79 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -949,8 +949,7 @@ br_port_get_check_rtnl(const struct net_device *dev)
/* br_ioctl.c */
int br_dev_siocdevprivate(struct net_device *dev, struct ifreq *rq,
void __user *data, int cmd);
-int br_ioctl_stub(struct net *net, struct net_bridge *br, unsigned int cmd,
- struct ifreq *ifr, void __user *uarg);
+int br_ioctl_stub(struct net *net, unsigned int cmd, void __user *uarg);
/* br_multicast.c */
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
@@ -1571,6 +1570,9 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v,
void br_vlan_port_event(struct net_bridge_port *p, unsigned long event);
int br_vlan_bridge_event(struct net_device *dev, unsigned long event,
void *ptr);
+void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event);
int br_vlan_rtnl_init(void);
void br_vlan_rtnl_uninit(void);
void br_vlan_notify(const struct net_bridge *br,
@@ -1802,6 +1804,12 @@ static inline int br_vlan_bridge_event(struct net_device *dev,
return 0;
}
+static inline void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event)
+{
+}
+
static inline int br_vlan_rtnl_init(void)
{
return 0;
@@ -2290,6 +2298,6 @@ void br_do_proxy_suppress_arp(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p);
void br_do_suppress_nd(struct sk_buff *skb, struct net_bridge *br,
u16 vid, struct net_bridge_port *p, struct nd_msg *msg);
-struct nd_msg *br_is_nd_neigh_msg(struct sk_buff *skb, struct nd_msg *m);
+struct nd_msg *br_is_nd_neigh_msg(const struct sk_buff *skb, struct nd_msg *m);
bool br_is_neigh_suppress_enabled(const struct net_bridge_port *p, u16 vid);
#endif
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index ea733542244c..c1176a5e02c4 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -1002,7 +1002,7 @@ static const struct attribute_group bridge_group = {
* Returns the number of bytes read.
*/
static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
- struct bin_attribute *bin_attr,
+ const struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
struct device *dev = kobj_to_dev(kobj);
@@ -1023,10 +1023,10 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
return n;
}
-static struct bin_attribute bridge_forward = {
+static const struct bin_attribute bridge_forward = {
.attr = { .name = SYSFS_BRIDGE_FDB,
.mode = 0444, },
- .read = brforward_read,
+ .read_new = brforward_read,
};
/*
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 89f51ea4cabe..939a3aa78d5c 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -715,8 +715,8 @@ static int br_vlan_add_existing(struct net_bridge *br,
u16 flags, bool *changed,
struct netlink_ext_ack *extack)
{
- bool would_change = __vlan_flags_would_change(vlan, flags);
bool becomes_brentry = false;
+ bool would_change = false;
int err;
if (!br_vlan_is_brentry(vlan)) {
@@ -725,6 +725,8 @@ static int br_vlan_add_existing(struct net_bridge *br,
return -EINVAL;
becomes_brentry = true;
+ } else {
+ would_change = __vlan_flags_would_change(vlan, flags);
}
/* Master VLANs that aren't brentries weren't notified before,
@@ -1664,6 +1666,18 @@ static void br_vlan_set_all_vlan_dev_state(struct net_bridge_port *p)
}
}
+static void br_vlan_toggle_bridge_binding(struct net_device *br_dev,
+ bool enable)
+{
+ struct net_bridge *br = netdev_priv(br_dev);
+
+ if (enable)
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
+ else
+ br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
+ br_vlan_has_upper_bind_vlan_dev(br_dev));
+}
+
static void br_vlan_upper_change(struct net_device *dev,
struct net_device *upper_dev,
bool linking)
@@ -1673,13 +1687,9 @@ static void br_vlan_upper_change(struct net_device *dev,
if (!br_vlan_is_bind_vlan_dev(upper_dev))
return;
- if (linking) {
+ br_vlan_toggle_bridge_binding(dev, linking);
+ if (linking)
br_vlan_set_vlan_dev_state(br, upper_dev);
- br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING, true);
- } else {
- br_opt_toggle(br, BROPT_VLAN_BRIDGE_BINDING,
- br_vlan_has_upper_bind_vlan_dev(dev));
- }
}
struct br_vlan_link_state_walk_data {
@@ -1764,6 +1774,30 @@ int br_vlan_bridge_event(struct net_device *dev, unsigned long event, void *ptr)
return ret;
}
+void br_vlan_vlan_upper_event(struct net_device *br_dev,
+ struct net_device *vlan_dev,
+ unsigned long event)
+{
+ struct vlan_dev_priv *vlan = vlan_dev_priv(vlan_dev);
+ struct net_bridge *br = netdev_priv(br_dev);
+ bool bridge_binding;
+
+ switch (event) {
+ case NETDEV_CHANGE:
+ case NETDEV_UP:
+ break;
+ default:
+ return;
+ }
+
+ bridge_binding = vlan->flags & VLAN_FLAG_BRIDGE_BINDING;
+ br_vlan_toggle_bridge_binding(br_dev, bridge_binding);
+ if (bridge_binding)
+ br_vlan_set_vlan_dev_state(br, vlan_dev);
+ else if (!bridge_binding && netif_carrier_ok(br_dev))
+ netif_carrier_on(vlan_dev);
+}
+
/* Must be protected by RTNL. */
void br_vlan_port_event(struct net_bridge_port *p, unsigned long event)
{
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 01f3fbb3b67d..65230e81fa08 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -287,8 +287,8 @@ int can_send(struct sk_buff *skb, int loop)
netif_rx(newskb);
/* update statistics */
- pkg_stats->tx_frames++;
- pkg_stats->tx_frames_delta++;
+ atomic_long_inc(&pkg_stats->tx_frames);
+ atomic_long_inc(&pkg_stats->tx_frames_delta);
return 0;
@@ -647,8 +647,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
int matches;
/* update statistics */
- pkg_stats->rx_frames++;
- pkg_stats->rx_frames_delta++;
+ atomic_long_inc(&pkg_stats->rx_frames);
+ atomic_long_inc(&pkg_stats->rx_frames_delta);
/* create non-zero unique skb identifier together with *skb */
while (!(can_skb_prv(skb)->skbcnt))
@@ -669,8 +669,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
consume_skb(skb);
if (matches > 0) {
- pkg_stats->matches++;
- pkg_stats->matches_delta++;
+ atomic_long_inc(&pkg_stats->matches);
+ atomic_long_inc(&pkg_stats->matches_delta);
}
}
diff --git a/net/can/af_can.h b/net/can/af_can.h
index 7c2d9161e224..22f3352c77fe 100644
--- a/net/can/af_can.h
+++ b/net/can/af_can.h
@@ -66,9 +66,9 @@ struct receiver {
struct can_pkg_stats {
unsigned long jiffies_init;
- unsigned long rx_frames;
- unsigned long tx_frames;
- unsigned long matches;
+ atomic_long_t rx_frames;
+ atomic_long_t tx_frames;
+ atomic_long_t matches;
unsigned long total_rx_rate;
unsigned long total_tx_rate;
@@ -82,9 +82,9 @@ struct can_pkg_stats {
unsigned long max_tx_rate;
unsigned long max_rx_match_ratio;
- unsigned long rx_frames_delta;
- unsigned long tx_frames_delta;
- unsigned long matches_delta;
+ atomic_long_t rx_frames_delta;
+ atomic_long_t tx_frames_delta;
+ atomic_long_t matches_delta;
};
/* persistent statistics */
diff --git a/net/can/proc.c b/net/can/proc.c
index bbce97825f13..25fdf060e30d 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -118,6 +118,13 @@ void can_stat_update(struct timer_list *t)
struct can_pkg_stats *pkg_stats = net->can.pkg_stats;
unsigned long j = jiffies; /* snapshot */
+ long rx_frames = atomic_long_read(&pkg_stats->rx_frames);
+ long tx_frames = atomic_long_read(&pkg_stats->tx_frames);
+ long matches = atomic_long_read(&pkg_stats->matches);
+ long rx_frames_delta = atomic_long_read(&pkg_stats->rx_frames_delta);
+ long tx_frames_delta = atomic_long_read(&pkg_stats->tx_frames_delta);
+ long matches_delta = atomic_long_read(&pkg_stats->matches_delta);
+
/* restart counting in timer context on user request */
if (user_reset)
can_init_stats(net);
@@ -127,35 +134,33 @@ void can_stat_update(struct timer_list *t)
can_init_stats(net);
/* prevent overflow in calc_rate() */
- if (pkg_stats->rx_frames > (ULONG_MAX / HZ))
+ if (rx_frames > (LONG_MAX / HZ))
can_init_stats(net);
/* prevent overflow in calc_rate() */
- if (pkg_stats->tx_frames > (ULONG_MAX / HZ))
+ if (tx_frames > (LONG_MAX / HZ))
can_init_stats(net);
/* matches overflow - very improbable */
- if (pkg_stats->matches > (ULONG_MAX / 100))
+ if (matches > (LONG_MAX / 100))
can_init_stats(net);
/* calc total values */
- if (pkg_stats->rx_frames)
- pkg_stats->total_rx_match_ratio = (pkg_stats->matches * 100) /
- pkg_stats->rx_frames;
+ if (rx_frames)
+ pkg_stats->total_rx_match_ratio = (matches * 100) / rx_frames;
pkg_stats->total_tx_rate = calc_rate(pkg_stats->jiffies_init, j,
- pkg_stats->tx_frames);
+ tx_frames);
pkg_stats->total_rx_rate = calc_rate(pkg_stats->jiffies_init, j,
- pkg_stats->rx_frames);
+ rx_frames);
/* calc current values */
- if (pkg_stats->rx_frames_delta)
+ if (rx_frames_delta)
pkg_stats->current_rx_match_ratio =
- (pkg_stats->matches_delta * 100) /
- pkg_stats->rx_frames_delta;
+ (matches_delta * 100) / rx_frames_delta;
- pkg_stats->current_tx_rate = calc_rate(0, HZ, pkg_stats->tx_frames_delta);
- pkg_stats->current_rx_rate = calc_rate(0, HZ, pkg_stats->rx_frames_delta);
+ pkg_stats->current_tx_rate = calc_rate(0, HZ, tx_frames_delta);
+ pkg_stats->current_rx_rate = calc_rate(0, HZ, rx_frames_delta);
/* check / update maximum values */
if (pkg_stats->max_tx_rate < pkg_stats->current_tx_rate)
@@ -168,9 +173,9 @@ void can_stat_update(struct timer_list *t)
pkg_stats->max_rx_match_ratio = pkg_stats->current_rx_match_ratio;
/* clear values for 'current rate' calculation */
- pkg_stats->tx_frames_delta = 0;
- pkg_stats->rx_frames_delta = 0;
- pkg_stats->matches_delta = 0;
+ atomic_long_set(&pkg_stats->tx_frames_delta, 0);
+ atomic_long_set(&pkg_stats->rx_frames_delta, 0);
+ atomic_long_set(&pkg_stats->matches_delta, 0);
/* restart timer (one second) */
mod_timer(&net->can.stattimer, round_jiffies(jiffies + HZ));
@@ -214,9 +219,12 @@ static int can_stats_proc_show(struct seq_file *m, void *v)
struct can_rcv_lists_stats *rcv_lists_stats = net->can.rcv_lists_stats;
seq_putc(m, '\n');
- seq_printf(m, " %8ld transmitted frames (TXF)\n", pkg_stats->tx_frames);
- seq_printf(m, " %8ld received frames (RXF)\n", pkg_stats->rx_frames);
- seq_printf(m, " %8ld matched frames (RXMF)\n", pkg_stats->matches);
+ seq_printf(m, " %8ld transmitted frames (TXF)\n",
+ atomic_long_read(&pkg_stats->tx_frames));
+ seq_printf(m, " %8ld received frames (RXF)\n",
+ atomic_long_read(&pkg_stats->rx_frames));
+ seq_printf(m, " %8ld matched frames (RXMF)\n",
+ atomic_long_read(&pkg_stats->matches));
seq_putc(m, '\n');
diff --git a/net/can/raw.c b/net/can/raw.c
index 255c0a8f39d6..46e8ed9d64da 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -962,7 +962,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
skb->dev = dev;
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = sockc.priority;
skb->mark = READ_ONCE(sk->sk_mark);
skb->tstamp = sockc.transmit_time;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 2f4ed83a75ae..2e538399757f 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -50,15 +50,16 @@ void bpf_sk_storage_free(struct sock *sk)
{
struct bpf_local_storage *sk_storage;
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
- if (!sk_storage) {
- rcu_read_unlock();
- return;
- }
+ if (!sk_storage)
+ goto out;
bpf_local_storage_destroy(sk_storage);
+out:
rcu_read_unlock();
+ migrate_enable();
}
static void bpf_sk_storage_map_free(struct bpf_map *map)
@@ -160,6 +161,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL);
+ migrate_disable();
rcu_read_lock();
sk_storage = rcu_dereference(sk->sk_bpf_storage);
@@ -212,6 +214,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
out:
rcu_read_unlock();
+ migrate_enable();
/* In case of an error, don't free anything explicitly here, the
* caller is responsible to call bpf_sk_storage_free.
@@ -352,11 +355,6 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = {
static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
{
- const struct btf *btf_vmlinux;
- const struct btf_type *t;
- const char *tname;
- u32 btf_id;
-
if (prog->aux->dst_prog)
return false;
@@ -371,13 +369,7 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
return true;
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
- btf_vmlinux = bpf_get_btf_vmlinux();
- if (IS_ERR_OR_NULL(btf_vmlinux))
- return false;
- btf_id = prog->aux->attach_btf_id;
- t = btf_type_by_id(btf_vmlinux, btf_id);
- tname = btf_name_by_offset(btf_vmlinux, t->name_off);
- return !!strncmp(tname, "bpf_sk_storage",
+ return !!strncmp(prog->aux->attach_func_name, "bpf_sk_storage",
strlen("bpf_sk_storage"));
default:
return false;
diff --git a/net/core/dev.c b/net/core/dev.c
index 2b09714761c6..2f7f5fd9ffec 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -92,6 +92,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
+#include <linux/ethtool_netlink.h>
#include <linux/skbuff.h>
#include <linux/kthread.h>
#include <linux/bpf.h>
@@ -105,6 +106,7 @@
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/gro.h>
+#include <net/netdev_queues.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
#include <net/checksum.h>
@@ -180,8 +182,6 @@ static DEFINE_SPINLOCK(napi_hash_lock);
static unsigned int napi_gen_id = NR_CPUS;
static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
-static DECLARE_RWSEM(devnet_rename_sem);
-
static inline void dev_base_seq_inc(struct net *net)
{
unsigned int val = net->dev_base_seq + 1;
@@ -460,7 +460,7 @@ EXPORT_PER_CPU_SYMBOL(softnet_data);
* PP consumers must pay attention to run APIs in the appropriate context
* (e.g. NAPI context).
*/
-static DEFINE_PER_CPU(struct page_pool *, system_page_pool);
+DEFINE_PER_CPU(struct page_pool *, system_page_pool);
#ifdef CONFIG_LOCKDEP
/*
@@ -767,7 +767,8 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
}
/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
+static struct napi_struct *
+netdev_napi_by_id(struct net *net, unsigned int napi_id)
{
struct napi_struct *napi;
@@ -784,6 +785,49 @@ struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id)
}
/**
+ * netdev_napi_by_id_lock() - find a device by NAPI ID and lock it
+ * @net: the applicable net namespace
+ * @napi_id: ID of a NAPI of a target device
+ *
+ * Find a NAPI instance with @napi_id. Lock its device.
+ * The device must be in %NETREG_REGISTERED state for lookup to succeed.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to NAPI, its device with lock held, NULL if not found.
+ */
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id)
+{
+ struct napi_struct *napi;
+ struct net_device *dev;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (!napi || READ_ONCE(napi->dev->reg_state) != NETREG_REGISTERED) {
+ rcu_read_unlock();
+ return NULL;
+ }
+
+ dev = napi->dev;
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev);
+ if (!dev)
+ return NULL;
+
+ rcu_read_lock();
+ napi = netdev_napi_by_id(net, napi_id);
+ if (napi && napi->dev != dev)
+ napi = NULL;
+ rcu_read_unlock();
+
+ if (!napi)
+ netdev_unlock(dev);
+ return napi;
+}
+
+/**
* __dev_get_by_name - find a device by its name
* @net: the applicable net namespace
* @name: name to find
@@ -957,7 +1001,6 @@ EXPORT_SYMBOL(netdev_get_by_index);
* its reference counter increased so the caller must be careful
* about locking. The caller must hold RCU lock.
*/
-
struct net_device *dev_get_by_napi_id(unsigned int napi_id)
{
struct napi_struct *napi;
@@ -971,7 +1014,73 @@ struct net_device *dev_get_by_napi_id(unsigned int napi_id)
return napi ? napi->dev : NULL;
}
-EXPORT_SYMBOL(dev_get_by_napi_id);
+
+/* Release the held reference on the net_device, and if the net_device
+ * is still registered try to lock the instance lock. If device is being
+ * unregistered NULL will be returned (but the reference has been released,
+ * either way!)
+ *
+ * This helper is intended for locking net_device after it has been looked up
+ * using a lockless lookup helper. Lock prevents the instance from going away.
+ */
+struct net_device *__netdev_put_lock(struct net_device *dev)
+{
+ netdev_lock(dev);
+ if (dev->reg_state > NETREG_REGISTERED) {
+ netdev_unlock(dev);
+ dev_put(dev);
+ return NULL;
+ }
+ dev_put(dev);
+ return dev;
+}
+
+/**
+ * netdev_get_by_index_lock() - find a device by its ifindex
+ * @net: the applicable net namespace
+ * @ifindex: index of device
+ *
+ * Search for an interface by index. If a valid device
+ * with @ifindex is found it will be returned with netdev->lock held.
+ * netdev_unlock() must be called to release it.
+ *
+ * Return: pointer to a device with lock held, NULL if not found.
+ */
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex)
+{
+ struct net_device *dev;
+
+ dev = dev_get_by_index(net, ifindex);
+ if (!dev)
+ return NULL;
+
+ return __netdev_put_lock(dev);
+}
+
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index)
+{
+ if (dev)
+ netdev_unlock(dev);
+
+ do {
+ rcu_read_lock();
+ dev = xa_find(&net->dev_by_index, index, ULONG_MAX, XA_PRESENT);
+ if (!dev) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ dev_hold(dev);
+ rcu_read_unlock();
+
+ dev = __netdev_put_lock(dev);
+ if (dev)
+ return dev;
+
+ (*index)++;
+ } while (true);
+}
static DEFINE_SEQLOCK(netdev_rename_lock);
@@ -1271,23 +1380,16 @@ static int dev_get_valid_name(struct net *net, struct net_device *dev,
*/
int dev_change_name(struct net_device *dev, const char *newname)
{
+ struct net *net = dev_net(dev);
unsigned char old_assign_type;
char oldname[IFNAMSIZ];
int err = 0;
int ret;
- struct net *net;
-
- ASSERT_RTNL();
- BUG_ON(!dev_net(dev));
-
- net = dev_net(dev);
- down_write(&devnet_rename_sem);
+ ASSERT_RTNL_NET(net);
- if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
- up_write(&devnet_rename_sem);
+ if (!strncmp(newname, dev->name, IFNAMSIZ))
return 0;
- }
memcpy(oldname, dev->name, IFNAMSIZ);
@@ -1295,10 +1397,8 @@ int dev_change_name(struct net_device *dev, const char *newname)
err = dev_get_valid_name(net, dev, newname);
write_sequnlock_bh(&netdev_rename_lock);
- if (err < 0) {
- up_write(&devnet_rename_sem);
+ if (err < 0)
return err;
- }
if (oldname[0] && !strchr(oldname, '%'))
netdev_info(dev, "renamed from %s%s\n", oldname,
@@ -1314,12 +1414,9 @@ rollback:
memcpy(dev->name, oldname, IFNAMSIZ);
write_sequnlock_bh(&netdev_rename_lock);
WRITE_ONCE(dev->name_assign_type, old_assign_type);
- up_write(&devnet_rename_sem);
return ret;
}
- up_write(&devnet_rename_sem);
-
netdev_adjacent_rename_links(dev, oldname);
netdev_name_node_del(dev->name_node);
@@ -1335,7 +1432,6 @@ rollback:
/* err >= 0 after dev_alloc_name() or stores the first errno */
if (err >= 0) {
err = ret;
- down_write(&devnet_rename_sem);
write_seqlock_bh(&netdev_rename_lock);
memcpy(dev->name, oldname, IFNAMSIZ);
write_sequnlock_bh(&netdev_rename_lock);
@@ -1543,7 +1639,7 @@ static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
if (ret)
clear_bit(__LINK_STATE_START, &dev->state);
else {
- dev->flags |= IFF_UP;
+ netif_set_up(dev, true);
dev_set_rx_mode(dev);
dev_activate(dev);
add_device_randomness(dev->dev_addr, dev->addr_len);
@@ -1622,7 +1718,7 @@ static void __dev_close_many(struct list_head *head)
if (ops->ndo_stop)
ops->ndo_stop(dev);
- dev->flags &= ~IFF_UP;
+ netif_set_up(dev, false);
netpoll_poll_enable(dev);
}
}
@@ -1832,14 +1928,19 @@ int register_netdevice_notifier(struct notifier_block *nb)
/* Close race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
+
+ /* When RTNL is removed, we need protection for netdev_chain. */
rtnl_lock();
+
err = raw_notifier_chain_register(&netdev_chain, nb);
if (err)
goto unlock;
if (dev_boot_phase)
goto unlock;
for_each_net(net) {
+ __rtnl_net_lock(net);
err = call_netdevice_register_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
if (err)
goto rollback;
}
@@ -1850,8 +1951,11 @@ unlock:
return err;
rollback:
- for_each_net_continue_reverse(net)
+ for_each_net_continue_reverse(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
@@ -1884,8 +1988,11 @@ int unregister_netdevice_notifier(struct notifier_block *nb)
if (err)
goto unlock;
- for_each_net(net)
+ for_each_net(net) {
+ __rtnl_net_lock(net);
call_netdevice_unregister_net_notifiers(nb, net);
+ __rtnl_net_unlock(net);
+ }
unlock:
rtnl_unlock();
@@ -1949,9 +2056,10 @@ int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __register_netdevice_notifier_net(net, nb, false);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_net);
@@ -1977,9 +2085,10 @@ int unregister_netdevice_notifier_net(struct net *net,
{
int err;
- rtnl_lock();
+ rtnl_net_lock(net);
err = __unregister_netdevice_notifier_net(net, nb);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_net);
@@ -1992,19 +2101,56 @@ static void __move_netdevice_notifier_net(struct net *src_net,
__register_netdevice_notifier_net(dst_net, nb, true);
}
+static void rtnl_net_dev_lock(struct net_device *dev)
+{
+ bool again;
+
+ do {
+ struct net *net;
+
+ again = false;
+
+ /* netns might be being dismantled. */
+ rcu_read_lock();
+ net = dev_net_rcu(dev);
+ net_passive_inc(net);
+ rcu_read_unlock();
+
+ rtnl_net_lock(net);
+
+#ifdef CONFIG_NET_NS
+ /* dev might have been moved to another netns. */
+ if (!net_eq(net, rcu_access_pointer(dev->nd_net.net))) {
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+ again = true;
+ }
+#endif
+ } while (again);
+}
+
+static void rtnl_net_dev_unlock(struct net_device *dev)
+{
+ struct net *net = dev_net(dev);
+
+ rtnl_net_unlock(net);
+ net_passive_dec(net);
+}
+
int register_netdevice_notifier_dev_net(struct net_device *dev,
struct notifier_block *nb,
struct netdev_net_notifier *nn)
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
if (!err) {
nn->nb = nb;
list_add(&nn->list, &dev->net_notifier_list);
}
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
@@ -2015,10 +2161,11 @@ int unregister_netdevice_notifier_dev_net(struct net_device *dev,
{
int err;
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
list_del(&nn->list);
err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
+
return err;
}
EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
@@ -3296,7 +3443,7 @@ void netif_device_attach(struct net_device *dev)
if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
netif_running(dev)) {
netif_tx_wake_all_queues(dev);
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_device_attach);
@@ -3725,6 +3872,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
{
netdev_features_t features;
+ if (!skb_frags_readable(skb))
+ goto out_kfree_skb;
+
features = netif_skb_features(skb);
skb = validate_xmit_vlan(skb, features);
if (unlikely(!skb))
@@ -4610,7 +4760,7 @@ use_local_napi:
* we have to raise NET_RX_SOFTIRQ.
*/
if (!sd->in_net_rx_action)
- __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+ raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
#ifdef CONFIG_RPS
@@ -4999,7 +5149,7 @@ static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
}
u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
void *orig_data, *orig_data_end, *hard_start;
struct netdev_rx_queue *rxqueue;
@@ -5101,7 +5251,7 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
}
static int
-netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
+netif_skb_check_for_xdp(struct sk_buff **pskb, const struct bpf_prog *prog)
{
struct sk_buff *skb = *pskb;
int err, hroom, troom;
@@ -5125,7 +5275,7 @@ netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog)
static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct sk_buff *skb = *pskb;
u32 mac_len, act = XDP_DROP;
@@ -5178,7 +5328,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff **pskb,
* and DDOS attacks will be more effective. In-driver-XDP use dedicated TX
* queues, so they do not have this starvation issue.
*/
-void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
+void generic_xdp_tx(struct sk_buff *skb, const struct bpf_prog *xdp_prog)
{
struct net_device *dev = skb->dev;
struct netdev_queue *txq;
@@ -5203,7 +5353,7 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
-int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb)
+int do_xdp_generic(const struct bpf_prog *xdp_prog, struct sk_buff **pskb)
{
struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
@@ -5542,8 +5692,14 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
orig_dev = skb->dev;
skb_reset_network_header(skb);
+#if !defined(CONFIG_DEBUG_NET)
+ /* We plan to no longer reset the transport header here.
+ * Give some time to fuzzers and dev build to catch bugs
+ * in network stacks.
+ */
if (!skb_transport_header_was_set(skb))
skb_reset_transport_header(skb);
+#endif
skb_reset_mac_len(skb);
pt_prev = NULL;
@@ -6027,8 +6183,6 @@ void netif_receive_skb_list(struct list_head *head)
}
EXPORT_SYMBOL(netif_receive_skb_list);
-static DEFINE_PER_CPU(struct work_struct, flush_works);
-
/* Network device is going away, flush any packets still pending */
static void flush_backlog(struct work_struct *work)
{
@@ -6085,36 +6239,54 @@ static bool flush_required(int cpu)
return true;
}
+struct flush_backlogs {
+ cpumask_t flush_cpus;
+ struct work_struct w[];
+};
+
+static struct flush_backlogs *flush_backlogs_alloc(void)
+{
+ return kmalloc(struct_size_t(struct flush_backlogs, w, nr_cpu_ids),
+ GFP_KERNEL);
+}
+
+static struct flush_backlogs *flush_backlogs_fallback;
+static DEFINE_MUTEX(flush_backlogs_mutex);
+
static void flush_all_backlogs(void)
{
- static cpumask_t flush_cpus;
+ struct flush_backlogs *ptr = flush_backlogs_alloc();
unsigned int cpu;
- /* since we are under rtnl lock protection we can use static data
- * for the cpumask and avoid allocating on stack the possibly
- * large mask
- */
- ASSERT_RTNL();
+ if (!ptr) {
+ mutex_lock(&flush_backlogs_mutex);
+ ptr = flush_backlogs_fallback;
+ }
+ cpumask_clear(&ptr->flush_cpus);
cpus_read_lock();
- cpumask_clear(&flush_cpus);
for_each_online_cpu(cpu) {
if (flush_required(cpu)) {
- queue_work_on(cpu, system_highpri_wq,
- per_cpu_ptr(&flush_works, cpu));
- cpumask_set_cpu(cpu, &flush_cpus);
+ INIT_WORK(&ptr->w[cpu], flush_backlog);
+ queue_work_on(cpu, system_highpri_wq, &ptr->w[cpu]);
+ __cpumask_set_cpu(cpu, &ptr->flush_cpus);
}
}
/* we can have in flight packet[s] on the cpus we are not flushing,
* synchronize_net() in unregister_netdevice_many() will take care of
- * them
+ * them.
*/
- for_each_cpu(cpu, &flush_cpus)
- flush_work(per_cpu_ptr(&flush_works, cpu));
+ for_each_cpu(cpu, &ptr->flush_cpus)
+ flush_work(&ptr->w[cpu]);
cpus_read_unlock();
+
+ if (ptr != flush_backlogs_fallback)
+ kfree(ptr);
+ else
+ mutex_unlock(&flush_backlogs_mutex);
}
static void net_rps_send_ipi(struct softnet_data *remsd)
@@ -6688,6 +6860,8 @@ int dev_set_threaded(struct net_device *dev, bool threaded)
struct napi_struct *napi;
int err = 0;
+ netdev_assert_locked_or_invisible(dev);
+
if (dev->threaded == threaded)
return 0;
@@ -6766,13 +6940,14 @@ static void napi_restore_config(struct napi_struct *n)
n->gro_flush_timeout = n->config->gro_flush_timeout;
n->irq_suspend_timeout = n->config->irq_suspend_timeout;
/* a NAPI ID might be stored in the config, if so use it. if not, use
- * napi_hash_add to generate one for us. It will be saved to the config
- * in napi_disable.
+ * napi_hash_add to generate one for us.
*/
- if (n->config->napi_id)
+ if (n->config->napi_id) {
napi_hash_add_with_id(n, n->config->napi_id);
- else
+ } else {
napi_hash_add(n);
+ n->config->napi_id = n->napi_id;
+ }
}
static void napi_save_config(struct napi_struct *n)
@@ -6780,13 +6955,62 @@ static void napi_save_config(struct napi_struct *n)
n->config->defer_hard_irqs = n->defer_hard_irqs;
n->config->gro_flush_timeout = n->gro_flush_timeout;
n->config->irq_suspend_timeout = n->irq_suspend_timeout;
- n->config->napi_id = n->napi_id;
napi_hash_del(n);
}
-void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+/* Netlink wants the NAPI list to be sorted by ID, if adding a NAPI which will
+ * inherit an existing ID try to insert it at the right position.
+ */
+static void
+netif_napi_dev_list_add(struct net_device *dev, struct napi_struct *napi)
+{
+ unsigned int new_id, pos_id;
+ struct list_head *higher;
+ struct napi_struct *pos;
+
+ new_id = UINT_MAX;
+ if (napi->config && napi->config->napi_id)
+ new_id = napi->config->napi_id;
+
+ higher = &dev->napi_list;
+ list_for_each_entry(pos, &dev->napi_list, dev_list) {
+ if (pos->napi_id >= MIN_NAPI_ID)
+ pos_id = pos->napi_id;
+ else if (pos->config)
+ pos_id = pos->config->napi_id;
+ else
+ pos_id = UINT_MAX;
+
+ if (pos_id <= new_id)
+ break;
+ higher = &pos->dev_list;
+ }
+ list_add_rcu(&napi->dev_list, higher); /* adds after higher */
+}
+
+/* Double check that napi_get_frags() allocates skbs with
+ * skb->head being backed by slab, not a page fragment.
+ * This is to make sure bug fixed in 3226b158e67c
+ * ("net: avoid 32 x truesize under-estimation for tiny skbs")
+ * does not accidentally come back.
+ */
+static void napi_get_frags_check(struct napi_struct *napi)
+{
+ struct sk_buff *skb;
+
+ local_bh_disable();
+ skb = napi_get_frags(napi);
+ WARN_ON_ONCE(skb && skb->head_frag);
+ napi_free_frags(napi);
+ local_bh_enable();
+}
+
+void netif_napi_add_weight_locked(struct net_device *dev,
+ struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int),
+ int weight)
{
+ netdev_assert_locked(dev);
if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
return;
@@ -6810,7 +7034,7 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
napi->list_owner = -1;
set_bit(NAPI_STATE_SCHED, &napi->state);
set_bit(NAPI_STATE_NPSVC, &napi->state);
- list_add_rcu(&napi->dev_list, &dev->napi_list);
+ netif_napi_dev_list_add(dev, napi);
/* default settings from sysfs are applied to all NAPIs. any per-NAPI
* configuration will be loaded in napi_enable
@@ -6825,15 +7049,17 @@ void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi,
*/
if (dev->threaded && napi_kthread_create(napi))
dev->threaded = false;
- netif_napi_set_irq(napi, -1);
+ netif_napi_set_irq_locked(napi, -1);
}
-EXPORT_SYMBOL(netif_napi_add_weight);
+EXPORT_SYMBOL(netif_napi_add_weight_locked);
-void napi_disable(struct napi_struct *n)
+void napi_disable_locked(struct napi_struct *n)
{
unsigned long val, new;
might_sleep();
+ netdev_assert_locked(n->dev);
+
set_bit(NAPI_STATE_DISABLE, &n->state);
val = READ_ONCE(n->state);
@@ -6856,16 +7082,25 @@ void napi_disable(struct napi_struct *n)
clear_bit(NAPI_STATE_DISABLE, &n->state);
}
-EXPORT_SYMBOL(napi_disable);
+EXPORT_SYMBOL(napi_disable_locked);
/**
- * napi_enable - enable NAPI scheduling
- * @n: NAPI context
+ * napi_disable() - prevent NAPI from scheduling
+ * @n: NAPI context
*
- * Resume NAPI from being scheduled on this context.
- * Must be paired with napi_disable.
+ * Stop NAPI from being scheduled on this context.
+ * Waits till any outstanding processing completes.
+ * Takes netdev_lock() for associated net_device.
*/
-void napi_enable(struct napi_struct *n)
+void napi_disable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_disable_locked(n);
+ netdev_unlock(n->dev);
+}
+EXPORT_SYMBOL(napi_disable);
+
+void napi_enable_locked(struct napi_struct *n)
{
unsigned long new, val = READ_ONCE(n->state);
@@ -6882,6 +7117,22 @@ void napi_enable(struct napi_struct *n)
new |= NAPIF_STATE_THREADED;
} while (!try_cmpxchg(&n->state, &val, new));
}
+EXPORT_SYMBOL(napi_enable_locked);
+
+/**
+ * napi_enable() - enable NAPI scheduling
+ * @n: NAPI context
+ *
+ * Enable scheduling of a NAPI instance.
+ * Must be paired with napi_disable().
+ * Takes netdev_lock() for associated net_device.
+ */
+void napi_enable(struct napi_struct *n)
+{
+ netdev_lock(n->dev);
+ napi_enable_locked(n);
+ netdev_unlock(n->dev);
+}
EXPORT_SYMBOL(napi_enable);
static void flush_gro_hash(struct napi_struct *napi)
@@ -6898,8 +7149,10 @@ static void flush_gro_hash(struct napi_struct *napi)
}
/* Must be called in process context */
-void __netif_napi_del(struct napi_struct *napi)
+void __netif_napi_del_locked(struct napi_struct *napi)
{
+ netdev_assert_locked(napi->dev);
+
if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
return;
@@ -6919,7 +7172,7 @@ void __netif_napi_del(struct napi_struct *napi)
napi->thread = NULL;
}
}
-EXPORT_SYMBOL(__netif_napi_del);
+EXPORT_SYMBOL(__netif_napi_del_locked);
static int __napi_poll(struct napi_struct *n, bool *repoll)
{
@@ -9535,11 +9788,31 @@ u8 dev_xdp_prog_count(struct net_device *dev)
}
EXPORT_SYMBOL_GPL(dev_xdp_prog_count);
+u8 dev_xdp_sb_prog_count(struct net_device *dev)
+{
+ u8 count = 0;
+ int i;
+
+ for (i = 0; i < __MAX_XDP_MODE; i++)
+ if (dev->xdp_state[i].prog &&
+ !dev->xdp_state[i].prog->aux->xdp_has_frags)
+ count++;
+ return count;
+}
+
int dev_xdp_propagate(struct net_device *dev, struct netdev_bpf *bpf)
{
if (!dev->netdev_ops->ndo_bpf)
return -EOPNOTSUPP;
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ bpf->command == XDP_SETUP_PROG &&
+ bpf->prog && !bpf->prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "unable to propagate XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
if (dev_get_min_mp_channel_count(dev)) {
NL_SET_ERR_MSG(bpf->extack, "unable to propagate XDP to device using memory provider");
return -EBUSY;
@@ -9577,6 +9850,12 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
struct netdev_bpf xdp;
int err;
+ if (dev->cfg->hds_config == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ prog && !prog->aux->xdp_has_frags) {
+ NL_SET_ERR_MSG(extack, "unable to install XDP to device using tcp-data-split");
+ return -EBUSY;
+ }
+
if (dev_get_min_mp_channel_count(dev)) {
NL_SET_ERR_MSG(extack, "unable to install XDP to device using memory provider");
return -EBUSY;
@@ -10061,6 +10340,15 @@ static void dev_index_release(struct net *net, int ifindex)
WARN_ON(xa_erase(&net->dev_by_index, ifindex));
}
+static bool from_cleanup_net(void)
+{
+#ifdef CONFIG_NET_NS
+ return current == cleanup_net_task;
+#else
+ return false;
+#endif
+}
+
/* Delayed registration/unregisteration */
LIST_HEAD(net_todo_list);
DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
@@ -10657,7 +10945,9 @@ int register_netdevice(struct net_device *dev)
ret = netdev_register_kobject(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED);
+ netdev_unlock(dev);
if (ret)
goto err_uninit_notify;
@@ -10725,26 +11015,20 @@ err_free_name:
EXPORT_SYMBOL(register_netdevice);
/* Initialize the core of a dummy net device.
- * This is useful if you are calling this function after alloc_netdev(),
- * since it does not memset the net_device fields.
+ * The setup steps dummy netdevs need which normal netdevs get by going
+ * through register_netdevice().
*/
-static void init_dummy_netdev_core(struct net_device *dev)
+static void init_dummy_netdev(struct net_device *dev)
{
/* make sure we BUG if trying to hit standard
* register/unregister code path
*/
dev->reg_state = NETREG_DUMMY;
- /* NAPI wants this */
- INIT_LIST_HEAD(&dev->napi_list);
-
/* a dummy interface is started by default */
set_bit(__LINK_STATE_PRESENT, &dev->state);
set_bit(__LINK_STATE_START, &dev->state);
- /* napi_busy_loop stats accounting wants this */
- dev_net_set(dev, &init_net);
-
/* Note : We dont allocate pcpu_refcnt for dummy devices,
* because users of this 'device' dont need to change
* its refcount.
@@ -10752,28 +11036,6 @@ static void init_dummy_netdev_core(struct net_device *dev)
}
/**
- * init_dummy_netdev - init a dummy network device for NAPI
- * @dev: device to init
- *
- * This takes a network device structure and initializes the minimum
- * amount of fields so it can be used to schedule NAPI polls without
- * registering a full blown interface. This is to be used by drivers
- * that need to tie several hardware interfaces to a single NAPI
- * poll scheduler due to HW limitations.
- */
-void init_dummy_netdev(struct net_device *dev)
-{
- /* Clear everything. Note we don't initialize spinlocks
- * as they aren't supposed to be taken by any of the
- * NAPI code and this dummy netdev is supposed to be
- * only ever used for NAPI polls
- */
- memset(dev, 0, sizeof(struct net_device));
- init_dummy_netdev_core(dev);
-}
-EXPORT_SYMBOL_GPL(init_dummy_netdev);
-
-/**
* register_netdev - register a network device
* @dev: device to register
*
@@ -10788,12 +11050,16 @@ EXPORT_SYMBOL_GPL(init_dummy_netdev);
*/
int register_netdev(struct net_device *dev)
{
+ struct net *net = dev_net(dev);
int err;
- if (rtnl_lock_killable())
+ if (rtnl_net_lock_killable(net))
return -EINTR;
+
err = register_netdevice(dev);
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
+
return err;
}
EXPORT_SYMBOL(register_netdev);
@@ -10955,7 +11221,9 @@ void netdev_run_todo(void)
continue;
}
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED);
+ netdev_unlock(dev);
linkwatch_sync_dev(dev);
}
@@ -11103,6 +11371,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
const struct net_device_ops *ops = dev->netdev_ops;
const struct net_device_core_stats __percpu *p;
+ /*
+ * IPv{4,6} and udp tunnels share common stat helpers and use
+ * different stat type (NETDEV_PCPU_STAT_TSTATS vs
+ * NETDEV_PCPU_STAT_DSTATS). Ensure the accounting is consistent.
+ */
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_bytes) !=
+ offsetof(struct pcpu_dstats, rx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, rx_packets) !=
+ offsetof(struct pcpu_dstats, rx_packets));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_bytes) !=
+ offsetof(struct pcpu_dstats, tx_bytes));
+ BUILD_BUG_ON(offsetof(struct pcpu_sw_netstats, tx_packets) !=
+ offsetof(struct pcpu_dstats, tx_packets));
+
if (ops->ndo_get_stats64) {
memset(storage, 0, sizeof(*storage));
ops->ndo_get_stats64(dev, storage);
@@ -11341,6 +11623,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
if (!dev->ethtool)
goto free_all;
+ dev->cfg = kzalloc(sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg)
+ goto free_all;
+ dev->cfg_pending = dev->cfg;
+
napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config));
dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT);
if (!dev->napi_config)
@@ -11370,6 +11657,22 @@ free_dev:
}
EXPORT_SYMBOL(alloc_netdev_mqs);
+static void netdev_napi_exit(struct net_device *dev)
+{
+ if (!list_empty(&dev->napi_list)) {
+ struct napi_struct *p, *n;
+
+ netdev_lock(dev);
+ list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+ __netif_napi_del_locked(p);
+ netdev_unlock(dev);
+
+ synchronize_net();
+ }
+
+ kvfree(dev->napi_config);
+}
+
/**
* free_netdev - free network device
* @dev: device
@@ -11381,8 +11684,6 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
*/
void free_netdev(struct net_device *dev)
{
- struct napi_struct *p, *n;
-
might_sleep();
/* When called immediately after register_netdevice() failed the unwind
@@ -11395,8 +11696,8 @@ void free_netdev(struct net_device *dev)
return;
}
- mutex_destroy(&dev->lock);
-
+ WARN_ON(dev->cfg != dev->cfg_pending);
+ kfree(dev->cfg);
kfree(dev->ethtool);
netif_free_tx_queues(dev);
netif_free_rx_queues(dev);
@@ -11406,10 +11707,7 @@ void free_netdev(struct net_device *dev)
/* Flush device addresses */
dev_addr_flush(dev);
- list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
- netif_napi_del(p);
-
- kvfree(dev->napi_config);
+ netdev_napi_exit(dev);
ref_tracker_dir_exit(&dev->refcnt_tracker);
#ifdef CONFIG_PCPU_DEV_REFCNT
@@ -11423,6 +11721,8 @@ void free_netdev(struct net_device *dev)
netdev_free_phy_link_topology(dev);
+ mutex_destroy(&dev->lock);
+
/* Compatibility with error handling in drivers */
if (dev->reg_state == NETREG_UNINITIALIZED ||
dev->reg_state == NETREG_DUMMY) {
@@ -11447,7 +11747,7 @@ EXPORT_SYMBOL(free_netdev);
struct net_device *alloc_netdev_dummy(int sizeof_priv)
{
return alloc_netdev(sizeof_priv, "dummy#", NET_NAME_UNKNOWN,
- init_dummy_netdev_core);
+ init_dummy_netdev);
}
EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
@@ -11460,7 +11760,7 @@ EXPORT_SYMBOL_GPL(alloc_netdev_dummy);
void synchronize_net(void)
{
might_sleep();
- if (rtnl_is_locked())
+ if (from_cleanup_net() || rtnl_is_locked())
synchronize_rcu_expedited();
else
synchronize_rcu();
@@ -11561,7 +11861,9 @@ void unregister_netdevice_many_notify(struct list_head *head,
list_for_each_entry(dev, head, unreg_list) {
/* And unlink it from device chain. */
unlist_netdevice(dev);
+ netdev_lock(dev);
WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING);
+ netdev_unlock(dev);
}
flush_all_backlogs();
@@ -11663,9 +11965,9 @@ EXPORT_SYMBOL(unregister_netdevice_many);
*/
void unregister_netdev(struct net_device *dev)
{
- rtnl_lock();
+ rtnl_net_dev_lock(dev);
unregister_netdevice(dev);
- rtnl_unlock();
+ rtnl_net_dev_unlock(dev);
}
EXPORT_SYMBOL(unregister_netdev);
@@ -12211,11 +12513,18 @@ static int net_page_pool_create(int cpuid)
.nid = cpu_to_mem(cpuid),
};
struct page_pool *pp_ptr;
+ int err;
pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid);
if (IS_ERR(pp_ptr))
return -ENOMEM;
+ err = xdp_reg_page_pool(pp_ptr);
+ if (err) {
+ page_pool_destroy(pp_ptr);
+ return err;
+ }
+
per_cpu(system_page_pool, cpuid) = pp_ptr;
#endif
return 0;
@@ -12281,12 +12590,13 @@ static int __init net_dev_init(void)
* Initialise the packet receive queues.
*/
+ flush_backlogs_fallback = flush_backlogs_alloc();
+ if (!flush_backlogs_fallback)
+ goto out;
+
for_each_possible_cpu(i) {
- struct work_struct *flush = per_cpu_ptr(&flush_works, i);
struct softnet_data *sd = &per_cpu(softnet_data, i);
- INIT_WORK(flush, flush_backlog);
-
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
#ifdef CONFIG_XFRM_OFFLOAD
@@ -12349,6 +12659,7 @@ out:
if (!pp_ptr)
continue;
+ xdp_unreg_page_pool(pp_ptr);
page_pool_destroy(pp_ptr);
per_cpu(system_page_pool, i) = NULL;
}
diff --git a/net/core/dev.h b/net/core/dev.h
index deb5eae5749f..a5b166bbd169 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -2,6 +2,7 @@
#ifndef _NET_CORE_DEV_H
#define _NET_CORE_DEV_H
+#include <linux/cleanup.h>
#include <linux/types.h>
#include <linux/rwsem.h>
#include <linux/netdevice.h>
@@ -22,7 +23,22 @@ struct sd_flow_limit {
extern int netdev_flow_limit_table_len;
-struct napi_struct *netdev_napi_by_id(struct net *net, unsigned int napi_id);
+struct napi_struct *
+netdev_napi_by_id_lock(struct net *net, unsigned int napi_id);
+struct net_device *dev_get_by_napi_id(unsigned int napi_id);
+
+struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex);
+struct net_device *__netdev_put_lock(struct net_device *dev);
+struct net_device *
+netdev_xa_find_lock(struct net *net, struct net_device *dev,
+ unsigned long *index);
+
+DEFINE_FREE(netdev_unlock, struct net_device *, if (_T) netdev_unlock(_T));
+
+#define for_each_netdev_lock_scoped(net, var_name, ifindex) \
+ for (struct net_device *var_name __free(netdev_unlock) = NULL; \
+ (var_name = netdev_xa_find_lock(net, var_name, &ifindex)); \
+ ifindex++)
#ifdef CONFIG_PROC_FS
int __init dev_proc_init(void);
@@ -111,6 +127,18 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
void unregister_netdevice_many_notify(struct list_head *head,
u32 portid, const struct nlmsghdr *nlh);
+static inline void netif_set_up(struct net_device *dev, bool value)
+{
+ if (value)
+ dev->flags |= IFF_UP;
+ else
+ dev->flags &= ~IFF_UP;
+
+ netdev_lock(dev);
+ dev->up = value;
+ netdev_unlock(dev);
+}
+
static inline void netif_set_gso_max_size(struct net_device *dev,
unsigned int size)
{
@@ -311,5 +339,8 @@ static inline void dev_xmit_recursion_dec(void)
int dev_set_hwtstamp_phylib(struct net_device *dev,
struct kernel_hwtstamp_config *cfg,
struct netlink_ext_ack *extack);
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg);
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg);
#endif
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index 166e404f7c03..90716bd736f3 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -242,9 +242,9 @@ static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
__hw_addr_del_entry(from_list, ha, false, false);
}
-static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
- struct netdev_hw_addr_list *from_list,
- int addr_len)
+int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+ struct netdev_hw_addr_list *from_list,
+ int addr_len)
{
int err = 0;
struct netdev_hw_addr *ha, *tmp;
@@ -260,6 +260,7 @@ static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
}
return err;
}
+EXPORT_SYMBOL(__hw_addr_sync_multiple);
/* This function only works where there is a strict 1-1 relationship
* between source and destination of they synch. If you ever need to
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 46d43b950471..57f79f8e8466 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -6,6 +6,7 @@
#include <linux/rtnetlink.h>
#include <linux/net_tstamp.h>
#include <linux/phylib_stubs.h>
+#include <linux/ptp_clock_kernel.h>
#include <linux/wireless.h>
#include <linux/if_bridge.h>
#include <net/dsa_stubs.h>
@@ -184,7 +185,7 @@ static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cm
return err;
}
-static int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
+int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg)
{
enum hwtstamp_tx_types tx_type;
enum hwtstamp_rx_filters rx_filter;
@@ -266,9 +267,24 @@ static int dev_eth_ioctl(struct net_device *dev,
* -EOPNOTSUPP for phylib for now, which is still more accurate than letting
* the netdev handle the GET request.
*/
-static int dev_get_hwtstamp_phylib(struct net_device *dev,
- struct kernel_hwtstamp_config *cfg)
+int dev_get_hwtstamp_phylib(struct net_device *dev,
+ struct kernel_hwtstamp_config *cfg)
{
+ struct hwtstamp_provider *hwprov;
+
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ cfg->qualifier = hwprov->desc.qualifier;
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev)
+ return phy_hwtstamp_get(hwprov->phydev, cfg);
+
+ if (hwprov->source == HWTSTAMP_SOURCE_NETDEV)
+ return dev->netdev_ops->ndo_hwtstamp_get(dev, cfg);
+
+ return -EOPNOTSUPP;
+ }
+
if (phy_is_default_hwtstamp(dev->phydev))
return phy_hwtstamp_get(dev->phydev, cfg);
@@ -324,11 +340,32 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
- bool phy_ts = phy_is_default_hwtstamp(dev->phydev);
struct kernel_hwtstamp_config old_cfg = {};
+ struct hwtstamp_provider *hwprov;
+ struct phy_device *phydev;
bool changed = false;
+ bool phy_ts;
int err;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source == HWTSTAMP_SOURCE_PHYLIB &&
+ hwprov->phydev) {
+ phy_ts = true;
+ phydev = hwprov->phydev;
+ } else if (hwprov->source == HWTSTAMP_SOURCE_NETDEV) {
+ phy_ts = false;
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ cfg->qualifier = hwprov->desc.qualifier;
+ } else {
+ phy_ts = phy_is_default_hwtstamp(dev->phydev);
+ if (phy_ts)
+ phydev = dev->phydev;
+ }
+
cfg->source = phy_ts ? HWTSTAMP_SOURCE_PHYLIB : HWTSTAMP_SOURCE_NETDEV;
if (phy_ts && dev->see_all_hwtstamp_requests) {
@@ -350,7 +387,7 @@ int dev_set_hwtstamp_phylib(struct net_device *dev,
changed = kernel_hwtstamp_config_changed(&old_cfg, cfg);
if (phy_ts) {
- err = phy_hwtstamp_set(dev->phydev, cfg, extack);
+ err = phy_hwtstamp_set(phydev, cfg, extack);
if (err) {
if (changed)
ops->ndo_hwtstamp_set(dev, &old_cfg, NULL);
@@ -506,7 +543,7 @@ static int dev_siocwandev(struct net_device *dev, struct if_settings *ifs)
}
/*
- * Perform the SIOCxIFxxx calls, inside rtnl_lock()
+ * Perform the SIOCxIFxxx calls, inside rtnl_net_lock()
*/
static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
unsigned int cmd)
@@ -514,7 +551,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
int err;
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
const struct net_device_ops *ops;
- netdevice_tracker dev_tracker;
if (!dev)
return -ENODEV;
@@ -577,19 +613,6 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data,
case SIOCWANDEV:
return dev_siocwandev(dev, &ifr->ifr_settings);
- case SIOCBRADDIF:
- case SIOCBRDELIF:
- if (!netif_device_present(dev))
- return -ENODEV;
- if (!netif_is_bridge_master(dev))
- return -EOPNOTSUPP;
- netdev_hold(dev, &dev_tracker, GFP_KERNEL);
- rtnl_unlock();
- err = br_ioctl_call(net, netdev_priv(dev), cmd, ifr, NULL);
- netdev_put(dev, &dev_tracker);
- rtnl_lock();
- return err;
-
case SIOCDEVPRIVATE ... SIOCDEVPRIVATE + 15:
return dev_siocdevprivate(dev, ifr, data, cmd);
@@ -733,9 +756,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
dev_load(net, ifr->ifr_name);
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (colon)
*colon = ':';
return ret;
@@ -770,8 +795,6 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDRELEASE:
case SIOCBONDSETHWADDR:
case SIOCBONDCHANGEACTIVE:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
case SIOCSHWTSTAMP:
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
@@ -779,9 +802,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
case SIOCBONDSLAVEINFOQUERY:
case SIOCBONDINFOQUERY:
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
+
if (need_copyout)
*need_copyout = false;
return ret;
@@ -804,9 +829,10 @@ int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
(cmd >= SIOCDEVPRIVATE &&
cmd <= SIOCDEVPRIVATE + 15)) {
dev_load(net, ifr->ifr_name);
- rtnl_lock();
+
+ rtnl_net_lock(net);
ret = dev_ifsioc(net, ifr, data, cmd);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return ret;
}
return -ENOTTY;
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 11b91c12ee11..0e5a2c672efd 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -8,6 +8,7 @@
*/
#include <linux/dma-buf.h>
+#include <linux/ethtool_netlink.h>
#include <linux/genalloc.h>
#include <linux/mm.h>
#include <linux/netdevice.h>
@@ -108,6 +109,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
struct netdev_rx_queue *rxq;
unsigned long xa_idx;
unsigned int rxq_idx;
+ int err;
if (binding->list.next)
list_del(&binding->list);
@@ -119,7 +121,8 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
rxq_idx = get_netdev_rx_queue_index(rxq);
- WARN_ON(netdev_rx_queue_restart(binding->dev, rxq_idx));
+ err = netdev_rx_queue_restart(binding->dev, rxq_idx);
+ WARN_ON(err && err != -ENETDOWN);
}
xa_erase(&net_devmem_dmabuf_bindings, binding->id);
@@ -140,6 +143,16 @@ int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
return -ERANGE;
}
+ if (dev->cfg->hds_config != ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(extack, "tcp-data-split is disabled");
+ return -EINVAL;
+ }
+
+ if (dev->cfg->hds_thresh) {
+ NL_SET_ERR_MSG(extack, "hds-thresh is not zero");
+ return -EINVAL;
+ }
+
rxq = __netif_get_rx_queue(dev, rxq_idx);
if (rxq->mp_params.mp_priv) {
NL_SET_ERR_MSG(extack, "designated queue already memory provider bound");
@@ -331,11 +344,11 @@ int mp_dmabuf_devmem_init(struct page_pool *pool)
if (!binding)
return -EINVAL;
- if (!pool->dma_map)
- return -EOPNOTSUPP;
-
- if (pool->dma_sync)
- return -EOPNOTSUPP;
+ /* dma-buf dma addresses do not need and should not be used with
+ * dma_sync_for_cpu/device. Force disable dma_sync.
+ */
+ pool->dma_sync = false;
+ pool->dma_sync_for_cpu = false;
if (pool->p.order != 0)
return -E2BIG;
diff --git a/net/core/dst.c b/net/core/dst.c
index 9552a90d4772..6d76b799ce64 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -165,6 +165,14 @@ static void dst_count_dec(struct dst_entry *dst)
void dst_release(struct dst_entry *dst)
{
if (dst && rcuref_put(&dst->__rcuref)) {
+#ifdef CONFIG_DST_CACHE
+ if (dst->flags & DST_METADATA) {
+ struct metadata_dst *md_dst = (struct metadata_dst *)dst;
+
+ if (md_dst->type == METADATA_IP_TUNNEL)
+ dst_cache_reset_now(&md_dst->u.tun_info.dst_cache);
+ }
+#endif
dst_count_dec(dst);
call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
}
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index ff1cebd71f7b..94a7872ab231 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -772,6 +772,8 @@ static const struct nla_policy fib_rule_policy[FRA_MAX + 1] = {
[FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
[FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) },
[FRA_DSCP] = NLA_POLICY_MAX(NLA_U8, INET_DSCP_MASK >> 2),
+ [FRA_FLOWLABEL] = { .type = NLA_BE32 },
+ [FRA_FLOWLABEL_MASK] = { .type = NLA_BE32 },
};
int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh,
diff --git a/net/core/filter.c b/net/core/filter.c
index d59a7ea646ca..b0df9b7d16d3 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -218,24 +218,36 @@ BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x)
return 0;
}
+static int bpf_skb_load_helper_convert_offset(const struct sk_buff *skb, int offset)
+{
+ if (likely(offset >= 0))
+ return offset;
+
+ if (offset >= SKF_NET_OFF)
+ return offset - SKF_NET_OFF + skb_network_offset(skb);
+
+ if (offset >= SKF_LL_OFF && skb_mac_header_was_set(skb))
+ return offset - SKF_LL_OFF + skb_mac_offset(skb);
+
+ return INT_MIN;
+}
+
BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- u8 tmp, *ptr;
+ u8 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return *(u8 *)(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return tmp;
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return *(u8 *)ptr;
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return *(u8 *)(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return tmp;
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
@@ -248,21 +260,19 @@ BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- __be16 tmp, *ptr;
+ __be16 tmp;
const int len = sizeof(tmp);
- if (offset >= 0) {
- if (headlen - offset >= len)
- return get_unaligned_be16(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be16_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be16(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be16(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be16_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
@@ -275,21 +285,19 @@ BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb,
BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *,
data, int, headlen, int, offset)
{
- __be32 tmp, *ptr;
+ __be32 tmp;
const int len = sizeof(tmp);
- if (likely(offset >= 0)) {
- if (headlen - offset >= len)
- return get_unaligned_be32(data + offset);
- if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
- return be32_to_cpu(tmp);
- } else {
- ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len);
- if (likely(ptr))
- return get_unaligned_be32(ptr);
- }
+ offset = bpf_skb_load_helper_convert_offset(skb, offset);
+ if (offset == INT_MIN)
+ return -EFAULT;
- return -EFAULT;
+ if (headlen - offset >= len)
+ return get_unaligned_be32(data + offset);
+ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp)))
+ return be32_to_cpu(tmp);
+ else
+ return -EFAULT;
}
BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb,
@@ -4128,13 +4136,13 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
}
static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
- struct xdp_mem_info *mem_info, bool release)
+ enum xdp_mem_type mem_type, bool release)
{
struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
if (release) {
xsk_buff_del_tail(zc_frag);
- __xdp_return(NULL, mem_info, false, zc_frag);
+ __xdp_return(0, mem_type, false, zc_frag);
} else {
zc_frag->data_end -= shrink;
}
@@ -4143,19 +4151,16 @@ static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
int shrink)
{
- struct xdp_mem_info *mem_info = &xdp->rxq->mem;
+ enum xdp_mem_type mem_type = xdp->rxq->mem.type;
bool release = skb_frag_size(frag) == shrink;
- if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
- bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
+ if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
+ bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
goto out;
}
- if (release) {
- struct page *page = skb_frag_page(frag);
-
- __xdp_return(page_address(page), mem_info, false, NULL);
- }
+ if (release)
+ __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
out:
return release;
@@ -4357,9 +4362,9 @@ u32 xdp_master_redirect(struct xdp_buff *xdp)
EXPORT_SYMBOL_GPL(xdp_master_redirect);
static inline int __xdp_do_redirect_xsk(struct bpf_redirect_info *ri,
- struct net_device *dev,
+ const struct net_device *dev,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
@@ -4380,10 +4385,10 @@ err:
return err;
}
-static __always_inline int __xdp_do_redirect_frame(struct bpf_redirect_info *ri,
- struct net_device *dev,
- struct xdp_frame *xdpf,
- struct bpf_prog *xdp_prog)
+static __always_inline int
+__xdp_do_redirect_frame(struct bpf_redirect_info *ri, struct net_device *dev,
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
enum bpf_map_type map_type = ri->map_type;
void *fwd = ri->tgt_value;
@@ -4452,7 +4457,7 @@ err:
}
int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog)
+ const struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
@@ -4466,7 +4471,8 @@ int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
EXPORT_SYMBOL_GPL(xdp_do_redirect);
int xdp_do_redirect_frame(struct net_device *dev, struct xdp_buff *xdp,
- struct xdp_frame *xdpf, struct bpf_prog *xdp_prog)
+ struct xdp_frame *xdpf,
+ const struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
@@ -4481,9 +4487,9 @@ EXPORT_SYMBOL_GPL(xdp_do_redirect_frame);
static int xdp_do_generic_redirect_map(struct net_device *dev,
struct sk_buff *skb,
struct xdp_buff *xdp,
- struct bpf_prog *xdp_prog, void *fwd,
- enum bpf_map_type map_type, u32 map_id,
- u32 flags)
+ const struct bpf_prog *xdp_prog,
+ void *fwd, enum bpf_map_type map_type,
+ u32 map_id, u32 flags)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
struct bpf_map *map;
@@ -4537,7 +4543,8 @@ err:
}
int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
- struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
+ struct xdp_buff *xdp,
+ const struct bpf_prog *xdp_prog)
{
struct bpf_redirect_info *ri = bpf_net_ctx_get_ri();
enum bpf_map_type map_type = ri->map_type;
@@ -9079,7 +9086,8 @@ static bool xdp_is_valid_access(int off, int size,
return __is_valid_xdp_access(off, size);
}
-void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, u32 act)
+void bpf_warn_invalid_xdp_action(const struct net_device *dev,
+ const struct bpf_prog *prog, u32 act)
{
const u32 act_max = XDP_REDIRECT;
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 711cd3b4347a..4417a18b3e95 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -23,6 +23,8 @@
#include <net/ip6_fib.h>
#include <net/rtnh.h>
+#include "dev.h"
+
DEFINE_STATIC_KEY_FALSE(nf_hooks_lwtunnel_enabled);
EXPORT_SYMBOL_GPL(nf_hooks_lwtunnel_enabled);
@@ -325,13 +327,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_cmp_encap);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
- if (!dst)
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -341,8 +353,11 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->output))
+ if (likely(ops && ops->output)) {
+ dev_xmit_recursion_inc();
ret = ops->output(net, sk, skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
@@ -359,13 +374,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
+
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
+ goto drop;
+ }
- if (!dst)
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
goto drop;
+ }
lwtstate = dst->lwtstate;
@@ -376,8 +401,11 @@ int lwtunnel_xmit(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->xmit))
+ if (likely(ops && ops->xmit)) {
+ dev_xmit_recursion_inc();
ret = ops->xmit(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
@@ -394,13 +422,23 @@ EXPORT_SYMBOL_GPL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb)
{
- struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
- int ret = -EINVAL;
+ struct dst_entry *dst;
+ int ret;
- if (!dst)
+ if (dev_xmit_recursion()) {
+ net_crit_ratelimited("%s(): recursion limit reached on datapath\n",
+ __func__);
+ ret = -ENETDOWN;
goto drop;
+ }
+
+ dst = skb_dst(skb);
+ if (!dst) {
+ ret = -EINVAL;
+ goto drop;
+ }
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
@@ -410,8 +448,11 @@ int lwtunnel_input(struct sk_buff *skb)
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
- if (likely(ops && ops->input))
+ if (likely(ops && ops->input)) {
+ dev_xmit_recursion_inc();
ret = ops->input(skb);
+ dev_xmit_recursion_dec();
+ }
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index bd0251bd74a1..1a620f903c56 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2250,6 +2250,7 @@ static const struct nla_policy nl_neightbl_policy[NDTA_MAX+1] = {
static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = {
[NDTPA_IFINDEX] = { .type = NLA_U32 },
[NDTPA_QUEUE_LEN] = { .type = NLA_U32 },
+ [NDTPA_QUEUE_LENBYTES] = { .type = NLA_U32 },
[NDTPA_PROXY_QLEN] = { .type = NLA_U32 },
[NDTPA_APP_PROBES] = { .type = NLA_U32 },
[NDTPA_UCAST_PROBES] = { .type = NLA_U32 },
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 2d9afc6e2161..07cb99b114bd 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -36,7 +36,7 @@ static const char fmt_uint[] = "%u\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
-/* Caller holds RTNL or RCU */
+/* Caller holds RTNL, netdev->lock or RCU */
static inline int dev_isalive(const struct net_device *dev)
{
return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED;
@@ -108,6 +108,36 @@ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr,
return ret;
}
+/* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */
+static ssize_t
+netdev_lock_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t len,
+ int (*set)(struct net_device *, unsigned long))
+{
+ struct net_device *netdev = to_net_dev(dev);
+ struct net *net = dev_net(netdev);
+ unsigned long new;
+ int ret;
+
+ if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ ret = kstrtoul(buf, 0, &new);
+ if (ret)
+ return ret;
+
+ netdev_lock(netdev);
+
+ if (dev_isalive(netdev)) {
+ ret = (*set)(netdev, new);
+ if (ret == 0)
+ ret = len;
+ }
+ netdev_unlock(netdev);
+
+ return ret;
+}
+
NETDEVICE_SHOW_RO(dev_id, fmt_hex);
NETDEVICE_SHOW_RO(dev_port, fmt_dec);
NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec);
@@ -420,7 +450,7 @@ static ssize_t gro_flush_timeout_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_gro_flush_timeout);
+ return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout);
}
NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong);
@@ -440,7 +470,8 @@ static ssize_t napi_defer_hard_irqs_store(struct device *dev,
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs);
+ return netdev_lock_store(dev, attr, buf, len,
+ change_napi_defer_hard_irqs);
}
NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint);
@@ -638,7 +669,7 @@ static ssize_t threaded_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t len)
{
- return netdev_store(dev, attr, buf, len, modify_napi_threaded);
+ return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded);
}
static DEVICE_ATTR_RW(threaded);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index b71aa96eeee2..4303f2a49262 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -588,6 +588,8 @@ static void unhash_nsid(struct net *net, struct net *last)
static LLIST_HEAD(cleanup_list);
+struct task_struct *cleanup_net_task;
+
static void cleanup_net(struct work_struct *work)
{
const struct pernet_operations *ops;
@@ -596,6 +598,8 @@ static void cleanup_net(struct work_struct *work)
LIST_HEAD(net_exit_list);
LIST_HEAD(dev_kill_list);
+ cleanup_net_task = current;
+
/* Atomically snapshot the list of namespaces to cleanup */
net_kill_list = llist_del_all(&cleanup_list);
@@ -670,6 +674,7 @@ static void cleanup_net(struct work_struct *work)
put_user_ns(net->user_ns);
net_passive_dec(net);
}
+ cleanup_net_task = NULL;
}
/**
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index a3bdaf075b6b..7832abc5ca6e 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -52,6 +52,8 @@ XDP_METADATA_KFUNC_xxx
xsk_features |= NETDEV_XSK_FLAGS_TX_TIMESTAMP;
if (netdev->xsk_tx_metadata_ops->tmo_request_checksum)
xsk_features |= NETDEV_XSK_FLAGS_TX_CHECKSUM;
+ if (netdev->xsk_tx_metadata_ops->tmo_request_launch_time)
+ xsk_features |= NETDEV_XSK_FLAGS_TX_LAUNCH_TIME_FIFO;
}
if (nla_put_u32(rsp, NETDEV_A_DEV_IFINDEX, netdev->ifindex) ||
@@ -167,7 +169,7 @@ netdev_nl_napi_fill_one(struct sk_buff *rsp, struct napi_struct *napi,
void *hdr;
pid_t pid;
- if (!(napi->dev->flags & IFF_UP))
+ if (!napi->dev->up)
return 0;
hdr = genlmsg_iput(rsp, info);
@@ -229,20 +231,15 @@ int netdev_nl_napi_get_doit(struct sk_buff *skb, struct genl_info *info)
if (!rsp)
return -ENOMEM;
- rtnl_lock();
- rcu_read_lock();
-
- napi = netdev_napi_by_id(genl_info_net(info), napi_id);
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
if (napi) {
err = netdev_nl_napi_fill_one(rsp, napi, info);
+ netdev_unlock(napi->dev);
} else {
NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
err = -ENOENT;
}
- rcu_read_unlock();
- rtnl_unlock();
-
if (err) {
goto err_free_msg;
} else if (!rsp->len) {
@@ -263,14 +260,21 @@ netdev_nl_napi_dump_one(struct net_device *netdev, struct sk_buff *rsp,
struct netdev_nl_dump_ctx *ctx)
{
struct napi_struct *napi;
+ unsigned int prev_id;
int err = 0;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
+ prev_id = UINT_MAX;
list_for_each_entry(napi, &netdev->napi_list, dev_list) {
if (napi->napi_id < MIN_NAPI_ID)
continue;
+
+ /* Dump continuation below depends on the list being sorted */
+ WARN_ON_ONCE(napi->napi_id >= prev_id);
+ prev_id = napi->napi_id;
+
if (ctx->napi_id && napi->napi_id >= ctx->napi_id)
continue;
@@ -294,22 +298,22 @@ int netdev_nl_napi_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
if (info->attrs[NETDEV_A_NAPI_IFINDEX])
ifindex = nla_get_u32(info->attrs[NETDEV_A_NAPI_IFINDEX]);
- rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_napi_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
ctx->napi_id = 0;
}
}
- rtnl_unlock();
return err;
}
@@ -350,20 +354,15 @@ int netdev_nl_napi_set_doit(struct sk_buff *skb, struct genl_info *info)
napi_id = nla_get_u32(info->attrs[NETDEV_A_NAPI_ID]);
- rtnl_lock();
- rcu_read_lock();
-
- napi = netdev_napi_by_id(genl_info_net(info), napi_id);
+ napi = netdev_napi_by_id_lock(genl_info_net(info), napi_id);
if (napi) {
err = netdev_nl_napi_set_config(napi, info);
+ netdev_unlock(napi->dev);
} else {
NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_NAPI_ID]);
err = -ENOENT;
}
- rcu_read_unlock();
- rtnl_unlock();
-
return err;
}
@@ -435,7 +434,7 @@ netdev_nl_queue_fill(struct sk_buff *rsp, struct net_device *netdev, u32 q_idx,
{
int err;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return -ENOENT;
err = netdev_nl_queue_validate(netdev, q_idx, q_type);
@@ -467,11 +466,13 @@ int netdev_nl_queue_get_doit(struct sk_buff *skb, struct genl_info *info)
rtnl_lock();
- netdev = __dev_get_by_index(genl_info_net(info), ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex);
+ if (netdev) {
err = netdev_nl_queue_fill(rsp, netdev, q_id, q_type, info);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
rtnl_unlock();
@@ -492,7 +493,7 @@ netdev_nl_queue_dump_one(struct net_device *netdev, struct sk_buff *rsp,
{
int err = 0;
- if (!(netdev->flags & IFF_UP))
+ if (!netdev->up)
return err;
for (; ctx->rxq_idx < netdev->real_num_rx_queues; ctx->rxq_idx++) {
@@ -525,13 +526,15 @@ int netdev_nl_queue_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
rtnl_lock();
if (ifindex) {
- netdev = __dev_get_by_index(net, ifindex);
- if (netdev)
+ netdev = netdev_get_by_index_lock(net, ifindex);
+ if (netdev) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
- else
+ netdev_unlock(netdev);
+ } else {
err = -ENODEV;
+ }
} else {
- for_each_netdev_dump(net, netdev, ctx->ifindex) {
+ for_each_netdev_lock_scoped(net, netdev, ctx->ifindex) {
err = netdev_nl_queue_dump_one(netdev, skb, info, ctx);
if (err < 0)
break;
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index e217a5838c87..db82786fa0c4 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -79,3 +79,4 @@ err_free_new_mem:
return err;
}
+EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 96a6ed37d4cc..0ab722d95a2d 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -319,6 +319,7 @@ static int netpoll_owner_active(struct net_device *dev)
static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
{
netdev_tx_t status = NETDEV_TX_BUSY;
+ netdev_tx_t ret = NET_XMIT_DROP;
struct net_device *dev;
unsigned long tries;
/* It is up to the caller to keep npinfo alive. */
@@ -327,11 +328,12 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
lockdep_assert_irqs_disabled();
dev = np->dev;
+ rcu_read_lock();
npinfo = rcu_dereference_bh(dev->npinfo);
if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) {
dev_kfree_skb_irq(skb);
- return NET_XMIT_DROP;
+ goto out;
}
/* don't get messages out of order, and no recursion */
@@ -370,7 +372,10 @@ static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
skb_queue_tail(&npinfo->txq, skb);
schedule_delayed_work(&npinfo->tx_work,0);
}
- return NETDEV_TX_OK;
+ ret = NETDEV_TX_OK;
+out:
+ rcu_read_unlock();
+ return ret;
}
netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
@@ -390,7 +395,7 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb)
}
EXPORT_SYMBOL(netpoll_send_skb);
-void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
+int netpoll_send_udp(struct netpoll *np, const char *msg, int len)
{
int total_len, ip_len, udp_len;
struct sk_buff *skb;
@@ -414,7 +419,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb = find_skb(np, total_len + np->dev->needed_tailroom,
total_len - len);
if (!skb)
- return;
+ return -ENOMEM;
skb_copy_to_linear_data(skb, msg, len);
skb_put(skb, len);
@@ -490,7 +495,7 @@ void netpoll_send_udp(struct netpoll *np, const char *msg, int len)
skb->dev = np->dev;
- netpoll_send_skb(np, skb);
+ return (int)netpoll_send_skb(np, skb);
}
EXPORT_SYMBOL(netpoll_send_udp);
@@ -636,7 +641,8 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
goto out;
}
- if (!rcu_access_pointer(ndev->npinfo)) {
+ npinfo = rtnl_dereference(ndev->npinfo);
+ if (!npinfo) {
npinfo = kmalloc(sizeof(*npinfo), GFP_KERNEL);
if (!npinfo) {
err = -ENOMEM;
@@ -656,7 +662,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
goto free_npinfo;
}
} else {
- npinfo = rtnl_dereference(ndev->npinfo);
refcount_inc(&npinfo->refcnt);
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 32570333068d..ede82c610936 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -201,6 +201,7 @@ static int page_pool_init(struct page_pool *pool,
memcpy(&pool->slow, &params->slow, sizeof(pool->slow));
pool->cpuid = cpuid;
+ pool->dma_sync_for_cpu = true;
/* Validate only known flags were used */
if (pool->slow.flags & ~PP_FLAG_ALL)
@@ -287,6 +288,9 @@ static int page_pool_init(struct page_pool *pool,
}
if (pool->mp_priv) {
+ if (!pool->dma_map || !pool->dma_sync)
+ return -EOPNOTSUPP;
+
err = mp_dmabuf_devmem_init(pool);
if (err) {
pr_warn("%s() mem-provider init failed %d\n", __func__,
@@ -532,12 +536,11 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
if (unlikely(pool->alloc.count > 0))
return pool->alloc.cache[--pool->alloc.count];
- /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */
+ /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */
memset(&pool->alloc.cache, 0, sizeof(void *) * bulk);
- nr_pages = alloc_pages_bulk_array_node(gfp,
- pool->p.nid, bulk,
- (struct page **)pool->alloc.cache);
+ nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk,
+ (struct page **)pool->alloc.cache);
if (unlikely(!nr_pages))
return 0;
@@ -574,7 +577,7 @@ static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool,
/* For using page_pool replace: alloc_pages() API calls, but provide
* synchronization guarantee for allocation side.
*/
-netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
+netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp)
{
netmem_ref netmem;
@@ -590,14 +593,14 @@ netmem_ref page_pool_alloc_netmem(struct page_pool *pool, gfp_t gfp)
netmem = __page_pool_alloc_pages_slow(pool, gfp);
return netmem;
}
-EXPORT_SYMBOL(page_pool_alloc_netmem);
+EXPORT_SYMBOL(page_pool_alloc_netmems);
+ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL);
struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
{
- return netmem_to_page(page_pool_alloc_netmem(pool, gfp));
+ return netmem_to_page(page_pool_alloc_netmems(pool, gfp));
}
EXPORT_SYMBOL(page_pool_alloc_pages);
-ALLOW_ERROR_INJECTION(page_pool_alloc_pages, NULL);
/* Calculate distance between two u32 values, valid if distance is below 2^(31)
* https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
@@ -839,69 +842,104 @@ void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page,
}
EXPORT_SYMBOL(page_pool_put_unrefed_page);
-/**
- * page_pool_put_page_bulk() - release references on multiple pages
- * @pool: pool from which pages were allocated
- * @data: array holding page pointers
- * @count: number of pages in @data
- *
- * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring
- * producer lock. If the ptr_ring is full, page_pool_put_page_bulk()
- * will release leftover pages to the page allocator.
- * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx
- * completion loop for the XDP_REDIRECT use case.
- *
- * Please note the caller must not use data area after running
- * page_pool_put_page_bulk(), as this function overwrites it.
- */
-void page_pool_put_page_bulk(struct page_pool *pool, void **data,
- int count)
+static void page_pool_recycle_ring_bulk(struct page_pool *pool,
+ netmem_ref *bulk,
+ u32 bulk_len)
{
- int i, bulk_len = 0;
- bool allow_direct;
bool in_softirq;
+ u32 i;
- allow_direct = page_pool_napi_local(pool);
-
- for (i = 0; i < count; i++) {
- netmem_ref netmem = page_to_netmem(virt_to_head_page(data[i]));
-
- /* It is not the last user for the page frag case */
- if (!page_pool_is_last_ref(netmem))
- continue;
-
- netmem = __page_pool_put_page(pool, netmem, -1, allow_direct);
- /* Approved for bulk recycling in ptr_ring cache */
- if (netmem)
- data[bulk_len++] = (__force void *)netmem;
- }
-
- if (!bulk_len)
- return;
-
- /* Bulk producer into ptr_ring page_pool cache */
+ /* Bulk produce into ptr_ring page_pool cache */
in_softirq = page_pool_producer_lock(pool);
+
for (i = 0; i < bulk_len; i++) {
- if (__ptr_ring_produce(&pool->ring, data[i])) {
+ if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) {
/* ring full */
recycle_stat_inc(pool, ring_full);
break;
}
}
- recycle_stat_add(pool, ring, i);
+
page_pool_producer_unlock(pool, in_softirq);
+ recycle_stat_add(pool, ring, i);
- /* Hopefully all pages was return into ptr_ring */
+ /* Hopefully all pages were returned into ptr_ring */
if (likely(i == bulk_len))
return;
- /* ptr_ring cache full, free remaining pages outside producer lock
- * since put_page() with refcnt == 1 can be an expensive operation
+ /*
+ * ptr_ring cache is full, free remaining pages outside producer lock
+ * since put_page() with refcnt == 1 can be an expensive operation.
*/
for (; i < bulk_len; i++)
- page_pool_return_page(pool, (__force netmem_ref)data[i]);
+ page_pool_return_page(pool, bulk[i]);
}
-EXPORT_SYMBOL(page_pool_put_page_bulk);
+
+/**
+ * page_pool_put_netmem_bulk() - release references on multiple netmems
+ * @data: array holding netmem references
+ * @count: number of entries in @data
+ *
+ * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring
+ * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk()
+ * will release leftover netmems to the memory provider.
+ * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx
+ * completion loop for the XDP_REDIRECT use case.
+ *
+ * Please note the caller must not use data area after running
+ * page_pool_put_netmem_bulk(), as this function overwrites it.
+ */
+void page_pool_put_netmem_bulk(netmem_ref *data, u32 count)
+{
+ u32 bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ netmem_ref netmem = netmem_compound_head(data[i]);
+
+ if (page_pool_unref_and_test(netmem))
+ data[bulk_len++] = netmem;
+ }
+
+ count = bulk_len;
+ while (count) {
+ netmem_ref bulk[XDP_BULK_QUEUE_SIZE];
+ struct page_pool *pool = NULL;
+ bool allow_direct;
+ u32 foreign = 0;
+
+ bulk_len = 0;
+
+ for (u32 i = 0; i < count; i++) {
+ struct page_pool *netmem_pp;
+ netmem_ref netmem = data[i];
+
+ netmem_pp = netmem_get_pp(netmem);
+ if (unlikely(!pool)) {
+ pool = netmem_pp;
+ allow_direct = page_pool_napi_local(pool);
+ } else if (netmem_pp != pool) {
+ /*
+ * If the netmem belongs to a different
+ * page_pool, save it for another round.
+ */
+ data[foreign++] = netmem;
+ continue;
+ }
+
+ netmem = __page_pool_put_page(pool, netmem, -1,
+ allow_direct);
+ /* Approved for bulk recycling in ptr_ring cache */
+ if (netmem)
+ bulk[bulk_len++] = netmem;
+ }
+
+ if (bulk_len)
+ page_pool_recycle_ring_bulk(pool, bulk, bulk_len);
+
+ count = foreign;
+ }
+}
+EXPORT_SYMBOL(page_pool_put_netmem_bulk);
static netmem_ref page_pool_drain_frag(struct page_pool *pool,
netmem_ref netmem)
@@ -957,7 +995,7 @@ netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool,
}
if (!netmem) {
- netmem = page_pool_alloc_netmem(pool, gfp);
+ netmem = page_pool_alloc_netmems(pool, gfp);
if (unlikely(!netmem)) {
pool->frag_page = 0;
return 0;
@@ -1066,7 +1104,13 @@ static void page_pool_release_retry(struct work_struct *wq)
int inflight;
inflight = page_pool_release(pool);
- if (!inflight)
+ /* In rare cases, a driver bug may cause inflight to go negative.
+ * Don't reschedule release if inflight is 0 or negative.
+ * - If 0, the page_pool has been destroyed
+ * - if negative, we will never recover
+ * in both cases no reschedule is necessary.
+ */
+ if (inflight <= 0)
return;
/* Periodic warning for page pools the user can't see */
diff --git a/net/core/page_pool_user.c b/net/core/page_pool_user.c
index 6677e0c2e256..d5e214c30c31 100644
--- a/net/core/page_pool_user.c
+++ b/net/core/page_pool_user.c
@@ -356,7 +356,7 @@ void page_pool_unlist(struct page_pool *pool)
int page_pool_check_memory_provider(struct net_device *dev,
struct netdev_rx_queue *rxq)
{
- struct net_devmem_dmabuf_binding *binding = rxq->mp_params.mp_priv;
+ void *binding = rxq->mp_params.mp_priv;
struct page_pool *pool;
struct hlist_node *n;
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 4cb547fae91f..82b6a2c3c141 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -3883,17 +3883,14 @@ static int __net_init pktgen_create_thread(int cpu, struct pktgen_net *pn)
list_add_tail(&t->th_list, &pn->pktgen_threads);
init_completion(&t->start_done);
- p = kthread_create_on_node(pktgen_thread_worker,
- t,
- cpu_to_node(cpu),
- "kpktgend_%d", cpu);
+ p = kthread_create_on_cpu(pktgen_thread_worker, t, cpu, "kpktgend_%d");
if (IS_ERR(p)) {
pr_err("kthread_create_on_node() failed for cpu %d\n", t->cpu);
list_del(&t->th_list);
kfree(t);
return PTR_ERR(p);
}
- kthread_bind(p, cpu);
+
t->tsk = p;
pe = proc_create_data(t->tsk->comm, 0600, pn->proc_dir,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b7cc30fd80e8..80e006940f51 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -84,7 +84,6 @@ int rtnl_lock_killable(void)
{
return mutex_lock_killable(&rtnl_mutex);
}
-EXPORT_SYMBOL(rtnl_lock_killable);
static struct sk_buff *defer_kfree_skb_list;
void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail)
@@ -221,6 +220,16 @@ int rtnl_net_trylock(struct net *net)
}
EXPORT_SYMBOL(rtnl_net_trylock);
+int rtnl_net_lock_killable(struct net *net)
+{
+ int ret = rtnl_lock_killable();
+
+ if (!ret)
+ __rtnl_net_lock(net);
+
+ return ret;
+}
+
static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
{
if (net_eq(net_a, net_b))
@@ -1162,6 +1171,9 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
/* IFLA_VF_STATS_TX_DROPPED */
nla_total_size_64bit(sizeof(__u64)));
}
+ if (dev->netdev_ops->ndo_get_vf_guid)
+ size += num_vfs * 2 *
+ nla_total_size(sizeof(struct ifla_vf_guid));
return size;
} else
return 0;
@@ -4766,15 +4778,16 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
int *idx,
struct netdev_hw_addr_list *list)
{
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
struct netdev_hw_addr *ha;
- int err;
u32 portid, seq;
+ int err;
portid = NETLINK_CB(cb->skb).portid;
seq = cb->nlh->nlmsg_seq;
list_for_each_entry(ha, &list->list, list) {
- if (*idx < cb->args[2])
+ if (*idx < ctx->fdb_idx)
goto skip;
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
@@ -4913,18 +4926,16 @@ static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh,
static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
- struct net_device *dev;
- struct net_device *br_dev = NULL;
- const struct net_device_ops *ops = NULL;
- const struct net_device_ops *cops = NULL;
+ const struct net_device_ops *ops = NULL, *cops = NULL;
+ struct ndo_fdb_dump_context *ctx = (void *)cb->ctx;
+ struct net_device *dev, *br_dev = NULL;
struct net *net = sock_net(skb->sk);
- struct hlist_head *head;
int brport_idx = 0;
int br_idx = 0;
- int h, s_h;
- int idx = 0, s_idx;
- int err = 0;
int fidx = 0;
+ int err;
+
+ NL_ASSERT_CTX_FITS(struct ndo_fdb_dump_context);
if (cb->strict_check)
err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx,
@@ -4943,70 +4954,51 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
- s_h = cb->args[0];
- s_idx = cb->args[1];
-
- for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
- idx = 0;
- head = &net->dev_index_head[h];
- hlist_for_each_entry(dev, head, index_hlist) {
-
- if (brport_idx && (dev->ifindex != brport_idx))
- continue;
-
- if (!br_idx) { /* user did not specify a specific bridge */
- if (netif_is_bridge_port(dev)) {
- br_dev = netdev_master_upper_dev_get(dev);
- cops = br_dev->netdev_ops;
- }
- } else {
- if (dev != br_dev &&
- !netif_is_bridge_port(dev))
- continue;
+ for_each_netdev_dump(net, dev, ctx->ifindex) {
+ if (brport_idx && (dev->ifindex != brport_idx))
+ continue;
- if (br_dev != netdev_master_upper_dev_get(dev) &&
- !netif_is_bridge_master(dev))
- continue;
- cops = ops;
+ if (!br_idx) { /* user did not specify a specific bridge */
+ if (netif_is_bridge_port(dev)) {
+ br_dev = netdev_master_upper_dev_get(dev);
+ cops = br_dev->netdev_ops;
}
+ } else {
+ if (dev != br_dev &&
+ !netif_is_bridge_port(dev))
+ continue;
- if (idx < s_idx)
- goto cont;
+ if (br_dev != netdev_master_upper_dev_get(dev) &&
+ !netif_is_bridge_master(dev))
+ continue;
+ cops = ops;
+ }
- if (netif_is_bridge_port(dev)) {
- if (cops && cops->ndo_fdb_dump) {
- err = cops->ndo_fdb_dump(skb, cb,
- br_dev, dev,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
- }
+ if (netif_is_bridge_port(dev)) {
+ if (cops && cops->ndo_fdb_dump) {
+ err = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
+ &fidx);
+ if (err == -EMSGSIZE)
+ break;
}
+ }
- if (dev->netdev_ops->ndo_fdb_dump)
- err = dev->netdev_ops->ndo_fdb_dump(skb, cb,
- dev, NULL,
- &fidx);
- else
- err = ndo_dflt_fdb_dump(skb, cb, dev, NULL,
- &fidx);
- if (err == -EMSGSIZE)
- goto out;
+ if (dev->netdev_ops->ndo_fdb_dump)
+ err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
+ &fidx);
+ else
+ err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx);
+ if (err == -EMSGSIZE)
+ break;
- cops = NULL;
+ cops = NULL;
- /* reset fdb offset to 0 for rest of the interfaces */
- cb->args[2] = 0;
- fidx = 0;
-cont:
- idx++;
- }
+ /* reset fdb offset to 0 for rest of the interfaces */
+ ctx->fdb_idx = 0;
+ fidx = 0;
}
-out:
- cb->args[0] = h;
- cb->args[1] = idx;
- cb->args[2] = fidx;
+ ctx->fdb_idx = fidx;
return skb->len;
}
diff --git a/net/core/rtnl_net_debug.c b/net/core/rtnl_net_debug.c
index f406045cbd0e..f3272b09c255 100644
--- a/net/core/rtnl_net_debug.c
+++ b/net/core/rtnl_net_debug.c
@@ -27,7 +27,6 @@ static int rtnl_net_debug_event(struct notifier_block *nb,
case NETDEV_CHANGEADDR:
case NETDEV_PRE_CHANGEADDR:
case NETDEV_GOING_DOWN:
- case NETDEV_CHANGENAME:
case NETDEV_FEAT_CHANGE:
case NETDEV_BONDING_FAILOVER:
case NETDEV_PRE_UP:
@@ -60,18 +59,10 @@ static int rtnl_net_debug_event(struct notifier_block *nb,
ASSERT_RTNL();
break;
- /* Once an event fully supports RTNL_NET, move it here
- * and remove "if (0)" below.
- *
- * case NETDEV_XXX:
- * ASSERT_RTNL_NET(net);
- * break;
- */
- }
-
- /* Just to avoid unused-variable error for dev and net. */
- if (0)
+ case NETDEV_CHANGENAME:
ASSERT_RTNL_NET(net);
+ break;
+ }
return NOTIFY_DONE;
}
@@ -111,7 +102,7 @@ static int __init rtnl_net_debug_init(void)
{
int ret;
- ret = register_pernet_device(&rtnl_net_debug_net_ops);
+ ret = register_pernet_subsys(&rtnl_net_debug_net_ops);
if (ret)
return ret;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index bed75273f8c4..b1c81687e9d8 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -223,67 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
-#if PAGE_SIZE == SZ_4K
-
-#define NAPI_HAS_SMALL_PAGE_FRAG 1
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
-
-/* specialized page frag allocator using a single order 0 page
- * and slicing it into 1K sized fragment. Constrained to systems
- * with a very limited amount of 1K fragments fitting a single
- * page - to avoid excessive truesize underestimation
- */
-
-struct page_frag_1k {
- void *va;
- u16 offset;
- bool pfmemalloc;
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
-{
- struct page *page;
- int offset;
-
- offset = nc->offset - SZ_1K;
- if (likely(offset >= 0))
- goto use_frag;
-
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- if (!page)
- return NULL;
-
- nc->va = page_address(page);
- nc->pfmemalloc = page_is_pfmemalloc(page);
- offset = PAGE_SIZE - SZ_1K;
- page_ref_add(page, offset / SZ_1K);
-
-use_frag:
- nc->offset = offset;
- return nc->va + offset;
-}
-#else
-
-/* the small page is actually unused in this build; add dummy helpers
- * to please the compiler and avoid later preprocessor's conditionals
- */
-#define NAPI_HAS_SMALL_PAGE_FRAG 0
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-
-struct page_frag_1k {
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
-{
- return NULL;
-}
-
-#endif
-
struct napi_alloc_cache {
local_lock_t bh_lock;
struct page_frag_cache page;
- struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -293,23 +235,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
-
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -816,11 +741,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
- * When the small frag allocator is available, prefer it over kmalloc
- * for small fragments
*/
- if ((!NAPI_HAS_SMALL_PAGE_FRAG &&
- len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -830,32 +752,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
goto skb_success;
}
+ len = SKB_HEAD_ALIGN(len);
+
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc = this_cpu_ptr(&napi_alloc_cache);
- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
- /* we are artificially inflating the allocation size, but
- * that is not as bad as it may look like, as:
- * - 'len' less than GRO_MAX_HEAD makes little sense
- * - On most systems, larger 'len' values lead to fragment
- * size above 512 bytes
- * - kmalloc would use the kmalloc-1k slab for such values
- * - Builds with smaller GRO_MAX_HEAD will very likely do
- * little networking, as that implies no WiFi and no
- * tunnels support, and 32 bits arches.
- */
- len = SZ_1K;
- data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
- } else {
- len = SKB_HEAD_ALIGN(len);
-
- data = page_frag_alloc(&nc->page, len, gfp_mask);
- pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
- }
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
@@ -1013,7 +919,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_pp_cow_data);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
- struct bpf_prog *prog)
+ const struct bpf_prog *prog)
{
if (!prog->aux->xdp_has_frags)
return -EINVAL;
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 61f3f3d4e528..0ddc4c718833 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -549,6 +549,9 @@ static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb,
return num_sge;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+ psock->ingress_bytes += len;
+#endif
copied = len;
msg->sg.start = 0;
msg->sg.size = copied;
@@ -1144,6 +1147,10 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock)
if (!ret)
sk_psock_set_state(psock, SK_PSOCK_RX_STRP_ENABLED);
+ if (sk_is_tcp(sk)) {
+ psock->strp.cb.read_sock = tcp_bpf_strp_read_sock;
+ psock->copied_seq = tcp_sk(sk)->copied_seq;
+ }
return ret;
}
diff --git a/net/core/sock.c b/net/core/sock.c
index 9d5dd99cc581..45df78655214 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -454,6 +454,13 @@ static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
return 0;
}
+static bool sk_set_prio_allowed(const struct sock *sk, int val)
+{
+ return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
+ sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
+}
+
static bool sock_needs_netstamp(const struct sock *sk)
{
switch (sk->sk_family) {
@@ -1193,9 +1200,7 @@ int sk_setsockopt(struct sock *sk, int level, int optname,
/* handle options which do not require locking the socket. */
switch (optname) {
case SO_PRIORITY:
- if ((val >= 0 && val <= 6) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
- sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
+ if (sk_set_prio_allowed(sk, val)) {
sock_set_priority(sk, val);
return 0;
}
@@ -1517,6 +1522,10 @@ set_sndbuf:
sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
break;
+ case SO_RCVPRIORITY:
+ sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
+ break;
+
case SO_RXQ_OVFL:
sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
@@ -1945,6 +1954,10 @@ int sk_getsockopt(struct sock *sk, int level, int optname,
v.val = sock_flag(sk, SOCK_RCVMARK);
break;
+ case SO_RCVPRIORITY:
+ v.val = sock_flag(sk, SOCK_RCVPRIORITY);
+ break;
+
case SO_RXQ_OVFL:
v.val = sock_flag(sk, SOCK_RXQ_OVFL);
break;
@@ -2102,6 +2115,8 @@ lenout:
*/
static inline void sock_lock_init(struct sock *sk)
{
+ sk_owner_clear(sk);
+
if (sk->sk_kern_sock)
sock_lock_init_class_and_name(
sk,
@@ -2198,6 +2213,9 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
cgroup_sk_free(&sk->sk_cgrp_data);
mem_cgroup_sk_free(sk);
security_sk_free(sk);
+
+ sk_owner_put(sk);
+
if (slab != NULL)
kmem_cache_free(slab, sk);
else
@@ -2962,6 +2980,13 @@ int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
case SCM_RIGHTS:
case SCM_CREDENTIALS:
break;
+ case SO_PRIORITY:
+ if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
+ return -EINVAL;
+ if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
+ return -EPERM;
+ sockc->priority = *(u32 *)CMSG_DATA(cmsg);
+ break;
default:
return -EINVAL;
}
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 3717fb152ecc..a50a7ef49ae8 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -9,6 +9,7 @@
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
#include <linux/export.h>
+#include <linux/ptp_clock_kernel.h>
static unsigned int classify(const struct sk_buff *skb)
{
@@ -21,19 +22,39 @@ static unsigned int classify(const struct sk_buff *skb)
void skb_clone_tx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
struct sk_buff *clone;
unsigned int type;
- if (!skb->sk || !skb->dev ||
- !phy_is_default_hwtstamp(skb->dev->phydev))
+ if (!skb->sk || !skb->dev)
return;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+
type = classify(skb);
if (type == PTP_CLASS_NONE)
return;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->txtstamp)) {
clone = skb_clone_sk(skb);
if (!clone)
@@ -45,12 +66,33 @@ EXPORT_SYMBOL_GPL(skb_clone_tx_timestamp);
bool skb_defer_rx_timestamp(struct sk_buff *skb)
{
+ struct hwtstamp_provider *hwprov;
struct mii_timestamper *mii_ts;
+ struct phy_device *phydev;
unsigned int type;
- if (!skb->dev || !phy_is_default_hwtstamp(skb->dev->phydev))
+ if (!skb->dev)
return false;
+ rcu_read_lock();
+ hwprov = rcu_dereference(skb->dev->hwprov);
+ if (hwprov) {
+ if (hwprov->source != HWTSTAMP_SOURCE_PHYLIB ||
+ !hwprov->phydev) {
+ rcu_read_unlock();
+ return false;
+ }
+
+ phydev = hwprov->phydev;
+ } else {
+ phydev = skb->dev->phydev;
+ if (!phy_is_default_hwtstamp(phydev)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
if (skb_headroom(skb) < ETH_HLEN)
return false;
@@ -63,7 +105,7 @@ bool skb_defer_rx_timestamp(struct sk_buff *skb)
if (type == PTP_CLASS_NONE)
return false;
- mii_ts = skb->dev->phydev->mii_ts;
+ mii_ts = phydev->mii_ts;
if (likely(mii_ts->rxtstamp))
return mii_ts->rxtstamp(mii_ts, skb, type);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 2315feed94ef..2c6ab6fb452f 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -357,6 +357,9 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
if (IS_ERR(xdp_alloc))
return PTR_ERR(xdp_alloc);
+ if (type == MEM_TYPE_XSK_BUFF_POOL && allocator)
+ xsk_pool_set_rxq_info(allocator, xdp_rxq);
+
if (trace_mem_connect_enabled() && xdp_alloc)
trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
@@ -364,33 +367,87 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+/**
+ * xdp_reg_page_pool - register &page_pool as a memory provider for XDP
+ * @pool: &page_pool to register
+ *
+ * Can be used to register pools manually without connecting to any XDP RxQ
+ * info, so that the XDP layer will be aware of them. Then, they can be
+ * attached to an RxQ info manually via xdp_rxq_info_attach_page_pool().
+ *
+ * Return: %0 on success, -errno on error.
+ */
+int xdp_reg_page_pool(struct page_pool *pool)
+{
+ struct xdp_mem_info mem;
+
+ return xdp_reg_mem_model(&mem, MEM_TYPE_PAGE_POOL, pool);
+}
+EXPORT_SYMBOL_GPL(xdp_reg_page_pool);
+
+/**
+ * xdp_unreg_page_pool - unregister &page_pool from the memory providers list
+ * @pool: &page_pool to unregister
+ *
+ * A shorthand for manual unregistering page pools. If the pool was previously
+ * attached to an RxQ info, it must be detached first.
+ */
+void xdp_unreg_page_pool(const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_unreg_mem_model(&mem);
+}
+EXPORT_SYMBOL_GPL(xdp_unreg_page_pool);
+
+/**
+ * xdp_rxq_info_attach_page_pool - attach registered pool to RxQ info
+ * @xdp_rxq: XDP RxQ info to attach the pool to
+ * @pool: pool to attach
+ *
+ * If the pool was registered manually, this function must be called instead
+ * of xdp_rxq_info_reg_mem_model() to connect it to the RxQ info.
+ */
+void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq,
+ const struct page_pool *pool)
+{
+ struct xdp_mem_info mem = {
+ .type = MEM_TYPE_PAGE_POOL,
+ .id = pool->xdp_mem_id,
+ };
+
+ xdp_rxq_info_attach_mem_model(xdp_rxq, &mem);
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_attach_page_pool);
+
/* XDP RX runs under NAPI protection, and in different delivery error
* scenarios (e.g. queue full), it is possible to return the xdp_frame
* while still leveraging this protection. The @napi_direct boolean
* is used for those calls sites. Thus, allowing for faster recycling
* of xdp_frames/pages in those cases.
*/
-void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
- struct xdp_buff *xdp)
+void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type,
+ bool napi_direct, struct xdp_buff *xdp)
{
- struct page *page;
-
- switch (mem->type) {
+ switch (mem_type) {
case MEM_TYPE_PAGE_POOL:
- page = virt_to_head_page(data);
+ netmem = netmem_compound_head(netmem);
if (napi_direct && xdp_return_frame_no_direct())
napi_direct = false;
/* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE)
* as mem->type knows this a page_pool page
*/
- page_pool_put_full_page(page->pp, page, napi_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem,
+ napi_direct);
break;
case MEM_TYPE_PAGE_SHARED:
- page_frag_free(data);
+ page_frag_free(__netmem_address(netmem));
break;
case MEM_TYPE_PAGE_ORDER0:
- page = virt_to_page(data); /* Assumes order0 page*/
- put_page(page);
+ put_page(__netmem_to_page(netmem));
break;
case MEM_TYPE_XSK_BUFF_POOL:
/* NB! Only valid from an xdp_buff! */
@@ -398,7 +455,7 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
break;
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
- WARN(1, "Incorrect XDP memory type (%d) usage", mem->type);
+ WARN(1, "Incorrect XDP memory type (%d) usage", mem_type);
break;
}
}
@@ -406,38 +463,34 @@ void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
void xdp_return_frame(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ false, NULL);
- __xdp_return(page_address(page), &xdpf->mem, false, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, false, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, false, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_frame_has_frags(xdpf)))
goto out;
sinfo = xdp_get_shared_info_from_frame(xdpf);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]), xdpf->mem_type,
+ true, NULL);
- __xdp_return(page_address(page), &xdpf->mem, true, NULL);
- }
out:
- __xdp_return(xdpf->data, &xdpf->mem, true, NULL);
+ __xdp_return(virt_to_netmem(xdpf->data), xdpf->mem_type, true, NULL);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
@@ -451,46 +504,19 @@ EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
* xdp_frame_bulk is usually stored/allocated on the function
* call-stack to avoid locking penalties.
*/
-void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq)
-{
- struct xdp_mem_allocator *xa = bq->xa;
-
- if (unlikely(!xa || !bq->count))
- return;
-
- page_pool_put_page_bulk(xa->page_pool, bq->q, bq->count);
- /* bq->xa is not cleared to save lookup, if mem.id same in next bulk */
- bq->count = 0;
-}
-EXPORT_SYMBOL_GPL(xdp_flush_frame_bulk);
/* Must be called with rcu_read_lock held */
void xdp_return_frame_bulk(struct xdp_frame *xdpf,
struct xdp_frame_bulk *bq)
{
- struct xdp_mem_info *mem = &xdpf->mem;
- struct xdp_mem_allocator *xa;
-
- if (mem->type != MEM_TYPE_PAGE_POOL) {
+ if (xdpf->mem_type != MEM_TYPE_PAGE_POOL) {
xdp_return_frame(xdpf);
return;
}
- xa = bq->xa;
- if (unlikely(!xa)) {
- xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- bq->count = 0;
- bq->xa = xa;
- }
-
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
- if (unlikely(mem->id != xa->mem.id)) {
- xdp_flush_frame_bulk(bq);
- bq->xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- }
-
if (unlikely(xdp_frame_has_frags(xdpf))) {
struct skb_shared_info *sinfo;
int i;
@@ -499,31 +525,40 @@ void xdp_return_frame_bulk(struct xdp_frame *xdpf,
for (i = 0; i < sinfo->nr_frags; i++) {
skb_frag_t *frag = &sinfo->frags[i];
- bq->q[bq->count++] = skb_frag_address(frag);
+ bq->q[bq->count++] = skb_frag_netmem(frag);
if (bq->count == XDP_BULK_QUEUE_SIZE)
xdp_flush_frame_bulk(bq);
}
}
- bq->q[bq->count++] = xdpf->data;
+ bq->q[bq->count++] = virt_to_netmem(xdpf->data);
}
EXPORT_SYMBOL_GPL(xdp_return_frame_bulk);
+/**
+ * xdp_return_frag -- free one XDP frag or decrement its refcount
+ * @netmem: network memory reference to release
+ * @xdp: &xdp_buff to release the frag for
+ */
+void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp)
+{
+ __xdp_return(netmem, xdp->rxq->mem.type, true, NULL);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frag);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
struct skb_shared_info *sinfo;
- int i;
if (likely(!xdp_buff_has_frags(xdp)))
goto out;
sinfo = xdp_get_shared_info_from_buff(xdp);
- for (i = 0; i < sinfo->nr_frags; i++) {
- struct page *page = skb_frag_page(&sinfo->frags[i]);
+ for (u32 i = 0; i < sinfo->nr_frags; i++)
+ __xdp_return(skb_frag_netmem(&sinfo->frags[i]),
+ xdp->rxq->mem.type, true, xdp);
- __xdp_return(page_address(page), &xdp->rxq->mem, true, xdp);
- }
out:
- __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp);
+ __xdp_return(virt_to_netmem(xdp->data), xdp->rxq->mem.type, true, xdp);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
@@ -569,7 +604,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp)
xdpf->headroom = 0;
xdpf->metasize = metasize;
xdpf->frame_sz = PAGE_SIZE;
- xdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
+ xdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
xsk_buff_free(xdp);
return xdpf;
@@ -593,6 +628,173 @@ int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(xdp_alloc_skb_bulk);
+/**
+ * xdp_build_skb_from_buff - create an skb from &xdp_buff
+ * @xdp: &xdp_buff to convert to an skb
+ *
+ * Perform common operations to create a new skb to pass up the stack from
+ * &xdp_buff: allocate an skb head from the NAPI percpu cache, initialize
+ * skb data pointers and offsets, set the recycle bit if the buff is
+ * PP-backed, Rx queue index, protocol and update frags info.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
+{
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ const struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
+ u32 nr_frags = 0;
+ int metalen;
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+ sinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = sinfo->nr_frags;
+ }
+
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+ return NULL;
+
+ skb_reserve(skb, xdp->data - xdp->data_hard_start);
+ __skb_put(skb, xdp->data_end - xdp->data);
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0)
+ skb_metadata_set(skb, metalen);
+
+ if (rxq->mem.type == MEM_TYPE_PAGE_POOL)
+ skb_mark_for_recycle(skb);
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(nr_frags)) {
+ u32 tsize;
+
+ tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size, tsize,
+ xdp_buff_is_frag_pfmemalloc(xdp));
+ }
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_buff);
+
+/**
+ * xdp_copy_frags_from_zc - copy frags from XSk buff to skb
+ * @skb: skb to copy frags to
+ * @xdp: XSk &xdp_buff from which the frags will be copied
+ * @pp: &page_pool backing page allocation, if available
+ *
+ * Copy all frags from XSk &xdp_buff to the skb to pass it up the stack.
+ * Allocate a new buffer for each frag, copy it and attach to the skb.
+ *
+ * Return: true on success, false on netmem allocation fail.
+ */
+static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
+ const struct xdp_buff *xdp,
+ struct page_pool *pp)
+{
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+ const struct skb_shared_info *xinfo;
+ u32 nr_frags, tsize = 0;
+ bool pfmemalloc = false;
+
+ xinfo = xdp_get_shared_info_from_buff(xdp);
+ nr_frags = xinfo->nr_frags;
+
+ for (u32 i = 0; i < nr_frags; i++) {
+ u32 len = skb_frag_size(&xinfo->frags[i]);
+ u32 offset, truesize = len;
+ netmem_ref netmem;
+
+ netmem = page_pool_dev_alloc_netmem(pp, &offset, &truesize);
+ if (unlikely(!netmem)) {
+ sinfo->nr_frags = i;
+ return false;
+ }
+
+ memcpy(__netmem_address(netmem),
+ __netmem_address(xinfo->frags[i].netmem),
+ LARGEST_ALIGN(len));
+ __skb_fill_netmem_desc_noacc(sinfo, i, netmem, offset, len);
+
+ tsize += truesize;
+ pfmemalloc |= netmem_is_pfmemalloc(netmem);
+ }
+
+ xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
+ tsize, pfmemalloc);
+
+ return true;
+}
+
+/**
+ * xdp_build_skb_from_zc - create an skb from XSk &xdp_buff
+ * @xdp: source XSk buff
+ *
+ * Similar to xdp_build_skb_from_buff(), but for XSk frames. Allocate an skb
+ * head, new buffer for the head, copy the data and initialize the skb fields.
+ * If there are frags, allocate new buffers for them and copy.
+ * Buffers are allocated from the system percpu pools to try recycling them.
+ * If new skb was built successfully, @xdp is returned to XSk pool's freelist.
+ * On error, it remains untouched and the caller must take care of this.
+ *
+ * Return: new &sk_buff on success, %NULL on error.
+ */
+struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp)
+{
+ struct page_pool *pp = this_cpu_read(system_page_pool);
+ const struct xdp_rxq_info *rxq = xdp->rxq;
+ u32 len = xdp->data_end - xdp->data_meta;
+ u32 truesize = xdp->frame_sz;
+ struct sk_buff *skb;
+ int metalen;
+ void *data;
+
+ if (!IS_ENABLED(CONFIG_PAGE_POOL))
+ return NULL;
+
+ data = page_pool_dev_alloc_va(pp, &truesize);
+ if (unlikely(!data))
+ return NULL;
+
+ skb = napi_build_skb(data, truesize);
+ if (unlikely(!skb)) {
+ page_pool_free_va(pp, data, true);
+ return NULL;
+ }
+
+ skb_mark_for_recycle(skb);
+ skb_reserve(skb, xdp->data_meta - xdp->data_hard_start);
+
+ memcpy(__skb_put(skb, len), xdp->data_meta, LARGEST_ALIGN(len));
+
+ metalen = xdp->data - xdp->data_meta;
+ if (metalen > 0) {
+ skb_metadata_set(skb, metalen);
+ __skb_pull(skb, metalen);
+ }
+
+ skb_record_rx_queue(skb, rxq->queue_index);
+
+ if (unlikely(xdp_buff_has_frags(xdp)) &&
+ unlikely(!xdp_copy_frags_from_zc(skb, xdp, pp))) {
+ napi_consume_skb(skb, true);
+ return NULL;
+ }
+
+ xsk_buff_free(xdp);
+
+ skb->protocol = eth_type_trans(skb, rxq->dev);
+
+ return skb;
+}
+EXPORT_SYMBOL_GPL(xdp_build_skb_from_zc);
+
struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct sk_buff *skb,
struct net_device *dev)
@@ -639,7 +841,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
* - RX ring dev queue index (skb_record_rx_queue)
*/
- if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+ if (xdpf->mem_type == MEM_TYPE_PAGE_POOL)
skb_mark_for_recycle(skb);
/* Allow SKB to reuse area used by xdp_frame */
@@ -686,8 +888,7 @@ struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf)
nxdpf = addr;
nxdpf->data = addr + headroom;
nxdpf->frame_sz = PAGE_SIZE;
- nxdpf->mem.type = MEM_TYPE_PAGE_ORDER0;
- nxdpf->mem.id = 0;
+ nxdpf->mem_type = MEM_TYPE_PAGE_ORDER0;
return nxdpf;
}
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 5926159a6f20..be515ba821e2 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -15,6 +15,7 @@
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/inet_dscp.h>
#include <net/inet_hashtables.h>
#include <net/inet_sock.h>
#include <net/protocol.h>
@@ -473,7 +474,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
.flowi4_oif = inet_iif(skb),
.daddr = iph->saddr,
.saddr = iph->daddr,
- .flowi4_tos = ip_sock_rt_tos(sk),
+ .flowi4_tos = inet_dscp_to_dsfield(inet_sk_dscp(inet_sk(sk))),
.flowi4_scope = ip_sock_rt_scope(sk),
.flowi4_proto = sk->sk_protocol,
.fl4_sport = dccp_hdr(skb)->dccph_dport,
diff --git a/net/dccp/sysctl.c b/net/dccp/sysctl.c
index 3fc474d6e57d..b15845fd6300 100644
--- a/net/dccp/sysctl.c
+++ b/net/dccp/sysctl.c
@@ -11,10 +11,6 @@
#include "dccp.h"
#include "feat.h"
-#ifndef CONFIG_SYSCTL
-#error This file should not be compiled without CONFIG_SYSCTL defined
-#endif
-
/* Boundary values */
static int u8_max = 0xFF;
static unsigned long seqw_min = DCCPF_SEQ_WMIN,
diff --git a/net/devlink/core.c b/net/devlink/core.c
index f49cd83f1955..7203c39532fc 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -117,7 +117,7 @@ static struct devlink_rel *devlink_rel_alloc(void)
err = xa_alloc_cyclic(&devlink_rels, &rel->index, rel,
xa_limit_32b, &next, GFP_KERNEL);
- if (err) {
+ if (err < 0) {
kfree(rel);
return ERR_PTR(err);
}
diff --git a/net/devlink/health.c b/net/devlink/health.c
index b8d3084e6fe0..57db6799722a 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -1238,3 +1238,70 @@ int devlink_nl_health_reporter_test_doit(struct sk_buff *skb,
return reporter->ops->test(reporter, info->extack);
}
+
+/**
+ * devlink_fmsg_dump_skb - Dump sk_buffer structure
+ * @fmsg: devlink formatted message pointer
+ * @skb: pointer to skb
+ *
+ * Dump diagnostic information about sk_buff structure, like headroom, length,
+ * tailroom, MAC, etc.
+ */
+void devlink_fmsg_dump_skb(struct devlink_fmsg *fmsg, const struct sk_buff *skb)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct sock *sk = skb->sk;
+ bool has_mac, has_trans;
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ devlink_fmsg_pair_nest_start(fmsg, "skb");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "actual len", skb->len);
+ devlink_fmsg_put(fmsg, "head len", skb_headlen(skb));
+ devlink_fmsg_put(fmsg, "data len", skb->data_len);
+ devlink_fmsg_put(fmsg, "tail len", skb_tailroom(skb));
+ devlink_fmsg_put(fmsg, "MAC", has_mac ? skb->mac_header : -1);
+ devlink_fmsg_put(fmsg, "MAC len",
+ has_mac ? skb_mac_header_len(skb) : -1);
+ devlink_fmsg_put(fmsg, "network hdr", skb->network_header);
+ devlink_fmsg_put(fmsg, "network hdr len",
+ has_trans ? skb_network_header_len(skb) : -1);
+ devlink_fmsg_put(fmsg, "transport hdr",
+ has_trans ? skb->transport_header : -1);
+ devlink_fmsg_put(fmsg, "csum", (__force u32)skb->csum);
+ devlink_fmsg_put(fmsg, "csum_ip_summed", (u8)skb->ip_summed);
+ devlink_fmsg_put(fmsg, "csum_complete_sw", !!skb->csum_complete_sw);
+ devlink_fmsg_put(fmsg, "csum_valid", !!skb->csum_valid);
+ devlink_fmsg_put(fmsg, "csum_level", (u8)skb->csum_level);
+ devlink_fmsg_put(fmsg, "sw_hash", !!skb->sw_hash);
+ devlink_fmsg_put(fmsg, "l4_hash", !!skb->l4_hash);
+ devlink_fmsg_put(fmsg, "proto", ntohs(skb->protocol));
+ devlink_fmsg_put(fmsg, "pkt_type", (u8)skb->pkt_type);
+ devlink_fmsg_put(fmsg, "iif", skb->skb_iif);
+
+ if (sk) {
+ devlink_fmsg_pair_nest_start(fmsg, "sk");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "family", sk->sk_type);
+ devlink_fmsg_put(fmsg, "type", sk->sk_type);
+ devlink_fmsg_put(fmsg, "proto", sk->sk_protocol);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+ }
+
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+
+ devlink_fmsg_pair_nest_start(fmsg, "shinfo");
+ devlink_fmsg_obj_nest_start(fmsg);
+ devlink_fmsg_put(fmsg, "tx_flags", sh->tx_flags);
+ devlink_fmsg_put(fmsg, "nr_frags", sh->nr_frags);
+ devlink_fmsg_put(fmsg, "gso_size", sh->gso_size);
+ devlink_fmsg_put(fmsg, "gso_type", sh->gso_type);
+ devlink_fmsg_put(fmsg, "gso_segs", sh->gso_segs);
+ devlink_fmsg_obj_nest_end(fmsg);
+ devlink_fmsg_pair_nest_end(fmsg);
+}
+EXPORT_SYMBOL_GPL(devlink_fmsg_dump_skb);
diff --git a/net/devlink/port.c b/net/devlink/port.c
index be9158b4453c..939081a0e615 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -1376,7 +1376,7 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
*
* @devlink_port: devlink port
* @controller: associated controller number for the devlink port instance
- * @pf: associated PF for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
* @external: indicates if the port is for an external controller
*/
void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 controller,
@@ -1402,8 +1402,9 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
*
* @devlink_port: devlink port
* @controller: associated controller number for the devlink port instance
- * @pf: associated PF for the devlink port instance
- * @vf: associated VF of a PF for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @vf: associated PCI VF number of a PF for the devlink port instance;
+ * VF number starts from 0 for the first PCI virtual function
* @external: indicates if the port is for an external controller
*/
void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 controller,
@@ -1430,8 +1431,8 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
*
* @devlink_port: devlink port
* @controller: associated controller number for the devlink port instance
- * @pf: associated PF for the devlink port instance
- * @sf: associated SF of a PF for the devlink port instance
+ * @pf: associated PCI function number for the devlink port instance
+ * @sf: associated SF number of a PF for the devlink port instance
* @external: indicates if the port is for an external controller
*/
void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller,
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 5a7c0e565a89..436a7e1b412a 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -862,6 +862,16 @@ static void dsa_tree_teardown_lags(struct dsa_switch_tree *dst)
kfree(dst->lags);
}
+static void dsa_tree_teardown_routing_table(struct dsa_switch_tree *dst)
+{
+ struct dsa_link *dl, *next;
+
+ list_for_each_entry_safe(dl, next, &dst->rtable, list) {
+ list_del(&dl->list);
+ kfree(dl);
+ }
+}
+
static int dsa_tree_setup(struct dsa_switch_tree *dst)
{
bool complete;
@@ -879,7 +889,7 @@ static int dsa_tree_setup(struct dsa_switch_tree *dst)
err = dsa_tree_setup_cpu_ports(dst);
if (err)
- return err;
+ goto teardown_rtable;
err = dsa_tree_setup_switches(dst);
if (err)
@@ -911,14 +921,14 @@ teardown_switches:
dsa_tree_teardown_switches(dst);
teardown_cpu_ports:
dsa_tree_teardown_cpu_ports(dst);
+teardown_rtable:
+ dsa_tree_teardown_routing_table(dst);
return err;
}
static void dsa_tree_teardown(struct dsa_switch_tree *dst)
{
- struct dsa_link *dl, *next;
-
if (!dst->setup)
return;
@@ -932,10 +942,7 @@ static void dsa_tree_teardown(struct dsa_switch_tree *dst)
dsa_tree_teardown_cpu_ports(dst);
- list_for_each_entry_safe(dl, next, &dst->rtable, list) {
- list_del(&dl->list);
- kfree(dl);
- }
+ dsa_tree_teardown_routing_table(dst);
pr_info("DSA: tree %d torn down\n", dst->index);
@@ -1367,7 +1374,7 @@ static int dsa_switch_parse_of(struct dsa_switch *ds, struct device_node *dn)
return dsa_switch_parse_ports_of(ds, dn);
}
-static int dev_is_class(struct device *dev, void *class)
+static int dev_is_class(struct device *dev, const void *class)
{
if (dev->class != NULL && !strcmp(dev->class->name, class))
return 1;
@@ -1478,12 +1485,44 @@ static int dsa_switch_parse(struct dsa_switch *ds, struct dsa_chip_data *cd)
static void dsa_switch_release_ports(struct dsa_switch *ds)
{
+ struct dsa_mac_addr *a, *tmp;
struct dsa_port *dp, *next;
+ struct dsa_vlan *v, *n;
dsa_switch_for_each_port_safe(dp, next, ds) {
- WARN_ON(!list_empty(&dp->fdbs));
- WARN_ON(!list_empty(&dp->mdbs));
- WARN_ON(!list_empty(&dp->vlans));
+ /* These are either entries that upper layers lost track of
+ * (probably due to bugs), or installed through interfaces
+ * where one does not necessarily have to remove them, like
+ * ndo_dflt_fdb_add().
+ */
+ list_for_each_entry_safe(a, tmp, &dp->fdbs, list) {
+ dev_info(ds->dev,
+ "Cleaning up unicast address %pM vid %u from port %d\n",
+ a->addr, a->vid, dp->index);
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ list_for_each_entry_safe(a, tmp, &dp->mdbs, list) {
+ dev_info(ds->dev,
+ "Cleaning up multicast address %pM vid %u from port %d\n",
+ a->addr, a->vid, dp->index);
+ list_del(&a->list);
+ kfree(a);
+ }
+
+ /* These are entries that upper layers have lost track of,
+ * probably due to bugs, but also due to dsa_port_do_vlan_del()
+ * having failed and the VLAN entry still lingering on.
+ */
+ list_for_each_entry_safe(v, n, &dp->vlans, list) {
+ dev_info(ds->dev,
+ "Cleaning up vid %u from port %d\n",
+ v->vid, dp->index);
+ list_del(&v->list);
+ kfree(v);
+ }
+
list_del(&dp->list);
kfree(dp);
}
diff --git a/net/dsa/port.c b/net/dsa/port.c
index ee0aaec4c8e0..5c9d1798e830 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1575,6 +1575,22 @@ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp,
cpu_dp->tag_ops = tag_ops;
}
+/* dsa_supports_eee - indicate that EEE is supported
+ * @ds: pointer to &struct dsa_switch
+ * @port: port index
+ *
+ * A default implementation for the .support_eee() DSA operations member,
+ * which drivers can use to indicate that they support EEE on all of their
+ * user ports.
+ *
+ * Returns: true
+ */
+bool dsa_supports_eee(struct dsa_switch *ds, int port)
+{
+ return true;
+}
+EXPORT_SYMBOL_GPL(dsa_supports_eee);
+
static void dsa_port_phylink_mac_config(struct phylink_config *config,
unsigned int mode,
const struct phylink_link_state *state)
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index 3ee53e28ec2e..53e03fd8071b 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -197,7 +197,7 @@ static int dsa_port_do_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid)
err = ds->ops->tag_8021q_vlan_del(ds, port, vid);
if (err) {
- refcount_inc(&v->refcount);
+ refcount_set(&v->refcount, 1);
return err;
}
diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index 281bbac5539d..c33d4bf17929 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -66,7 +66,7 @@ static int ksz_connect(struct dsa_switch *ds)
if (!priv)
return -ENOMEM;
- xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit",
+ xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
ds->dst->index, ds->index);
if (IS_ERR(xmit_worker)) {
ret = PTR_ERR(xmit_worker);
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 11ea8cfd6266..3929584791e4 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -110,7 +110,7 @@ static int ocelot_connect(struct dsa_switch *ds)
if (!priv)
return -ENOMEM;
- priv->xmit_worker = kthread_create_worker(0, "felix_xmit");
+ priv->xmit_worker = kthread_run_worker(0, "felix_xmit");
if (IS_ERR(priv->xmit_worker)) {
err = PTR_ERR(priv->xmit_worker);
kfree(priv);
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 3e902af7eea6..02adec693811 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -707,7 +707,7 @@ static int sja1105_connect(struct dsa_switch *ds)
spin_lock_init(&priv->meta_lock);
- xmit_worker = kthread_create_worker(0, "dsa%d:%d_xmit",
+ xmit_worker = kthread_run_worker(0, "dsa%d:%d_xmit",
ds->dst->index, ds->index);
if (IS_ERR(xmit_worker)) {
err = PTR_ERR(xmit_worker);
diff --git a/net/dsa/user.c b/net/dsa/user.c
index 06c30a9e29ff..291ab1b4acc4 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -515,12 +515,13 @@ dsa_user_port_fdb_do_dump(const unsigned char *addr, u16 vid,
bool is_static, void *data)
{
struct dsa_user_dump_ctx *dump = data;
+ struct ndo_fdb_dump_context *ctx = (void *)dump->cb->ctx;
u32 portid = NETLINK_CB(dump->cb->skb).portid;
u32 seq = dump->cb->nlh->nlmsg_seq;
struct nlmsghdr *nlh;
struct ndmsg *ndm;
- if (dump->idx < dump->cb->args[2])
+ if (dump->idx < ctx->fdb_idx)
goto skip;
nlh = nlmsg_put(dump->skb, portid, seq, RTM_NEWNEIGH,
@@ -1149,6 +1150,16 @@ dsa_user_get_rmon_stats(struct net_device *dev,
ds->ops->get_rmon_stats(ds, dp->index, rmon_stats, ranges);
}
+static void dsa_user_get_ts_stats(struct net_device *dev,
+ struct ethtool_ts_stats *ts_stats)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_switch *ds = dp->ds;
+
+ if (ds->ops->get_ts_stats)
+ ds->ops->get_ts_stats(ds, dp->index, ts_stats);
+}
+
static void dsa_user_net_selftest(struct net_device *ndev,
struct ethtool_test *etest, u64 *buf)
{
@@ -1228,8 +1239,12 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e)
struct dsa_switch *ds = dp->ds;
int ret;
+ /* Check whether the switch supports EEE */
+ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
+ return -EOPNOTSUPP;
+
/* Port's PHY and MAC both need to be EEE capable */
- if (!dev->phydev || !dp->pl)
+ if (!dev->phydev)
return -ENODEV;
if (!ds->ops->set_mac_eee)
@@ -1246,18 +1261,14 @@ static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e)
{
struct dsa_port *dp = dsa_user_to_port(dev);
struct dsa_switch *ds = dp->ds;
- int ret;
-
- /* Port's PHY and MAC both need to be EEE capable */
- if (!dev->phydev || !dp->pl)
- return -ENODEV;
- if (!ds->ops->get_mac_eee)
+ /* Check whether the switch supports EEE */
+ if (!ds->ops->support_eee || !ds->ops->support_eee(ds, dp->index))
return -EOPNOTSUPP;
- ret = ds->ops->get_mac_eee(ds, dp->index, e);
- if (ret)
- return ret;
+ /* Port's PHY and MAC both need to be EEE capable */
+ if (!dev->phydev)
+ return -ENODEV;
return phylink_ethtool_get_eee(dp->pl, e);
}
@@ -2500,6 +2511,7 @@ static const struct ethtool_ops dsa_user_ethtool_ops = {
.get_eth_mac_stats = dsa_user_get_eth_mac_stats,
.get_eth_ctrl_stats = dsa_user_get_eth_ctrl_stats,
.get_rmon_stats = dsa_user_get_rmon_stats,
+ .get_ts_stats = dsa_user_get_ts_stats,
.set_wol = dsa_user_set_wol,
.get_wol = dsa_user_get_wol,
.set_eee = dsa_user_set_eee,
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index 9b540644ba31..a1490c4afe6b 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -9,4 +9,4 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \
- phy.o
+ phy.o tsconfig.o
diff --git a/net/ethtool/cmis.h b/net/ethtool/cmis.h
index 1e790413db0e..4a9a946cabf0 100644
--- a/net/ethtool/cmis.h
+++ b/net/ethtool/cmis.h
@@ -101,7 +101,6 @@ struct ethtool_cmis_cdb_rpl {
};
u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs);
-u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs);
void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl,
diff --git a/net/ethtool/cmis_cdb.c b/net/ethtool/cmis_cdb.c
index d159dc121bde..3057576bc81e 100644
--- a/net/ethtool/cmis_cdb.c
+++ b/net/ethtool/cmis_cdb.c
@@ -16,15 +16,6 @@ u32 ethtool_cmis_get_max_lpl_size(u8 num_of_byte_octs)
return 8 * (1 + min_t(u8, num_of_byte_octs, 15));
}
-/* For accessing the EPL field on page 9Fh, the allowable length extension is
- * min(i, 255) byte octets where i specifies the allowable additional number of
- * byte octets in a READ or a WRITE.
- */
-u32 ethtool_cmis_get_max_epl_size(u8 num_of_byte_octs)
-{
- return 8 * (1 + min_t(u8, num_of_byte_octs, 255));
-}
-
void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
enum ethtool_cmis_cdb_cmd_id cmd, u8 *lpl,
u8 lpl_len, u8 *epl, u16 epl_len,
@@ -33,19 +24,16 @@ void ethtool_cmis_cdb_compose_args(struct ethtool_cmis_cdb_cmd_args *args,
{
args->req.id = cpu_to_be16(cmd);
args->req.lpl_len = lpl_len;
- if (lpl) {
+ if (lpl)
memcpy(args->req.payload, lpl, args->req.lpl_len);
- args->read_write_len_ext =
- ethtool_cmis_get_max_lpl_size(read_write_len_ext);
- }
if (epl) {
args->req.epl_len = cpu_to_be16(epl_len);
args->req.epl = epl;
- args->read_write_len_ext =
- ethtool_cmis_get_max_epl_size(read_write_len_ext);
}
args->max_duration = max_duration;
+ args->read_write_len_ext =
+ ethtool_cmis_get_max_lpl_size(read_write_len_ext);
args->msleep_pre_rpl = msleep_pre_rpl;
args->rpl_exp_len = rpl_exp_len;
args->flags = flags;
@@ -363,7 +351,7 @@ ethtool_cmis_module_poll(struct net_device *dev,
struct netlink_ext_ack extack = {};
int err;
- ethtool_cmis_page_init(&page_data, 0, offset, sizeof(rpl));
+ ethtool_cmis_page_init(&page_data, 0, offset, sizeof(*rpl));
page_data.data = (u8 *)rpl;
err = ops->get_module_eeprom_by_page(dev, &page_data, &extack);
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 05ce4f8080b3..e2f8a41cc108 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -5,9 +5,13 @@
#include <linux/phy.h>
#include <linux/rtnetlink.h>
#include <linux/ptp_clock_kernel.h>
+#include <linux/phy_link_topology.h>
+#include <net/netdev_queues.h>
#include "netlink.h"
#include "common.h"
+#include "../core/dev.h"
+
const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
[NETIF_F_SG_BIT] = "tx-scatter-gather",
@@ -459,6 +463,11 @@ const char ts_rx_filter_names[][ETH_GSTRING_LEN] = {
};
static_assert(ARRAY_SIZE(ts_rx_filter_names) == __HWTSTAMP_FILTER_CNT);
+const char ts_flags_names[][ETH_GSTRING_LEN] = {
+ [const_ilog2(HWTSTAMP_FLAG_BONDED_PHC_INDEX)] = "bonded-phc-index",
+};
+static_assert(ARRAY_SIZE(ts_flags_names) == __HWTSTAMP_FLAG_CNT);
+
const char udp_tunnel_type_names[][ETH_GSTRING_LEN] = {
[ETHTOOL_UDP_TUNNEL_TYPE_VXLAN] = "vxlan",
[ETHTOOL_UDP_TUNNEL_TYPE_GENEVE] = "geneve",
@@ -763,20 +772,114 @@ int ethtool_check_ops(const struct ethtool_ops *ops)
return 0;
}
-int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info)
+void ethtool_ringparam_get_cfg(struct net_device *dev,
+ struct ethtool_ringparam *param,
+ struct kernel_ethtool_ringparam *kparam,
+ struct netlink_ext_ack *extack)
{
- const struct ethtool_ops *ops = dev->ethtool_ops;
- struct phy_device *phydev = dev->phydev;
- int err = 0;
+ memset(param, 0, sizeof(*param));
+ memset(kparam, 0, sizeof(*kparam));
+
+ param->cmd = ETHTOOL_GRINGPARAM;
+ dev->ethtool_ops->get_ringparam(dev, param, kparam, extack);
+
+ /* Driver gives us current state, we want to return current config */
+ kparam->tcp_data_split = dev->cfg->hds_config;
+ kparam->hds_thresh = dev->cfg->hds_thresh;
+}
+static void ethtool_init_tsinfo(struct kernel_ethtool_ts_info *info)
+{
memset(info, 0, sizeof(*info));
info->cmd = ETHTOOL_GET_TS_INFO;
info->phc_index = -1;
+}
+
+int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ int err;
+
+ if (!ops->get_ts_info)
+ return -ENODEV;
+
+ /* Does ptp comes from netdev */
+ ethtool_init_tsinfo(info);
+ info->phc_qualifier = hwprov_desc->qualifier;
+ err = ops->get_ts_info(dev, info);
+ if (err)
+ return err;
+
+ if (info->phc_index == hwprov_desc->index &&
+ net_support_hwtstamp_qualifier(dev, hwprov_desc->qualifier))
+ return 0;
+
+ return -ENODEV;
+}
+
+struct phy_device *
+ethtool_phy_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ int err;
+
+ /* Only precise qualifier is supported in phydev */
+ if (hwprov_desc->qualifier != HWTSTAMP_PROVIDER_QUALIFIER_PRECISE)
+ return ERR_PTR(-ENODEV);
+
+ /* Look in the phy topology */
+ if (dev->link_topo) {
+ struct phy_device_node *pdn;
+ unsigned long phy_index;
+
+ xa_for_each(&dev->link_topo->phys, phy_index, pdn) {
+ if (!phy_has_tsinfo(pdn->phy))
+ continue;
+
+ ethtool_init_tsinfo(info);
+ err = phy_ts_info(pdn->phy, info);
+ if (err)
+ return ERR_PTR(err);
+
+ if (info->phc_index == hwprov_desc->index)
+ return pdn->phy;
+ }
+ return ERR_PTR(-ENODEV);
+ }
- if (phy_is_default_hwtstamp(phydev) && phy_has_tsinfo(phydev))
- err = phy_ts_info(phydev, info);
- else if (ops->get_ts_info)
- err = ops->get_ts_info(dev, info);
+ /* Look on the dev->phydev */
+ if (phy_has_tsinfo(dev->phydev)) {
+ ethtool_init_tsinfo(info);
+ err = phy_ts_info(dev->phydev, info);
+ if (err)
+ return ERR_PTR(err);
+
+ if (info->phc_index == hwprov_desc->index)
+ return dev->phydev;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+int ethtool_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ int err;
+
+ err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc);
+ if (err == -ENODEV) {
+ struct phy_device *phy;
+
+ phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc);
+ if (IS_ERR(phy))
+ err = PTR_ERR(phy);
+ else
+ err = 0;
+ }
info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
SOF_TIMESTAMPING_SOFTWARE;
@@ -784,6 +887,59 @@ int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info
return err;
}
+int __ethtool_get_ts_info(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info)
+{
+ struct hwtstamp_provider *hwprov;
+ int err = 0;
+
+ rcu_read_lock();
+ hwprov = rcu_dereference(dev->hwprov);
+ /* No provider specified, use default behavior */
+ if (!hwprov) {
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct phy_device *phydev = dev->phydev;
+
+ ethtool_init_tsinfo(info);
+ if (phy_is_default_hwtstamp(phydev) &&
+ phy_has_tsinfo(phydev))
+ err = phy_ts_info(phydev, info);
+ else if (ops->get_ts_info)
+ err = ops->get_ts_info(dev, info);
+
+ info->so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ rcu_read_unlock();
+ return err;
+ }
+
+ err = ethtool_get_ts_info_by_phc(dev, info, &hwprov->desc);
+ rcu_read_unlock();
+ return err;
+}
+
+bool net_support_hwtstamp_qualifier(struct net_device *dev,
+ enum hwtstamp_provider_qualifier qualifier)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+
+ if (!ops)
+ return false;
+
+ /* Return true with precise qualifier and with NIC without
+ * qualifier description to not break the old behavior.
+ */
+ if (!ops->supported_hwtstamp_qualifiers &&
+ qualifier == HWTSTAMP_PROVIDER_QUALIFIER_PRECISE)
+ return true;
+
+ if (ops->supported_hwtstamp_qualifiers & BIT(qualifier))
+ return true;
+
+ return false;
+}
+
int ethtool_get_phc_vclocks(struct net_device *dev, int **vclock_index)
{
struct kernel_ethtool_ts_info info = { };
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index 4a2de3ce7354..a1088c2441d0 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -13,6 +13,7 @@
ETHTOOL_LINK_MODE_ ## speed ## base ## type ## _ ## duplex ## _BIT
#define __SOF_TIMESTAMPING_CNT (const_ilog2(SOF_TIMESTAMPING_LAST) + 1)
+#define __HWTSTAMP_FLAG_CNT (const_ilog2(HWTSTAMP_FLAG_LAST) + 1)
struct link_mode_info {
int speed;
@@ -21,6 +22,7 @@ struct link_mode_info {
};
struct genl_info;
+struct hwtstamp_provider_desc;
extern const char
netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN];
@@ -37,6 +39,7 @@ extern const char wol_mode_names[][ETH_GSTRING_LEN];
extern const char sof_timestamping_names[][ETH_GSTRING_LEN];
extern const char ts_tx_type_names[][ETH_GSTRING_LEN];
extern const char ts_rx_filter_names[][ETH_GSTRING_LEN];
+extern const char ts_flags_names[][ETH_GSTRING_LEN];
extern const char udp_tunnel_type_names[][ETH_GSTRING_LEN];
int __ethtool_get_link(struct net_device *dev);
@@ -48,7 +51,25 @@ int ethtool_check_max_channel(struct net_device *dev,
struct ethtool_channels channels,
struct genl_info *info);
int ethtool_check_rss_ctx_busy(struct net_device *dev, u32 rss_context);
+
+void ethtool_ringparam_get_cfg(struct net_device *dev,
+ struct ethtool_ringparam *param,
+ struct kernel_ethtool_ringparam *kparam,
+ struct netlink_ext_ack *extack);
+
int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info);
+int ethtool_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+int ethtool_net_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+struct phy_device *
+ethtool_phy_get_ts_info_by_phc(struct net_device *dev,
+ struct kernel_ethtool_ts_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc);
+bool net_support_hwtstamp_qualifier(struct net_device *dev,
+ enum hwtstamp_provider_qualifier qualifier);
extern const struct ethtool_phy_ops *ethtool_phy_ops;
extern const struct ethtool_pse_ops *ethtool_pse_ops;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 7609ce2b2c5e..1c3ba2247776 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -2059,8 +2059,8 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr)
static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
{
- struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM };
struct kernel_ethtool_ringparam kernel_ringparam;
+ struct ethtool_ringparam ringparam, max;
int ret;
if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam)
@@ -2069,7 +2069,7 @@ static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr)
if (copy_from_user(&ringparam, useraddr, sizeof(ringparam)))
return -EFAULT;
- dev->ethtool_ops->get_ringparam(dev, &max, &kernel_ringparam, NULL);
+ ethtool_ringparam_get_cfg(dev, &max, &kernel_ringparam, NULL);
/* ensure new ring parameters are within the maximums */
if (ringparam.rx_pending > max.rx_max_pending ||
diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c
index e233dfc8ca4b..e088a30d1dd2 100644
--- a/net/ethtool/netlink.c
+++ b/net/ethtool/netlink.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <net/netdev_queues.h>
#include <net/sock.h>
#include <linux/ethtool_netlink.h>
#include <linux/phy_link_topology.h>
@@ -394,6 +395,8 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = {
[ETHTOOL_MSG_PLCA_GET_STATUS] = &ethnl_plca_status_request_ops,
[ETHTOOL_MSG_MM_GET] = &ethnl_mm_request_ops,
[ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops,
+ [ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops,
+ [ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops,
};
static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb)
@@ -490,7 +493,7 @@ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info)
ret = ops->prepare_data(req_info, reply_data, info);
rtnl_unlock();
if (ret < 0)
- goto err_cleanup;
+ goto err_dev;
ret = ops->reply_size(req_info, reply_data);
if (ret < 0)
goto err_cleanup;
@@ -548,7 +551,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info);
rtnl_unlock();
if (ret < 0)
- goto out;
+ goto out_cancel;
ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr);
if (ret < 0)
goto out;
@@ -557,6 +560,7 @@ static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev,
out:
if (ctx->ops->cleanup_data)
ctx->ops->cleanup_data(ctx->reply_data);
+out_cancel:
ctx->reply_data->dev = NULL;
if (ret < 0)
genlmsg_cancel(skb, ehdr);
@@ -665,6 +669,7 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info)
const struct ethnl_request_ops *ops;
struct ethnl_req_info req_info = {};
const u8 cmd = info->genlhdr->cmd;
+ struct net_device *dev;
int ret;
ops = ethnl_default_requests[cmd];
@@ -686,20 +691,36 @@ static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info)
goto out_dev;
}
+ dev = req_info.dev;
+
rtnl_lock();
- ret = ethnl_ops_begin(req_info.dev);
+ dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg),
+ GFP_KERNEL_ACCOUNT);
+ if (!dev->cfg_pending) {
+ ret = -ENOMEM;
+ goto out_tie_cfg;
+ }
+
+ ret = ethnl_ops_begin(dev);
if (ret < 0)
- goto out_rtnl;
+ goto out_free_cfg;
ret = ops->set(&req_info, info);
- if (ret <= 0)
+ if (ret < 0)
+ goto out_ops;
+
+ swap(dev->cfg, dev->cfg_pending);
+ if (!ret)
goto out_ops;
- ethtool_notify(req_info.dev, ops->set_ntf_cmd, NULL);
+ ethtool_notify(dev, ops->set_ntf_cmd, NULL);
ret = 0;
out_ops:
- ethnl_ops_complete(req_info.dev);
-out_rtnl:
+ ethnl_ops_complete(dev);
+out_free_cfg:
+ kfree(dev->cfg_pending);
+out_tie_cfg:
+ dev->cfg_pending = dev->cfg;
rtnl_unlock();
out_dev:
ethnl_parse_header_dev_put(&req_info);
@@ -760,7 +781,7 @@ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd,
ethnl_init_reply_data(reply_data, ops, dev);
ret = ops->prepare_data(req_info, reply_data, &info);
if (ret < 0)
- goto err_cleanup;
+ goto err_rep;
ret = ops->reply_size(req_info, reply_data);
if (ret < 0)
goto err_cleanup;
@@ -795,6 +816,7 @@ err_skb:
err_cleanup:
if (ops->cleanup_data)
ops->cleanup_data(reply_data);
+err_rep:
kfree(reply_data);
kfree(req_info);
return;
@@ -1074,9 +1096,9 @@ static const struct genl_ops ethtool_genl_ops[] = {
{
.cmd = ETHTOOL_MSG_TSINFO_GET,
.doit = ethnl_default_doit,
- .start = ethnl_default_start,
- .dumpit = ethnl_default_dumpit,
- .done = ethnl_default_done,
+ .start = ethnl_tsinfo_start,
+ .dumpit = ethnl_tsinfo_dumpit,
+ .done = ethnl_tsinfo_done,
.policy = ethnl_tsinfo_get_policy,
.maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1,
},
@@ -1243,6 +1265,22 @@ static const struct genl_ops ethtool_genl_ops[] = {
.policy = ethnl_phy_get_policy,
.maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1,
},
+ {
+ .cmd = ETHTOOL_MSG_TSCONFIG_GET,
+ .doit = ethnl_default_doit,
+ .start = ethnl_default_start,
+ .dumpit = ethnl_default_dumpit,
+ .done = ethnl_default_done,
+ .policy = ethnl_tsconfig_get_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1,
+ },
+ {
+ .cmd = ETHTOOL_MSG_TSCONFIG_SET,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .doit = ethnl_default_set_doit,
+ .policy = ethnl_tsconfig_set_policy,
+ .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1,
+ },
};
static const struct genl_multicast_group ethtool_nl_mcgrps[] = {
diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h
index 5e176938d6d2..ec6ab5443a6f 100644
--- a/net/ethtool/netlink.h
+++ b/net/ethtool/netlink.h
@@ -436,6 +436,7 @@ extern const struct ethnl_request_ops ethnl_plca_cfg_request_ops;
extern const struct ethnl_request_ops ethnl_plca_status_request_ops;
extern const struct ethnl_request_ops ethnl_mm_request_ops;
extern const struct ethnl_request_ops ethnl_phy_request_ops;
+extern const struct ethnl_request_ops ethnl_tsconfig_request_ops;
extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1];
extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1];
@@ -456,7 +457,7 @@ extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANT
extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1];
extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1];
extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1];
-extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX + 1];
+extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_HDS_THRESH_MAX + 1];
extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1];
extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1];
extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1];
@@ -465,7 +466,7 @@ extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_STATS_SRC
extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1];
extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];
extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1];
-extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1];
+extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1];
extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1];
extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1];
extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1];
@@ -486,6 +487,8 @@ extern const struct nla_policy ethnl_mm_get_policy[ETHTOOL_A_MM_HEADER + 1];
extern const struct nla_policy ethnl_mm_set_policy[ETHTOOL_A_MM_MAX + 1];
extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE_FW_FLASH_PASSWORD + 1];
extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1];
+extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1];
+extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1];
int ethnl_set_features(struct sk_buff *skb, struct genl_info *info);
int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info);
@@ -500,11 +503,15 @@ int ethnl_phy_start(struct netlink_callback *cb);
int ethnl_phy_doit(struct sk_buff *skb, struct genl_info *info);
int ethnl_phy_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
int ethnl_phy_done(struct netlink_callback *cb);
+int ethnl_tsinfo_start(struct netlink_callback *cb);
+int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int ethnl_tsinfo_done(struct netlink_callback *cb);
extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN];
extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN];
extern const char stats_eth_mac_names[__ETHTOOL_A_STATS_ETH_MAC_CNT][ETH_GSTRING_LEN];
extern const char stats_eth_ctrl_names[__ETHTOOL_A_STATS_ETH_CTRL_CNT][ETH_GSTRING_LEN];
extern const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN];
+extern const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN];
#endif /* _NET_ETHTOOL_NETLINK_H */
diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c
index 71843de832cc..4f6b99eab2a6 100644
--- a/net/ethtool/pse-pd.c
+++ b/net/ethtool/pse-pd.c
@@ -19,7 +19,7 @@ struct pse_req_info {
struct pse_reply_data {
struct ethnl_reply_data base;
- struct pse_control_status status;
+ struct ethtool_pse_control_status status;
};
#define PSE_REPDATA(__reply_base) \
@@ -80,7 +80,7 @@ static int pse_reply_size(const struct ethnl_req_info *req_base,
const struct ethnl_reply_data *reply_base)
{
const struct pse_reply_data *data = PSE_REPDATA(reply_base);
- const struct pse_control_status *st = &data->status;
+ const struct ethtool_pse_control_status *st = &data->status;
int len = 0;
if (st->podl_admin_state > 0)
@@ -114,7 +114,7 @@ static int pse_reply_size(const struct ethnl_req_info *req_base,
}
static int pse_put_pw_limit_ranges(struct sk_buff *skb,
- const struct pse_control_status *st)
+ const struct ethtool_pse_control_status *st)
{
const struct ethtool_c33_pse_pw_limit_range *pw_limit_ranges;
int i;
@@ -146,7 +146,7 @@ static int pse_fill_reply(struct sk_buff *skb,
const struct ethnl_reply_data *reply_base)
{
const struct pse_reply_data *data = PSE_REPDATA(reply_base);
- const struct pse_control_status *st = &data->status;
+ const struct ethtool_pse_control_status *st = &data->status;
if (st->podl_admin_state > 0 &&
nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE,
diff --git a/net/ethtool/rings.c b/net/ethtool/rings.c
index b7865a14fdf8..aeedd5ec6b8c 100644
--- a/net/ethtool/rings.c
+++ b/net/ethtool/rings.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <net/netdev_queues.h>
+
#include "netlink.h"
#include "common.h"
@@ -37,6 +39,10 @@ static int rings_prepare_data(const struct ethnl_req_info *req_base,
ret = ethnl_ops_begin(dev);
if (ret < 0)
return ret;
+
+ data->kernel_ringparam.tcp_data_split = dev->cfg->hds_config;
+ data->kernel_ringparam.hds_thresh = dev->cfg->hds_thresh;
+
dev->ethtool_ops->get_ringparam(dev, &data->ringparam,
&data->kernel_ringparam, info->extack);
ethnl_ops_complete(dev);
@@ -61,7 +67,9 @@ static int rings_reply_size(const struct ethnl_req_info *req_base,
nla_total_size(sizeof(u8)) + /* _RINGS_TX_PUSH */
nla_total_size(sizeof(u8))) + /* _RINGS_RX_PUSH */
nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN */
- nla_total_size(sizeof(u32)); /* _RINGS_TX_PUSH_BUF_LEN_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_TX_PUSH_BUF_LEN_MAX */
+ nla_total_size(sizeof(u32)) + /* _RINGS_HDS_THRESH */
+ nla_total_size(sizeof(u32)); /* _RINGS_HDS_THRESH_MAX*/
}
static int rings_fill_reply(struct sk_buff *skb,
@@ -108,7 +116,12 @@ static int rings_fill_reply(struct sk_buff *skb,
(nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN_MAX,
kr->tx_push_buf_max_len) ||
nla_put_u32(skb, ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN,
- kr->tx_push_buf_len))))
+ kr->tx_push_buf_len))) ||
+ ((supported_ring_params & ETHTOOL_RING_USE_HDS_THRS) &&
+ (nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH,
+ kr->hds_thresh) ||
+ nla_put_u32(skb, ETHTOOL_A_RINGS_HDS_THRESH_MAX,
+ kr->hds_thresh_max))))
return -EMSGSIZE;
return 0;
@@ -130,6 +143,7 @@ const struct nla_policy ethnl_rings_set_policy[] = {
[ETHTOOL_A_RINGS_TX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1),
[ETHTOOL_A_RINGS_RX_PUSH] = NLA_POLICY_MAX(NLA_U8, 1),
[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN] = { .type = NLA_U32 },
+ [ETHTOOL_A_RINGS_HDS_THRESH] = { .type = NLA_U32 },
};
static int
@@ -155,6 +169,14 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info,
return -EOPNOTSUPP;
}
+ if (tb[ETHTOOL_A_RINGS_HDS_THRESH] &&
+ !(ops->supported_ring_params & ETHTOOL_RING_USE_HDS_THRS)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_HDS_THRESH],
+ "setting hds-thresh is not supported");
+ return -EOPNOTSUPP;
+ }
+
if (tb[ETHTOOL_A_RINGS_CQE_SIZE] &&
!(ops->supported_ring_params & ETHTOOL_RING_USE_CQE_SIZE)) {
NL_SET_ERR_MSG_ATTR(info->extack,
@@ -193,16 +215,16 @@ ethnl_set_rings_validate(struct ethnl_req_info *req_info,
static int
ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
{
- struct kernel_ethtool_ringparam kernel_ringparam = {};
- struct ethtool_ringparam ringparam = {};
+ struct kernel_ethtool_ringparam kernel_ringparam;
struct net_device *dev = req_info->dev;
+ struct ethtool_ringparam ringparam;
struct nlattr **tb = info->attrs;
const struct nlattr *err_attr;
bool mod = false;
int ret;
- dev->ethtool_ops->get_ringparam(dev, &ringparam,
- &kernel_ringparam, info->extack);
+ ethtool_ringparam_get_cfg(dev, &ringparam, &kernel_ringparam,
+ info->extack);
ethnl_update_u32(&ringparam.rx_pending, tb[ETHTOOL_A_RINGS_RX], &mod);
ethnl_update_u32(&ringparam.rx_mini_pending,
@@ -222,9 +244,32 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
tb[ETHTOOL_A_RINGS_RX_PUSH], &mod);
ethnl_update_u32(&kernel_ringparam.tx_push_buf_len,
tb[ETHTOOL_A_RINGS_TX_PUSH_BUF_LEN], &mod);
+ ethnl_update_u32(&kernel_ringparam.hds_thresh,
+ tb[ETHTOOL_A_RINGS_HDS_THRESH], &mod);
if (!mod)
return 0;
+ if (kernel_ringparam.tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_ENABLED &&
+ dev_xdp_sb_prog_count(dev)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_RINGS_TCP_DATA_SPLIT],
+ "tcp-data-split can not be enabled with single buffer XDP");
+ return -EINVAL;
+ }
+
+ if (dev_get_min_mp_channel_count(dev)) {
+ if (kernel_ringparam.tcp_data_split !=
+ ETHTOOL_TCP_DATA_SPLIT_ENABLED) {
+ NL_SET_ERR_MSG(info->extack,
+ "can't disable tcp-data-split while device has memory provider enabled");
+ return -EINVAL;
+ } else if (kernel_ringparam.hds_thresh) {
+ NL_SET_ERR_MSG(info->extack,
+ "can't set non-zero hds_thresh while device is memory provider enabled");
+ return -EINVAL;
+ }
+ }
+
/* ensure new ring parameters are within limits */
if (ringparam.rx_pending > ringparam.rx_max_pending)
err_attr = tb[ETHTOOL_A_RINGS_RX];
@@ -234,6 +279,8 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
err_attr = tb[ETHTOOL_A_RINGS_RX_JUMBO];
else if (ringparam.tx_pending > ringparam.tx_max_pending)
err_attr = tb[ETHTOOL_A_RINGS_TX];
+ else if (kernel_ringparam.hds_thresh > kernel_ringparam.hds_thresh_max)
+ err_attr = tb[ETHTOOL_A_RINGS_HDS_THRESH];
else
err_attr = NULL;
if (err_attr) {
@@ -250,6 +297,9 @@ ethnl_set_rings(struct ethnl_req_info *req_info, struct genl_info *info)
return -EINVAL;
}
+ dev->cfg_pending->hds_config = kernel_ringparam.tcp_data_split;
+ dev->cfg_pending->hds_thresh = kernel_ringparam.hds_thresh;
+
ret = dev->ethtool_ops->set_ringparam(dev, &ringparam,
&kernel_ringparam, info->extack);
return ret < 0 ? ret : 1;
diff --git a/net/ethtool/stats.c b/net/ethtool/stats.c
index 273ae4ff343f..3ca8eb2a3b31 100644
--- a/net/ethtool/stats.c
+++ b/net/ethtool/stats.c
@@ -36,6 +36,7 @@ const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN] = {
[ETHTOOL_STATS_ETH_MAC] = "eth-mac",
[ETHTOOL_STATS_ETH_CTRL] = "eth-ctrl",
[ETHTOOL_STATS_RMON] = "rmon",
+ [ETHTOOL_STATS_PHY] = "phydev",
};
const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN] = {
@@ -80,6 +81,15 @@ const char stats_rmon_names[__ETHTOOL_A_STATS_RMON_CNT][ETH_GSTRING_LEN] = {
[ETHTOOL_A_STATS_RMON_JABBER] = "etherStatsJabbers",
};
+const char stats_phy_names[__ETHTOOL_A_STATS_PHY_CNT][ETH_GSTRING_LEN] = {
+ [ETHTOOL_A_STATS_PHY_RX_PKTS] = "RxFrames",
+ [ETHTOOL_A_STATS_PHY_RX_BYTES] = "RxOctets",
+ [ETHTOOL_A_STATS_PHY_RX_ERRORS] = "RxErrors",
+ [ETHTOOL_A_STATS_PHY_TX_PKTS] = "TxFrames",
+ [ETHTOOL_A_STATS_PHY_TX_BYTES] = "TxOctets",
+ [ETHTOOL_A_STATS_PHY_TX_ERRORS] = "TxErrors",
+};
+
const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_SRC + 1] = {
[ETHTOOL_A_STATS_HEADER] =
NLA_POLICY_NESTED(ethnl_header_policy),
@@ -156,7 +166,8 @@ static int stats_prepare_data(const struct ethnl_req_info *req_base,
data->ctrl_stats.src = src;
data->rmon_stats.src = src;
- if (test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask) &&
+ if ((test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask) ||
+ test_bit(ETHTOOL_STATS_ETH_PHY, req_info->stat_mask)) &&
src == ETHTOOL_MAC_STATS_SRC_AGGREGATE) {
if (phydev)
phy_ethtool_get_phy_stats(phydev, &data->phy_stats,
@@ -212,6 +223,10 @@ static int stats_reply_size(const struct ethnl_req_info *req_base,
nla_total_size(4)) * /* _A_STATS_GRP_HIST_BKT_HI */
ETHTOOL_RMON_HIST_MAX * 2;
}
+ if (test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask)) {
+ n_stats += sizeof(struct ethtool_phy_stats) / sizeof(u64);
+ n_grps++;
+ }
len += n_grps * (nla_total_size(0) + /* _A_STATS_GRP */
nla_total_size(4) + /* _A_STATS_GRP_ID */
@@ -265,6 +280,25 @@ static int stats_put_phy_stats(struct sk_buff *skb,
return 0;
}
+static int stats_put_phydev_stats(struct sk_buff *skb,
+ const struct stats_reply_data *data)
+{
+ if (stat_put(skb, ETHTOOL_A_STATS_PHY_RX_PKTS,
+ data->phydev_stats.rx_packets) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_RX_BYTES,
+ data->phydev_stats.rx_bytes) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_RX_ERRORS,
+ data->phydev_stats.rx_errors) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_PKTS,
+ data->phydev_stats.tx_packets) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_BYTES,
+ data->phydev_stats.tx_bytes) ||
+ stat_put(skb, ETHTOOL_A_STATS_PHY_TX_ERRORS,
+ data->phydev_stats.tx_errors))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int stats_put_mac_stats(struct sk_buff *skb,
const struct stats_reply_data *data)
{
@@ -441,6 +475,9 @@ static int stats_fill_reply(struct sk_buff *skb,
if (!ret && test_bit(ETHTOOL_STATS_RMON, req_info->stat_mask))
ret = stats_put_stats(skb, data, ETHTOOL_STATS_RMON,
ETH_SS_STATS_RMON, stats_put_rmon_stats);
+ if (!ret && test_bit(ETHTOOL_STATS_PHY, req_info->stat_mask))
+ ret = stats_put_stats(skb, data, ETHTOOL_STATS_PHY,
+ ETH_SS_STATS_PHY, stats_put_phydev_stats);
return ret;
}
diff --git a/net/ethtool/strset.c b/net/ethtool/strset.c
index b9400d18f01d..f6a67109beda 100644
--- a/net/ethtool/strset.c
+++ b/net/ethtool/strset.c
@@ -75,6 +75,11 @@ static const struct strset_info info_template[] = {
.count = __HWTSTAMP_FILTER_CNT,
.strings = ts_rx_filter_names,
},
+ [ETH_SS_TS_FLAGS] = {
+ .per_dev = false,
+ .count = __HWTSTAMP_FLAG_CNT,
+ .strings = ts_flags_names,
+ },
[ETH_SS_UDP_TUNNEL_TYPES] = {
.per_dev = false,
.count = __ETHTOOL_UDP_TUNNEL_TYPE_CNT,
@@ -105,6 +110,11 @@ static const struct strset_info info_template[] = {
.count = __ETHTOOL_A_STATS_RMON_CNT,
.strings = stats_rmon_names,
},
+ [ETH_SS_STATS_PHY] = {
+ .per_dev = false,
+ .count = __ETHTOOL_A_STATS_PHY_CNT,
+ .strings = stats_phy_names,
+ },
};
struct strset_req_info {
diff --git a/net/ethtool/ts.h b/net/ethtool/ts.h
new file mode 100644
index 000000000000..d901a879a671
--- /dev/null
+++ b/net/ethtool/ts.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef _NET_ETHTOOL_TS_H
+#define _NET_ETHTOOL_TS_H
+
+#include "netlink.h"
+
+static const struct nla_policy
+ethnl_ts_hwtst_prov_policy[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_MAX + 1] = {
+ [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX] = { .type = NLA_U32 },
+ [ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER] =
+ NLA_POLICY_MAX(NLA_U32, HWTSTAMP_PROVIDER_QUALIFIER_CNT - 1)
+};
+
+int ts_parse_hwtst_provider(const struct nlattr *nest,
+ struct hwtstamp_provider_desc *hwprov_desc,
+ struct netlink_ext_ack *extack,
+ bool *mod);
+
+#endif /* _NET_ETHTOOL_TS_H */
diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c
new file mode 100644
index 000000000000..2be356bdfe87
--- /dev/null
+++ b/net/ethtool/tsconfig.c
@@ -0,0 +1,457 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/net_tstamp.h>
+#include <linux/ptp_clock_kernel.h>
+
+#include "netlink.h"
+#include "common.h"
+#include "bitset.h"
+#include "../core/dev.h"
+#include "ts.h"
+
+struct tsconfig_req_info {
+ struct ethnl_req_info base;
+};
+
+struct tsconfig_reply_data {
+ struct ethnl_reply_data base;
+ struct hwtstamp_provider_desc hwprov_desc;
+ struct {
+ u32 tx_type;
+ u32 rx_filter;
+ u32 flags;
+ } hwtst_config;
+};
+
+#define TSCONFIG_REPDATA(__reply_base) \
+ container_of(__reply_base, struct tsconfig_reply_data, base)
+
+const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1] = {
+ [ETHTOOL_A_TSCONFIG_HEADER] =
+ NLA_POLICY_NESTED(ethnl_header_policy),
+};
+
+static int tsconfig_prepare_data(const struct ethnl_req_info *req_base,
+ struct ethnl_reply_data *reply_base,
+ const struct genl_info *info)
+{
+ struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ struct hwtstamp_provider *hwprov = NULL;
+ struct net_device *dev = reply_base->dev;
+ struct kernel_hwtstamp_config cfg = {};
+ int ret;
+
+ if (!dev->netdev_ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ ret = ethnl_ops_begin(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = dev_get_hwtstamp_phylib(dev, &cfg);
+ if (ret)
+ goto out;
+
+ data->hwtst_config.tx_type = BIT(cfg.tx_type);
+ data->hwtst_config.rx_filter = BIT(cfg.rx_filter);
+ data->hwtst_config.flags = cfg.flags;
+
+ data->hwprov_desc.index = -1;
+ hwprov = rtnl_dereference(dev->hwprov);
+ if (hwprov) {
+ data->hwprov_desc.index = hwprov->desc.index;
+ data->hwprov_desc.qualifier = hwprov->desc.qualifier;
+ } else {
+ struct kernel_ethtool_ts_info ts_info = {};
+
+ ts_info.phc_index = -1;
+ ret = __ethtool_get_ts_info(dev, &ts_info);
+ if (ret)
+ goto out;
+
+ if (ts_info.phc_index == -1)
+ return -ENODEV;
+
+ data->hwprov_desc.index = ts_info.phc_index;
+ data->hwprov_desc.qualifier = ts_info.phc_qualifier;
+ }
+
+out:
+ ethnl_ops_complete(dev);
+ return ret;
+}
+
+static int tsconfig_reply_size(const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int len = 0;
+ int ret;
+
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT > 32);
+ BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32);
+
+ if (data->hwtst_config.flags) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.flags,
+ NULL, __HWTSTAMP_FLAG_CNT,
+ ts_flags_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_HWTSTAMP_FLAGS */
+ }
+
+ if (data->hwtst_config.tx_type) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.tx_type,
+ NULL, __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_TX_TYPES */
+ }
+ if (data->hwtst_config.rx_filter) {
+ ret = ethnl_bitset32_size(&data->hwtst_config.rx_filter,
+ NULL, __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ len += ret; /* _TSCONFIG_RX_FILTERS */
+ }
+
+ if (data->hwprov_desc.index >= 0)
+ /* _TSCONFIG_HWTSTAMP_PROVIDER */
+ len += nla_total_size(0) +
+ 2 * nla_total_size(sizeof(u32));
+
+ return len;
+}
+
+static int tsconfig_fill_reply(struct sk_buff *skb,
+ const struct ethnl_req_info *req_base,
+ const struct ethnl_reply_data *reply_base)
+{
+ const struct tsconfig_reply_data *data = TSCONFIG_REPDATA(reply_base);
+ bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS;
+ int ret;
+
+ if (data->hwtst_config.flags) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS,
+ &data->hwtst_config.flags, NULL,
+ __HWTSTAMP_FLAG_CNT,
+ ts_flags_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwtst_config.tx_type) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_TX_TYPES,
+ &data->hwtst_config.tx_type, NULL,
+ __HWTSTAMP_TX_CNT,
+ ts_tx_type_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwtst_config.rx_filter) {
+ ret = ethnl_put_bitset32(skb, ETHTOOL_A_TSCONFIG_RX_FILTERS,
+ &data->hwtst_config.rx_filter,
+ NULL, __HWTSTAMP_FILTER_CNT,
+ ts_rx_filter_names, compact);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (data->hwprov_desc.index >= 0) {
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX,
+ data->hwprov_desc.index) ||
+ nla_put_u32(skb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER,
+ data->hwprov_desc.qualifier)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+ }
+ return 0;
+}
+
+/* TSCONFIG_SET */
+const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1] = {
+ [ETHTOOL_A_TSCONFIG_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy),
+ [ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER] =
+ NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy),
+ [ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSCONFIG_RX_FILTERS] = { .type = NLA_NESTED },
+ [ETHTOOL_A_TSCONFIG_TX_TYPES] = { .type = NLA_NESTED },
+};
+
+static int tsconfig_send_reply(struct net_device *dev, struct genl_info *info)
+{
+ struct tsconfig_reply_data *reply_data;
+ struct tsconfig_req_info *req_info;
+ struct sk_buff *rskb;
+ void *reply_payload;
+ int reply_len = 0;
+ int ret;
+
+ req_info = kzalloc(sizeof(*req_info), GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kmalloc(sizeof(*reply_data), GFP_KERNEL);
+ if (!reply_data) {
+ kfree(req_info);
+ return -ENOMEM;
+ }
+
+ ASSERT_RTNL();
+ reply_data->base.dev = dev;
+ ret = tsconfig_prepare_data(&req_info->base, &reply_data->base, info);
+ if (ret < 0)
+ goto err_cleanup;
+
+ ret = tsconfig_reply_size(&req_info->base, &reply_data->base);
+ if (ret < 0)
+ goto err_cleanup;
+
+ reply_len = ret + ethnl_reply_header_size();
+ rskb = ethnl_reply_init(reply_len, dev, ETHTOOL_MSG_TSCONFIG_SET_REPLY,
+ ETHTOOL_A_TSCONFIG_HEADER, info, &reply_payload);
+ if (!rskb)
+ goto err_cleanup;
+
+ ret = tsconfig_fill_reply(rskb, &req_info->base, &reply_data->base);
+ if (ret < 0)
+ goto err_cleanup;
+
+ genlmsg_end(rskb, reply_payload);
+ ret = genlmsg_reply(rskb, info);
+
+err_cleanup:
+ kfree(reply_data);
+ kfree(req_info);
+ return ret;
+}
+
+static int ethnl_set_tsconfig_validate(struct ethnl_req_info *req_base,
+ struct genl_info *info)
+{
+ const struct net_device_ops *ops = req_base->dev->netdev_ops;
+
+ if (!ops->ndo_hwtstamp_set || !ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
+
+ return 1;
+}
+
+static struct hwtstamp_provider *
+tsconfig_set_hwprov_from_desc(struct net_device *dev,
+ struct genl_info *info,
+ struct hwtstamp_provider_desc *hwprov_desc)
+{
+ struct kernel_ethtool_ts_info ts_info;
+ struct hwtstamp_provider *hwprov;
+ struct nlattr **tb = info->attrs;
+ struct phy_device *phy = NULL;
+ enum hwtstamp_source source;
+ int ret;
+
+ ret = ethtool_net_get_ts_info_by_phc(dev, &ts_info, hwprov_desc);
+ if (!ret) {
+ /* Found */
+ source = HWTSTAMP_SOURCE_NETDEV;
+ } else {
+ phy = ethtool_phy_get_ts_info_by_phc(dev, &ts_info, hwprov_desc);
+ if (IS_ERR(phy)) {
+ if (PTR_ERR(phy) == -ENODEV)
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER],
+ "phc not in this net device topology");
+ return ERR_CAST(phy);
+ }
+
+ source = HWTSTAMP_SOURCE_PHYLIB;
+ }
+
+ hwprov = kzalloc(sizeof(*hwprov), GFP_KERNEL);
+ if (!hwprov)
+ return ERR_PTR(-ENOMEM);
+
+ hwprov->desc.index = hwprov_desc->index;
+ hwprov->desc.qualifier = hwprov_desc->qualifier;
+ hwprov->source = source;
+ hwprov->phydev = phy;
+
+ return hwprov;
+}
+
+static int ethnl_set_tsconfig(struct ethnl_req_info *req_base,
+ struct genl_info *info)
+{
+ struct kernel_hwtstamp_config hwtst_config = {0};
+ bool hwprov_mod = false, config_mod = false;
+ struct hwtstamp_provider *hwprov = NULL;
+ struct net_device *dev = req_base->dev;
+ struct nlattr **tb = info->attrs;
+ int ret;
+
+ BUILD_BUG_ON(__HWTSTAMP_TX_CNT >= 32);
+ BUILD_BUG_ON(__HWTSTAMP_FILTER_CNT >= 32);
+ BUILD_BUG_ON(__HWTSTAMP_FLAG_CNT > 32);
+
+ if (!netif_device_present(dev))
+ return -ENODEV;
+
+ if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER]) {
+ struct hwtstamp_provider_desc __hwprov_desc = {.index = -1};
+ struct hwtstamp_provider *__hwprov;
+
+ __hwprov = rtnl_dereference(dev->hwprov);
+ if (__hwprov) {
+ __hwprov_desc.index = __hwprov->desc.index;
+ __hwprov_desc.qualifier = __hwprov->desc.qualifier;
+ }
+
+ ret = ts_parse_hwtst_provider(tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_PROVIDER],
+ &__hwprov_desc, info->extack,
+ &hwprov_mod);
+ if (ret < 0)
+ return ret;
+
+ if (hwprov_mod) {
+ hwprov = tsconfig_set_hwprov_from_desc(dev, info,
+ &__hwprov_desc);
+ if (IS_ERR(hwprov))
+ return PTR_ERR(hwprov);
+ }
+ }
+
+ /* Get current hwtstamp config if we are not changing the
+ * hwtstamp source. It will be zeroed in the other case.
+ */
+ if (!hwprov_mod) {
+ ret = dev_get_hwtstamp_phylib(dev, &hwtst_config);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ goto err_free_hwprov;
+ }
+
+ /* Get the hwtstamp config from netlink */
+ if (tb[ETHTOOL_A_TSCONFIG_TX_TYPES]) {
+ u32 req_tx_type;
+
+ req_tx_type = BIT(hwtst_config.tx_type);
+ ret = ethnl_update_bitset32(&req_tx_type,
+ __HWTSTAMP_TX_CNT,
+ tb[ETHTOOL_A_TSCONFIG_TX_TYPES],
+ ts_tx_type_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Select only one tx type at a time */
+ if (ffs(req_tx_type) != fls(req_tx_type)) {
+ ret = -EINVAL;
+ goto err_free_hwprov;
+ }
+
+ hwtst_config.tx_type = ffs(req_tx_type) - 1;
+ }
+
+ if (tb[ETHTOOL_A_TSCONFIG_RX_FILTERS]) {
+ u32 req_rx_filter;
+
+ req_rx_filter = BIT(hwtst_config.rx_filter);
+ ret = ethnl_update_bitset32(&req_rx_filter,
+ __HWTSTAMP_FILTER_CNT,
+ tb[ETHTOOL_A_TSCONFIG_RX_FILTERS],
+ ts_rx_filter_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Select only one rx filter at a time */
+ if (ffs(req_rx_filter) != fls(req_rx_filter)) {
+ ret = -EINVAL;
+ goto err_free_hwprov;
+ }
+
+ hwtst_config.rx_filter = ffs(req_rx_filter) - 1;
+ }
+
+ if (tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS]) {
+ ret = ethnl_update_bitset32(&hwtst_config.flags,
+ __HWTSTAMP_FLAG_CNT,
+ tb[ETHTOOL_A_TSCONFIG_HWTSTAMP_FLAGS],
+ ts_flags_names, info->extack,
+ &config_mod);
+ if (ret < 0)
+ goto err_free_hwprov;
+ }
+
+ ret = net_hwtstamp_validate(&hwtst_config);
+ if (ret)
+ goto err_free_hwprov;
+
+ if (hwprov_mod) {
+ struct kernel_hwtstamp_config zero_config = {0};
+ struct hwtstamp_provider *__hwprov;
+
+ /* Disable current time stamping if we try to enable
+ * another one
+ */
+ ret = dev_set_hwtstamp_phylib(dev, &zero_config, info->extack);
+ if (ret < 0)
+ goto err_free_hwprov;
+
+ /* Change the selected hwtstamp source */
+ __hwprov = rcu_replace_pointer_rtnl(dev->hwprov, hwprov);
+ if (__hwprov)
+ kfree_rcu(__hwprov, rcu_head);
+ }
+
+ if (config_mod) {
+ ret = dev_set_hwtstamp_phylib(dev, &hwtst_config,
+ info->extack);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (hwprov_mod || config_mod) {
+ ret = tsconfig_send_reply(dev, info);
+ if (ret && ret != -EOPNOTSUPP) {
+ NL_SET_ERR_MSG(info->extack,
+ "error while reading the new configuration set");
+ return ret;
+ }
+ }
+
+ /* tsconfig has no notification */
+ return 0;
+
+err_free_hwprov:
+ kfree(hwprov);
+
+ return ret;
+}
+
+const struct ethnl_request_ops ethnl_tsconfig_request_ops = {
+ .request_cmd = ETHTOOL_MSG_TSCONFIG_GET,
+ .reply_cmd = ETHTOOL_MSG_TSCONFIG_GET_REPLY,
+ .hdr_attr = ETHTOOL_A_TSCONFIG_HEADER,
+ .req_info_size = sizeof(struct tsconfig_req_info),
+ .reply_data_size = sizeof(struct tsconfig_reply_data),
+
+ .prepare_data = tsconfig_prepare_data,
+ .reply_size = tsconfig_reply_size,
+ .fill_reply = tsconfig_fill_reply,
+
+ .set_validate = ethnl_set_tsconfig_validate,
+ .set = ethnl_set_tsconfig,
+};
diff --git a/net/ethtool/tsinfo.c b/net/ethtool/tsinfo.c
index 03d12d6f79ca..ad3866c5a902 100644
--- a/net/ethtool/tsinfo.c
+++ b/net/ethtool/tsinfo.c
@@ -1,13 +1,18 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/net_tstamp.h>
+#include <linux/phy.h>
+#include <linux/phy_link_topology.h>
+#include <linux/ptp_clock_kernel.h>
#include "netlink.h"
#include "common.h"
#include "bitset.h"
+#include "ts.h"
struct tsinfo_req_info {
struct ethnl_req_info base;
+ struct hwtstamp_provider_desc hwprov_desc;
};
struct tsinfo_reply_data {
@@ -16,34 +21,96 @@ struct tsinfo_reply_data {
struct ethtool_ts_stats stats;
};
+#define TSINFO_REQINFO(__req_base) \
+ container_of(__req_base, struct tsinfo_req_info, base)
+
#define TSINFO_REPDATA(__reply_base) \
container_of(__reply_base, struct tsinfo_reply_data, base)
#define ETHTOOL_TS_STAT_CNT \
(__ETHTOOL_A_TS_STAT_CNT - (ETHTOOL_A_TS_STAT_UNSPEC + 1))
-const struct nla_policy ethnl_tsinfo_get_policy[] = {
+const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_MAX + 1] = {
[ETHTOOL_A_TSINFO_HEADER] =
NLA_POLICY_NESTED(ethnl_header_policy_stats),
+ [ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER] =
+ NLA_POLICY_NESTED(ethnl_ts_hwtst_prov_policy),
};
+int ts_parse_hwtst_provider(const struct nlattr *nest,
+ struct hwtstamp_provider_desc *hwprov_desc,
+ struct netlink_ext_ack *extack,
+ bool *mod)
+{
+ struct nlattr *tb[ARRAY_SIZE(ethnl_ts_hwtst_prov_policy)];
+ int ret;
+
+ ret = nla_parse_nested(tb,
+ ARRAY_SIZE(ethnl_ts_hwtst_prov_policy) - 1,
+ nest,
+ ethnl_ts_hwtst_prov_policy, extack);
+ if (ret < 0)
+ return ret;
+
+ if (NL_REQ_ATTR_CHECK(extack, nest, tb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX) ||
+ NL_REQ_ATTR_CHECK(extack, nest, tb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER))
+ return -EINVAL;
+
+ ethnl_update_u32(&hwprov_desc->index,
+ tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX],
+ mod);
+ ethnl_update_u32(&hwprov_desc->qualifier,
+ tb[ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER],
+ mod);
+
+ return 0;
+}
+
+static int
+tsinfo_parse_request(struct ethnl_req_info *req_base, struct nlattr **tb,
+ struct netlink_ext_ack *extack)
+{
+ struct tsinfo_req_info *req = TSINFO_REQINFO(req_base);
+ bool mod = false;
+
+ req->hwprov_desc.index = -1;
+
+ if (!tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER])
+ return 0;
+
+ return ts_parse_hwtst_provider(tb[ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER],
+ &req->hwprov_desc, extack, &mod);
+}
+
static int tsinfo_prepare_data(const struct ethnl_req_info *req_base,
struct ethnl_reply_data *reply_base,
const struct genl_info *info)
{
struct tsinfo_reply_data *data = TSINFO_REPDATA(reply_base);
+ struct tsinfo_req_info *req = TSINFO_REQINFO(req_base);
struct net_device *dev = reply_base->dev;
int ret;
ret = ethnl_ops_begin(dev);
if (ret < 0)
return ret;
+
+ if (req->hwprov_desc.index != -1) {
+ ret = ethtool_get_ts_info_by_phc(dev, &data->ts_info,
+ &req->hwprov_desc);
+ ethnl_ops_complete(dev);
+ return ret;
+ }
+
if (req_base->flags & ETHTOOL_FLAG_STATS) {
ethtool_stats_init((u64 *)&data->stats,
sizeof(data->stats) / sizeof(u64));
if (dev->ethtool_ops->get_ts_stats)
dev->ethtool_ops->get_ts_stats(dev, &data->stats);
}
+
ret = __ethtool_get_ts_info(dev, &data->ts_info);
ethnl_ops_complete(dev);
@@ -87,8 +154,11 @@ static int tsinfo_reply_size(const struct ethnl_req_info *req_base,
return ret;
len += ret; /* _TSINFO_RX_FILTERS */
}
- if (ts_info->phc_index >= 0)
+ if (ts_info->phc_index >= 0) {
len += nla_total_size(sizeof(u32)); /* _TSINFO_PHC_INDEX */
+ /* _TSINFO_HWTSTAMP_PROVIDER */
+ len += nla_total_size(0) + 2 * nla_total_size(sizeof(u32));
+ }
if (req_base->flags & ETHTOOL_FLAG_STATS)
len += nla_total_size(0) + /* _TSINFO_STATS */
nla_total_size_64bit(sizeof(u64)) * ETHTOOL_TS_STAT_CNT;
@@ -116,6 +186,8 @@ static int tsinfo_put_stats(struct sk_buff *skb,
if (tsinfo_put_stat(skb, stats->tx_stats.pkts,
ETHTOOL_A_TS_STAT_TX_PKTS) ||
+ tsinfo_put_stat(skb, stats->tx_stats.onestep_pkts_unconfirmed,
+ ETHTOOL_A_TS_STAT_TX_ONESTEP_PKTS_UNCONFIRMED) ||
tsinfo_put_stat(skb, stats->tx_stats.lost,
ETHTOOL_A_TS_STAT_TX_LOST) ||
tsinfo_put_stat(skb, stats->tx_stats.err,
@@ -163,9 +235,29 @@ static int tsinfo_fill_reply(struct sk_buff *skb,
if (ret < 0)
return ret;
}
- if (ts_info->phc_index >= 0 &&
- nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX, ts_info->phc_index))
- return -EMSGSIZE;
+ if (ts_info->phc_index >= 0) {
+ struct nlattr *nest;
+
+ ret = nla_put_u32(skb, ETHTOOL_A_TSINFO_PHC_INDEX,
+ ts_info->phc_index);
+ if (ret)
+ return -EMSGSIZE;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_TSINFO_HWTSTAMP_PROVIDER);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_TS_HWTSTAMP_PROVIDER_INDEX,
+ ts_info->phc_index) ||
+ nla_put_u32(skb,
+ ETHTOOL_A_TS_HWTSTAMP_PROVIDER_QUALIFIER,
+ ts_info->phc_qualifier)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+ }
if (req_base->flags & ETHTOOL_FLAG_STATS &&
tsinfo_put_stats(skb, &data->stats))
return -EMSGSIZE;
@@ -173,6 +265,264 @@ static int tsinfo_fill_reply(struct sk_buff *skb,
return 0;
}
+struct ethnl_tsinfo_dump_ctx {
+ struct tsinfo_req_info *req_info;
+ struct tsinfo_reply_data *reply_data;
+ unsigned long pos_ifindex;
+ bool netdev_dump_done;
+ unsigned long pos_phyindex;
+ enum hwtstamp_provider_qualifier pos_phcqualifier;
+};
+
+static void *ethnl_tsinfo_prepare_dump(struct sk_buff *skb,
+ struct net_device *dev,
+ struct tsinfo_reply_data *reply_data,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ void *ehdr = NULL;
+
+ ehdr = ethnl_dump_put(skb, cb,
+ ETHTOOL_MSG_TSINFO_GET_REPLY);
+ if (!ehdr)
+ return ERR_PTR(-EMSGSIZE);
+
+ reply_data = ctx->reply_data;
+ memset(reply_data, 0, sizeof(*reply_data));
+ reply_data->base.dev = dev;
+ reply_data->ts_info.cmd = ETHTOOL_GET_TS_INFO;
+ reply_data->ts_info.phc_index = -1;
+
+ return ehdr;
+}
+
+static int ethnl_tsinfo_end_dump(struct sk_buff *skb,
+ struct net_device *dev,
+ struct tsinfo_req_info *req_info,
+ struct tsinfo_reply_data *reply_data,
+ void *ehdr)
+{
+ int ret;
+
+ reply_data->ts_info.so_timestamping |= SOF_TIMESTAMPING_RX_SOFTWARE |
+ SOF_TIMESTAMPING_SOFTWARE;
+
+ ret = ethnl_fill_reply_header(skb, dev, ETHTOOL_A_TSINFO_HEADER);
+ if (ret < 0)
+ return ret;
+
+ ret = tsinfo_fill_reply(skb, &req_info->base, &reply_data->base);
+ if (ret < 0)
+ return ret;
+
+ reply_data->base.dev = NULL;
+ genlmsg_end(skb, ehdr);
+
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_phydev(struct sk_buff *skb,
+ struct net_device *dev,
+ struct phy_device *phydev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ void *ehdr = NULL;
+ int ret = 0;
+
+ if (!phy_has_tsinfo(phydev))
+ return -EOPNOTSUPP;
+
+ reply_data = ctx->reply_data;
+ req_info = ctx->req_info;
+ ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
+ if (IS_ERR(ehdr))
+ return PTR_ERR(ehdr);
+
+ ret = phy_ts_info(phydev, &reply_data->ts_info);
+ if (ret < 0)
+ goto err;
+
+ ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data, ehdr);
+ if (ret < 0)
+ goto err;
+
+ return ret;
+err:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_netdev(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ void *ehdr = NULL;
+ int ret = 0;
+
+ if (!ops->get_ts_info)
+ return -EOPNOTSUPP;
+
+ reply_data = ctx->reply_data;
+ req_info = ctx->req_info;
+ for (; ctx->pos_phcqualifier < HWTSTAMP_PROVIDER_QUALIFIER_CNT;
+ ctx->pos_phcqualifier++) {
+ if (!net_support_hwtstamp_qualifier(dev,
+ ctx->pos_phcqualifier))
+ continue;
+
+ ehdr = ethnl_tsinfo_prepare_dump(skb, dev, reply_data, cb);
+ if (IS_ERR(ehdr)) {
+ ret = PTR_ERR(ehdr);
+ goto err;
+ }
+
+ reply_data->ts_info.phc_qualifier = ctx->pos_phcqualifier;
+ ret = ops->get_ts_info(dev, &reply_data->ts_info);
+ if (ret < 0)
+ goto err;
+
+ ret = ethnl_tsinfo_end_dump(skb, dev, req_info, reply_data,
+ ehdr);
+ if (ret < 0)
+ goto err;
+ }
+
+ return ret;
+
+err:
+ genlmsg_cancel(skb, ehdr);
+ return ret;
+}
+
+static int ethnl_tsinfo_dump_one_net_topo(struct sk_buff *skb,
+ struct net_device *dev,
+ struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct phy_device_node *pdn;
+ int ret = 0;
+
+ if (!ctx->netdev_dump_done) {
+ ret = ethnl_tsinfo_dump_one_netdev(skb, dev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ ctx->netdev_dump_done = true;
+ }
+
+ if (!dev->link_topo) {
+ if (phy_has_tsinfo(dev->phydev)) {
+ ret = ethnl_tsinfo_dump_one_phydev(skb, dev,
+ dev->phydev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ }
+
+ return 0;
+ }
+
+ xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn,
+ ctx->pos_phyindex) {
+ if (phy_has_tsinfo(pdn->phy)) {
+ ret = ethnl_tsinfo_dump_one_phydev(skb, dev,
+ pdn->phy, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+int ethnl_tsinfo_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct net *net = sock_net(skb->sk);
+ struct net_device *dev;
+ int ret = 0;
+
+ rtnl_lock();
+ if (ctx->req_info->base.dev) {
+ ret = ethnl_tsinfo_dump_one_net_topo(skb,
+ ctx->req_info->base.dev,
+ cb);
+ } else {
+ for_each_netdev_dump(net, dev, ctx->pos_ifindex) {
+ ret = ethnl_tsinfo_dump_one_net_topo(skb, dev, cb);
+ if (ret < 0 && ret != -EOPNOTSUPP)
+ break;
+ ctx->pos_phyindex = 0;
+ ctx->netdev_dump_done = false;
+ ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE;
+ }
+ }
+ rtnl_unlock();
+
+ return ret;
+}
+
+int ethnl_tsinfo_start(struct netlink_callback *cb)
+{
+ const struct genl_dumpit_info *info = genl_dumpit_info(cb);
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct nlattr **tb = info->info.attrs;
+ struct tsinfo_reply_data *reply_data;
+ struct tsinfo_req_info *req_info;
+ int ret;
+
+ BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx));
+
+ req_info = kzalloc(sizeof(*req_info), GFP_KERNEL);
+ if (!req_info)
+ return -ENOMEM;
+ reply_data = kzalloc(sizeof(*reply_data), GFP_KERNEL);
+ if (!reply_data) {
+ ret = -ENOMEM;
+ goto free_req_info;
+ }
+
+ ret = ethnl_parse_header_dev_get(&req_info->base,
+ tb[ETHTOOL_A_TSINFO_HEADER],
+ sock_net(cb->skb->sk), cb->extack,
+ false);
+ if (ret < 0)
+ goto free_reply_data;
+
+ ctx->req_info = req_info;
+ ctx->reply_data = reply_data;
+ ctx->pos_ifindex = 0;
+ ctx->pos_phyindex = 0;
+ ctx->netdev_dump_done = false;
+ ctx->pos_phcqualifier = HWTSTAMP_PROVIDER_QUALIFIER_PRECISE;
+
+ return 0;
+
+free_reply_data:
+ kfree(reply_data);
+free_req_info:
+ kfree(req_info);
+
+ return ret;
+}
+
+int ethnl_tsinfo_done(struct netlink_callback *cb)
+{
+ struct ethnl_tsinfo_dump_ctx *ctx = (void *)cb->ctx;
+ struct tsinfo_req_info *req_info = ctx->req_info;
+
+ ethnl_parse_header_dev_put(&req_info->base);
+ kfree(ctx->reply_data);
+ kfree(ctx->req_info);
+
+ return 0;
+}
+
const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
.request_cmd = ETHTOOL_MSG_TSINFO_GET,
.reply_cmd = ETHTOOL_MSG_TSINFO_GET_REPLY,
@@ -180,6 +530,7 @@ const struct ethnl_request_ops ethnl_tsinfo_request_ops = {
.req_info_size = sizeof(struct tsinfo_req_info),
.reply_data_size = sizeof(struct tsinfo_reply_data),
+ .parse_request = tsinfo_parse_request,
.prepare_data = tsinfo_prepare_data,
.reply_size = tsinfo_reply_size,
.fill_reply = tsinfo_fill_reply,
diff --git a/net/hsr/hsr_debugfs.c b/net/hsr/hsr_debugfs.c
index 1a195efc79cd..5b2cfac3b2ba 100644
--- a/net/hsr/hsr_debugfs.c
+++ b/net/hsr/hsr_debugfs.c
@@ -57,14 +57,11 @@ DEFINE_SHOW_ATTRIBUTE(hsr_node_table);
void hsr_debugfs_rename(struct net_device *dev)
{
struct hsr_priv *priv = netdev_priv(dev);
- struct dentry *d;
+ int err;
- d = debugfs_rename(hsr_debugfs_root_dir, priv->node_tbl_root,
- hsr_debugfs_root_dir, dev->name);
- if (IS_ERR(d))
+ err = debugfs_change_name(priv->node_tbl_root, "%s", dev->name);
+ if (err)
netdev_warn(dev, "failed to rename\n");
- else
- priv->node_tbl_root = d;
}
/* hsr_debugfs_init - create hsr node_table file for dumping
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 03eadd6c51fd..b6fb18469439 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -663,6 +663,19 @@ bool is_hsr_master(struct net_device *dev)
}
EXPORT_SYMBOL(is_hsr_master);
+struct net_device *hsr_get_port_ndev(struct net_device *ndev,
+ enum hsr_port_type pt)
+{
+ struct hsr_priv *hsr = netdev_priv(ndev);
+ struct hsr_port *port;
+
+ hsr_for_each_port(hsr, port)
+ if (port->type == pt)
+ return port->dev;
+ return NULL;
+}
+EXPORT_SYMBOL(hsr_get_port_ndev);
+
/* Default multicast address for HSR Supervision frames */
static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = {
0x01, 0x15, 0x4e, 0x00, 0x01, 0x00
diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h
index fcfeb79bb040..7561845b8bf6 100644
--- a/net/hsr/hsr_main.h
+++ b/net/hsr/hsr_main.h
@@ -121,15 +121,6 @@ struct hsrv1_ethhdr_sp {
struct hsr_sup_tag hsr_sup;
} __packed;
-enum hsr_port_type {
- HSR_PT_NONE = 0, /* Must be 0, used by framereg */
- HSR_PT_SLAVE_A,
- HSR_PT_SLAVE_B,
- HSR_PT_INTERLINK,
- HSR_PT_MASTER,
- HSR_PT_PORTS, /* This must be the last item in the enum */
-};
-
/* PRP Redunancy Control Trailor (RCT).
* As defined in IEC-62439-4:2012, the PRP RCT is really { sequence Nr,
* Lan indentifier (LanId), LSDU_size and PRP_suffix = 0x88FB }.
@@ -163,6 +154,7 @@ struct hsr_port {
struct net_device *dev;
struct hsr_priv *hsr;
enum hsr_port_type type;
+ struct rcu_head rcu;
};
struct hsr_frame_info;
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 464f683e016d..2a802a5de2ac 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -204,7 +204,6 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev,
}
list_add_tail_rcu(&port->port_list, &hsr->ports);
- synchronize_rcu();
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
netdev_update_features(master->dev);
@@ -235,7 +234,5 @@ void hsr_del_port(struct hsr_port *port)
netdev_upper_dev_unlink(port->dev, master->dev);
}
- synchronize_rcu();
-
- kfree(port);
+ kfree_rcu(port, rcu);
}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8095e82de808..21f46ee7b6e9 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1309,8 +1309,6 @@ int inet_sk_rebuild_header(struct sock *sk)
{
struct rtable *rt = dst_rtable(__sk_dst_check(sk, 0));
struct inet_sock *inet = inet_sk(sk);
- __be32 daddr;
- struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
int err;
@@ -1319,17 +1317,9 @@ int inet_sk_rebuild_header(struct sock *sk)
return 0;
/* Reroute. */
- rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rcu_read_unlock();
fl4 = &inet->cork.fl.u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, ip_sock_rt_tos(sk),
- sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (!IS_ERR(rt)) {
err = 0;
sk_setup_caps(sk, &rt->dst);
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 4aca1f05edd3..4b5bc6eb52e7 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -102,8 +102,6 @@ EXPORT_SYMBOL(ip4_datagram_connect);
void ip4_datagram_release_cb(struct sock *sk)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct dst_entry *dst;
struct flowi4 fl4;
struct rtable *rt;
@@ -115,14 +113,9 @@ void ip4_datagram_release_cb(struct sock *sk)
rcu_read_unlock();
return;
}
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
- rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, &fl4);
+ rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
dst = !IS_ERR(rt) ? &rt->dst : NULL;
sk_dst_set(sk, dst);
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f3281312eb5e..0e4076866c0a 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -279,7 +279,7 @@ static void esp_output_done(void *data, int err)
x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
esp_output_tail_tcp(x, skb);
else
- xfrm_output_resume(skb->sk, skb, err);
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
}
@@ -816,7 +816,8 @@ int esp_input_done2(struct sk_buff *skb, int err)
}
skb_pull_rcsum(skb, hlen);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -ihl);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 8325224ef072..9517b8667e00 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -249,6 +249,12 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
int err = -EINVAL;
struct fib4_rule *rule4 = (struct fib4_rule *) rule;
+ if (tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) {
+ NL_SET_ERR_MSG(extack,
+ "Flow label cannot be specified for IPv4 FIB rules");
+ goto errout;
+ }
+
if (!inet_validate_dscp(frh->tos)) {
NL_SET_ERR_MSG(extack,
"Invalid dsfield (tos): ECN bits must be 0");
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 161f5526b86c..d6411ac81096 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -2999,7 +2999,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%s\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
nhc->nhc_dev ? nhc->nhc_dev->name : "*",
prefix, gw, flags, 0, 0,
fi->fib_priority,
@@ -3011,7 +3011,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
} else {
seq_printf(seq,
"*\t%08X\t%08X\t%04X\t%d\t%u\t"
- "%d\t%08X\t%d\t%u\t%u",
+ "%u\t%08X\t%d\t%u\t%u",
prefix, 0, flags, 0, 0, 0,
mask, 0, 0, 0);
}
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 6a238398acc9..3da126cea884 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -88,6 +88,8 @@
#include <linux/byteorder/generic.h>
#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/addrconf.h>
#include <net/arp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -1430,6 +1432,65 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
*mc_hash = im->next_hash;
}
+static int inet_fill_ifmcaddr(struct sk_buff *skb, struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct ifa_cacheinfo ci;
+ struct ifaddrmsg *ifm;
+ struct nlmsghdr *nlh;
+
+ nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct ifaddrmsg), 0);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ ifm = nlmsg_data(nlh);
+ ifm->ifa_family = AF_INET;
+ ifm->ifa_prefixlen = 32;
+ ifm->ifa_flags = IFA_F_PERMANENT;
+ ifm->ifa_scope = RT_SCOPE_UNIVERSE;
+ ifm->ifa_index = dev->ifindex;
+
+ ci.cstamp = (READ_ONCE(im->mca_cstamp) - INITIAL_JIFFIES) * 100UL / HZ;
+ ci.tstamp = ci.cstamp;
+ ci.ifa_prefered = INFINITY_LIFE_TIME;
+ ci.ifa_valid = INFINITY_LIFE_TIME;
+
+ if (nla_put_in_addr(skb, IFA_MULTICAST, im->multiaddr) < 0 ||
+ nla_put(skb, IFA_CACHEINFO, sizeof(ci), &ci) < 0) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static void inet_ifmcaddr_notify(struct net_device *dev,
+ const struct ip_mc_list *im, int event)
+{
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(__be32)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet_fill_ifmcaddr(skb, dev, im, event);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV4_MCADDR, err);
+}
/*
* A socket has joined a multicast group on device dev.
@@ -1473,6 +1534,8 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
im->interface = in_dev;
in_dev_hold(in_dev);
im->multiaddr = addr;
+ im->mca_cstamp = jiffies;
+ im->mca_tstamp = im->mca_cstamp;
/* initial mode is (EX, empty) */
im->sfmode = mode;
im->sfcount[mode] = 1;
@@ -1492,6 +1555,7 @@ static void ____ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
igmpv3_del_delrec(in_dev, im);
#endif
igmp_group_added(im);
+ inet_ifmcaddr_notify(in_dev->dev, im, RTM_NEWMULTICAST);
if (!in_dev->dead)
ip_rt_multicast_event(in_dev);
out:
@@ -1705,6 +1769,8 @@ void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp)
*ip = i->next_rcu;
in_dev->mc_count--;
__igmp_group_dropped(i, gfp);
+ inet_ifmcaddr_notify(in_dev->dev, i,
+ RTM_DELMULTICAST);
ip_mc_clear_src(i);
if (!in_dev->dead)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 6872b5aff73e..e4decfb270fa 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -1561,20 +1561,13 @@ EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr);
static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *fl)
{
const struct inet_sock *inet = inet_sk(sk);
- const struct ip_options_rcu *inet_opt;
- __be32 daddr = inet->inet_daddr;
struct flowi4 *fl4;
struct rtable *rt;
rcu_read_lock();
- inet_opt = rcu_dereference(inet->inet_opt);
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
fl4 = &fl->u.ip4;
- rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr,
- inet->inet_saddr, inet->inet_dport,
- inet->inet_sport, sk->sk_protocol,
- ip_sock_rt_tos(sk), sk->sk_bound_dev_if);
+ inet_sk_init_flowi4(inet, fl4);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (IS_ERR(rt))
rt = NULL;
if (rt)
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index e02484f4d22b..b8b23a77ceb4 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -246,23 +246,27 @@ void inet_putpeer(struct inet_peer *p)
#define XRLIM_BURST_FACTOR 6
bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout)
{
- unsigned long now, token;
+ unsigned long now, token, otoken, delta;
bool rc = false;
if (!peer)
return true;
- token = peer->rate_tokens;
+ token = otoken = READ_ONCE(peer->rate_tokens);
now = jiffies;
- token += now - peer->rate_last;
- peer->rate_last = now;
- if (token > XRLIM_BURST_FACTOR * timeout)
- token = XRLIM_BURST_FACTOR * timeout;
+ delta = now - READ_ONCE(peer->rate_last);
+ if (delta) {
+ WRITE_ONCE(peer->rate_last, now);
+ token += delta;
+ if (token > XRLIM_BURST_FACTOR * timeout)
+ token = XRLIM_BURST_FACTOR * timeout;
+ }
if (token >= timeout) {
token -= timeout;
rc = true;
}
- peer->rate_tokens = token;
+ if (token != otoken)
+ WRITE_ONCE(peer->rate_tokens, token);
return rc;
}
EXPORT_SYMBOL(inet_peer_xrlim_allow);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f1f31ebfc793..ed1b6b44faf8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -924,15 +924,18 @@ static int ipgre_open(struct net_device *dev)
struct ip_tunnel *t = netdev_priv(dev);
if (ipv4_is_multicast(t->parms.iph.daddr)) {
- struct flowi4 fl4;
+ struct flowi4 fl4 = {
+ .flowi4_oif = t->parms.link,
+ .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)),
+ .flowi4_scope = RT_SCOPE_UNIVERSE,
+ .flowi4_proto = IPPROTO_GRE,
+ .saddr = t->parms.iph.saddr,
+ .daddr = t->parms.iph.daddr,
+ .fl4_gre_key = t->parms.o_key,
+ };
struct rtable *rt;
- rt = ip_route_output_gre(t->net, &fl4,
- t->parms.iph.daddr,
- t->parms.iph.saddr,
- t->parms.o_key,
- t->parms.iph.tos & INET_DSCP_MASK,
- t->parms.link);
+ rt = ip_route_output_key(t->net, &fl4);
if (IS_ERR(rt))
return -EADDRNOTAVAIL;
dev = rt->dst.dev;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index f0a4dda246ab..30a5e9460d00 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -314,7 +314,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
int tcp_v4_early_demux(struct sk_buff *skb);
int udp_v4_early_demux(struct sk_buff *skb);
-static int ip_rcv_finish_core(struct net *net, struct sock *sk,
+static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
{
@@ -442,7 +442,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
if (!skb)
return NET_RX_SUCCESS;
- ret = ip_rcv_finish_core(net, sk, skb, dev, NULL);
+ ret = ip_rcv_finish_core(net, skb, dev, NULL);
if (ret != NET_RX_DROP)
ret = dst_input(skb);
return ret;
@@ -589,8 +589,7 @@ static struct sk_buff *ip_extract_route_hint(const struct net *net,
return skb;
}
-static void ip_list_rcv_finish(struct net *net, struct sock *sk,
- struct list_head *head)
+static void ip_list_rcv_finish(struct net *net, struct list_head *head)
{
struct sk_buff *skb, *next, *hint = NULL;
struct dst_entry *curr_dst = NULL;
@@ -607,7 +606,7 @@ static void ip_list_rcv_finish(struct net *net, struct sock *sk,
skb = l3mdev_ip_rcv(skb);
if (!skb)
continue;
- if (ip_rcv_finish_core(net, sk, skb, dev, hint) == NET_RX_DROP)
+ if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
continue;
dst = skb_dst(skb);
@@ -633,7 +632,7 @@ static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
{
NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
head, dev, NULL, ip_rcv_finish);
- ip_list_rcv_finish(net, NULL, head);
+ ip_list_rcv_finish(net, head);
}
/* Receive a list of IP packets */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 0065b1996c94..ea7a260bec8a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -478,24 +478,16 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
/* Make sure we can route this packet. */
rt = dst_rtable(__sk_dst_check(sk, 0));
if (!rt) {
- __be32 daddr;
+ inet_sk_init_flowi4(inet, fl4);
- /* Use correct destination address if we have options. */
- daddr = inet->inet_daddr;
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
+ /* sctp_v4_xmit() uses its own DSCP value */
+ fl4->flowi4_tos = tos & INET_DSCP_MASK;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(net, fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport,
- inet->inet_sport,
- sk->sk_protocol,
- tos & INET_DSCP_MASK,
- sk->sk_bound_dev_if);
+ rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt))
goto no_route;
sk_setup_caps(sk, &rt->dst);
@@ -1169,7 +1161,10 @@ alloc_new_skb:
/* [!] NOTE: copy will be negative if pagedlen>0
* because then the equation reduces to -fraggap.
*/
- if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
+ if (copy > 0 &&
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
@@ -1213,8 +1208,9 @@ alloc_new_skb:
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
@@ -1252,7 +1248,8 @@ alloc_new_skb:
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (getfrag(from,
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
@@ -1328,7 +1325,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
cork->ttl = ipc->ttl;
cork->tos = ipc->tos;
cork->mark = ipc->sockc.mark;
- cork->priority = ipc->priority;
+ cork->priority = ipc->sockc.priority;
cork->transmit_time = ipc->sockc.transmit_time;
cork->tx_flags = 0;
sock_tx_timestamp(sk, &ipc->sockc, &cork->tx_flags);
@@ -1465,7 +1462,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
ip_options_build(skb, opt, cork->addr, rt);
}
- skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority);
+ skb->priority = cork->priority;
skb->mark = cork->mark;
if (sk_is_tcp(sk))
skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index cf377377b52d..6d9c5c20b1c4 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -128,20 +128,20 @@ static void ip_cmsg_recv_checksum(struct msghdr *msg, struct sk_buff *skb,
static void ip_cmsg_recv_security(struct msghdr *msg, struct sk_buff *skb)
{
- char *secdata;
- u32 seclen, secid;
+ struct lsm_context ctx;
+ u32 secid;
int err;
err = security_socket_getpeersec_dgram(NULL, skb, &secid);
if (err)
return;
- err = security_secid_to_secctx(secid, &secdata, &seclen);
- if (err)
+ err = security_secid_to_secctx(secid, &ctx);
+ if (err < 0)
return;
- put_cmsg(msg, SOL_IP, SCM_SECURITY, seclen, secdata);
- security_release_secctx(secdata, seclen);
+ put_cmsg(msg, SOL_IP, SCM_SECURITY, ctx.len, ctx.context);
+ security_release_secctx(&ctx);
}
static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
@@ -315,7 +315,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
if (val < 0 || val > 255)
return -EINVAL;
ipc->tos = val;
- ipc->priority = rt_tos2priority(ipc->tos);
+ ipc->sockc.priority = rt_tos2priority(ipc->tos);
break;
case IP_PROTOCOL:
if (cmsg->cmsg_len != CMSG_LEN(sizeof(int)))
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index a3676155be78..f65d2f727381 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -416,7 +416,7 @@ int skb_tunnel_check_pmtu(struct sk_buff *skb, struct dst_entry *encap_dst,
skb_dst_update_pmtu_no_confirm(skb, mtu);
- if (!reply || skb->pkt_type == PACKET_HOST)
+ if (!reply)
return 0;
if (skb->protocol == htons(ETH_P_IP))
@@ -451,7 +451,7 @@ static const struct nla_policy
geneve_opt_policy[LWTUNNEL_IP_OPT_GENEVE_MAX + 1] = {
[LWTUNNEL_IP_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
[LWTUNNEL_IP_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
- [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+ [LWTUNNEL_IP_OPT_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
};
static const struct nla_policy
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 40053a02bae1..affd21a0f572 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -189,6 +189,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED),
SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED),
SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED),
+ SNMP_MIB_ITEM("PAWSOldAck", LINUX_MIB_PAWS_OLD_ACK),
SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS),
SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED),
SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST),
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 0e9e01967ec9..4304a68d1db0 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -358,7 +358,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
skb_reserve(skb, hlen);
skb->protocol = htons(ETH_P_IP);
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = sockc->priority;
skb->mark = sockc->mark;
skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
skb_dst_set(skb, &rt->dst);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index cf84704af25c..753704f75b2c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3282,6 +3282,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct flowi4 fl4 = {};
__be32 dst = 0;
__be32 src = 0;
+ dscp_t dscp;
kuid_t uid;
u32 iif;
int err;
@@ -3296,6 +3297,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
dst = nla_get_in_addr_default(tb[RTA_DST], 0);
iif = nla_get_u32_default(tb[RTA_IIF], 0);
mark = nla_get_u32_default(tb[RTA_MARK], 0);
+ dscp = inet_dsfield_to_dscp(rtm->rtm_tos);
if (tb[RTA_UID])
uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
else
@@ -3320,7 +3322,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = rtm->rtm_tos & INET_DSCP_MASK;
+ fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
@@ -3344,9 +3346,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.flowi4_iif = iif; /* for rt_fill_info */
skb->dev = dev;
skb->mark = mark;
- err = ip_route_input_rcu(skb, dst, src,
- inet_dsfield_to_dscp(rtm->rtm_tos),
- dev, &res) ? -EINVAL : 0;
+ err = ip_route_input_rcu(skb, dst, src, dscp, dev,
+ &res) ? -EINVAL : 0;
rt = skb_rtable(skb);
if (err == 0 && rt->dst.error)
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a79b2a52ce01..42cb5dc9cb24 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,6 +45,7 @@ static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
+static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -1066,6 +1067,15 @@ static struct ctl_table ipv4_net_table[] = {
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_tw_reuse_delay",
+ .data = &init_net.ipv4.sysctl_tcp_tw_reuse_delay,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = &tcp_tw_reuse_delay_max,
+ },
+ {
.procname = "tcp_max_syn_backlog",
.data = &init_net.ipv4.sysctl_max_syn_backlog,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d74281eca14f..57df7c1d2faa 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1565,12 +1565,13 @@ EXPORT_SYMBOL(tcp_recv_skb);
* or for 'peeking' the socket using this routine
* (although both would be easy to implement).
*/
-int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
- sk_read_actor_t recv_actor)
+static int __tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
{
struct sk_buff *skb;
struct tcp_sock *tp = tcp_sk(sk);
- u32 seq = tp->copied_seq;
+ u32 seq = *copied_seq;
u32 offset;
int copied = 0;
@@ -1624,9 +1625,12 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_eat_recv_skb(sk, skb);
if (!desc->count)
break;
- WRITE_ONCE(tp->copied_seq, seq);
+ WRITE_ONCE(*copied_seq, seq);
}
- WRITE_ONCE(tp->copied_seq, seq);
+ WRITE_ONCE(*copied_seq, seq);
+
+ if (noack)
+ goto out;
tcp_rcv_space_adjust(sk);
@@ -1635,10 +1639,25 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
tcp_recv_skb(sk, seq, &offset);
tcp_cleanup_rbuf(sk, copied);
}
+out:
return copied;
}
+
+int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, false,
+ &tcp_sk(sk)->copied_seq);
+}
EXPORT_SYMBOL(tcp_read_sock);
+int tcp_read_sock_noack(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack,
+ u32 *copied_seq)
+{
+ return __tcp_read_sock(sk, desc, recv_actor, noack, copied_seq);
+}
+
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
{
struct sk_buff *skb;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index 47f65b1b70ca..ba581785adb4 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -646,6 +646,42 @@ static int tcp_bpf_assert_proto_ops(struct proto *ops)
ops->sendmsg == tcp_sendmsg ? 0 : -ENOTSUPP;
}
+#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
+int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ struct sock *sk = strp->sk;
+ struct sk_psock *psock;
+ struct tcp_sock *tp;
+ int copied = 0;
+
+ tp = tcp_sk(sk);
+ rcu_read_lock();
+ psock = sk_psock(sk);
+ if (WARN_ON_ONCE(!psock)) {
+ desc->error = -EINVAL;
+ goto out;
+ }
+
+ psock->ingress_bytes = 0;
+ copied = tcp_read_sock_noack(sk, desc, recv_actor, true,
+ &psock->copied_seq);
+ if (copied < 0)
+ goto out;
+ /* recv_actor may redirect skb to another socket (SK_REDIRECT) or
+ * just put skb into ingress queue of current socket (SK_PASS).
+ * For SK_REDIRECT, we need to ack the frame immediately but for
+ * SK_PASS, we want to delay the ack until tcp_bpf_recvmsg_parser().
+ */
+ tp->copied_seq = psock->copied_seq - psock->ingress_bytes;
+ tcp_rcv_space_adjust(sk);
+ __tcp_cleanup_rbuf(sk, copied - psock->ingress_bytes);
+out:
+ rcu_read_unlock();
+ return copied;
+}
+#endif /* CONFIG_BPF_STREAM_PARSER */
+
int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
{
int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0ee22e10fcfa..0cbf81bf3d45 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4456,34 +4456,40 @@ static u32 tcp_tsval_replay(const struct sock *sk)
return inet_csk(sk)->icsk_rto * 1200 / HZ;
}
-static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb)
+static enum skb_drop_reason tcp_disordered_ack_check(const struct sock *sk,
+ const struct sk_buff *skb)
{
const struct tcp_sock *tp = tcp_sk(sk);
const struct tcphdr *th = tcp_hdr(skb);
- u32 seq = TCP_SKB_CB(skb)->seq;
+ SKB_DR_INIT(reason, TCP_RFC7323_PAWS);
u32 ack = TCP_SKB_CB(skb)->ack_seq;
+ u32 seq = TCP_SKB_CB(skb)->seq;
- return /* 1. Pure ACK with correct sequence number. */
- (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) &&
+ /* 1. Is this not a pure ACK ? */
+ if (!th->ack || seq != TCP_SKB_CB(skb)->end_seq)
+ return reason;
- /* 2. ... and duplicate ACK. */
- ack == tp->snd_una &&
+ /* 2. Is its sequence not the expected one ? */
+ if (seq != tp->rcv_nxt)
+ return before(seq, tp->rcv_nxt) ?
+ SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK :
+ reason;
- /* 3. ... and does not update window. */
- !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) &&
+ /* 3. Is this not a duplicate ACK ? */
+ if (ack != tp->snd_una)
+ return reason;
- /* 4. ... and sits in replay window. */
- (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <=
- tcp_tsval_replay(sk);
-}
+ /* 4. Is this updating the window ? */
+ if (tcp_may_update_window(tp, ack, seq, ntohs(th->window) <<
+ tp->rx_opt.snd_wscale))
+ return reason;
-static inline bool tcp_paws_discard(const struct sock *sk,
- const struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
+ /* 5. Is this not in the replay window ? */
+ if ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) >
+ tcp_tsval_replay(sk))
+ return reason;
- return !tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW) &&
- !tcp_disordered_ack(sk, skb);
+ return 0;
}
/* Check segment sequence number for validity.
@@ -5955,23 +5961,35 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
- if (tcp_fast_parse_options(sock_net(sk), skb, th, tp) &&
- tp->rx_opt.saw_tstamp &&
- tcp_paws_discard(sk, skb)) {
- if (!th->rst) {
- if (unlikely(th->syn))
- goto syn_challenge;
- NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
- if (!tcp_oow_rate_limited(sock_net(sk), skb,
- LINUX_MIB_TCPACKSKIPPEDPAWS,
- &tp->last_oow_ack_time))
- tcp_send_dupack(sk, skb);
- SKB_DR_SET(reason, TCP_RFC7323_PAWS);
- goto discard;
- }
- /* Reset is accepted even if it did not pass PAWS. */
+ if (!tcp_fast_parse_options(sock_net(sk), skb, th, tp) ||
+ !tp->rx_opt.saw_tstamp ||
+ tcp_paws_check(&tp->rx_opt, TCP_PAWS_WINDOW))
+ goto step1;
+
+ reason = tcp_disordered_ack_check(sk, skb);
+ if (!reason)
+ goto step1;
+ /* Reset is accepted even if it did not pass PAWS. */
+ if (th->rst)
+ goto step1;
+ if (unlikely(th->syn))
+ goto syn_challenge;
+
+ /* Old ACK are common, increment PAWS_OLD_ACK
+ * and do not send a dupack.
+ */
+ if (reason == SKB_DROP_REASON_TCP_RFC7323_PAWS_ACK) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWS_OLD_ACK);
+ goto discard;
}
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
+ if (!tcp_oow_rate_limited(sock_net(sk), skb,
+ LINUX_MIB_TCPACKSKIPPEDPAWS,
+ &tp->last_oow_ack_time))
+ tcp_send_dupack(sk, skb);
+ goto discard;
+step1:
/* Step 1: check sequence number */
reason = tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq);
if (reason) {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 96d68f9b1bb9..2632844d2c35 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -120,6 +120,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
struct tcp_sock *tp = tcp_sk(sk);
int ts_recent_stamp;
+ u32 reuse_thresh;
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2)
reuse = 0;
@@ -162,9 +163,10 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
and use initial timestamp retrieved from peer table.
*/
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
+ reuse_thresh = READ_ONCE(tw->tw_entry_stamp) +
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse_delay);
if (ts_recent_stamp &&
- (!twp || (reuse && time_after32(ktime_get_seconds(),
- ts_recent_stamp)))) {
+ (!twp || (reuse && time_after32(tcp_clock_ms(), reuse_thresh)))) {
/* inet_twsk_hashdance_schedule() sets sk_refcnt after putting twsk
* and releasing the bucket lock.
*/
@@ -3457,6 +3459,7 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
net->ipv4.sysctl_tcp_tw_reuse = 2;
+ net->ipv4.sysctl_tcp_tw_reuse_delay = 1 * MSEC_PER_SEC;
net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 789e495d3bd6..dfdb7a4608a8 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -157,8 +157,11 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
rcv_nxt);
if (tmp_opt.saw_tstamp) {
+ u64 ts = tcp_clock_ms();
+
+ WRITE_ONCE(tw->tw_entry_stamp, ts);
WRITE_ONCE(tcptw->tw_ts_recent_stamp,
- ktime_get_seconds());
+ div_u64(ts, MSEC_PER_SEC));
WRITE_ONCE(tcptw->tw_ts_recent,
tmp_opt.rcv_tsval);
}
@@ -316,6 +319,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
tw->tw_mark = sk->sk_mark;
tw->tw_priority = READ_ONCE(sk->sk_priority);
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
+ /* refreshed when we enter true TIME-WAIT state */
+ tw->tw_entry_stamp = tcp_time_stamp_ms(tp);
tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt;
tcptw->tw_rcv_wnd = tcp_receive_window(tp);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a9bb9ce5438e..3fe85ecec236 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1626,12 +1626,12 @@ static bool udp_skb_has_head_state(struct sk_buff *skb)
}
/* fully reclaim rmem/fwd memory allocated for skb */
-static void udp_rmem_release(struct sock *sk, int size, int partial,
- bool rx_queue_lock_held)
+static void udp_rmem_release(struct sock *sk, unsigned int size,
+ int partial, bool rx_queue_lock_held)
{
struct udp_sock *up = udp_sk(sk);
struct sk_buff_head *sk_queue;
- int amt;
+ unsigned int amt;
if (likely(partial)) {
up->forward_deficit += size;
@@ -1651,10 +1651,8 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
if (!rx_queue_lock_held)
spin_lock(&sk_queue->lock);
-
- sk_forward_alloc_add(sk, size);
- amt = (sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
- sk_forward_alloc_add(sk, -amt);
+ amt = (size + sk->sk_forward_alloc - partial) & ~(PAGE_SIZE - 1);
+ sk_forward_alloc_add(sk, size - amt);
if (amt)
__sk_mem_reduce_allocated(sk, amt >> PAGE_SHIFT);
@@ -1726,17 +1724,25 @@ static int udp_rmem_schedule(struct sock *sk, int size)
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff_head *list = &sk->sk_receive_queue;
- int rmem, err = -ENOMEM;
+ unsigned int rmem, rcvbuf;
spinlock_t *busy = NULL;
- int size, rcvbuf;
+ int size, err = -ENOMEM;
- /* Immediately drop when the receive queue is full.
- * Always allow at least one packet.
- */
rmem = atomic_read(&sk->sk_rmem_alloc);
rcvbuf = READ_ONCE(sk->sk_rcvbuf);
- if (rmem > rcvbuf)
- goto drop;
+ size = skb->truesize;
+
+ /* Immediately drop when the receive queue is full.
+ * Cast to unsigned int performs the boundary check for INT_MAX.
+ */
+ if (rmem + size > rcvbuf) {
+ if (rcvbuf > INT_MAX >> 1)
+ goto drop;
+
+ /* Always allow at least one packet for small buffer. */
+ if (rmem > rcvbuf)
+ goto drop;
+ }
/* Under mem pressure, it might be helpful to help udp_recvmsg()
* having linear skbs :
@@ -1746,10 +1752,10 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
*/
if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
-
+ size = skb->truesize;
busy = busylock_acquire(sk);
}
- size = skb->truesize;
+
udp_set_dev_scratch(skb);
atomic_add(size, &sk->sk_rmem_alloc);
@@ -1836,7 +1842,7 @@ EXPORT_SYMBOL_GPL(skb_consume_udp);
static struct sk_buff *__first_packet_length(struct sock *sk,
struct sk_buff_head *rcvq,
- int *total)
+ unsigned int *total)
{
struct sk_buff *skb;
@@ -1869,8 +1875,8 @@ static int first_packet_length(struct sock *sk)
{
struct sk_buff_head *rcvq = &udp_sk(sk)->reader_queue;
struct sk_buff_head *sk_queue = &sk->sk_receive_queue;
+ unsigned int total = 0;
struct sk_buff *skb;
- int total = 0;
int res;
spin_lock_bh(&rcvq->lock);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 0e765466d7f7..54a8ea004da2 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -852,7 +852,7 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
struct inet6_dev *idev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
int changed = (!idev->cnf.forwarding) ^ (!newf);
@@ -865,13 +865,12 @@ static void addrconf_forward_change(struct net *net, __s32 newf)
static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int newf)
{
- struct net *net;
+ struct net *net = (struct net *)table->extra2;
int old;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
- net = (struct net *)table->extra2;
old = *p;
WRITE_ONCE(*p, newf);
@@ -881,7 +880,7 @@ static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int
NETCONFA_FORWARDING,
NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
@@ -903,7 +902,7 @@ static int addrconf_fixup_forwarding(const struct ctl_table *table, int *p, int
net->ipv6.devconf_all);
} else if ((!newf) ^ (!old))
dev_forward_change((struct inet6_dev *)table->extra1);
- rtnl_unlock();
+ rtnl_net_unlock(net);
if (newf)
rt6_purge_dflt_routers(net);
@@ -916,7 +915,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf)
struct inet6_dev *idev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
int changed = (!idev->cnf.ignore_routes_with_linkdown) ^ (!newf);
@@ -933,13 +932,12 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf)
static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int newf)
{
- struct net *net;
+ struct net *net = (struct net *)table->extra2;
int old;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
- net = (struct net *)table->extra2;
old = *p;
WRITE_ONCE(*p, newf);
@@ -950,7 +948,7 @@ static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int ne
NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
@@ -964,7 +962,8 @@ static int addrconf_fixup_linkdown(const struct ctl_table *table, int *p, int ne
NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all);
}
- rtnl_unlock();
+
+ rtnl_net_unlock(net);
return 1;
}
@@ -2980,11 +2979,11 @@ int addrconf_set_dstaddr(struct net *net, void __user *arg)
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
- rtnl_lock();
+ rtnl_net_lock(net);
dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
if (dev && dev->type == ARPHRD_SIT)
err = addrconf_set_sit_dstaddr(net, dev, &ireq);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
@@ -3008,39 +3007,25 @@ static int ipv6_mc_config(struct sock *sk, bool join,
/*
* Manual configuration of address on an interface
*/
-static int inet6_addr_add(struct net *net, int ifindex,
- struct ifa6_config *cfg,
+static int inet6_addr_add(struct net *net, struct net_device *dev,
+ struct ifa6_config *cfg, clock_t expires, u32 flags,
struct netlink_ext_ack *extack)
{
struct inet6_ifaddr *ifp;
struct inet6_dev *idev;
- struct net_device *dev;
- unsigned long timeout;
- clock_t expires;
- u32 flags;
- ASSERT_RTNL();
+ ASSERT_RTNL_NET(net);
if (cfg->plen > 128) {
NL_SET_ERR_MSG_MOD(extack, "Invalid prefix length");
return -EINVAL;
}
- /* check the lifetime */
- if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft) {
- NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid");
- return -EINVAL;
- }
-
if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR && cfg->plen != 64) {
NL_SET_ERR_MSG_MOD(extack, "address with \"mngtmpaddr\" flag must have a prefix length of 64");
return -EINVAL;
}
- dev = __dev_get_by_index(net, ifindex);
- if (!dev)
- return -ENODEV;
-
idev = addrconf_add_dev(dev);
if (IS_ERR(idev)) {
NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
@@ -3049,7 +3034,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
int ret = ipv6_mc_config(net->ipv6.mc_autojoin_sk,
- true, cfg->pfx, ifindex);
+ true, cfg->pfx, dev->ifindex);
if (ret < 0) {
NL_SET_ERR_MSG_MOD(extack, "Multicast auto join failed");
@@ -3059,24 +3044,6 @@ static int inet6_addr_add(struct net *net, int ifindex,
cfg->scope = ipv6_addr_scope(cfg->pfx);
- timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- expires = jiffies_to_clock_t(timeout * HZ);
- cfg->valid_lft = timeout;
- flags = RTF_EXPIRES;
- } else {
- expires = 0;
- flags = 0;
- cfg->ifa_flags |= IFA_F_PERMANENT;
- }
-
- timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- if (timeout == 0)
- cfg->ifa_flags |= IFA_F_DEPRECATED;
- cfg->preferred_lft = timeout;
- }
-
ifp = ipv6_add_addr(idev, cfg, true, extack);
if (!IS_ERR(ifp)) {
if (!(cfg->ifa_flags & IFA_F_NOPREFIXROUTE)) {
@@ -3104,7 +3071,7 @@ static int inet6_addr_add(struct net *net, int ifindex,
return 0;
} else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) {
ipv6_mc_config(net->ipv6.mc_autojoin_sk, false,
- cfg->pfx, ifindex);
+ cfg->pfx, dev->ifindex);
}
return PTR_ERR(ifp);
@@ -3129,7 +3096,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags,
return -ENODEV;
}
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (!idev) {
NL_SET_ERR_MSG_MOD(extack, "IPv6 is disabled on this device");
return -ENXIO;
@@ -3170,6 +3137,7 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
.preferred_lft = INFINITY_LIFE_TIME,
.valid_lft = INFINITY_LIFE_TIME,
};
+ struct net_device *dev;
struct in6_ifreq ireq;
int err;
@@ -3182,9 +3150,13 @@ int addrconf_add_ifaddr(struct net *net, void __user *arg)
cfg.pfx = &ireq.ifr6_addr;
cfg.plen = ireq.ifr6_prefixlen;
- rtnl_lock();
- err = inet6_addr_add(net, ireq.ifr6_ifindex, &cfg, NULL);
- rtnl_unlock();
+ rtnl_net_lock(net);
+ dev = __dev_get_by_index(net, ireq.ifr6_ifindex);
+ if (dev)
+ err = inet6_addr_add(net, dev, &cfg, 0, 0, NULL);
+ else
+ err = -ENODEV;
+ rtnl_net_unlock(net);
return err;
}
@@ -3199,10 +3171,10 @@ int addrconf_del_ifaddr(struct net *net, void __user *arg)
if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq)))
return -EFAULT;
- rtnl_lock();
+ rtnl_net_lock(net);
err = inet6_addr_del(net, ireq.ifr6_ifindex, 0, &ireq.ifr6_addr,
ireq.ifr6_prefixlen, NULL);
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
@@ -4205,6 +4177,7 @@ static void addrconf_dad_work(struct work_struct *w)
struct inet6_dev *idev = ifp->idev;
bool bump_id, disable_ipv6 = false;
struct in6_addr mcaddr;
+ struct net *net;
enum {
DAD_PROCESS,
@@ -4212,7 +4185,9 @@ static void addrconf_dad_work(struct work_struct *w)
DAD_ABORT,
} action = DAD_PROCESS;
- rtnl_lock();
+ net = dev_net(idev->dev);
+
+ rtnl_net_lock(net);
spin_lock_bh(&ifp->lock);
if (ifp->state == INET6_IFADDR_STATE_PREDAD) {
@@ -4222,7 +4197,7 @@ static void addrconf_dad_work(struct work_struct *w)
action = DAD_ABORT;
ifp->state = INET6_IFADDR_STATE_POSTDAD;
- if ((READ_ONCE(dev_net(idev->dev)->ipv6.devconf_all->accept_dad) > 1 ||
+ if ((READ_ONCE(net->ipv6.devconf_all->accept_dad) > 1 ||
READ_ONCE(idev->cnf.accept_dad) > 1) &&
!idev->cnf.disable_ipv6 &&
!(ifp->flags & IFA_F_STABLE_PRIVACY)) {
@@ -4304,7 +4279,7 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_nonce);
out:
in6_ifa_put(ifp);
- rtnl_unlock();
+ rtnl_net_unlock(net);
}
/* ifp->idev must be at least read locked */
@@ -4752,9 +4727,9 @@ static void addrconf_verify_work(struct work_struct *w)
struct net *net = container_of(to_delayed_work(w), struct net,
ipv6.addr_chk_work);
- rtnl_lock();
+ rtnl_net_lock(net);
addrconf_verify_rtnl(net);
- rtnl_unlock();
+ rtnl_net_unlock(net);
}
static void addrconf_verify(struct net *net)
@@ -4817,8 +4792,12 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh,
/* We ignore other flags so far. */
ifa_flags &= IFA_F_MANAGETEMPADDR;
- return inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
- ifm->ifa_prefixlen, extack);
+ rtnl_net_lock(net);
+ err = inet6_addr_del(net, ifm->ifa_index, ifa_flags, pfx,
+ ifm->ifa_prefixlen, extack);
+ rtnl_net_unlock(net);
+
+ return err;
}
static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp,
@@ -4867,19 +4846,14 @@ static int modify_prefix_route(struct net *net, struct inet6_ifaddr *ifp,
}
static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
- struct ifa6_config *cfg)
+ struct ifa6_config *cfg, clock_t expires,
+ u32 flags)
{
- u32 flags;
- clock_t expires;
- unsigned long timeout;
bool was_managetempaddr;
- bool had_prefixroute;
bool new_peer = false;
+ bool had_prefixroute;
- ASSERT_RTNL();
-
- if (!cfg->valid_lft || cfg->preferred_lft > cfg->valid_lft)
- return -EINVAL;
+ ASSERT_RTNL_NET(net);
if (cfg->ifa_flags & IFA_F_MANAGETEMPADDR &&
(ifp->flags & IFA_F_TEMPORARY || ifp->prefix_len != 64))
@@ -4888,24 +4862,6 @@ static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp,
if (!(ifp->flags & IFA_F_TENTATIVE) || ifp->flags & IFA_F_DADFAILED)
cfg->ifa_flags &= ~IFA_F_OPTIMISTIC;
- timeout = addrconf_timeout_fixup(cfg->valid_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- expires = jiffies_to_clock_t(timeout * HZ);
- cfg->valid_lft = timeout;
- flags = RTF_EXPIRES;
- } else {
- expires = 0;
- flags = 0;
- cfg->ifa_flags |= IFA_F_PERMANENT;
- }
-
- timeout = addrconf_timeout_fixup(cfg->preferred_lft, HZ);
- if (addrconf_finite_timeout(timeout)) {
- if (timeout == 0)
- cfg->ifa_flags |= IFA_F_DEPRECATED;
- cfg->preferred_lft = timeout;
- }
-
if (cfg->peer_pfx &&
memcmp(&ifp->peer_addr, cfg->peer_pfx, sizeof(struct in6_addr))) {
if (!ipv6_addr_any(&ifp->peer_addr))
@@ -4990,13 +4946,16 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
- struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
struct in6_addr *peer_pfx;
struct inet6_ifaddr *ifa;
struct net_device *dev;
struct inet6_dev *idev;
struct ifa6_config cfg;
+ struct ifaddrmsg *ifm;
+ unsigned long timeout;
+ clock_t expires;
+ u32 flags;
int err;
err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFA_MAX,
@@ -5019,8 +4978,18 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[IFA_PROTO])
cfg.ifa_proto = nla_get_u8(tb[IFA_PROTO]);
+ cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
+
+ /* We ignore other flags so far. */
+ cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
+ IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
+ IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
+
+ cfg.ifa_flags |= IFA_F_PERMANENT;
cfg.valid_lft = INFINITY_LIFE_TIME;
cfg.preferred_lft = INFINITY_LIFE_TIME;
+ expires = 0;
+ flags = 0;
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
@@ -5028,24 +4997,43 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
ci = nla_data(tb[IFA_CACHEINFO]);
cfg.valid_lft = ci->ifa_valid;
cfg.preferred_lft = ci->ifa_prefered;
+
+ if (!cfg.valid_lft || cfg.preferred_lft > cfg.valid_lft) {
+ NL_SET_ERR_MSG_MOD(extack, "address lifetime invalid");
+ return -EINVAL;
+ }
+
+ timeout = addrconf_timeout_fixup(cfg.valid_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ cfg.ifa_flags &= ~IFA_F_PERMANENT;
+ cfg.valid_lft = timeout;
+ expires = jiffies_to_clock_t(timeout * HZ);
+ flags = RTF_EXPIRES;
+ }
+
+ timeout = addrconf_timeout_fixup(cfg.preferred_lft, HZ);
+ if (addrconf_finite_timeout(timeout)) {
+ if (timeout == 0)
+ cfg.ifa_flags |= IFA_F_DEPRECATED;
+
+ cfg.preferred_lft = timeout;
+ }
}
+ rtnl_net_lock(net);
+
dev = __dev_get_by_index(net, ifm->ifa_index);
if (!dev) {
NL_SET_ERR_MSG_MOD(extack, "Unable to find the interface");
- return -ENODEV;
+ err = -ENODEV;
+ goto unlock;
}
- cfg.ifa_flags = nla_get_u32_default(tb[IFA_FLAGS], ifm->ifa_flags);
-
- /* We ignore other flags so far. */
- cfg.ifa_flags &= IFA_F_NODAD | IFA_F_HOMEADDRESS |
- IFA_F_MANAGETEMPADDR | IFA_F_NOPREFIXROUTE |
- IFA_F_MCAUTOJOIN | IFA_F_OPTIMISTIC;
-
idev = ipv6_find_idev(dev);
- if (IS_ERR(idev))
- return PTR_ERR(idev);
+ if (IS_ERR(idev)) {
+ err = PTR_ERR(idev);
+ goto unlock;
+ }
if (!ipv6_allow_optimistic_dad(net, idev))
cfg.ifa_flags &= ~IFA_F_OPTIMISTIC;
@@ -5053,7 +5041,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
if (cfg.ifa_flags & IFA_F_NODAD &&
cfg.ifa_flags & IFA_F_OPTIMISTIC) {
NL_SET_ERR_MSG(extack, "IFA_F_NODAD and IFA_F_OPTIMISTIC are mutually exclusive");
- return -EINVAL;
+ err = -EINVAL;
+ goto unlock;
}
ifa = ipv6_get_ifaddr(net, cfg.pfx, dev, 1);
@@ -5062,7 +5051,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
* It would be best to check for !NLM_F_CREATE here but
* userspace already relies on not having to provide this.
*/
- return inet6_addr_add(net, ifm->ifa_index, &cfg, extack);
+ err = inet6_addr_add(net, dev, &cfg, expires, flags, extack);
+ goto unlock;
}
if (nlh->nlmsg_flags & NLM_F_EXCL ||
@@ -5070,10 +5060,12 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
NL_SET_ERR_MSG_MOD(extack, "address already assigned");
err = -EEXIST;
} else {
- err = inet6_addr_modify(net, ifa, &cfg);
+ err = inet6_addr_modify(net, ifa, &cfg, expires, flags);
}
in6_ifa_put(ifa);
+unlock:
+ rtnl_net_unlock(net);
return err;
}
@@ -5127,22 +5119,6 @@ static inline int inet6_ifaddr_msgsize(void)
+ nla_total_size(4) /* IFA_RT_PRIORITY */;
}
-enum addr_type_t {
- UNICAST_ADDR,
- MULTICAST_ADDR,
- ANYCAST_ADDR,
-};
-
-struct inet6_fill_args {
- u32 portid;
- u32 seq;
- int event;
- unsigned int flags;
- int netnsid;
- int ifindex;
- enum addr_type_t type;
-};
-
static int inet6_fill_ifaddr(struct sk_buff *skb,
const struct inet6_ifaddr *ifa,
struct inet6_fill_args *args)
@@ -5221,15 +5197,16 @@ error:
return -EMSGSIZE;
}
-static int inet6_fill_ifmcaddr(struct sk_buff *skb,
- const struct ifmcaddr6 *ifmca,
- struct inet6_fill_args *args)
+int inet6_fill_ifmcaddr(struct sk_buff *skb,
+ const struct ifmcaddr6 *ifmca,
+ struct inet6_fill_args *args)
{
int ifindex = ifmca->idev->dev->ifindex;
u8 scope = RT_SCOPE_UNIVERSE;
struct nlmsghdr *nlh;
- if (ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
+ if (!args->force_rt_scope_universe &&
+ ipv6_addr_scope(&ifmca->mca_addr) & IFA_SITE)
scope = RT_SCOPE_SITE;
nlh = nlmsg_put(skb, args->portid, args->seq, args->event,
@@ -5255,9 +5232,9 @@ static int inet6_fill_ifmcaddr(struct sk_buff *skb,
return 0;
}
-static int inet6_fill_ifacaddr(struct sk_buff *skb,
- const struct ifacaddr6 *ifaca,
- struct inet6_fill_args *args)
+int inet6_fill_ifacaddr(struct sk_buff *skb,
+ const struct ifacaddr6 *ifaca,
+ struct inet6_fill_args *args)
{
struct net_device *dev = fib6_info_nh_dev(ifaca->aca_rt);
int ifindex = dev ? dev->ifindex : 1;
@@ -5418,6 +5395,7 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb,
.flags = NLM_F_MULTI,
.netnsid = -1,
.type = type,
+ .force_rt_scope_universe = false,
};
struct {
unsigned long ifindex;
@@ -5546,6 +5524,7 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
.event = RTM_NEWADDR,
.flags = 0,
.netnsid = -1,
+ .force_rt_scope_universe = false,
};
struct ifaddrmsg *ifm;
struct nlattr *tb[IFA_MAX+1];
@@ -5617,6 +5596,7 @@ static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa)
.event = event,
.flags = 0,
.netnsid = -1,
+ .force_rt_scope_universe = false,
};
int err = -ENOBUFS;
@@ -5804,6 +5784,27 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int attrtype,
}
}
+static int inet6_fill_ifla6_stats_attrs(struct sk_buff *skb,
+ struct inet6_dev *idev)
+{
+ struct nlattr *nla;
+
+ nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
+
+ nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
+ if (!nla)
+ goto nla_put_failure;
+ snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
u32 ext_filter_mask)
{
@@ -5826,18 +5827,10 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
/* XXX - MC not implemented */
- if (ext_filter_mask & RTEXT_FILTER_SKIP_STATS)
- return 0;
-
- nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
- if (!nla)
- goto nla_put_failure;
- snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_STATS, nla_len(nla));
-
- nla = nla_reserve(skb, IFLA_INET6_ICMP6STATS, ICMP6_MIB_MAX * sizeof(u64));
- if (!nla)
- goto nla_put_failure;
- snmp6_fill_stats(nla_data(nla), idev, IFLA_INET6_ICMP6STATS, nla_len(nla));
+ if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) {
+ if (inet6_fill_ifla6_stats_attrs(skb, idev) < 0)
+ goto nla_put_failure;
+ }
nla = nla_reserve(skb, IFLA_INET6_TOKEN, sizeof(struct in6_addr));
if (!nla)
@@ -6382,7 +6375,7 @@ static void addrconf_disable_change(struct net *net, __s32 newf)
struct inet6_dev *idev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
int changed = (!idev->cnf.disable_ipv6) ^ (!newf);
@@ -6403,7 +6396,7 @@ static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf
return 0;
}
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
old = *p;
@@ -6412,10 +6405,11 @@ static int addrconf_disable_ipv6(const struct ctl_table *table, int *p, int newf
if (p == &net->ipv6.devconf_all->disable_ipv6) {
WRITE_ONCE(net->ipv6.devconf_dflt->disable_ipv6, newf);
addrconf_disable_change(net, newf);
- } else if ((!newf) ^ (!old))
+ } else if ((!newf) ^ (!old)) {
dev_disable_change((struct inet6_dev *)table->extra1);
+ }
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
@@ -6458,20 +6452,20 @@ static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write,
if (write && old != new) {
struct net *net = ctl->extra2;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
- if (valp == &net->ipv6.devconf_dflt->proxy_ndp)
+ if (valp == &net->ipv6.devconf_dflt->proxy_ndp) {
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_PROXY_NEIGH,
NETCONFA_IFINDEX_DEFAULT,
net->ipv6.devconf_dflt);
- else if (valp == &net->ipv6.devconf_all->proxy_ndp)
+ } else if (valp == &net->ipv6.devconf_all->proxy_ndp) {
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
NETCONFA_PROXY_NEIGH,
NETCONFA_IFINDEX_ALL,
net->ipv6.devconf_all);
- else {
+ } else {
struct inet6_dev *idev = ctl->extra1;
inet6_netconf_notify_devconf(net, RTM_NEWNETCONF,
@@ -6479,7 +6473,7 @@ static int addrconf_sysctl_proxy_ndp(const struct ctl_table *ctl, int write,
idev->dev->ifindex,
&idev->cnf);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
}
return ret;
@@ -6499,7 +6493,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write,
.mode = ctl->mode,
};
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
new_val = *((u32 *)ctl->data);
@@ -6529,7 +6523,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write,
WRITE_ONCE(net->ipv6.devconf_dflt->addr_gen_mode, new_val);
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev &&
idev->cnf.addr_gen_mode != new_val) {
WRITE_ONCE(idev->cnf.addr_gen_mode,
@@ -6543,7 +6537,7 @@ static int addrconf_sysctl_addr_gen_mode(const struct ctl_table *ctl, int write,
}
out:
- rtnl_unlock();
+ rtnl_net_unlock(net);
return ret;
}
@@ -6565,7 +6559,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write,
lctl.maxlen = IPV6_MAX_STRLEN;
lctl.data = str;
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
if (!write && !secret->initialized) {
@@ -6595,7 +6589,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write,
struct net_device *dev;
for_each_netdev(net, dev) {
- struct inet6_dev *idev = __in6_dev_get(dev);
+ struct inet6_dev *idev = __in6_dev_get_rtnl_net(dev);
if (idev) {
WRITE_ONCE(idev->cnf.addr_gen_mode,
@@ -6610,7 +6604,7 @@ static int addrconf_sysctl_stable_secret(const struct ctl_table *ctl, int write,
}
out:
- rtnl_unlock();
+ rtnl_net_unlock(net);
return err;
}
@@ -6694,7 +6688,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
return 0;
}
- if (!rtnl_trylock())
+ if (!rtnl_net_trylock(net))
return restart_syscall();
WRITE_ONCE(*valp, val);
@@ -6703,7 +6697,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
struct net_device *dev;
for_each_netdev(net, dev) {
- idev = __in6_dev_get(dev);
+ idev = __in6_dev_get_rtnl_net(dev);
if (idev)
addrconf_disable_policy_idev(idev, val);
}
@@ -6712,7 +6706,7 @@ int addrconf_disable_policy(const struct ctl_table *ctl, int *valp, int val)
addrconf_disable_policy_idev(idev, val);
}
- rtnl_unlock();
+ rtnl_net_unlock(net);
return 0;
}
@@ -7425,9 +7419,9 @@ static const struct rtnl_msg_handler addrconf_rtnl_msg_handlers[] __initconst_or
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETLINK,
.dumpit = inet6_dump_ifinfo, .flags = RTNL_FLAG_DUMP_UNLOCKED},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWADDR,
- .doit = inet6_rtm_newaddr},
+ .doit = inet6_rtm_newaddr, .flags = RTNL_FLAG_DOIT_PERNET},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELADDR,
- .doit = inet6_rtm_deladdr},
+ .doit = inet6_rtm_deladdr, .flags = RTNL_FLAG_DOIT_PERNET},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETADDR,
.doit = inet6_rtm_getaddr, .dumpit = inet6_dump_ifaddr,
.flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED},
@@ -7469,9 +7463,9 @@ int __init addrconf_init(void)
goto out_nowq;
}
- rtnl_lock();
+ rtnl_net_lock(&init_net);
idev = ipv6_add_dev(blackhole_netdev);
- rtnl_unlock();
+ rtnl_net_unlock(&init_net);
if (IS_ERR(idev)) {
err = PTR_ERR(idev);
goto errlo;
@@ -7521,17 +7515,17 @@ void addrconf_cleanup(void)
rtnl_af_unregister(&inet6_ops);
- rtnl_lock();
+ rtnl_net_lock(&init_net);
/* clean dev list */
for_each_netdev(&init_net, dev) {
- if (__in6_dev_get(dev) == NULL)
+ if (!__in6_dev_get_rtnl_net(dev))
continue;
addrconf_ifdown(dev, true);
}
addrconf_ifdown(init_net.loopback_dev, true);
- rtnl_unlock();
+ rtnl_net_unlock(&init_net);
destroy_workqueue(addrconf_wq);
}
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index 562cace50ca9..21e01695b48c 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -278,6 +278,37 @@ static struct ifacaddr6 *aca_alloc(struct fib6_info *f6i,
return aca;
}
+static void inet6_ifacaddr_notify(struct net_device *dev,
+ const struct ifacaddr6 *ifaca, int event)
+{
+ struct inet6_fill_args fillargs = {
+ .event = event,
+ .netnsid = -1,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(struct in6_addr)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet6_fill_ifacaddr(skb, ifaca, &fillargs);
+ if (err < 0) {
+ pr_err("Failed to fill in anycast addresses (err %d)\n", err);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_ACADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_ACADDR, err);
+}
+
/*
* device anycast group inc (add if not found)
*/
@@ -333,6 +364,8 @@ int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr)
addrconf_join_solict(idev->dev, &aca->aca_addr);
+ inet6_ifacaddr_notify(idev->dev, aca, RTM_NEWANYCAST);
+
aca_put(aca);
return 0;
out:
@@ -375,6 +408,8 @@ int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr)
ip6_del_rt(dev_net(idev->dev), aca->aca_rt, false);
+ inet6_ifacaddr_notify(idev->dev, aca, RTM_DELANYCAST);
+
aca_put(aca);
return 0;
}
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index dbcea9fee626..62618a058b8f 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -1072,8 +1072,13 @@ static int calipso_sock_getattr(struct sock *sk,
struct ipv6_opt_hdr *hop;
int opt_len, len, ret_val = -ENOMSG, offset;
unsigned char *opt;
- struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return -EAFNOSUPPORT;
+ txopts = txopt_get(pinfo);
if (!txopts || !txopts->hopopt)
goto done;
@@ -1125,8 +1130,13 @@ static int calipso_sock_setattr(struct sock *sk,
{
int ret_val;
struct ipv6_opt_hdr *old, *new;
- struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return -EAFNOSUPPORT;
+ txopts = txopt_get(pinfo);
old = NULL;
if (txopts)
old = txopts->hopopt;
@@ -1153,8 +1163,13 @@ static int calipso_sock_setattr(struct sock *sk,
static void calipso_sock_delattr(struct sock *sk)
{
struct ipv6_opt_hdr *new_hop;
- struct ipv6_txoptions *txopts = txopt_get(inet6_sk(sk));
+ struct ipv6_pinfo *pinfo = inet6_sk(sk);
+ struct ipv6_txoptions *txopts;
+
+ if (!pinfo)
+ return;
+ txopts = txopt_get(pinfo);
if (!txopts || !txopts->hopopt)
goto done;
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index b2400c226a32..9e73944e3b53 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -315,7 +315,7 @@ static void esp_output_done(void *data, int err)
x->encap && x->encap->encap_type == TCP_ENCAP_ESPINTCP)
esp_output_tail_tcp(x, skb);
else
- xfrm_output_resume(skb->sk, skb, err);
+ xfrm_output_resume(skb_to_full_sk(skb), skb, err);
}
}
@@ -859,7 +859,8 @@ int esp6_input_done2(struct sk_buff *skb, int err)
skb_postpull_rcsum(skb, skb_network_header(skb),
skb_network_header_len(skb));
skb_pull_rcsum(skb, hlen);
- if (x->props.mode == XFRM_MODE_TUNNEL)
+ if (x->props.mode == XFRM_MODE_TUNNEL ||
+ x->props.mode == XFRM_MODE_IPTFS)
skb_reset_transport_header(skb);
else
skb_set_transport_header(skb, -hdr_len);
diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c
index c85c1627cb16..67d39114d9a6 100644
--- a/net/ipv6/fib6_rules.c
+++ b/net/ipv6/fib6_rules.c
@@ -26,6 +26,8 @@ struct fib6_rule {
struct fib_rule common;
struct rt6key src;
struct rt6key dst;
+ __be32 flowlabel;
+ __be32 flowlabel_mask;
dscp_t dscp;
u8 dscp_full:1; /* DSCP or TOS selector */
};
@@ -34,7 +36,7 @@ static bool fib6_rule_matchall(const struct fib_rule *rule)
{
struct fib6_rule *r = container_of(rule, struct fib6_rule, common);
- if (r->dst.plen || r->src.plen || r->dscp)
+ if (r->dst.plen || r->src.plen || r->dscp || r->flowlabel_mask)
return false;
return fib_rule_matchall(rule);
}
@@ -332,6 +334,9 @@ INDIRECT_CALLABLE_SCOPE int fib6_rule_match(struct fib_rule *rule,
if (r->dscp && r->dscp != ip6_dscp(fl6->flowlabel))
return 0;
+ if ((r->flowlabel ^ flowi6_get_flowlabel(fl6)) & r->flowlabel_mask)
+ return 0;
+
if (rule->ip_proto && (rule->ip_proto != fl6->flowi6_proto))
return 0;
@@ -360,6 +365,35 @@ static int fib6_nl2rule_dscp(const struct nlattr *nla, struct fib6_rule *rule6,
return 0;
}
+static int fib6_nl2rule_flowlabel(struct nlattr **tb, struct fib6_rule *rule6,
+ struct netlink_ext_ack *extack)
+{
+ __be32 flowlabel, flowlabel_mask;
+
+ if (NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL) ||
+ NL_REQ_ATTR_CHECK(extack, NULL, tb, FRA_FLOWLABEL_MASK))
+ return -EINVAL;
+
+ flowlabel = nla_get_be32(tb[FRA_FLOWLABEL]);
+ flowlabel_mask = nla_get_be32(tb[FRA_FLOWLABEL_MASK]);
+
+ if (flowlabel_mask & ~IPV6_FLOWLABEL_MASK) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[FRA_FLOWLABEL_MASK],
+ "Invalid flow label mask");
+ return -EINVAL;
+ }
+
+ if (flowlabel & ~flowlabel_mask) {
+ NL_SET_ERR_MSG(extack, "Flow label and mask do not match");
+ return -EINVAL;
+ }
+
+ rule6->flowlabel = flowlabel;
+ rule6->flowlabel_mask = flowlabel_mask;
+
+ return 0;
+}
+
static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
struct fib_rule_hdr *frh,
struct nlattr **tb,
@@ -379,6 +413,10 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
if (tb[FRA_DSCP] && fib6_nl2rule_dscp(tb[FRA_DSCP], rule6, extack) < 0)
goto errout;
+ if ((tb[FRA_FLOWLABEL] || tb[FRA_FLOWLABEL_MASK]) &&
+ fib6_nl2rule_flowlabel(tb, rule6, extack) < 0)
+ goto errout;
+
if (rule->action == FR_ACT_TO_TBL && !rule->l3mdev) {
if (rule->table == RT6_TABLE_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid table");
@@ -444,6 +482,14 @@ static int fib6_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
return 0;
}
+ if (tb[FRA_FLOWLABEL] &&
+ nla_get_be32(tb[FRA_FLOWLABEL]) != rule6->flowlabel)
+ return 0;
+
+ if (tb[FRA_FLOWLABEL_MASK] &&
+ nla_get_be32(tb[FRA_FLOWLABEL_MASK]) != rule6->flowlabel_mask)
+ return 0;
+
if (frh->src_len &&
nla_memcmp(tb[FRA_SRC], &rule6->src.addr, sizeof(struct in6_addr)))
return 0;
@@ -472,6 +518,11 @@ static int fib6_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
frh->tos = inet_dscp_to_dsfield(rule6->dscp);
}
+ if (rule6->flowlabel_mask &&
+ (nla_put_be32(skb, FRA_FLOWLABEL, rule6->flowlabel) ||
+ nla_put_be32(skb, FRA_FLOWLABEL_MASK, rule6->flowlabel_mask)))
+ goto nla_put_failure;
+
if ((rule6->dst.plen &&
nla_put_in6_addr(skb, FRA_DST, &rule6->dst.addr)) ||
(rule6->src.plen &&
@@ -487,7 +538,9 @@ static size_t fib6_rule_nlmsg_payload(struct fib_rule *rule)
{
return nla_total_size(16) /* dst */
+ nla_total_size(16) /* src */
- + nla_total_size(1); /* dscp */
+ + nla_total_size(1) /* dscp */
+ + nla_total_size(4) /* flowlabel */
+ + nla_total_size(4); /* flowlabel mask */
}
static void fib6_rule_flush_cache(struct fib_rules_ops *ops)
diff --git a/net/ipv6/ioam6_iptunnel.c b/net/ipv6/ioam6_iptunnel.c
index 2c383c12a431..09065187378e 100644
--- a/net/ipv6/ioam6_iptunnel.c
+++ b/net/ipv6/ioam6_iptunnel.c
@@ -337,7 +337,6 @@ static int ioam6_do_encap(struct net *net, struct sk_buff *skb,
static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb), *cache_dst = NULL;
- struct in6_addr orig_daddr;
struct ioam6_lwt *ilwt;
int err = -EINVAL;
u32 pkt_cnt;
@@ -352,8 +351,6 @@ static int ioam6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (pkt_cnt % ilwt->freq.n >= ilwt->freq.k)
goto out;
- orig_daddr = ipv6_hdr(skb)->daddr;
-
local_bh_disable();
cache_dst = dst_cache_get(&ilwt->cache);
local_bh_enable();
@@ -422,7 +419,10 @@ do_encap:
goto drop;
}
- if (!ipv6_addr_equal(&orig_daddr, &ipv6_hdr(skb)->daddr)) {
+ /* avoid lwtunnel_output() reentry loop when destination is the same
+ * after transformation (e.g., with the inline mode)
+ */
+ if (dst->lwtstate != cache_dst->lwtstate) {
skb_dst_drop(skb);
skb_dst_set(skb, cache_dst);
return dst_output(net, sk, skb);
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 70c0e16c0ae6..39da6a7ce5f1 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -477,9 +477,7 @@ discard:
static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
skb_clear_delivery_time(skb);
- rcu_read_lock();
ip6_protocol_deliver_rcu(net, skb, 0, false);
- rcu_read_unlock();
return 0;
}
@@ -487,9 +485,15 @@ static int ip6_input_finish(struct net *net, struct sock *sk, struct sk_buff *sk
int ip6_input(struct sk_buff *skb)
{
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
- ip6_input_finish);
+ int res;
+
+ rcu_read_lock();
+ res = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_IN,
+ dev_net_rcu(skb->dev), NULL, skb, skb->dev, NULL,
+ ip6_input_finish);
+ rcu_read_unlock();
+
+ return res;
}
EXPORT_SYMBOL_GPL(ip6_input);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5a364b352115..d577bf2f3053 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1401,6 +1401,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
cork->base.gso_size = ipc6->gso_size;
cork->base.tx_flags = 0;
cork->base.mark = ipc6->sockc.mark;
+ cork->base.priority = ipc6->sockc.priority;
sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
cork->base.flags |= IPCORK_TS_OPT_ID;
@@ -1697,8 +1698,9 @@ alloc_new_skb:
pskb_trim_unique(skb_prev, maxfraglen);
}
if (copy > 0 &&
- getfrag(from, data + transhdrlen, offset,
- copy, fraggap, skb) < 0) {
+ INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, data + transhdrlen, offset,
+ copy, fraggap, skb) < 0) {
err = -EFAULT;
kfree_skb(skb);
goto error;
@@ -1742,8 +1744,9 @@ alloc_new_skb:
unsigned int off;
off = skb->len;
- if (getfrag(from, skb_put(skb, copy),
- offset, copy, off, skb) < 0) {
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from, skb_put(skb, copy),
+ offset, copy, off, skb) < 0) {
__skb_trim(skb, off);
err = -EFAULT;
goto error;
@@ -1781,7 +1784,8 @@ alloc_new_skb:
get_page(pfrag->page);
}
copy = min_t(int, copy, pfrag->size - pfrag->offset);
- if (getfrag(from,
+ if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
+ from,
page_address(pfrag->page) + pfrag->offset,
offset, copy, skb->len, skb) < 0)
goto error_efault;
@@ -1939,7 +1943,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
hdr->saddr = fl6->saddr;
hdr->daddr = *final_dst;
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = cork->base.priority;
skb->mark = cork->base.mark;
if (sk_is_tcp(sk))
skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index b7b62e5a562e..65831b4fee1f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -33,8 +33,10 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
+#include <linux/if_addr.h>
#include <linux/if_arp.h>
#include <linux/route.h>
+#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -47,6 +49,7 @@
#include <linux/netfilter_ipv6.h>
#include <net/net_namespace.h>
+#include <net/netlink.h>
#include <net/sock.h>
#include <net/snmp.h>
@@ -901,6 +904,41 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
return mc;
}
+static void inet6_ifmcaddr_notify(struct net_device *dev,
+ const struct ifmcaddr6 *ifmca, int event)
+{
+ struct inet6_fill_args fillargs = {
+ .portid = 0,
+ .seq = 0,
+ .event = event,
+ .flags = 0,
+ .netnsid = -1,
+ .force_rt_scope_universe = true,
+ };
+ struct net *net = dev_net(dev);
+ struct sk_buff *skb;
+ int err = -ENOMEM;
+
+ skb = nlmsg_new(NLMSG_ALIGN(sizeof(struct ifaddrmsg)) +
+ nla_total_size(sizeof(struct in6_addr)) +
+ nla_total_size(sizeof(struct ifa_cacheinfo)),
+ GFP_KERNEL);
+ if (!skb)
+ goto error;
+
+ err = inet6_fill_ifmcaddr(skb, ifmca, &fillargs);
+ if (err < 0) {
+ WARN_ON_ONCE(err == -EMSGSIZE);
+ nlmsg_free(skb);
+ goto error;
+ }
+
+ rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MCADDR, NULL, GFP_KERNEL);
+ return;
+error:
+ rtnl_set_sk_err(net, RTNLGRP_IPV6_MCADDR, err);
+}
+
/*
* device multicast group inc (add if not found)
*/
@@ -948,6 +986,7 @@ static int __ipv6_dev_mc_inc(struct net_device *dev,
mld_del_delrec(idev, mc);
igmp6_group_added(mc);
+ inet6_ifmcaddr_notify(dev, mc, RTM_NEWMULTICAST);
mutex_unlock(&idev->mc_lock);
ma_put(mc);
return 0;
@@ -977,6 +1016,8 @@ int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr)
*map = ma->next;
igmp6_group_dropped(ma);
+ inet6_ifmcaddr_notify(idev->dev, ma,
+ RTM_DELMULTICAST);
ip6_mc_clear_src(ma);
mutex_unlock(&idev->mc_lock);
@@ -1021,29 +1062,31 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group,
rcu_read_lock();
idev = __in6_dev_get(dev);
- if (idev) {
- for_each_mc_rcu(idev, mc) {
- if (ipv6_addr_equal(&mc->mca_addr, group))
- break;
- }
- if (mc) {
- if (src_addr && !ipv6_addr_any(src_addr)) {
- struct ip6_sf_list *psf;
+ if (!idev)
+ goto unlock;
+ for_each_mc_rcu(idev, mc) {
+ if (ipv6_addr_equal(&mc->mca_addr, group))
+ break;
+ }
+ if (!mc)
+ goto unlock;
+ if (src_addr && !ipv6_addr_any(src_addr)) {
+ struct ip6_sf_list *psf;
- for_each_psf_rcu(mc, psf) {
- if (ipv6_addr_equal(&psf->sf_addr, src_addr))
- break;
- }
- if (psf)
- rv = psf->sf_count[MCAST_INCLUDE] ||
- psf->sf_count[MCAST_EXCLUDE] !=
- mc->mca_sfcount[MCAST_EXCLUDE];
- else
- rv = mc->mca_sfcount[MCAST_EXCLUDE] != 0;
- } else
- rv = true; /* don't filter unspecified source */
+ for_each_psf_rcu(mc, psf) {
+ if (ipv6_addr_equal(&psf->sf_addr, src_addr))
+ break;
}
+ if (psf)
+ rv = READ_ONCE(psf->sf_count[MCAST_INCLUDE]) ||
+ READ_ONCE(psf->sf_count[MCAST_EXCLUDE]) !=
+ READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]);
+ else
+ rv = READ_ONCE(mc->mca_sfcount[MCAST_EXCLUDE]) != 0;
+ } else {
+ rv = true; /* don't filter unspecified source */
}
+unlock:
rcu_read_unlock();
return rv;
}
@@ -2290,7 +2333,7 @@ static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode,
/* source filter not found, or count wrong => bug */
return -ESRCH;
}
- psf->sf_count[sfmode]--;
+ WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] - 1);
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
struct inet6_dev *idev = pmc->idev;
@@ -2396,7 +2439,7 @@ static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode,
rcu_assign_pointer(pmc->mca_sources, psf);
}
}
- psf->sf_count[sfmode]++;
+ WRITE_ONCE(psf->sf_count[sfmode], psf->sf_count[sfmode] + 1);
return 0;
}
@@ -2508,7 +2551,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
sf_markstate(pmc);
isexclude = pmc->mca_sfmode == MCAST_EXCLUDE;
if (!delta)
- pmc->mca_sfcount[sfmode]++;
+ WRITE_ONCE(pmc->mca_sfcount[sfmode],
+ pmc->mca_sfcount[sfmode] + 1);
err = 0;
for (i = 0; i < sfcount; i++) {
err = ip6_mc_add1_src(pmc, sfmode, &psfsrc[i]);
@@ -2519,7 +2563,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
int j;
if (!delta)
- pmc->mca_sfcount[sfmode]--;
+ WRITE_ONCE(pmc->mca_sfcount[sfmode],
+ pmc->mca_sfcount[sfmode] - 1);
for (j = 0; j < i; j++)
ip6_mc_del1_src(pmc, sfmode, &psfsrc[j]);
} else if (isexclude != (pmc->mca_sfcount[MCAST_EXCLUDE] != 0)) {
@@ -2564,7 +2609,8 @@ static void ip6_mc_clear_src(struct ifmcaddr6 *pmc)
RCU_INIT_POINTER(pmc->mca_sources, NULL);
pmc->mca_sfmode = MCAST_EXCLUDE;
pmc->mca_sfcount[MCAST_INCLUDE] = 0;
- pmc->mca_sfcount[MCAST_EXCLUDE] = 1;
+ /* Paired with the READ_ONCE() from ipv6_chk_mcast_addr() */
+ WRITE_ONCE(pmc->mca_sfcount[MCAST_EXCLUDE], 1);
}
/* called with mc_lock */
@@ -3079,8 +3125,8 @@ static int igmp6_mcf_seq_show(struct seq_file *seq, void *v)
state->dev->ifindex, state->dev->name,
&state->im->mca_addr,
&psf->sf_addr,
- psf->sf_count[MCAST_INCLUDE],
- psf->sf_count[MCAST_EXCLUDE]);
+ READ_ONCE(psf->sf_count[MCAST_INCLUDE]),
+ READ_ONCE(psf->sf_count[MCAST_EXCLUDE]));
}
return 0;
}
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index a7690ec62325..9ea5ef56cb27 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -103,6 +103,10 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
struct sk_buff *data_skb = NULL;
int doff = 0;
int thoff = 0, tproto;
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn const *ct;
+#endif
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
if (tproto < 0) {
@@ -136,6 +140,25 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
return NULL;
}
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ /* Do the lookup with the original socket address in
+ * case this is a reply packet of an established
+ * SNAT-ted connection.
+ */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct &&
+ ((tproto != IPPROTO_ICMPV6 &&
+ ctinfo == IP_CT_ESTABLISHED_REPLY) ||
+ (tproto == IPPROTO_ICMPV6 &&
+ ctinfo == IP_CT_RELATED_REPLY)) &&
+ (ct->status & IPS_SRC_NAT_DONE)) {
+ daddr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.in6;
+ dport = (tproto == IPPROTO_TCP) ?
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.tcp.port :
+ ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.udp.port;
+ }
+#endif
+
return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
sport, dport, indev);
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 88b3fcacd4f9..46b8adf6e7f8 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -119,6 +119,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return -EINVAL;
ipcm6_init_sk(&ipc6, sk);
+ ipc6.sockc.priority = READ_ONCE(sk->sk_priority);
ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 8476a3944a88..a45aba090aa4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -619,7 +619,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length,
skb_reserve(skb, hlen);
skb->protocol = htons(ETH_P_IPV6);
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = sockc->priority;
skb->mark = sockc->mark;
skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid);
@@ -780,6 +780,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipcm6_init(&ipc6);
ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
ipc6.sockc.mark = fl6.flowi6_mark;
+ ipc6.sockc.priority = READ_ONCE(sk->sk_priority);
if (sin6) {
if (addr_len < SIN6_LEN_RFC2133)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 997e2e4f441d..21eca985a1fd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -412,12 +412,37 @@ static bool rt6_check_expired(const struct rt6_info *rt)
return false;
}
+static struct fib6_info *
+rt6_multipath_first_sibling_rcu(const struct fib6_info *rt)
+{
+ struct fib6_info *iter;
+ struct fib6_node *fn;
+
+ fn = rcu_dereference(rt->fib6_node);
+ if (!fn)
+ goto out;
+ iter = rcu_dereference(fn->leaf);
+ if (!iter)
+ goto out;
+
+ while (iter) {
+ if (iter->fib6_metric == rt->fib6_metric &&
+ rt6_qualify_for_ecmp(iter))
+ return iter;
+ iter = rcu_dereference(iter->fib6_next);
+ }
+
+out:
+ return NULL;
+}
+
void fib6_select_path(const struct net *net, struct fib6_result *res,
struct flowi6 *fl6, int oif, bool have_oif_match,
const struct sk_buff *skb, int strict)
{
- struct fib6_info *match = res->f6i;
+ struct fib6_info *first, *match = res->f6i;
struct fib6_info *sibling;
+ int hash;
if (!match->nh && (!match->fib6_nsiblings || have_oif_match))
goto out;
@@ -440,16 +465,25 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
return;
}
- if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
+ first = rt6_multipath_first_sibling_rcu(match);
+ if (!first)
goto out;
- list_for_each_entry_rcu(sibling, &match->fib6_siblings,
+ hash = fl6->mp_hash;
+ if (hash <= atomic_read(&first->fib6_nh->fib_nh_upper_bound)) {
+ if (rt6_score_route(first->fib6_nh, first->fib6_flags, oif,
+ strict) >= 0)
+ match = first;
+ goto out;
+ }
+
+ list_for_each_entry_rcu(sibling, &first->fib6_siblings,
fib6_siblings) {
const struct fib6_nh *nh = sibling->fib6_nh;
int nh_upper_bound;
nh_upper_bound = atomic_read(&nh->fib_nh_upper_bound);
- if (fl6->mp_hash > nh_upper_bound)
+ if (hash > nh_upper_bound)
continue;
if (rt6_score_route(nh, sibling->fib6_flags, oif, strict) < 0)
break;
@@ -1737,6 +1771,7 @@ out:
if (!err) {
spin_lock_bh(&f6i->fib6_table->tb6_lock);
fib6_update_sernum(net, f6i);
+ fib6_add_gc_list(f6i);
spin_unlock_bh(&f6i->fib6_table->tb6_lock);
fib6_force_start_gc(net);
}
@@ -3644,7 +3679,8 @@ out:
in6_dev_put(idev);
if (err) {
- lwtstate_put(fib6_nh->fib_nh_lws);
+ fib_nh_common_release(&fib6_nh->nh_common);
+ fib6_nh->nh_common.nhc_pcpu_rth_output = NULL;
fib6_nh->fib_nh_lws = NULL;
netdev_put(dev, dev_tracker);
}
@@ -3802,10 +3838,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (nh) {
if (rt->fib6_src.plen) {
NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ err = -EINVAL;
goto out_free;
}
if (!nexthop_get(nh)) {
NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -ENOENT;
goto out_free;
}
rt->nh = nh;
@@ -5010,6 +5048,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_SPORT] = { .type = NLA_U16 },
[RTA_DPORT] = { .type = NLA_U16 },
[RTA_NH_ID] = { .type = NLA_U32 },
+ [RTA_FLOWLABEL] = { .type = NLA_BE32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -5035,6 +5074,12 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
+ if (tb[RTA_FLOWLABEL]) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
+ "Flow label cannot be specified for this operation");
+ goto errout;
+ }
+
*cfg = (struct fib6_config){
.fc_table = rtm->rtm_table,
.fc_dst_len = rtm->rtm_dst_len,
@@ -6018,6 +6063,13 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
return -EINVAL;
}
+ if (tb[RTA_FLOWLABEL] &&
+ (nla_get_be32(tb[RTA_FLOWLABEL]) & ~IPV6_FLOWLABEL_MASK)) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[RTA_FLOWLABEL],
+ "Invalid flow label");
+ return -EINVAL;
+ }
+
for (i = 0; i <= RTA_MAX; i++) {
if (!tb[i])
continue;
@@ -6032,6 +6084,7 @@ static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
case RTA_SPORT:
case RTA_DPORT:
case RTA_IP_PROTO:
+ case RTA_FLOWLABEL:
break;
default:
NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
@@ -6054,6 +6107,7 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
struct sk_buff *skb;
struct rtmsg *rtm;
struct flowi6 fl6 = {};
+ __be32 flowlabel;
bool fibmatch;
err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
@@ -6062,7 +6116,6 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
err = -EINVAL;
rtm = nlmsg_data(nlh);
- fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
if (tb[RTA_SRC]) {
@@ -6108,6 +6161,9 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
goto errout;
}
+ flowlabel = nla_get_be32_default(tb[RTA_FLOWLABEL], 0);
+ fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, flowlabel);
+
if (iif) {
struct net_device *dev;
int flags = 0;
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index a45bf17cb2a1..ae2da28f9dfb 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -94,14 +94,23 @@ INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
}
static void __tcpv6_gso_segment_csum(struct sk_buff *seg,
+ struct in6_addr *oldip,
+ const struct in6_addr *newip,
__be16 *oldport, __be16 newport)
{
- struct tcphdr *th;
+ struct tcphdr *th = tcp_hdr(seg);
+
+ if (!ipv6_addr_equal(oldip, newip)) {
+ inet_proto_csum_replace16(&th->check, seg,
+ oldip->s6_addr32,
+ newip->s6_addr32,
+ true);
+ *oldip = *newip;
+ }
if (*oldport == newport)
return;
- th = tcp_hdr(seg);
inet_proto_csum_replace2(&th->check, seg, *oldport, newport, false);
*oldport = newport;
}
@@ -129,10 +138,10 @@ static struct sk_buff *__tcpv6_gso_segment_list_csum(struct sk_buff *segs)
th2 = tcp_hdr(seg);
iph2 = ipv6_hdr(seg);
- iph2->saddr = iph->saddr;
- iph2->daddr = iph->daddr;
- __tcpv6_gso_segment_csum(seg, &th2->source, th->source);
- __tcpv6_gso_segment_csum(seg, &th2->dest, th->dest);
+ __tcpv6_gso_segment_csum(seg, &iph2->saddr, &iph->saddr,
+ &th2->source, th->source);
+ __tcpv6_gso_segment_csum(seg, &iph2->daddr, &iph->daddr,
+ &th2->dest, th->dest);
}
return segs;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3a3c7639d1d6..c6ea438b5c75 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1498,6 +1498,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc6.gso_size = READ_ONCE(up->gso_size);
ipc6.sockc.tsflags = READ_ONCE(sk->sk_tsflags);
ipc6.sockc.mark = READ_ONCE(sk->sk_mark);
+ ipc6.sockc.priority = READ_ONCE(sk->sk_priority);
/* destination address check */
if (sin6) {
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index 5f7b1fdbffe6..b3d5d1f266ee 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -82,14 +82,14 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
toobig = skb->len > mtu && !skb_is_gso(skb);
- if (toobig && xfrm6_local_dontfrag(skb->sk)) {
+ if (toobig && xfrm6_local_dontfrag(sk)) {
xfrm6_local_rxpmtu(skb, mtu);
kfree_skb(skb);
return -EMSGSIZE;
} else if (toobig && xfrm6_noneed_fragment(skb)) {
skb->ignore_df = 1;
goto skip_frag;
- } else if (!skb->ignore_df && toobig && skb->sk) {
+ } else if (!skb->ignore_df && toobig && sk) {
xfrm_local_error(skb, mtu);
kfree_skb(skb);
return -EMSGSIZE;
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index d692b902e120..e83691073496 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -73,9 +73,9 @@ static netdev_tx_t l2tp_eth_dev_xmit(struct sk_buff *skb, struct net_device *dev
int ret = l2tp_xmit_skb(session, skb);
if (likely(ret == NET_XMIT_SUCCESS))
- dev_sw_netstats_tx_add(dev, 1, len);
+ dev_dstats_tx_add(dev, len);
else
- DEV_STATS_INC(dev, tx_dropped);
+ dev_dstats_tx_dropped(dev);
return NETDEV_TX_OK;
}
@@ -84,7 +84,6 @@ static const struct net_device_ops l2tp_eth_netdev_ops = {
.ndo_init = l2tp_eth_dev_init,
.ndo_uninit = l2tp_eth_dev_uninit,
.ndo_start_xmit = l2tp_eth_dev_xmit,
- .ndo_get_stats64 = dev_get_tstats64,
.ndo_set_mac_address = eth_mac_addr,
};
@@ -100,7 +99,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev)
dev->lltx = true;
dev->netdev_ops = &l2tp_eth_netdev_ops;
dev->needs_free_netdev = true;
- dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS;
+ dev->pcpu_stat_type = NETDEV_PCPU_STAT_DSTATS;
}
static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len)
@@ -128,7 +127,7 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb,
goto error_rcu;
if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS)
- dev_sw_netstats_rx_add(dev, data_len);
+ dev_dstats_rx_add(dev, data_len);
else
DEV_STATS_INC(dev, rx_errors);
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 4bc24fddfd52..29795d2839e8 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -425,7 +425,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int rc;
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = NULL;
- struct flowi4 *fl4;
int connected = 0;
__be32 daddr;
@@ -455,7 +454,6 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (sk->sk_state != TCP_ESTABLISHED)
goto out;
- daddr = inet->inet_daddr;
connected = 1;
}
@@ -482,29 +480,24 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto error;
}
- fl4 = &inet->cork.fl.u.ip4;
if (connected)
rt = dst_rtable(__sk_dst_check(sk, 0));
rcu_read_lock();
if (!rt) {
- const struct ip_options_rcu *inet_opt;
+ struct flowi4 *fl4 = &inet->cork.fl.u.ip4;
- inet_opt = rcu_dereference(inet->inet_opt);
+ inet_sk_init_flowi4(inet, fl4);
- /* Use correct destination address if we have options. */
- if (inet_opt && inet_opt->opt.srr)
- daddr = inet_opt->opt.faddr;
+ /* Overwrite ->daddr if msg->msg_name was provided */
+ if (!connected)
+ fl4->daddr = daddr;
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
- daddr, inet->inet_saddr,
- inet->inet_dport, inet->inet_sport,
- sk->sk_protocol, ip_sock_rt_tos(sk),
- sk->sk_bound_dev_if);
+ rt = ip_route_output_flow(sock_net(sk), fl4, sk);
if (IS_ERR(rt))
goto no_route;
if (connected) {
diff --git a/net/llc/sysctl_net_llc.c b/net/llc/sysctl_net_llc.c
index 72e101135f8c..c8d88e2508fc 100644
--- a/net/llc/sysctl_net_llc.c
+++ b/net/llc/sysctl_net_llc.c
@@ -11,10 +11,6 @@
#include <net/net_namespace.h>
#include <net/llc.h>
-#ifndef CONFIG_SYSCTL
-#error This file should not be compiled without CONFIG_SYSCTL defined
-#endif
-
static struct ctl_table llc2_timeout_table[] = {
{
.procname = "ack",
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index d3fc158ccaf6..b766472703b1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -503,6 +503,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
if (IS_ERR(link))
return PTR_ERR(link);
+ if (WARN_ON(pairwise && link_id >= 0))
+ return -EINVAL;
+
if (pairwise && params->mode == NL80211_KEY_SET_TX)
return ieee80211_set_tx(sdata, mac_addr, key_idx);
@@ -525,10 +528,12 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
if (IS_ERR(key))
return PTR_ERR(key);
- key->conf.link_id = link_id;
-
- if (pairwise)
+ if (pairwise) {
key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE;
+ key->conf.link_id = -1;
+ } else {
+ key->conf.link_id = link->link_id;
+ }
if (params->mode == NL80211_KEY_NO_TX)
key->conf.flags |= IEEE80211_KEY_FLAG_NO_AUTO_TX;
@@ -1671,7 +1676,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
link_conf->ema_ap = false;
link_conf->bssid_indicator = 0;
- __sta_info_flush(sdata, true, link_id);
+ __sta_info_flush(sdata, true, link_id, NULL);
ieee80211_remove_link_keys(link, &keys);
if (!list_empty(&keys)) {
@@ -1903,12 +1908,12 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
}
if (params->supported_rates &&
- params->supported_rates_len) {
- ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
- sband, params->supported_rates,
- params->supported_rates_len,
- &link_sta->pub->supp_rates[sband->band]);
- }
+ params->supported_rates_len &&
+ !ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
+ sband, params->supported_rates,
+ params->supported_rates_len,
+ &link_sta->pub->supp_rates[sband->band]))
+ return -EINVAL;
if (params->ht_capa)
ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
@@ -3190,19 +3195,27 @@ static int ieee80211_set_tx_power(struct wiphy *wiphy,
static int ieee80211_get_tx_power(struct wiphy *wiphy,
struct wireless_dev *wdev,
+ unsigned int link_id,
int *dbm)
{
struct ieee80211_local *local = wiphy_priv(wiphy);
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_link_data *link_data;
if (local->ops->get_txpower &&
(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
- return drv_get_txpower(local, sdata, dbm);
+ return drv_get_txpower(local, sdata, link_id, dbm);
- if (local->emulate_chanctx)
+ if (local->emulate_chanctx) {
*dbm = local->hw.conf.power_level;
- else
- *dbm = sdata->vif.bss_conf.txpower;
+ } else {
+ link_data = wiphy_dereference(wiphy, sdata->link[link_id]);
+
+ if (link_data)
+ *dbm = link_data->conf->txpower;
+ else
+ return -ENOLINK;
+ }
/* INT_MIN indicates no power level was set yet */
if (*dbm == INT_MIN)
@@ -5172,6 +5185,18 @@ ieee80211_set_ttlm(struct wiphy *wiphy, struct net_device *dev,
return ieee80211_req_neg_ttlm(sdata, params);
}
+static int
+ieee80211_assoc_ml_reconf(struct wiphy *wiphy, struct net_device *dev,
+ struct cfg80211_assoc_link *add_links,
+ u16 rem_links)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
+
+ lockdep_assert_wiphy(sdata->local->hw.wiphy);
+
+ return ieee80211_mgd_assoc_ml_reconf(sdata, add_links, rem_links);
+}
+
const struct cfg80211_ops mac80211_config_ops = {
.add_virtual_intf = ieee80211_add_iface,
.del_virtual_intf = ieee80211_del_iface,
@@ -5286,4 +5311,5 @@ const struct cfg80211_ops mac80211_config_ops = {
.set_hw_timestamp = ieee80211_set_hw_timestamp,
.set_ttlm = ieee80211_set_ttlm,
.get_radio_mask = ieee80211_get_radio_mask,
+ .assoc_ml_reconf = ieee80211_assoc_ml_reconf,
};
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index a442cb667520..dc28f2b0957a 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -247,6 +247,13 @@ static enum nl80211_chan_width ieee80211_get_sta_bw(struct sta_info *sta,
if (!link_sta)
return NL80211_CHAN_WIDTH_20_NOHT;
+ /*
+ * We assume that TX/RX might be asymmetric (so e.g. VHT operating
+ * mode notification changes what a STA wants to receive, but not
+ * necessarily what it will transmit to us), and therefore use the
+ * capabilities here. Calling it RX bandwidth capability is a bit
+ * wrong though, since capabilities are in fact symmetric.
+ */
width = ieee80211_sta_cap_rx_bw(link_sta);
switch (width) {
diff --git a/net/mac80211/debug.h b/net/mac80211/debug.h
index 35a8ba25fa57..5b81998cb0c9 100644
--- a/net/mac80211/debug.h
+++ b/net/mac80211/debug.h
@@ -1,7 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Portions
- * Copyright (C) 2022 - 2023 Intel Corporation
+ * Copyright (C) 2022 - 2024 Intel Corporation
*/
#ifndef __MAC80211_DEBUG_H
#define __MAC80211_DEBUG_H
@@ -152,6 +152,14 @@ do { \
else \
_sdata_err((link)->sdata, fmt, ##__VA_ARGS__); \
} while (0)
+#define link_id_info(sdata, link_id, fmt, ...) \
+ do { \
+ if (ieee80211_vif_is_mld(&sdata->vif)) \
+ _sdata_info(sdata, "[link %d] " fmt, link_id, \
+ ##__VA_ARGS__); \
+ else \
+ _sdata_info(sdata, fmt, ##__VA_ARGS__); \
+ } while (0)
#define _link_id_dbg(print, sdata, link_id, fmt, ...) \
do { \
if (ieee80211_vif_is_mld(&(sdata)->vif)) \
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index be2e486907f9..69e03630f64c 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -284,7 +284,8 @@ static ssize_t aql_txq_limit_write(struct file *file,
q_limit_low_old = local->aql_txq_limit_low[ac];
q_limit_high_old = local->aql_txq_limit_high[ac];
- wiphy_lock(local->hw.wiphy);
+ guard(wiphy)(local->hw.wiphy);
+
local->aql_txq_limit_low[ac] = q_limit_low;
local->aql_txq_limit_high[ac] = q_limit_high;
@@ -296,7 +297,6 @@ static ssize_t aql_txq_limit_write(struct file *file,
sta->airtime[ac].aql_limit_high = q_limit_high;
}
}
- wiphy_unlock(local->hw.wiphy);
return count;
}
@@ -492,6 +492,7 @@ static const char *hw_flag_names[] = {
FLAG(DISALLOW_PUNCTURING),
FLAG(DISALLOW_PUNCTURING_5GHZ),
FLAG(HANDLES_QUIET_CSA),
+ FLAG(STRICT),
#undef FLAG
};
@@ -524,6 +525,46 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
return rv;
}
+static ssize_t hwflags_write(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct ieee80211_local *local = file->private_data;
+ char buf[100];
+ int val;
+
+ if (count >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(buf, user_buf, count))
+ return -EFAULT;
+
+ if (count && buf[count - 1] == '\n')
+ buf[count - 1] = '\0';
+ else
+ buf[count] = '\0';
+
+ if (sscanf(buf, "strict=%d", &val) == 1) {
+ switch (val) {
+ case 0:
+ ieee80211_hw_set(&local->hw, STRICT);
+ return count;
+ case 1:
+ __clear_bit(IEEE80211_HW_STRICT, local->hw.flags);
+ return count;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ return -EINVAL;
+}
+
+static const struct file_operations hwflags_ops = {
+ .open = simple_open,
+ .read = hwflags_read,
+ .write = hwflags_write,
+};
+
static ssize_t misc_read(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
@@ -574,7 +615,6 @@ static ssize_t queues_read(struct file *file, char __user *user_buf,
return simple_read_from_buffer(user_buf, count, ppos, buf, res);
}
-DEBUGFS_READONLY_FILE_OPS(hwflags);
DEBUGFS_READONLY_FILE_OPS(queues);
DEBUGFS_READONLY_FILE_OPS(misc);
@@ -651,7 +691,7 @@ void debugfs_hw_add(struct ieee80211_local *local)
#ifdef CONFIG_PM
DEBUGFS_ADD_MODE(reset, 0200);
#endif
- DEBUGFS_ADD(hwflags);
+ DEBUGFS_ADD_MODE(hwflags, 0600);
DEBUGFS_ADD(user_power);
DEBUGFS_ADD(power);
DEBUGFS_ADD(hw_conf);
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index b3a64edea0f2..117f58af5ff9 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -402,25 +402,6 @@ void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata)
}
}
-void ieee80211_debugfs_key_add_mgmt_default(struct ieee80211_sub_if_data *sdata)
-{
- char buf[50];
- struct ieee80211_key *key;
-
- if (!sdata->vif.debugfs_dir)
- return;
-
- key = wiphy_dereference(sdata->local->hw.wiphy,
- sdata->deflink.default_mgmt_key);
- if (key) {
- sprintf(buf, "../keys/%d", key->debugfs.cnt);
- sdata->debugfs.default_mgmt_key =
- debugfs_create_symlink("default_mgmt_key",
- sdata->vif.debugfs_dir, buf);
- } else
- ieee80211_debugfs_key_remove_mgmt_default(sdata);
-}
-
void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sdata)
{
if (!sdata)
@@ -431,27 +412,6 @@ void ieee80211_debugfs_key_remove_mgmt_default(struct ieee80211_sub_if_data *sda
}
void
-ieee80211_debugfs_key_add_beacon_default(struct ieee80211_sub_if_data *sdata)
-{
- char buf[50];
- struct ieee80211_key *key;
-
- if (!sdata->vif.debugfs_dir)
- return;
-
- key = wiphy_dereference(sdata->local->hw.wiphy,
- sdata->deflink.default_beacon_key);
- if (key) {
- sprintf(buf, "../keys/%d", key->debugfs.cnt);
- sdata->debugfs.default_beacon_key =
- debugfs_create_symlink("default_beacon_key",
- sdata->vif.debugfs_dir, buf);
- } else {
- ieee80211_debugfs_key_remove_beacon_default(sdata);
- }
-}
-
-void
ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata)
{
if (!sdata)
@@ -460,10 +420,3 @@ ieee80211_debugfs_key_remove_beacon_default(struct ieee80211_sub_if_data *sdata)
debugfs_remove(sdata->debugfs.default_beacon_key);
sdata->debugfs.default_beacon_key = NULL;
}
-
-void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
- struct sta_info *sta)
-{
- debugfs_remove(key->debugfs.stalink);
- key->debugfs.stalink = NULL;
-}
diff --git a/net/mac80211/debugfs_key.h b/net/mac80211/debugfs_key.h
index af7cf495f8d1..e17a48d5c6cc 100644
--- a/net/mac80211/debugfs_key.h
+++ b/net/mac80211/debugfs_key.h
@@ -6,16 +6,10 @@
void ieee80211_debugfs_key_add(struct ieee80211_key *key);
void ieee80211_debugfs_key_remove(struct ieee80211_key *key);
void ieee80211_debugfs_key_update_default(struct ieee80211_sub_if_data *sdata);
-void ieee80211_debugfs_key_add_mgmt_default(
- struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_key_remove_mgmt_default(
struct ieee80211_sub_if_data *sdata);
-void ieee80211_debugfs_key_add_beacon_default(
- struct ieee80211_sub_if_data *sdata);
void ieee80211_debugfs_key_remove_beacon_default(
struct ieee80211_sub_if_data *sdata);
-void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
- struct sta_info *sta);
#else
static inline void ieee80211_debugfs_key_add(struct ieee80211_key *key)
{}
@@ -24,21 +18,12 @@ static inline void ieee80211_debugfs_key_remove(struct ieee80211_key *key)
static inline void ieee80211_debugfs_key_update_default(
struct ieee80211_sub_if_data *sdata)
{}
-static inline void ieee80211_debugfs_key_add_mgmt_default(
- struct ieee80211_sub_if_data *sdata)
-{}
static inline void ieee80211_debugfs_key_remove_mgmt_default(
struct ieee80211_sub_if_data *sdata)
{}
-static inline void ieee80211_debugfs_key_add_beacon_default(
- struct ieee80211_sub_if_data *sdata)
-{}
static inline void ieee80211_debugfs_key_remove_beacon_default(
struct ieee80211_sub_if_data *sdata)
{}
-static inline void ieee80211_debugfs_key_sta_del(struct ieee80211_key *key,
- struct sta_info *sta)
-{}
#endif
#endif /* __MAC80211_DEBUGFS_KEY_H */
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index e7687a7b1683..54c479910d05 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -1025,16 +1025,7 @@ void ieee80211_debugfs_remove_netdev(struct ieee80211_sub_if_data *sdata)
void ieee80211_debugfs_rename_netdev(struct ieee80211_sub_if_data *sdata)
{
- struct dentry *dir;
- char buf[10 + IFNAMSIZ];
-
- dir = sdata->vif.debugfs_dir;
-
- if (IS_ERR_OR_NULL(dir))
- return;
-
- sprintf(buf, "netdev:%s", sdata->name);
- debugfs_rename(dir->d_parent, dir, dir->d_parent, buf);
+ debugfs_change_name(sdata->vif.debugfs_dir, "netdev:%s", sdata->name);
}
void ieee80211_debugfs_recreate_netdev(struct ieee80211_sub_if_data *sdata,
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 299d38e9e863..35349a7f16cb 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -116,8 +116,14 @@ void drv_remove_interface(struct ieee80211_local *local,
sdata->flags &= ~IEEE80211_SDATA_IN_DRIVER;
- /* Remove driver debugfs entries */
- ieee80211_debugfs_recreate_netdev(sdata, sdata->vif.valid_links);
+ /*
+ * Remove driver debugfs entries.
+ * The virtual monitor interface doesn't get a debugfs
+ * entry, so it's exempt here.
+ */
+ if (sdata != rcu_access_pointer(local->monitor_sdata))
+ ieee80211_debugfs_recreate_netdev(sdata,
+ sdata->vif.valid_links);
trace_drv_remove_interface(local, sdata);
local->ops->remove_interface(&local->hw, &sdata->vif);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index ca04f2ff9f44..5acecc7bd4a9 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1276,7 +1276,8 @@ static inline u32 drv_get_expected_throughput(struct ieee80211_local *local,
}
static inline int drv_get_txpower(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata, int *dbm)
+ struct ieee80211_sub_if_data *sdata,
+ unsigned int link_id, int *dbm)
{
int ret;
@@ -1286,8 +1287,8 @@ static inline int drv_get_txpower(struct ieee80211_local *local,
if (!local->ops->get_txpower)
return -EOPNOTSUPP;
- ret = local->ops->get_txpower(&local->hw, &sdata->vif, dbm);
- trace_drv_get_txpower(local, sdata, *dbm, ret);
+ ret = local->ops->get_txpower(&local->hw, &sdata->vif, link_id, dbm);
+ trace_drv_get_txpower(local, sdata, link_id, *dbm, ret);
return ret;
}
diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c
index 7a3116c36df9..fd41046e3b68 100644
--- a/net/mac80211/eht.c
+++ b/net/mac80211/eht.c
@@ -2,7 +2,7 @@
/*
* EHT handling
*
- * Copyright(c) 2021-2024 Intel Corporation
+ * Copyright(c) 2021-2025 Intel Corporation
*/
#include "ieee80211_i.h"
@@ -76,6 +76,13 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
link_sta->cur_max_bandwidth = ieee80211_sta_cap_rx_bw(link_sta);
link_sta->pub->bandwidth = ieee80211_sta_cur_vht_bw(link_sta);
+ /*
+ * The MPDU length bits are reserved on all but 2.4 GHz and get set via
+ * VHT (5 GHz) or HE (6 GHz) capabilities.
+ */
+ if (sband->band != NL80211_BAND_2GHZ)
+ return;
+
switch (u8_get_bits(eht_cap->eht_cap_elem.mac_cap_info[0],
IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_MASK)) {
case IEEE80211_EHT_MAC_CAP0_MAX_MPDU_LEN_11454:
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 99f6174a9d69..42f7ee142ce3 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -19,16 +19,13 @@ static int ieee80211_set_ringparam(struct net_device *dev,
struct netlink_ext_ack *extack)
{
struct ieee80211_local *local = wiphy_priv(dev->ieee80211_ptr->wiphy);
- int ret;
if (rp->rx_mini_pending != 0 || rp->rx_jumbo_pending != 0)
return -EINVAL;
- wiphy_lock(local->hw.wiphy);
- ret = drv_set_ringparam(local, rp->tx_pending, rp->rx_pending);
- wiphy_unlock(local->hw.wiphy);
+ guard(wiphy)(local->hw.wiphy);
- return ret;
+ return drv_set_ringparam(local, rp->tx_pending, rp->rx_pending);
}
static void ieee80211_get_ringparam(struct net_device *dev,
@@ -40,10 +37,10 @@ static void ieee80211_get_ringparam(struct net_device *dev,
memset(rp, 0, sizeof(*rp));
- wiphy_lock(local->hw.wiphy);
+ guard(wiphy)(local->hw.wiphy);
+
drv_get_ringparam(local, &rp->tx_pending, &rp->tx_max_pending,
&rp->rx_pending, &rp->rx_max_pending);
- wiphy_unlock(local->hw.wiphy);
}
static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = {
@@ -109,7 +106,7 @@ static void ieee80211_get_stats(struct net_device *dev,
* network device.
*/
- wiphy_lock(local->hw.wiphy);
+ guard(wiphy)(local->hw.wiphy);
if (sdata->vif.type == NL80211_IFTYPE_STATION) {
sta = sta_info_get_bss(sdata, sdata->deflink.u.mgd.bssid);
@@ -160,6 +157,10 @@ do_survey:
chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
if (chanctx_conf)
channel = chanctx_conf->def.chan;
+ else if (local->open_count > 0 &&
+ local->open_count == local->monitors &&
+ sdata->vif.type == NL80211_IFTYPE_MONITOR)
+ channel = local->monitor_chanreq.oper.chan;
else
channel = NULL;
rcu_read_unlock();
@@ -205,13 +206,10 @@ do_survey:
else
data[i++] = -1LL;
- if (WARN_ON(i != STA_STATS_LEN)) {
- wiphy_unlock(local->hw.wiphy);
+ if (WARN_ON(i != STA_STATS_LEN))
return;
- }
drv_get_et_stats(sdata, stats, &(data[STA_STATS_LEN]));
- wiphy_unlock(local->hw.wiphy);
}
static void ieee80211_get_strings(struct net_device *dev, u32 sset, u8 *data)
diff --git a/net/mac80211/he.c b/net/mac80211/he.c
index ecbb042dd043..5792ef77e986 100644
--- a/net/mac80211/he.c
+++ b/net/mac80211/he.c
@@ -3,10 +3,11 @@
* HE handling
*
* Copyright(c) 2017 Intel Deutschland GmbH
- * Copyright(c) 2019 - 2023 Intel Corporation
+ * Copyright(c) 2019 - 2024 Intel Corporation
*/
#include "ieee80211_i.h"
+#include "rate.h"
static void
ieee80211_update_from_he_6ghz_capa(const struct ieee80211_he_6ghz_capa *he_6ghz_capa,
@@ -248,3 +249,119 @@ ieee80211_he_spr_ie_to_bss_conf(struct ieee80211_vif *vif,
he_obss_pd->enable = true;
}
}
+
+static void ieee80211_link_sta_rc_update_omi(struct ieee80211_link_data *link,
+ struct link_sta_info *link_sta)
+{
+ struct ieee80211_sub_if_data *sdata = link->sdata;
+ struct ieee80211_supported_band *sband;
+ enum ieee80211_sta_rx_bandwidth new_bw;
+ enum nl80211_band band;
+
+ band = link->conf->chanreq.oper.chan->band;
+ sband = sdata->local->hw.wiphy->bands[band];
+
+ new_bw = ieee80211_sta_cur_vht_bw(link_sta);
+ if (link_sta->pub->bandwidth == new_bw)
+ return;
+
+ link_sta->pub->bandwidth = new_bw;
+ rate_control_rate_update(sdata->local, sband, link_sta,
+ IEEE80211_RC_BW_CHANGED);
+}
+
+bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta,
+ enum ieee80211_sta_rx_bandwidth bw)
+{
+ struct sta_info *sta = container_of(pub_link_sta->sta,
+ struct sta_info, sta);
+ struct ieee80211_local *local = sta->sdata->local;
+ struct link_sta_info *link_sta =
+ sdata_dereference(sta->link[pub_link_sta->link_id], sta->sdata);
+ struct ieee80211_link_data *link =
+ sdata_dereference(sta->sdata->link[pub_link_sta->link_id],
+ sta->sdata);
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *chanctx;
+ bool ret;
+
+ if (WARN_ON(!link || !link_sta || link_sta->pub != pub_link_sta))
+ return false;
+
+ conf = sdata_dereference(link->conf->chanctx_conf, sta->sdata);
+ if (WARN_ON(!conf))
+ return false;
+
+ trace_api_prepare_rx_omi_bw(local, sta->sdata, link_sta, bw);
+
+ chanctx = container_of(conf, typeof(*chanctx), conf);
+
+ if (link_sta->rx_omi_bw_staging == bw) {
+ ret = false;
+ goto trace;
+ }
+
+ /* must call this API in pairs */
+ if (WARN_ON(link_sta->rx_omi_bw_tx != link_sta->rx_omi_bw_staging ||
+ link_sta->rx_omi_bw_rx != link_sta->rx_omi_bw_staging)) {
+ ret = false;
+ goto trace;
+ }
+
+ if (bw < link_sta->rx_omi_bw_staging) {
+ link_sta->rx_omi_bw_tx = bw;
+ ieee80211_link_sta_rc_update_omi(link, link_sta);
+ } else {
+ link_sta->rx_omi_bw_rx = bw;
+ ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false);
+ }
+
+ link_sta->rx_omi_bw_staging = bw;
+ ret = true;
+trace:
+ trace_api_return_bool(local, ret);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(ieee80211_prepare_rx_omi_bw);
+
+void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta)
+{
+ struct sta_info *sta = container_of(pub_link_sta->sta,
+ struct sta_info, sta);
+ struct ieee80211_local *local = sta->sdata->local;
+ struct link_sta_info *link_sta =
+ sdata_dereference(sta->link[pub_link_sta->link_id], sta->sdata);
+ struct ieee80211_link_data *link =
+ sdata_dereference(sta->sdata->link[pub_link_sta->link_id],
+ sta->sdata);
+ struct ieee80211_chanctx_conf *conf;
+ struct ieee80211_chanctx *chanctx;
+
+ if (WARN_ON(!link || !link_sta || link_sta->pub != pub_link_sta))
+ return;
+
+ conf = sdata_dereference(link->conf->chanctx_conf, sta->sdata);
+ if (WARN_ON(!conf))
+ return;
+
+ trace_api_finalize_rx_omi_bw(local, sta->sdata, link_sta);
+
+ chanctx = container_of(conf, typeof(*chanctx), conf);
+
+ if (link_sta->rx_omi_bw_tx != link_sta->rx_omi_bw_staging) {
+ /* rate control in finalize only when widening bandwidth */
+ WARN_ON(link_sta->rx_omi_bw_tx > link_sta->rx_omi_bw_staging);
+ link_sta->rx_omi_bw_tx = link_sta->rx_omi_bw_staging;
+ ieee80211_link_sta_rc_update_omi(link, link_sta);
+ }
+
+ if (link_sta->rx_omi_bw_rx != link_sta->rx_omi_bw_staging) {
+ /* channel context in finalize only when narrowing bandwidth */
+ WARN_ON(link_sta->rx_omi_bw_rx < link_sta->rx_omi_bw_staging);
+ link_sta->rx_omi_bw_rx = link_sta->rx_omi_bw_staging;
+ ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false);
+ }
+
+ trace_api_return_void(local);
+}
+EXPORT_SYMBOL_GPL(ieee80211_finalize_rx_omi_bw);
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index a1b4178deccf..05a945df3259 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -245,6 +245,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
sdata->vif.cfg.ibss_creator = false;
sdata->vif.bss_conf.enable_beacon = false;
netif_carrier_off(sdata->dev);
+ synchronize_net();
ieee80211_bss_info_change_notify(sdata,
BSS_CHANGED_IBSS |
BSS_CHANGED_BEACON_ENABLED);
@@ -1826,8 +1827,8 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_if_ibss *ifibss = &sdata->u.ibss;
- ieee80211_ibss_disconnect(sdata);
ifibss->ssid_len = 0;
+ ieee80211_ibss_disconnect(sdata);
eth_zero_addr(ifibss->bssid);
/* remove beacon */
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index c39b813a8199..e7dc3f0cfc9a 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -404,6 +404,8 @@ struct ieee80211_mgd_auth_data {
int tries;
u16 algorithm, expected_transaction;
+ unsigned long userspace_selectors[BITS_TO_LONGS(128)];
+
u8 key[WLAN_KEY_LEN_WEP104];
u8 key_len, key_idx;
bool done, waiting;
@@ -444,6 +446,8 @@ struct ieee80211_mgd_assoc_data {
const u8 *supp_rates;
u8 supp_rates_len;
+ unsigned long userspace_selectors[BITS_TO_LONGS(128)];
+
unsigned long timeout;
int tries;
@@ -602,6 +606,15 @@ struct ieee80211_if_managed {
/* dialog token enumerator for neg TTLM request */
u8 dialog_token_alloc;
struct wiphy_delayed_work neg_ttlm_timeout_work;
+
+ /* Locally initiated multi-link reconfiguration */
+ struct {
+ struct ieee80211_mgd_assoc_data *add_links_data;
+ struct wiphy_delayed_work wk;
+ u16 removed_links;
+ u16 added_links;
+ u8 dialog_token;
+ } reconf;
};
struct ieee80211_if_ibss {
@@ -1204,7 +1217,7 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p)
for (int ___link_id = 0; \
___link_id < ARRAY_SIZE(___sdata->link); \
___link_id++) \
- if ((_link = wiphy_dereference((local)->hw.wiphy, \
+ if ((_link = wiphy_dereference((_local)->hw.wiphy, \
___sdata->link[___link_id])))
static inline int
@@ -2111,8 +2124,6 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata,
int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata,
enum ieee80211_smps_mode smps, const u8 *da,
const u8 *bssid, int link_id);
-bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
- enum ieee80211_smps_mode smps_mode_new);
void ieee80211_add_addbaext(struct sk_buff *skb,
const u8 req_addba_ext_data,
u16 buf_size);
@@ -2768,6 +2779,12 @@ void ieee80211_check_wbrf_support(struct ieee80211_local *local);
void ieee80211_add_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef);
void ieee80211_remove_wbrf(struct ieee80211_local *local, struct cfg80211_chan_def *chandef);
+int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_assoc_link *add_links,
+ u16 rem_links);
+
+void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len);
#if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST)
#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym)
#define VISIBLE_IF_MAC80211_KUNIT
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 806dffa48ef9..d299bdbca6b3 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -8,7 +8,7 @@
* Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
*/
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -295,7 +295,6 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
- int ret;
/*
* This happens during unregistration if there's a bond device
@@ -305,11 +304,9 @@ static int ieee80211_change_mac(struct net_device *dev, void *addr)
if (!dev->ieee80211_ptr->registered)
return 0;
- wiphy_lock(local->hw.wiphy);
- ret = _ieee80211_change_mac(sdata, addr);
- wiphy_unlock(local->hw.wiphy);
+ guard(wiphy)(local->hw.wiphy);
- return ret;
+ return _ieee80211_change_mac(sdata, addr);
}
static inline int identical_mac_addr_allowed(int type1, int type2)
@@ -445,16 +442,13 @@ static int ieee80211_open(struct net_device *dev)
if (!is_valid_ether_addr(dev->dev_addr))
return -EADDRNOTAVAIL;
- wiphy_lock(sdata->local->hw.wiphy);
+ guard(wiphy)(sdata->local->hw.wiphy);
+
err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type);
if (err)
- goto out;
-
- err = ieee80211_do_open(&sdata->wdev, true);
-out:
- wiphy_unlock(sdata->local->hw.wiphy);
+ return err;
- return err;
+ return ieee80211_do_open(&sdata->wdev, true);
}
static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_down)
@@ -666,6 +660,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
ieee80211_txq_remove_vlan(local, sdata);
+ if (sdata->vif.txq)
+ ieee80211_txq_purge(sdata->local, to_txq_info(sdata->vif.txq));
+
sdata->bss = NULL;
if (local->open_count == 0)
@@ -775,11 +772,11 @@ static int ieee80211_stop(struct net_device *dev)
ieee80211_stop_mbssid(sdata);
}
- wiphy_lock(sdata->local->hw.wiphy);
+ guard(wiphy)(sdata->local->hw.wiphy);
+
wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->activate_links_work);
ieee80211_do_stop(sdata, true);
- wiphy_unlock(sdata->local->hw.wiphy);
return 0;
}
@@ -813,6 +810,9 @@ static void ieee80211_set_multicast_list(struct net_device *dev)
*/
static void ieee80211_teardown_sdata(struct ieee80211_sub_if_data *sdata)
{
+ if (WARN_ON(!list_empty(&sdata->work.entry)))
+ wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work);
+
/* free extra data */
ieee80211_free_keys(sdata, false);
@@ -1212,16 +1212,17 @@ void ieee80211_del_virtual_monitor(struct ieee80211_local *local)
return;
}
- RCU_INIT_POINTER(local->monitor_sdata, NULL);
- mutex_unlock(&local->iflist_mtx);
-
- synchronize_net();
-
+ clear_bit(SDATA_STATE_RUNNING, &sdata->state);
ieee80211_link_release_channel(&sdata->deflink);
if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF))
drv_remove_interface(local, sdata);
+ RCU_INIT_POINTER(local->monitor_sdata, NULL);
+ mutex_unlock(&local->iflist_mtx);
+
+ synchronize_net();
+
kfree(sdata);
}
@@ -1566,6 +1567,10 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
ieee80211_process_neg_ttlm_res(sdata, mgmt,
skb->len);
break;
+ case WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP:
+ ieee80211_process_ml_reconf_resp(sdata, mgmt,
+ skb->len);
+ break;
default:
break;
}
@@ -2279,7 +2284,7 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
*/
cfg80211_shutdown_all_interfaces(local->hw.wiphy);
- wiphy_lock(local->hw.wiphy);
+ guard(wiphy)(local->hw.wiphy);
WARN(local->open_count, "%s: open count remains %d\n",
wiphy_name(local->hw.wiphy), local->open_count);
@@ -2309,7 +2314,6 @@ void ieee80211_remove_interfaces(struct ieee80211_local *local)
if (!netdev)
kfree(sdata);
}
- wiphy_unlock(local->hw.wiphy);
}
static int netdev_notify(struct notifier_block *nb,
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 67ecfea22982..dcf8643a0baa 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -1409,7 +1409,7 @@ ieee80211_gtk_rekey_add(struct ieee80211_vif *vif,
if (sdata->u.mgd.mfp != IEEE80211_MFP_DISABLED)
key->conf.flags |= IEEE80211_KEY_FLAG_RX_MGMT;
- key->conf.link_id = link_id;
+ key->conf.link_id = link_data->link_id;
err = ieee80211_key_link(key, link_data, NULL);
if (err)
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index ee1211a213d7..53e5aee46885 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
*/
#include <net/mac80211.h>
@@ -726,8 +726,13 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
},
[NL80211_IFTYPE_P2P_DEVICE] = {
.tx = 0xffff,
+ /*
+ * To support P2P PASN pairing let user space register to rx
+ * also AUTH frames on P2P device interface.
+ */
.rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
- BIT(IEEE80211_STYPE_PROBE_REQ >> 4),
+ BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4),
},
};
@@ -1305,6 +1310,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
sband->ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 &&
!(iftd->he_cap.he_cap_elem.phy_cap_info[0] & he_40_mhz_cap))
return -EINVAL;
+
+ /* no support for per-band vendor elems with MLO */
+ if (WARN_ON(iftd->vendor_elems.len &&
+ hw->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO))
+ return -EINVAL;
}
/* HT, VHT, HE require QoS, thus >= 4 queues */
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index 4e9546e998b6..c94a9c7ca960 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -367,6 +367,12 @@ u32 airtime_link_metric_get(struct ieee80211_local *local,
return (u32)result;
}
+/* Check that the first metric is at least 10% better than the second one */
+static bool is_metric_better(u32 x, u32 y)
+{
+ return (x < y) && (x < (y - x / 10));
+}
+
/**
* hwmp_route_info_get - Update routing info to originator and transmitter
*
@@ -458,8 +464,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
(mpath->sn == orig_sn &&
(rcu_access_pointer(mpath->next_hop) !=
sta ?
- mult_frac(new_metric, 10, 9) :
- new_metric) >= mpath->metric)) {
+ !is_metric_better(new_metric, mpath->metric) :
+ new_metric >= mpath->metric))) {
process = false;
fresh_info = false;
}
@@ -533,8 +539,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata,
if ((mpath->flags & MESH_PATH_FIXED) ||
((mpath->flags & MESH_PATH_ACTIVE) &&
((rcu_access_pointer(mpath->next_hop) != sta ?
- mult_frac(last_hop_metric, 10, 9) :
- last_hop_metric) > mpath->metric)))
+ !is_metric_better(last_hop_metric, mpath->metric) :
+ last_hop_metric > mpath->metric))))
fresh_info = false;
} else {
mpath = mesh_path_add(sdata, ta);
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index 6ea35c88dc48..5a0156e11c91 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -432,15 +432,14 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata,
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
- u32 rates, basic_rates = 0, changed = 0;
+ u32 rates, changed = 0;
enum ieee80211_sta_rx_bandwidth bw = sta->sta.deflink.bandwidth;
sband = ieee80211_get_sband(sdata);
if (!sband)
return;
- rates = ieee80211_sta_get_rates(sdata, elems, sband->band,
- &basic_rates);
+ rates = ieee80211_sta_get_rates(sdata, elems, sband->band, NULL);
spin_lock_bh(&sta->mesh->plink_lock);
sta->deflink.rx_stats.last_rx = jiffies;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 0e3db0c2920b..99e9b03d7fe1 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -168,6 +168,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
bool no_vht = false;
u32 ht_cfreq;
+ if (ieee80211_hw_check(&sdata->local->hw, STRICT))
+ ignore_ht_channel_mismatch = false;
+
*chandef = (struct cfg80211_chan_def) {
.chan = channel,
.width = NL80211_CHAN_WIDTH_20_NOHT,
@@ -344,6 +347,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
static bool
ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata,
+ int link_id,
const struct ieee80211_he_cap_elem *he_cap,
const struct ieee80211_he_operation *he_op)
{
@@ -371,9 +375,9 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata,
*/
if ((mcs_80_map_tx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED ||
(mcs_80_map_rx & 0x3) == IEEE80211_HE_MCS_NOT_SUPPORTED) {
- sdata_info(sdata,
- "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n",
- mcs_80_map_tx, mcs_80_map_rx);
+ link_id_info(sdata, link_id,
+ "Missing mandatory rates for 1 Nss, rx 0x%x, tx 0x%x, disable HE\n",
+ mcs_80_map_tx, mcs_80_map_rx);
return false;
}
@@ -387,7 +391,7 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata,
* zeroes, which is nonsense, and completely inconsistent with itself
* (it doesn't have 8 streams). Accept the settings in this case anyway.
*/
- if (!ap_min_req_set)
+ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set)
return true;
/* make sure the AP is consistent with itself
@@ -417,9 +421,9 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata,
if (ap_rx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
ap_tx_val == IEEE80211_HE_MCS_NOT_SUPPORTED ||
ap_rx_val < ap_op_val || ap_tx_val < ap_op_val) {
- sdata_info(sdata,
- "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n",
- nss, ap_rx_val, ap_rx_val, ap_op_val);
+ link_id_info(sdata, link_id,
+ "Invalid rates for %d Nss, rx %d, tx %d oper %d, disable HE\n",
+ nss, ap_rx_val, ap_tx_val, ap_op_val);
return false;
}
}
@@ -447,7 +451,7 @@ ieee80211_verify_sta_he_mcs_support(struct ieee80211_sub_if_data *sdata,
* zeroes, which is nonsense, and completely inconsistent with itself
* (it doesn't have 8 streams). Accept the settings in this case anyway.
*/
- if (!ap_min_req_set)
+ if (!ieee80211_hw_check(&sdata->local->hw, STRICT) && !ap_min_req_set)
return true;
/* Need to go over for 80MHz, 160MHz and for 80+80 */
@@ -589,6 +593,68 @@ ieee80211_verify_sta_eht_mcs_support(struct ieee80211_sub_if_data *sdata,
return true;
}
+static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
+ const u8 *supp_rates,
+ unsigned int supp_rates_len,
+ const u8 *ext_supp_rates,
+ unsigned int ext_supp_rates_len,
+ u32 *rates, u32 *basic_rates,
+ unsigned long *unknown_rates_selectors,
+ bool *have_higher_than_11mbit,
+ int *min_rate, int *min_rate_index)
+{
+ int i, j;
+
+ for (i = 0; i < supp_rates_len + ext_supp_rates_len; i++) {
+ u8 supp_rate = i < supp_rates_len ?
+ supp_rates[i] :
+ ext_supp_rates[i - supp_rates_len];
+ int rate = supp_rate & 0x7f;
+ bool is_basic = !!(supp_rate & 0x80);
+
+ if ((rate * 5) > 110 && have_higher_than_11mbit)
+ *have_higher_than_11mbit = true;
+
+ /*
+ * Skip membership selectors since they're not rates.
+ *
+ * Note: Even though the membership selector and the basic
+ * rate flag share the same bit, they are not exactly
+ * the same.
+ */
+ if (is_basic && rate >= BSS_MEMBERSHIP_SELECTOR_MIN) {
+ if (unknown_rates_selectors)
+ set_bit(rate, unknown_rates_selectors);
+ continue;
+ }
+
+ for (j = 0; j < sband->n_bitrates; j++) {
+ struct ieee80211_rate *br;
+ int brate;
+
+ br = &sband->bitrates[j];
+
+ brate = DIV_ROUND_UP(br->bitrate, 5);
+ if (brate == rate) {
+ if (rates)
+ *rates |= BIT(j);
+ if (is_basic && basic_rates)
+ *basic_rates |= BIT(j);
+ if (min_rate && (rate * 5) < *min_rate) {
+ *min_rate = rate * 5;
+ if (min_rate_index)
+ *min_rate_index = j;
+ }
+ break;
+ }
+ }
+
+ /* Handle an unknown entry as if it is an unknown selector */
+ if (is_basic && unknown_rates_selectors && j == sband->n_bitrates)
+ set_bit(rate, unknown_rates_selectors);
+ }
+}
+
static bool ieee80211_chandef_usable(struct ieee80211_sub_if_data *sdata,
const struct cfg80211_chan_def *chandef,
u32 prohibited_flags)
@@ -819,7 +885,8 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata,
struct ieee80211_conn_settings *conn,
struct cfg80211_bss *cbss, int link_id,
struct ieee80211_chan_req *chanreq,
- struct cfg80211_chan_def *ap_chandef)
+ struct cfg80211_chan_def *ap_chandef,
+ unsigned long *userspace_selectors)
{
const struct cfg80211_bss_ies *ies = rcu_dereference(cbss->ies);
struct ieee80211_bss *bss = (void *)cbss->priv;
@@ -833,6 +900,8 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata,
struct ieee802_11_elems *elems;
struct ieee80211_supported_band *sband;
enum ieee80211_conn_mode ap_mode;
+ unsigned long unknown_rates_selectors[BITS_TO_LONGS(128)] = {};
+ unsigned long sta_selectors[BITS_TO_LONGS(128)] = {};
int ret;
again:
@@ -861,6 +930,11 @@ again:
sband = sdata->local->hw.wiphy->bands[channel->band];
+ ieee80211_get_rates(sband, elems->supp_rates, elems->supp_rates_len,
+ elems->ext_supp_rates, elems->ext_supp_rates_len,
+ NULL, NULL, unknown_rates_selectors, NULL, NULL,
+ NULL);
+
switch (channel->band) {
case NL80211_BAND_S1GHZ:
if (WARN_ON(ap_mode != IEEE80211_CONN_MODE_S1G)) {
@@ -870,8 +944,8 @@ again:
return elems;
case NL80211_BAND_6GHZ:
if (ap_mode < IEEE80211_CONN_MODE_HE) {
- sdata_info(sdata,
- "Rejecting non-HE 6/7 GHz connection");
+ link_id_info(sdata, link_id,
+ "Rejecting non-HE 6/7 GHz connection");
ret = -EINVAL;
goto free;
}
@@ -911,6 +985,29 @@ again:
chanreq->oper = *ap_chandef;
+ bitmap_copy(sta_selectors, userspace_selectors, 128);
+ if (conn->mode >= IEEE80211_CONN_MODE_HT)
+ set_bit(BSS_MEMBERSHIP_SELECTOR_HT_PHY, sta_selectors);
+ if (conn->mode >= IEEE80211_CONN_MODE_VHT)
+ set_bit(BSS_MEMBERSHIP_SELECTOR_VHT_PHY, sta_selectors);
+ if (conn->mode >= IEEE80211_CONN_MODE_HE)
+ set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors);
+ if (conn->mode >= IEEE80211_CONN_MODE_EHT)
+ set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors);
+
+ /*
+ * We do not support EPD or GLK so never add them.
+ * SAE_H2E is handled through userspace_selectors.
+ */
+
+ /* Check if we support all required features */
+ if (!bitmap_subset(unknown_rates_selectors, sta_selectors, 128)) {
+ link_id_info(sdata, link_id,
+ "required basic rate or BSS membership selectors not supported or disabled, rejecting connection\n");
+ ret = -EINVAL;
+ goto free;
+ }
+
ieee80211_set_chanreq_ap(sdata, chanreq, conn, ap_chandef);
while (!ieee80211_chandef_usable(sdata, &chanreq->oper,
@@ -942,16 +1039,18 @@ again:
}
if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode)
- sdata_info(sdata,
- "regulatory prevented using AP config, downgraded\n");
+ link_id_info(sdata, link_id,
+ "regulatory prevented using AP config, downgraded\n");
if (conn->mode >= IEEE80211_CONN_MODE_HE &&
- (!ieee80211_verify_peer_he_mcs_support(sdata, (void *)elems->he_cap,
+ (!ieee80211_verify_peer_he_mcs_support(sdata, link_id,
+ (void *)elems->he_cap,
elems->he_operation) ||
!ieee80211_verify_sta_he_mcs_support(sdata, sband,
elems->he_operation))) {
conn->mode = IEEE80211_CONN_MODE_VHT;
- sdata_info(sdata, "required MCSes not supported, disabling HE\n");
+ link_id_info(sdata, link_id,
+ "required MCSes not supported, disabling HE\n");
}
if (conn->mode >= IEEE80211_CONN_MODE_EHT &&
@@ -961,7 +1060,8 @@ again:
conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
conn->bw_limit,
IEEE80211_CONN_BW_LIMIT_160);
- sdata_info(sdata, "required MCSes not supported, disabling EHT\n");
+ link_id_info(sdata, link_id,
+ "required MCSes not supported, disabling EHT\n");
}
/* the mode can only decrease, so this must terminate */
@@ -988,7 +1088,8 @@ free:
static int ieee80211_config_bw(struct ieee80211_link_data *link,
struct ieee802_11_elems *elems,
- bool update, u64 *changed)
+ bool update, u64 *changed,
+ const char *frame)
{
struct ieee80211_channel *channel = link->conf->chanreq.oper.chan;
struct ieee80211_sub_if_data *sdata = link->sdata;
@@ -1013,9 +1114,10 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
if (ap_mode != link->u.mgd.conn.mode) {
link_info(link,
- "AP appears to change mode (expected %s, found %s), disconnect\n",
+ "AP %pM appears to change mode (expected %s, found %s) in %s, disconnect\n",
+ link->u.mgd.bssid,
ieee80211_conn_mode_str(link->u.mgd.conn.mode),
- ieee80211_conn_mode_str(ap_mode));
+ ieee80211_conn_mode_str(ap_mode), frame);
return -EINVAL;
}
@@ -1060,16 +1162,16 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
return 0;
link_info(link,
- "AP %pM changed bandwidth, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n",
- link->u.mgd.bssid, chanreq.oper.chan->center_freq,
+ "AP %pM changed bandwidth in %s, new used config is %d.%03d MHz, width %d (%d.%03d/%d MHz)\n",
+ link->u.mgd.bssid, frame, chanreq.oper.chan->center_freq,
chanreq.oper.chan->freq_offset, chanreq.oper.width,
chanreq.oper.center_freq1, chanreq.oper.freq1_offset,
chanreq.oper.center_freq2);
if (!cfg80211_chandef_valid(&chanreq.oper)) {
sdata_info(sdata,
- "AP %pM changed caps/bw in a way we can't support - disconnect\n",
- link->u.mgd.bssid);
+ "AP %pM changed caps/bw in %s in a way we can't support - disconnect\n",
+ link->u.mgd.bssid, frame);
return -EINVAL;
}
@@ -1098,8 +1200,8 @@ static int ieee80211_config_bw(struct ieee80211_link_data *link,
ret = ieee80211_link_change_chanreq(link, &chanreq, changed);
if (ret) {
sdata_info(sdata,
- "AP %pM changed bandwidth to incompatible one - disconnect\n",
- link->u.mgd.bssid);
+ "AP %pM changed bandwidth in %s to incompatible one - disconnect\n",
+ link->u.mgd.bssid, frame);
return ret;
}
@@ -1214,13 +1316,15 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
* Some APs apparently get confused if our capabilities are better
* than theirs, so restrict what we advertise in the assoc request.
*/
- if (!(ap_vht_cap->vht_cap_info &
- cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)))
- cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE |
- IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE);
- else if (!(ap_vht_cap->vht_cap_info &
- cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE)))
- cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
+ if (!ieee80211_hw_check(&local->hw, STRICT)) {
+ if (!(ap_vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)))
+ cap &= ~(IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE |
+ IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE);
+ else if (!(ap_vht_cap->vht_cap_info &
+ cpu_to_le32(IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE)))
+ cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
+ }
/*
* If some other vif is using the MU-MIMO capability we cannot associate
@@ -1262,14 +1366,16 @@ static bool ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
return mu_mimo_owner;
}
-static void ieee80211_assoc_add_rates(struct sk_buff *skb,
+static void ieee80211_assoc_add_rates(struct ieee80211_local *local,
+ struct sk_buff *skb,
enum nl80211_chan_width width,
struct ieee80211_supported_band *sband,
struct ieee80211_mgd_assoc_data *assoc_data)
{
u32 rates;
- if (assoc_data->supp_rates_len) {
+ if (assoc_data->supp_rates_len &&
+ !ieee80211_hw_check(&local->hw, STRICT)) {
/*
* Get all rates supported by the device and the AP as
* some APs don't like getting a superset of their rates
@@ -1412,23 +1518,25 @@ static size_t ieee80211_add_before_he_elems(struct sk_buff *skb,
#define PRESENT_ELEMS_MAX 8
#define PRESENT_ELEM_EXT_OFFS 0x100
-static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb, u16 capab,
- const struct element *ext_capa,
- const u16 *present_elems);
-
-static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb, u16 *capab,
- const struct element *ext_capa,
- const u8 *extra_elems,
- size_t extra_elems_len,
- unsigned int link_id,
- struct ieee80211_link_data *link,
- u16 *present_elems)
+static void
+ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u16 capab,
+ const struct element *ext_capa,
+ const u16 *present_elems,
+ struct ieee80211_mgd_assoc_data *assoc_data);
+
+static size_t
+ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u16 *capab,
+ const struct element *ext_capa,
+ const u8 *extra_elems,
+ size_t extra_elems_len,
+ unsigned int link_id,
+ struct ieee80211_link_data *link,
+ u16 *present_elems,
+ struct ieee80211_mgd_assoc_data *assoc_data)
{
enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif);
- struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
struct cfg80211_bss *cbss = assoc_data->link[link_id].bss;
struct ieee80211_channel *chan = cbss->channel;
const struct ieee80211_sband_iftype_data *iftd;
@@ -1483,7 +1591,7 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata,
*capab |= WLAN_CAPABILITY_SPECTRUM_MGMT;
if (sband->band != NL80211_BAND_S1GHZ)
- ieee80211_assoc_add_rates(skb, width, sband, assoc_data);
+ ieee80211_assoc_add_rates(local, skb, width, sband, assoc_data);
if (*capab & WLAN_CAPABILITY_SPECTRUM_MGMT ||
*capab & WLAN_CAPABILITY_RADIO_MEASURE) {
@@ -1577,7 +1685,7 @@ static size_t ieee80211_assoc_link_elems(struct ieee80211_sub_if_data *sdata,
if (link_id == assoc_data->assoc_link_id)
ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa,
- present_elems);
+ present_elems, assoc_data);
/* crash if somebody gets it wrong */
present_elems = NULL;
@@ -1656,14 +1764,14 @@ static void ieee80211_add_non_inheritance_elem(struct sk_buff *skb,
*len = skb->len - skb_len - 2;
}
-static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
- struct sk_buff *skb, u16 capab,
- const struct element *ext_capa,
- const u16 *outer_present_elems)
+static void
+ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb, u16 capab,
+ const struct element *ext_capa,
+ const u16 *outer_present_elems,
+ struct ieee80211_mgd_assoc_data *assoc_data)
{
struct ieee80211_local *local = sdata->local;
- struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
- struct ieee80211_mgd_assoc_data *assoc_data = ifmgd->assoc_data;
struct ieee80211_multi_link_elem *ml_elem;
struct ieee80211_mle_basic_common_info *common;
const struct wiphy_iftype_ext_capab *ift_ext_capa;
@@ -1736,16 +1844,17 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
* (if applicable) are skipped. So we only have
* the capability field (remember the position and fill
* later), followed by the elements added below by
- * calling ieee80211_assoc_link_elems().
+ * calling ieee80211_add_link_elems().
*/
capab_pos = skb_put(skb, 2);
- extra_used = ieee80211_assoc_link_elems(sdata, skb, &capab,
- ext_capa,
- extra_elems,
- extra_elems_len,
- link_id, NULL,
- link_present_elems);
+ extra_used = ieee80211_add_link_elems(sdata, skb, &capab,
+ ext_capa,
+ extra_elems,
+ extra_elems_len,
+ link_id, NULL,
+ link_present_elems,
+ assoc_data);
if (extra_elems)
skb_put_data(skb, extra_elems + extra_used,
extra_elems_len - extra_used);
@@ -1762,6 +1871,55 @@ static void ieee80211_assoc_add_ml_elem(struct ieee80211_sub_if_data *sdata,
ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT);
}
+static int
+ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata,
+ enum nl80211_iftype iftype,
+ struct cfg80211_bss *cbss,
+ size_t elems_len)
+{
+ struct ieee80211_local *local = sdata->local;
+ const struct ieee80211_sband_iftype_data *iftd;
+ struct ieee80211_supported_band *sband;
+ size_t size = 0;
+
+ if (!cbss)
+ return size;
+
+ sband = local->hw.wiphy->bands[cbss->channel->band];
+
+ /* add STA profile elements length */
+ size += elems_len;
+
+ /* and supported rates length */
+ size += 4 + sband->n_bitrates;
+
+ /* supported channels */
+ size += 2 + 2 * sband->n_channels;
+
+ iftd = ieee80211_get_sband_iftype_data(sband, iftype);
+ if (iftd)
+ size += iftd->vendor_elems.len;
+
+ /* power capability */
+ size += 4;
+
+ /* HT, VHT, HE, EHT */
+ size += 2 + sizeof(struct ieee80211_ht_cap);
+ size += 2 + sizeof(struct ieee80211_vht_cap);
+ size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) +
+ sizeof(struct ieee80211_he_mcs_nss_supp) +
+ IEEE80211_HE_PPE_THRES_MAX_LEN;
+
+ if (sband->band == NL80211_BAND_6GHZ)
+ size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa);
+
+ size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) +
+ sizeof(struct ieee80211_eht_mcs_nss_supp) +
+ IEEE80211_EHT_PPE_THRES_MAX_LEN;
+
+ return size;
+}
+
static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
{
struct ieee80211_local *local = sdata->local;
@@ -1800,42 +1958,15 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
struct cfg80211_bss *cbss = assoc_data->link[link_id].bss;
- const struct ieee80211_sband_iftype_data *iftd;
- struct ieee80211_supported_band *sband;
+ size_t elems_len = assoc_data->link[link_id].elems_len;
if (!cbss)
continue;
- sband = local->hw.wiphy->bands[cbss->channel->band];
-
n_links++;
- /* add STA profile elements length */
- size += assoc_data->link[link_id].elems_len;
- /* and supported rates length */
- size += 4 + sband->n_bitrates;
- /* supported channels */
- size += 2 + 2 * sband->n_channels;
-
- iftd = ieee80211_get_sband_iftype_data(sband, iftype);
- if (iftd)
- size += iftd->vendor_elems.len;
-
- /* power capability */
- size += 4;
-
- /* HT, VHT, HE, EHT */
- size += 2 + sizeof(struct ieee80211_ht_cap);
- size += 2 + sizeof(struct ieee80211_vht_cap);
- size += 2 + 1 + sizeof(struct ieee80211_he_cap_elem) +
- sizeof(struct ieee80211_he_mcs_nss_supp) +
- IEEE80211_HE_PPE_THRES_MAX_LEN;
-
- if (sband->band == NL80211_BAND_6GHZ)
- size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa);
-
- size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) +
- sizeof(struct ieee80211_eht_mcs_nss_supp) +
- IEEE80211_EHT_PPE_THRES_MAX_LEN;
+
+ size += ieee80211_link_common_elems_size(sdata, iftype, cbss,
+ elems_len);
/* non-inheritance element */
size += 2 + 2 + PRESENT_ELEMS_MAX;
@@ -1927,17 +2058,18 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
* for some reason check it and want it to be set, set the bit for all
* pre-EHT connections as we used to do.
*/
- if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT)
+ if (link->u.mgd.conn.mode < IEEE80211_CONN_MODE_EHT &&
+ !ieee80211_hw_check(&local->hw, STRICT))
capab |= WLAN_CAPABILITY_ESS;
/* add the elements for the assoc (main) link */
link_capab = capab;
- offset = ieee80211_assoc_link_elems(sdata, skb, &link_capab,
- ext_capa,
- assoc_data->ie,
- assoc_data->ie_len,
- assoc_data->assoc_link_id, link,
- present_elems);
+ offset = ieee80211_add_link_elems(sdata, skb, &link_capab,
+ ext_capa,
+ assoc_data->ie,
+ assoc_data->ie_len,
+ assoc_data->assoc_link_id, link,
+ present_elems, assoc_data);
put_unaligned_le16(link_capab, capab_pos);
/* if present, add any custom non-vendor IEs */
@@ -3593,12 +3725,45 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
netif_carrier_on(sdata->dev);
}
+static void ieee80211_ml_reconf_reset(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_mgd_assoc_data *add_links_data =
+ sdata->u.mgd.reconf.add_links_data;
+
+ if (!ieee80211_vif_is_mld(&sdata->vif) ||
+ !(sdata->u.mgd.reconf.added_links |
+ sdata->u.mgd.reconf.removed_links))
+ return;
+
+ wiphy_delayed_work_cancel(sdata->local->hw.wiphy,
+ &sdata->u.mgd.reconf.wk);
+ sdata->u.mgd.reconf.added_links = 0;
+ sdata->u.mgd.reconf.removed_links = 0;
+ sdata->u.mgd.reconf.dialog_token = 0;
+
+ if (add_links_data) {
+ struct cfg80211_mlo_reconf_done_data done_data = {};
+ u8 link_id;
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++)
+ done_data.links[link_id].bss =
+ add_links_data->link[link_id].bss;
+
+ cfg80211_mlo_reconf_add_done(sdata->dev, &done_data);
+
+ kfree(sdata->u.mgd.reconf.add_links_data);
+ sdata->u.mgd.reconf.add_links_data = NULL;
+ }
+}
+
static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
u16 stype, u16 reason, bool tx,
u8 *frame_buf)
{
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
struct ieee80211_local *local = sdata->local;
+ struct sta_info *ap_sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr);
unsigned int link_id;
u64 changed = 0;
struct ieee80211_prep_tx_info info = {
@@ -3609,6 +3774,9 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
lockdep_assert_wiphy(local->hw.wiphy);
+ if (WARN_ON(!ap_sta))
+ return;
+
if (WARN_ON_ONCE(tx && !frame_buf))
return;
@@ -3672,8 +3840,16 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
sdata->vif.cfg.ssid_len = 0;
- /* remove AP and TDLS peers */
- sta_info_flush(sdata, -1);
+ /* Remove TDLS peers */
+ __sta_info_flush(sdata, false, -1, ap_sta);
+
+ if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) {
+ /* Only move the AP state */
+ sta_info_move_state(ap_sta, IEEE80211_STA_NONE);
+ } else {
+ /* Remove AP peer */
+ sta_info_flush(sdata, -1);
+ }
/* finally reset all BSS / config parameters */
if (!ieee80211_vif_is_mld(&sdata->vif))
@@ -3724,6 +3900,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
ieee80211_vif_cfg_change_notify(sdata, changed);
}
+ if (sdata->vif.driver_flags & IEEE80211_VIF_REMOVE_AP_AFTER_DISASSOC) {
+ /*
+ * After notifying the driver about the disassoc,
+ * remove the ap sta.
+ */
+ sta_info_flush(sdata, -1);
+ }
+
/* disassociated - set to defaults now */
ieee80211_set_wmm_default(&sdata->deflink, false, false);
@@ -3785,6 +3969,12 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
ieee80211_vif_set_links(sdata, 0, 0);
ifmgd->mcast_seq_last = IEEE80211_SN_MODULO;
+
+ /* if disconnection happens in the middle of the ML reconfiguration
+ * flow, cfg80211 must called to release the BSS references obtained
+ * when the flow started.
+ */
+ ieee80211_ml_reconf_reset(sdata);
}
static void ieee80211_reset_ap_probe(struct ieee80211_sub_if_data *sdata)
@@ -4220,6 +4410,8 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
lockdep_assert_wiphy(sdata->local->hw.wiphy);
+ sdata->u.mgd.auth_data = NULL;
+
if (!assoc) {
/*
* we are not authenticated yet, the only timer that could be
@@ -4241,7 +4433,6 @@ static void ieee80211_destroy_auth_data(struct ieee80211_sub_if_data *sdata,
cfg80211_put_bss(sdata->local->hw.wiphy, auth_data->bss);
kfree(auth_data);
- sdata->u.mgd.auth_data = NULL;
}
enum assoc_status {
@@ -4258,6 +4449,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
lockdep_assert_wiphy(sdata->local->hw.wiphy);
+ sdata->u.mgd.assoc_data = NULL;
+
if (status != ASSOC_SUCCESS) {
/*
* we are not associated yet, the only timer that could be
@@ -4296,7 +4489,6 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
}
kfree(assoc_data);
- sdata->u.mgd.assoc_data = NULL;
}
static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
@@ -4599,57 +4791,6 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata,
false);
}
-static void ieee80211_get_rates(struct ieee80211_supported_band *sband,
- u8 *supp_rates, unsigned int supp_rates_len,
- u32 *rates, u32 *basic_rates,
- bool *have_higher_than_11mbit,
- int *min_rate, int *min_rate_index)
-{
- int i, j;
-
- for (i = 0; i < supp_rates_len; i++) {
- int rate = supp_rates[i] & 0x7f;
- bool is_basic = !!(supp_rates[i] & 0x80);
-
- if ((rate * 5) > 110)
- *have_higher_than_11mbit = true;
-
- /*
- * Skip HT, VHT, HE, EHT and SAE H2E only BSS membership
- * selectors since they're not rates.
- *
- * Note: Even though the membership selector and the basic
- * rate flag share the same bit, they are not exactly
- * the same.
- */
- if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) ||
- supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY) ||
- supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HE_PHY) ||
- supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_EHT_PHY) ||
- supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_SAE_H2E))
- continue;
-
- for (j = 0; j < sband->n_bitrates; j++) {
- struct ieee80211_rate *br;
- int brate;
-
- br = &sband->bitrates[j];
-
- brate = DIV_ROUND_UP(br->bitrate, 5);
- if (brate == rate) {
- *rates |= BIT(j);
- if (is_basic)
- *basic_rates |= BIT(j);
- if ((rate * 5) < *min_rate) {
- *min_rate = rate * 5;
- *min_rate_index = j;
- }
- break;
- }
- }
- }
-}
-
static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata,
struct ieee80211_supported_band *sband,
const struct link_sta_info *link_sta,
@@ -4711,7 +4852,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
u64 *changed)
{
struct ieee80211_sub_if_data *sdata = link->sdata;
- struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data;
+ struct ieee80211_mgd_assoc_data *assoc_data =
+ sdata->u.mgd.assoc_data ?: sdata->u.mgd.reconf.add_links_data;
struct ieee80211_bss_conf *bss_conf = link->conf;
struct ieee80211_local *local = sdata->local;
unsigned int link_id = link->link_id;
@@ -4802,7 +4944,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
* 2G/3G/4G wifi routers, reported models include the "Onda PN51T",
* "Vodafone PocketWiFi 2", "ZTE MF60" and a similar T-Mobile device.
*/
- if (!is_6ghz &&
+ if (!ieee80211_hw_check(&local->hw, STRICT) && !is_6ghz &&
((assoc_data->wmm && !elems->wmm_param) ||
(link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HT &&
(!elems->ht_cap_elem || !elems->ht_operation)) ||
@@ -4899,7 +5041,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
/* check/update if AP changed anything in assoc response vs. scan */
if (ieee80211_config_bw(link, elems,
link_id == assoc_data->assoc_link_id,
- changed)) {
+ changed, "assoc response")) {
ret = false;
goto out;
}
@@ -4938,6 +5080,15 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
bss_vht_cap = (const void *)elem->data;
}
+ if (ieee80211_hw_check(&local->hw, STRICT) &&
+ (!bss_vht_cap || memcmp(bss_vht_cap, elems->vht_cap_elem,
+ sizeof(*bss_vht_cap)))) {
+ rcu_read_unlock();
+ ret = false;
+ link_info(link, "VHT capabilities mismatch\n");
+ goto out;
+ }
+
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
elems->vht_cap_elem,
bss_vht_cap, link_sta);
@@ -5122,7 +5273,9 @@ static int ieee80211_mgd_setup_link_sta(struct ieee80211_link_data *link,
sband = local->hw.wiphy->bands[cbss->channel->band];
ieee80211_get_rates(sband, bss->supp_rates, bss->supp_rates_len,
- &rates, &basic_rates, &have_higher_than_11mbit,
+ NULL, 0,
+ &rates, &basic_rates, NULL,
+ &have_higher_than_11mbit,
&min_rate, &min_rate_index);
/*
@@ -5515,7 +5668,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
struct ieee80211_link_data *link,
int link_id,
struct cfg80211_bss *cbss, bool mlo,
- struct ieee80211_conn_settings *conn)
+ struct ieee80211_conn_settings *conn,
+ unsigned long *userspace_selectors)
{
struct ieee80211_local *local = sdata->local;
bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ;
@@ -5528,7 +5682,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata,
rcu_read_lock();
elems = ieee80211_determine_chan_mode(sdata, conn, cbss, link_id,
- &chanreq, &ap_chandef);
+ &chanreq, &ap_chandef,
+ userspace_selectors);
if (IS_ERR(elems)) {
rcu_read_unlock();
@@ -5722,7 +5877,8 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
link->u.mgd.conn = assoc_data->link[link_id].conn;
err = ieee80211_prep_channel(sdata, link, link_id, cbss,
- true, &link->u.mgd.conn);
+ true, &link->u.mgd.conn,
+ assoc_data->userspace_selectors);
if (err) {
link_info(link, "prep_channel failed\n");
goto out_err;
@@ -5840,6 +5996,8 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
if (!assoc_data)
return;
+ info.link_id = assoc_data->assoc_link_id;
+
parse_params.mode =
assoc_data->link[assoc_data->assoc_link_id].conn.mode;
@@ -7057,7 +7215,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems);
- if (ieee80211_config_bw(link, elems, true, &changed)) {
+ if (ieee80211_config_bw(link, elems, true, &changed, "beacon")) {
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
WLAN_REASON_DEAUTH_LEAVING,
true, deauth_buf);
@@ -8136,6 +8294,25 @@ static void ieee80211_request_smps_mgd_work(struct wiphy *wiphy,
link->u.mgd.driver_smps_mode);
}
+static void ieee80211_ml_sta_reconf_timeout(struct wiphy *wiphy,
+ struct wiphy_work *work)
+{
+ struct ieee80211_sub_if_data *sdata =
+ container_of(work, struct ieee80211_sub_if_data,
+ u.mgd.reconf.wk.work);
+
+ if (!sdata->u.mgd.reconf.added_links &&
+ !sdata->u.mgd.reconf.removed_links)
+ return;
+
+ sdata_info(sdata,
+ "mlo: reconf: timeout: added=0x%x, removed=0x%x\n",
+ sdata->u.mgd.reconf.added_links,
+ sdata->u.mgd.reconf.removed_links);
+
+ __ieee80211_disconnect(sdata);
+}
+
/* interface setup */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
{
@@ -8150,6 +8327,8 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata)
ieee80211_tdls_peer_del_work);
wiphy_delayed_work_init(&ifmgd->ml_reconf_work,
ieee80211_ml_reconf_work);
+ wiphy_delayed_work_init(&ifmgd->reconf.wk,
+ ieee80211_ml_sta_reconf_timeout);
timer_setup(&ifmgd->timer, ieee80211_sta_timer, 0);
timer_setup(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, 0);
timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0);
@@ -8210,6 +8389,9 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link)
if (sdata->u.mgd.assoc_data)
ether_addr_copy(link->conf->addr,
sdata->u.mgd.assoc_data->link[link_id].addr);
+ else if (sdata->u.mgd.reconf.add_links_data)
+ ether_addr_copy(link->conf->addr,
+ sdata->u.mgd.reconf.add_links_data->link[link_id].addr);
else if (!is_valid_ether_addr(link->conf->addr))
eth_random_addr(link->conf->addr);
}
@@ -8232,7 +8414,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
struct cfg80211_bss *cbss, s8 link_id,
const u8 *ap_mld_addr, bool assoc,
struct ieee80211_conn_settings *conn,
- bool override)
+ bool override,
+ unsigned long *userspace_selectors)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
@@ -8371,7 +8554,8 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
*/
link->u.mgd.conn = *conn;
err = ieee80211_prep_channel(sdata, link, link->link_id, cbss,
- mlo, &link->u.mgd.conn);
+ mlo, &link->u.mgd.conn,
+ userspace_selectors);
if (err) {
if (new_sta)
sta_info_free(local, new_sta);
@@ -8487,6 +8671,22 @@ out:
return ret;
}
+static void ieee80211_parse_cfg_selectors(unsigned long *userspace_selectors,
+ const u8 *supported_selectors,
+ u8 supported_selectors_len)
+{
+ if (supported_selectors) {
+ for (int i = 0; i < supported_selectors_len; i++) {
+ set_bit(supported_selectors[i],
+ userspace_selectors);
+ }
+ } else {
+ /* Assume SAE_H2E support for backward compatibility. */
+ set_bit(BSS_MEMBERSHIP_SELECTOR_SAE_H2E,
+ userspace_selectors);
+ }
+}
+
/* config hooks */
int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
struct cfg80211_auth_request *req)
@@ -8588,6 +8788,10 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
memcpy(auth_data->key, req->key, req->key_len);
}
+ ieee80211_parse_cfg_selectors(auth_data->userspace_selectors,
+ req->supported_selectors,
+ req->supported_selectors_len);
+
auth_data->algorithm = auth_alg;
/* try to authenticate/probe */
@@ -8641,7 +8845,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
err = ieee80211_prep_connection(sdata, req->bss, req->link_id,
req->ap_mld_addr, cont_auth,
- &conn, false);
+ &conn, false,
+ auth_data->userspace_selectors);
if (err)
goto err_clear;
@@ -8928,6 +9133,10 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
false);
}
+ ieee80211_parse_cfg_selectors(assoc_data->userspace_selectors,
+ req->supported_selectors,
+ req->supported_selectors_len);
+
memcpy(&ifmgd->ht_capa, &req->ht_capa, sizeof(ifmgd->ht_capa));
memcpy(&ifmgd->ht_capa_mask, &req->ht_capa_mask,
sizeof(ifmgd->ht_capa_mask));
@@ -9174,7 +9383,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
/* only calculate the mode, hence link == NULL */
err = ieee80211_prep_channel(sdata, NULL, i,
assoc_data->link[i].bss, true,
- &assoc_data->link[i].conn);
+ &assoc_data->link[i].conn,
+ assoc_data->userspace_selectors);
if (err) {
req->links[i].error = err;
goto err_clear;
@@ -9190,7 +9400,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
err = ieee80211_prep_connection(sdata, cbss, req->link_id,
req->ap_mld_addr, true,
&assoc_data->link[assoc_link_id].conn,
- override);
+ override,
+ assoc_data->userspace_selectors);
if (err)
goto err_clear;
@@ -9434,3 +9645,690 @@ void ieee80211_disable_rssi_reports(struct ieee80211_vif *vif)
_ieee80211_enable_rssi_reports(sdata, 0, 0);
}
EXPORT_SYMBOL(ieee80211_disable_rssi_reports);
+
+static void ieee80211_ml_reconf_selectors(unsigned long *userspace_selectors)
+{
+ /* these selectors are mandatory for ML reconfiguration */
+ set_bit(BSS_MEMBERSHIP_SELECTOR_SAE_H2E, userspace_selectors);
+ set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, userspace_selectors);
+ set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, userspace_selectors);
+}
+
+void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt, size_t len)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
+ struct ieee80211_mgd_assoc_data *add_links_data =
+ ifmgd->reconf.add_links_data;
+ struct sta_info *sta;
+ struct cfg80211_mlo_reconf_done_data done_data = {};
+ u16 sta_changed_links = sdata->u.mgd.reconf.added_links |
+ sdata->u.mgd.reconf.removed_links;
+ u16 link_mask, valid_links;
+ unsigned int link_id;
+ unsigned long userspace_selectors[BITS_TO_LONGS(128)] = {};
+ size_t orig_len = len;
+ u8 i, group_key_data_len;
+ u8 *pos;
+
+ if (!ieee80211_vif_is_mld(&sdata->vif) ||
+ len < offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp) ||
+ mgmt->u.action.u.ml_reconf_resp.dialog_token !=
+ sdata->u.mgd.reconf.dialog_token ||
+ !sta_changed_links)
+ return;
+
+ pos = mgmt->u.action.u.ml_reconf_resp.variable;
+ len -= offsetofend(typeof(*mgmt), u.action.u.ml_reconf_resp);
+
+ /* each status duple is 3 octets */
+ if (len < mgmt->u.action.u.ml_reconf_resp.count * 3) {
+ sdata_info(sdata,
+ "mlo: reconf: unexpected len=%zu, count=%u\n",
+ len, mgmt->u.action.u.ml_reconf_resp.count);
+ goto disconnect;
+ }
+
+ link_mask = sta_changed_links;
+ for (i = 0; i < mgmt->u.action.u.ml_reconf_resp.count; i++) {
+ u16 status = get_unaligned_le16(pos + 1);
+
+ link_id = *pos;
+
+ if (!(link_mask & BIT(link_id))) {
+ sdata_info(sdata,
+ "mlo: reconf: unexpected link: %u, changed=0x%x\n",
+ link_id, sta_changed_links);
+ goto disconnect;
+ }
+
+ /* clear the corresponding link, to detect the case that
+ * the same link was included more than one time
+ */
+ link_mask &= ~BIT(link_id);
+
+ /* Handle failure to remove links here. Failure to remove added
+ * links will be done later in the flow.
+ */
+ if (status != WLAN_STATUS_SUCCESS) {
+ sdata_info(sdata,
+ "mlo: reconf: failed on link=%u, status=%u\n",
+ link_id, status);
+
+ /* The AP MLD failed to remove a link that was already
+ * removed locally. As this is not expected behavior,
+ * disconnect
+ */
+ if (sdata->u.mgd.reconf.removed_links & BIT(link_id))
+ goto disconnect;
+
+ /* The AP MLD failed to add a link. Remove it from the
+ * added links.
+ */
+ sdata->u.mgd.reconf.added_links &= ~BIT(link_id);
+ }
+
+ pos += 3;
+ len -= 3;
+ }
+
+ if (link_mask) {
+ sdata_info(sdata,
+ "mlo: reconf: no response for links=0x%x\n",
+ link_mask);
+ goto disconnect;
+ }
+
+ if (!sdata->u.mgd.reconf.added_links)
+ goto out;
+
+ if (len < 1 || len < 1 + *pos) {
+ sdata_info(sdata,
+ "mlo: reconf: invalid group key data length");
+ goto disconnect;
+ }
+
+ /* The Group Key Data field must be present when links are added. This
+ * field should be processed by userland.
+ */
+ group_key_data_len = *pos++;
+
+ pos += group_key_data_len;
+ len -= group_key_data_len + 1;
+
+ /* Process the information for the added links */
+ sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr);
+ if (WARN_ON(!sta))
+ goto disconnect;
+
+ valid_links = sdata->vif.valid_links;
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ if (!add_links_data->link[link_id].bss ||
+ !(sdata->u.mgd.reconf.added_links & BIT(link_id)))
+
+ continue;
+
+ valid_links |= BIT(link_id);
+ if (ieee80211_sta_allocate_link(sta, link_id))
+ goto disconnect;
+ }
+
+ ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links);
+ ieee80211_ml_reconf_selectors(userspace_selectors);
+ link_mask = 0;
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ struct cfg80211_bss *cbss = add_links_data->link[link_id].bss;
+ struct ieee80211_link_data *link;
+ struct link_sta_info *link_sta;
+ u64 changed = 0;
+
+ if (!cbss)
+ continue;
+
+ link = sdata_dereference(sdata->link[link_id], sdata);
+ if (WARN_ON(!link))
+ goto disconnect;
+
+ link_info(link,
+ "mlo: reconf: local address %pM, AP link address %pM\n",
+ add_links_data->link[link_id].addr,
+ add_links_data->link[link_id].bss->bssid);
+
+ link_sta = rcu_dereference_protected(sta->link[link_id],
+ lockdep_is_held(&local->hw.wiphy->mtx));
+ if (WARN_ON(!link_sta))
+ goto disconnect;
+
+ if (!link->u.mgd.have_beacon) {
+ const struct cfg80211_bss_ies *ies;
+
+ rcu_read_lock();
+ ies = rcu_dereference(cbss->beacon_ies);
+ if (ies)
+ link->u.mgd.have_beacon = true;
+ else
+ ies = rcu_dereference(cbss->ies);
+ ieee80211_get_dtim(ies,
+ &link->conf->sync_dtim_count,
+ &link->u.mgd.dtim_period);
+ link->conf->beacon_int = cbss->beacon_interval;
+ rcu_read_unlock();
+ }
+
+ link->conf->dtim_period = link->u.mgd.dtim_period ?: 1;
+
+ link->u.mgd.conn = add_links_data->link[link_id].conn;
+ if (ieee80211_prep_channel(sdata, link, link_id, cbss,
+ true, &link->u.mgd.conn,
+ userspace_selectors)) {
+ link_info(link, "mlo: reconf: prep_channel failed\n");
+ goto disconnect;
+ }
+
+ if (ieee80211_mgd_setup_link_sta(link, sta, link_sta,
+ add_links_data->link[link_id].bss))
+ goto disconnect;
+
+ if (!ieee80211_assoc_config_link(link, link_sta,
+ add_links_data->link[link_id].bss,
+ mgmt, pos, len,
+ &changed))
+ goto disconnect;
+
+ /* The AP MLD indicated success for this link, but the station
+ * profile status indicated otherwise. Since there is an
+ * inconsistency in the ML reconfiguration response, disconnect
+ */
+ if (add_links_data->link[link_id].status != WLAN_STATUS_SUCCESS)
+ goto disconnect;
+
+ ieee80211_sta_init_nss(link_sta);
+ if (ieee80211_sta_activate_link(sta, link_id))
+ goto disconnect;
+
+ changed |= ieee80211_link_set_associated(link, cbss);
+ ieee80211_link_info_change_notify(sdata, link, changed);
+
+ ieee80211_recalc_smps(sdata, link);
+ link_mask |= BIT(link_id);
+ }
+
+ sdata_info(sdata,
+ "mlo: reconf: current valid_links=0x%x, added=0x%x\n",
+ valid_links, link_mask);
+
+ /* links might have changed due to rejected ones, set them again */
+ ieee80211_vif_set_links(sdata, valid_links, sdata->vif.dormant_links);
+ ieee80211_vif_cfg_change_notify(sdata, BSS_CHANGED_MLD_VALID_LINKS);
+
+ ieee80211_recalc_ps(local);
+ ieee80211_recalc_ps_vif(sdata);
+
+ done_data.buf = (const u8 *)mgmt;
+ done_data.len = orig_len;
+ done_data.added_links = link_mask;
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++)
+ done_data.links[link_id].bss = add_links_data->link[link_id].bss;
+
+ cfg80211_mlo_reconf_add_done(sdata->dev, &done_data);
+ kfree(sdata->u.mgd.reconf.add_links_data);
+ sdata->u.mgd.reconf.add_links_data = NULL;
+out:
+ ieee80211_ml_reconf_reset(sdata);
+ return;
+
+disconnect:
+ __ieee80211_disconnect(sdata);
+}
+
+static struct sk_buff *
+ieee80211_build_ml_reconf_req(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgd_assoc_data *add_links_data,
+ u16 removed_links)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_mgmt *mgmt;
+ struct ieee80211_multi_link_elem *ml_elem;
+ struct ieee80211_mle_basic_common_info *common;
+ enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif);
+ struct sk_buff *skb;
+ size_t size;
+ unsigned int link_id;
+ __le16 eml_capa = 0, mld_capa_ops = 0;
+ struct ieee80211_tx_info *info;
+ u8 common_size, var_common_size;
+ u8 *ml_elem_len;
+ u16 capab = 0;
+
+ size = local->hw.extra_tx_headroom + sizeof(*mgmt);
+
+ /* Consider the maximal length of the reconfiguration ML element */
+ size += sizeof(struct ieee80211_multi_link_elem);
+
+ /* The Basic ML element and the Reconfiguration ML element have the same
+ * fixed common information fields in the context of ML reconfiguration
+ * action frame. The AP MLD MAC address must always be present
+ */
+ common_size = sizeof(*common);
+
+ /* when adding links, the MLD capabilities must be present */
+ var_common_size = 0;
+ if (add_links_data) {
+ const struct wiphy_iftype_ext_capab *ift_ext_capa =
+ cfg80211_get_iftype_ext_capa(local->hw.wiphy,
+ ieee80211_vif_type_p2p(&sdata->vif));
+
+ if (ift_ext_capa) {
+ eml_capa = cpu_to_le16(ift_ext_capa->eml_capabilities);
+ mld_capa_ops =
+ cpu_to_le16(ift_ext_capa->mld_capa_and_ops);
+ }
+
+ /* MLD capabilities and operation */
+ var_common_size += 2;
+
+ /* EML capabilities */
+ if (eml_capa & cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP |
+ IEEE80211_EML_CAP_EMLMR_SUPPORT)))
+ var_common_size += 2;
+ }
+
+ /* Add the common information length */
+ size += common_size + var_common_size;
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ struct cfg80211_bss *cbss;
+ size_t elems_len;
+
+ if (removed_links & BIT(link_id)) {
+ size += sizeof(struct ieee80211_mle_per_sta_profile) +
+ ETH_ALEN;
+ continue;
+ }
+
+ if (!add_links_data || !add_links_data->link[link_id].bss)
+ continue;
+
+ elems_len = add_links_data->link[link_id].elems_len;
+ cbss = add_links_data->link[link_id].bss;
+
+ /* should be the same across all BSSes */
+ if (cbss->capability & WLAN_CAPABILITY_PRIVACY)
+ capab |= WLAN_CAPABILITY_PRIVACY;
+
+ size += 2 + sizeof(struct ieee80211_mle_per_sta_profile) +
+ ETH_ALEN;
+
+ /* WMM */
+ size += 9;
+ size += ieee80211_link_common_elems_size(sdata, iftype, cbss,
+ elems_len);
+ }
+
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+ mgmt = skb_put_zero(skb, offsetofend(struct ieee80211_mgmt,
+ u.action.u.ml_reconf_req));
+
+ /* Add the MAC header */
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+ memcpy(mgmt->da, sdata->vif.cfg.ap_addr, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
+
+ /* Add the action frame fixed fields */
+ mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT;
+ mgmt->u.action.u.ml_reconf_req.action_code =
+ WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_REQ;
+
+ /* allocate a dialog token and store it */
+ sdata->u.mgd.reconf.dialog_token = ++sdata->u.mgd.dialog_token_alloc;
+ mgmt->u.action.u.ml_reconf_req.dialog_token =
+ sdata->u.mgd.reconf.dialog_token;
+
+ /* Add the ML reconfiguration element and the common information */
+ skb_put_u8(skb, WLAN_EID_EXTENSION);
+ ml_elem_len = skb_put(skb, 1);
+ skb_put_u8(skb, WLAN_EID_EXT_EHT_MULTI_LINK);
+ ml_elem = skb_put(skb, sizeof(*ml_elem));
+ ml_elem->control =
+ cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_RECONF |
+ IEEE80211_MLC_RECONF_PRES_MLD_MAC_ADDR);
+ common = skb_put(skb, common_size);
+ common->len = common_size + var_common_size;
+ memcpy(common->mld_mac_addr, sdata->vif.addr, ETH_ALEN);
+
+ if (add_links_data) {
+ if (eml_capa &
+ cpu_to_le16((IEEE80211_EML_CAP_EMLSR_SUPP |
+ IEEE80211_EML_CAP_EMLMR_SUPPORT))) {
+ ml_elem->control |=
+ cpu_to_le16(IEEE80211_MLC_RECONF_PRES_EML_CAPA);
+ skb_put_data(skb, &eml_capa, sizeof(eml_capa));
+ }
+
+ ml_elem->control |=
+ cpu_to_le16(IEEE80211_MLC_RECONF_PRES_MLD_CAPA_OP);
+
+ skb_put_data(skb, &mld_capa_ops, sizeof(mld_capa_ops));
+ }
+
+ if (sdata->u.mgd.flags & IEEE80211_STA_ENABLE_RRM)
+ capab |= WLAN_CAPABILITY_RADIO_MEASURE;
+
+ /* Add the per station profile */
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) {
+ u8 *subelem_len = NULL;
+ u16 ctrl;
+ const u8 *addr;
+
+ /* Skip links that are not changing */
+ if (!(removed_links & BIT(link_id)) &&
+ (!add_links_data || !add_links_data->link[link_id].bss))
+ continue;
+
+ ctrl = link_id |
+ IEEE80211_MLE_STA_RECONF_CONTROL_STA_MAC_ADDR_PRESENT;
+
+ if (removed_links & BIT(link_id)) {
+ struct ieee80211_bss_conf *conf =
+ sdata_dereference(sdata->vif.link_conf[link_id],
+ sdata);
+ if (!conf)
+ continue;
+
+ addr = conf->addr;
+ ctrl |= u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_DEL_LINK,
+ IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE);
+ } else {
+ addr = add_links_data->link[link_id].addr;
+ ctrl |= IEEE80211_MLE_STA_RECONF_CONTROL_COMPLETE_PROFILE |
+ u16_encode_bits(IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE_ADD_LINK,
+ IEEE80211_MLE_STA_RECONF_CONTROL_OPERATION_TYPE);
+ }
+
+ skb_put_u8(skb, IEEE80211_MLE_SUBELEM_PER_STA_PROFILE);
+ subelem_len = skb_put(skb, 1);
+
+ put_unaligned_le16(ctrl, skb_put(skb, sizeof(ctrl)));
+ skb_put_u8(skb, 1 + ETH_ALEN);
+ skb_put_data(skb, addr, ETH_ALEN);
+
+ if (!(removed_links & BIT(link_id))) {
+ u16 link_present_elems[PRESENT_ELEMS_MAX] = {};
+ size_t extra_used;
+ void *capab_pos;
+ u8 qos_info;
+
+ capab_pos = skb_put(skb, 2);
+
+ extra_used =
+ ieee80211_add_link_elems(sdata, skb, &capab, NULL,
+ add_links_data->link[link_id].elems,
+ add_links_data->link[link_id].elems_len,
+ link_id, NULL,
+ link_present_elems,
+ add_links_data);
+
+ if (add_links_data->link[link_id].elems)
+ skb_put_data(skb,
+ add_links_data->link[link_id].elems +
+ extra_used,
+ add_links_data->link[link_id].elems_len -
+ extra_used);
+ if (sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED) {
+ qos_info = sdata->u.mgd.uapsd_queues;
+ qos_info |= (sdata->u.mgd.uapsd_max_sp_len <<
+ IEEE80211_WMM_IE_STA_QOSINFO_SP_SHIFT);
+ } else {
+ qos_info = 0;
+ }
+
+ ieee80211_add_wmm_info_ie(skb_put(skb, 9), qos_info);
+ put_unaligned_le16(capab, capab_pos);
+ }
+
+ ieee80211_fragment_element(skb, subelem_len,
+ IEEE80211_MLE_SUBELEM_FRAGMENT);
+ }
+
+ ieee80211_fragment_element(skb, ml_elem_len, WLAN_EID_FRAGMENT);
+
+ info = IEEE80211_SKB_CB(skb);
+ info->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+ return skb;
+}
+
+int ieee80211_mgd_assoc_ml_reconf(struct ieee80211_sub_if_data *sdata,
+ struct cfg80211_assoc_link *add_links,
+ u16 rem_links)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_mgd_assoc_data *data = NULL;
+ struct sta_info *sta;
+ struct sk_buff *skb;
+ u16 added_links, new_valid_links;
+ int link_id, err;
+
+ if (!ieee80211_vif_is_mld(&sdata->vif) ||
+ !(sdata->vif.cfg.mld_capa_op &
+ IEEE80211_MLD_CAP_OP_LINK_RECONF_SUPPORT))
+ return -EINVAL;
+
+ /* No support for concurrent ML reconfiguration operation */
+ if (sdata->u.mgd.reconf.added_links ||
+ sdata->u.mgd.reconf.removed_links)
+ return -EBUSY;
+
+ added_links = 0;
+ for (link_id = 0; add_links && link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++) {
+ if (!add_links[link_id].bss)
+ continue;
+
+ added_links |= BIT(link_id);
+ }
+
+ sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr);
+ if (WARN_ON(!sta))
+ return -ENOLINK;
+
+ if (rem_links & BIT(sta->sta.deflink.link_id))
+ return -EINVAL;
+
+ /* Adding links to the set of valid link is done only after a successful
+ * ML reconfiguration frame exchange. Here prepare the data for the ML
+ * reconfiguration frame construction and allocate the required
+ * resources
+ */
+ if (added_links) {
+ bool uapsd_supported;
+ unsigned long userspace_selectors[BITS_TO_LONGS(128)] = {};
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ uapsd_supported = true;
+ ieee80211_ml_reconf_selectors(userspace_selectors);
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++) {
+ struct ieee80211_supported_band *sband;
+ struct cfg80211_bss *link_cbss = add_links[link_id].bss;
+ struct ieee80211_bss *bss;
+
+ if (!link_cbss)
+ continue;
+
+ bss = (void *)link_cbss->priv;
+
+ if (!bss->wmm_used) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (link_cbss->channel->band == NL80211_BAND_S1GHZ) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ eth_random_addr(data->link[link_id].addr);
+ data->link[link_id].conn =
+ ieee80211_conn_settings_unlimited;
+ sband =
+ local->hw.wiphy->bands[link_cbss->channel->band];
+
+ ieee80211_determine_our_sta_mode(sdata, sband,
+ NULL, true, link_id,
+ &data->link[link_id].conn);
+
+ data->link[link_id].bss = link_cbss;
+ data->link[link_id].disabled =
+ add_links[link_id].disabled;
+ data->link[link_id].elems =
+ (u8 *)add_links[link_id].elems;
+ data->link[link_id].elems_len =
+ add_links[link_id].elems_len;
+
+ if (!bss->uapsd_supported)
+ uapsd_supported = false;
+
+ if (data->link[link_id].conn.mode <
+ IEEE80211_CONN_MODE_EHT) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ err = ieee80211_mgd_get_ap_ht_vht_capa(sdata, data,
+ link_id);
+ if (err) {
+ err = -EINVAL;
+ goto err_free;
+ }
+ }
+
+ /* Require U-APSD support to be similar to the current valid
+ * links
+ */
+ if (uapsd_supported !=
+ !!(sdata->u.mgd.flags & IEEE80211_STA_UAPSD_ENABLED)) {
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++) {
+ if (!data->link[link_id].bss)
+ continue;
+
+ /* only used to verify the mode, nothing is allocated */
+ err = ieee80211_prep_channel(sdata, NULL, link_id,
+ data->link[link_id].bss,
+ true,
+ &data->link[link_id].conn,
+ userspace_selectors);
+ if (err)
+ goto err_free;
+ }
+ }
+
+ /* link removal is done before the ML reconfiguration frame exchange so
+ * that these links will not be used between their removal by the AP MLD
+ * and before the station got the ML reconfiguration response. Based on
+ * Section 35.3.6.4 in Draft P802.11be_D7.0 the AP MLD should accept the
+ * link removal request.
+ */
+ if (rem_links) {
+ u16 new_active_links = sdata->vif.active_links & ~rem_links;
+
+ new_valid_links = sdata->vif.valid_links & ~rem_links;
+
+ /* Should not be left with no valid links to perform the
+ * ML reconfiguration
+ */
+ if (!new_valid_links ||
+ !(new_valid_links & ~sdata->vif.dormant_links)) {
+ sdata_info(sdata, "mlo: reconf: no valid links\n");
+ err = -EINVAL;
+ goto err_free;
+ }
+
+ if (new_active_links != sdata->vif.active_links) {
+ if (!new_active_links)
+ new_active_links =
+ BIT(__ffs(new_valid_links &
+ ~sdata->vif.dormant_links));
+
+ err = ieee80211_set_active_links(&sdata->vif,
+ new_active_links);
+ if (err) {
+ sdata_info(sdata,
+ "mlo: reconf: failed set active links\n");
+ goto err_free;
+ }
+ }
+ }
+
+ /* Build the SKB before the link removal as the construction of the
+ * station info for removed links requires the local address.
+ * Invalidate the removed links, so that the transmission of the ML
+ * reconfiguration request frame would not be done using them, as the AP
+ * is expected to send the ML reconfiguration response frame on the link
+ * on which the request was received.
+ */
+ skb = ieee80211_build_ml_reconf_req(sdata, data, rem_links);
+ if (!skb) {
+ err = -ENOMEM;
+ goto err_free;
+ }
+
+ if (rem_links) {
+ u16 new_dormant_links = sdata->vif.dormant_links & ~rem_links;
+
+ err = ieee80211_vif_set_links(sdata, new_valid_links,
+ new_dormant_links);
+ if (err) {
+ sdata_info(sdata,
+ "mlo: reconf: failed set valid links\n");
+ kfree_skb(skb);
+ goto err_free;
+ }
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++) {
+ if (!(rem_links & BIT(link_id)))
+ continue;
+
+ ieee80211_sta_remove_link(sta, link_id);
+ }
+
+ /* notify the driver and upper layers */
+ ieee80211_vif_cfg_change_notify(sdata,
+ BSS_CHANGED_MLD_VALID_LINKS);
+ cfg80211_links_removed(sdata->dev, rem_links);
+ }
+
+ sdata_info(sdata, "mlo: reconf: adding=0x%x, removed=0x%x\n",
+ added_links, rem_links);
+
+ ieee80211_tx_skb(sdata, skb);
+
+ sdata->u.mgd.reconf.added_links = added_links;
+ sdata->u.mgd.reconf.add_links_data = data;
+ sdata->u.mgd.reconf.removed_links = rem_links;
+ wiphy_delayed_work_queue(sdata->local->hw.wiphy,
+ &sdata->u.mgd.reconf.wk,
+ IEEE80211_ASSOC_TIMEOUT_SHORT);
+ return 0;
+
+ err_free:
+ kfree(data);
+ return err;
+}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index c4a28ccbd064..0659ec892ec6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -6,7 +6,7 @@
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
*/
#include <linux/jiffies.h>
@@ -3035,8 +3035,7 @@ __ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx, u8 data_offset)
check_da = NULL;
break;
case NL80211_IFTYPE_STATION:
- if (!rx->sta ||
- !test_sta_flag(rx->sta, WLAN_STA_TDLS_PEER))
+ if (!test_sta_flag(rx->sta, WLAN_STA_TDLS_PEER))
check_sa = NULL;
break;
case NL80211_IFTYPE_MESH_POINT:
@@ -3330,8 +3329,8 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
return;
}
- if (!ether_addr_equal(mgmt->sa, sdata->deflink.u.mgd.bssid) ||
- !ether_addr_equal(mgmt->bssid, sdata->deflink.u.mgd.bssid)) {
+ if (!ether_addr_equal(mgmt->sa, sdata->vif.cfg.ap_addr) ||
+ !ether_addr_equal(mgmt->bssid, sdata->vif.cfg.ap_addr)) {
/* Not from the current AP or not associated yet. */
return;
}
@@ -3347,9 +3346,9 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
skb_reserve(skb, local->hw.extra_tx_headroom);
resp = skb_put_zero(skb, 24);
- memcpy(resp->da, mgmt->sa, ETH_ALEN);
+ memcpy(resp->da, sdata->vif.cfg.ap_addr, ETH_ALEN);
memcpy(resp->sa, sdata->vif.addr, ETH_ALEN);
- memcpy(resp->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN);
+ memcpy(resp->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN);
resp->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
IEEE80211_STYPE_ACTION);
skb_put(skb, 1 + sizeof(resp->u.action.u.sa_query));
@@ -3820,6 +3819,18 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
u.action.u.ttlm_res))
goto invalid;
goto queue;
+ case WLAN_PROTECTED_EHT_ACTION_LINK_RECONFIG_RESP:
+ if (sdata->vif.type != NL80211_IFTYPE_STATION)
+ break;
+
+ /* The reconfiguration response action frame must
+ * least one 'Status Duple' entry (3 octets)
+ */
+ if (len <
+ offsetofend(typeof(*mgmt),
+ u.action.u.ml_reconf_resp) + 3)
+ goto invalid;
+ goto queue;
default:
break;
}
@@ -4563,7 +4574,9 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
return ieee80211_is_public_action(hdr, skb->len) ||
ieee80211_is_probe_req(hdr->frame_control) ||
ieee80211_is_probe_resp(hdr->frame_control) ||
- ieee80211_is_beacon(hdr->frame_control);
+ ieee80211_is_beacon(hdr->frame_control) ||
+ (ieee80211_is_auth(hdr->frame_control) &&
+ ether_addr_equal(sdata->vif.addr, hdr->addr1));
case NL80211_IFTYPE_NAN:
/* Currently no frames on NAN interface are allowed */
return false;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index aa22f09e6d14..caa3d0236b5e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2023 Intel Corporation
+ * Copyright (C) 2018-2024 Intel Corporation
*/
#include <linux/module.h>
@@ -509,6 +509,24 @@ static int sta_info_alloc_link(struct ieee80211_local *local,
for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++)
ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]);
+ link_info->rx_omi_bw_rx = IEEE80211_STA_RX_BW_MAX;
+ link_info->rx_omi_bw_tx = IEEE80211_STA_RX_BW_MAX;
+ link_info->rx_omi_bw_staging = IEEE80211_STA_RX_BW_MAX;
+
+ /*
+ * Cause (a) warning(s) if IEEE80211_STA_RX_BW_MAX != 320
+ * or if new values are added to the enum.
+ */
+ switch (link_info->cur_max_bandwidth) {
+ case IEEE80211_STA_RX_BW_20:
+ case IEEE80211_STA_RX_BW_40:
+ case IEEE80211_STA_RX_BW_80:
+ case IEEE80211_STA_RX_BW_160:
+ case IEEE80211_STA_RX_BW_MAX:
+ /* intentionally nothing */
+ break;
+ }
+
return 0;
}
@@ -1317,9 +1335,13 @@ static int _sta_info_move_state(struct sta_info *sta,
sta->sta.addr, new_state);
/* notify the driver before the actual changes so it can
- * fail the transition
+ * fail the transition if the state is increasing.
+ * The driver is required not to fail when the transition
+ * is decreasing the state, so first, do all the preparation
+ * work and only then, notify the driver.
*/
- if (test_sta_flag(sta, WLAN_STA_INSERTED)) {
+ if (new_state > sta->sta_state &&
+ test_sta_flag(sta, WLAN_STA_INSERTED)) {
int err = drv_sta_state(sta->local, sta->sdata, sta,
sta->sta_state, new_state);
if (err)
@@ -1395,6 +1417,16 @@ static int _sta_info_move_state(struct sta_info *sta,
break;
}
+ if (new_state < sta->sta_state &&
+ test_sta_flag(sta, WLAN_STA_INSERTED)) {
+ int err = drv_sta_state(sta->local, sta->sdata, sta,
+ sta->sta_state, new_state);
+
+ WARN_ONCE(err,
+ "Driver is not allowed to fail if the sta_state is transitioning down the list: %d\n",
+ err);
+ }
+
sta->sta_state = new_state;
return 0;
@@ -1567,7 +1599,7 @@ void sta_info_stop(struct ieee80211_local *local)
int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans,
- int link_id)
+ int link_id, struct sta_info *do_not_flush_sta)
{
struct ieee80211_local *local = sdata->local;
struct sta_info *sta, *tmp;
@@ -1585,6 +1617,9 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans,
(!vlans || sdata->bss != sta->sdata->bss))
continue;
+ if (sta == do_not_flush_sta)
+ continue;
+
if (link_id >= 0 && sta->sta.valid_links &&
!(sta->sta.valid_links & BIT(link_id)))
continue;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 9f89fb5bee37..07b7ec39a52f 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -512,6 +512,10 @@ struct ieee80211_fragment_cache {
* @status_stats.avg_ack_signal: average ACK signal
* @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
* taken from HT/VHT capabilities or VHT operating mode notification
+ * @rx_omi_bw_rx: RX OMI bandwidth restriction to apply for RX
+ * @rx_omi_bw_tx: RX OMI bandwidth restriction to apply for TX
+ * @rx_omi_bw_staging: RX OMI bandwidth restriction to apply later
+ * during finalize
* @debugfs_dir: debug filesystem directory dentry
* @pub: public (driver visible) link STA data
* TODO Move other link params from sta_info as required for MLD operation
@@ -561,6 +565,9 @@ struct link_sta_info {
} tx_stats;
enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
+ enum ieee80211_sta_rx_bandwidth rx_omi_bw_rx,
+ rx_omi_bw_tx,
+ rx_omi_bw_staging;
#ifdef CONFIG_MAC80211_DEBUGFS
struct dentry *debugfs_dir;
@@ -899,9 +906,10 @@ void sta_info_stop(struct ieee80211_local *local);
* @link_id: if given (>=0), all those STA entries using @link_id only
* will be removed. If -1 is passed, all STA entries will be
* removed.
+ * @do_not_flush_sta: a station that shouldn't be flushed.
*/
int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans,
- int link_id);
+ int link_id, struct sta_info *do_not_flush_sta);
/**
* sta_info_flush - flush matching STA entries from the STA table
@@ -916,7 +924,7 @@ int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans,
static inline int sta_info_flush(struct ieee80211_sub_if_data *sdata,
int link_id)
{
- return __sta_info_flush(sdata, false, link_id);
+ return __sta_info_flush(sdata, false, link_id, NULL);
}
void sta_set_rate_info_tx(struct sta_info *sta,
diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile
index 511dfa226699..0f5336bc7314 100644
--- a/net/mac80211/tests/Makefile
+++ b/net/mac80211/tests/Makefile
@@ -1,3 +1,3 @@
-mac80211-tests-y += module.o elems.o mfp.o tpe.o
+mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o
obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o
diff --git a/net/mac80211/tests/util.c b/net/mac80211/tests/util.c
new file mode 100644
index 000000000000..0936a73e3617
--- /dev/null
+++ b/net/mac80211/tests/util.c
@@ -0,0 +1,313 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Utilities for mac80211 unit testing
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+#include <linux/ieee80211.h>
+#include <net/mac80211.h>
+#include <kunit/test.h>
+#include <kunit/test-bug.h>
+#include "util.h"
+
+#define CHAN2G(_freq) { \
+ .band = NL80211_BAND_2GHZ, \
+ .center_freq = (_freq), \
+ .hw_value = (_freq), \
+}
+
+static const struct ieee80211_channel channels_2ghz[] = {
+ CHAN2G(2412), /* Channel 1 */
+ CHAN2G(2417), /* Channel 2 */
+ CHAN2G(2422), /* Channel 3 */
+ CHAN2G(2427), /* Channel 4 */
+ CHAN2G(2432), /* Channel 5 */
+ CHAN2G(2437), /* Channel 6 */
+ CHAN2G(2442), /* Channel 7 */
+ CHAN2G(2447), /* Channel 8 */
+ CHAN2G(2452), /* Channel 9 */
+ CHAN2G(2457), /* Channel 10 */
+ CHAN2G(2462), /* Channel 11 */
+ CHAN2G(2467), /* Channel 12 */
+ CHAN2G(2472), /* Channel 13 */
+ CHAN2G(2484), /* Channel 14 */
+};
+
+#define CHAN5G(_freq) { \
+ .band = NL80211_BAND_5GHZ, \
+ .center_freq = (_freq), \
+ .hw_value = (_freq), \
+}
+
+static const struct ieee80211_channel channels_5ghz[] = {
+ CHAN5G(5180), /* Channel 36 */
+ CHAN5G(5200), /* Channel 40 */
+ CHAN5G(5220), /* Channel 44 */
+ CHAN5G(5240), /* Channel 48 */
+};
+
+static const struct ieee80211_rate bitrates[] = {
+ { .bitrate = 10 },
+ { .bitrate = 20, .flags = IEEE80211_RATE_SHORT_PREAMBLE },
+ { .bitrate = 55, .flags = IEEE80211_RATE_SHORT_PREAMBLE },
+ { .bitrate = 110, .flags = IEEE80211_RATE_SHORT_PREAMBLE },
+ { .bitrate = 60 },
+ { .bitrate = 90 },
+ { .bitrate = 120 },
+ { .bitrate = 180 },
+ { .bitrate = 240 },
+ { .bitrate = 360 },
+ { .bitrate = 480 },
+ { .bitrate = 540 }
+};
+
+/* Copied from hwsim except that it only supports 4 EHT streams and STA/P2P mode */
+static const struct ieee80211_sband_iftype_data sband_capa_5ghz[] = {
+ {
+ .types_mask = BIT(NL80211_IFTYPE_STATION) |
+ BIT(NL80211_IFTYPE_P2P_CLIENT),
+ .he_cap = {
+ .has_he = true,
+ .he_cap_elem = {
+ .mac_cap_info[0] =
+ IEEE80211_HE_MAC_CAP0_HTC_HE,
+ .mac_cap_info[1] =
+ IEEE80211_HE_MAC_CAP1_TF_MAC_PAD_DUR_16US |
+ IEEE80211_HE_MAC_CAP1_MULTI_TID_AGG_RX_QOS_8,
+ .mac_cap_info[2] =
+ IEEE80211_HE_MAC_CAP2_BSR |
+ IEEE80211_HE_MAC_CAP2_MU_CASCADING |
+ IEEE80211_HE_MAC_CAP2_ACK_EN,
+ .mac_cap_info[3] =
+ IEEE80211_HE_MAC_CAP3_OMI_CONTROL |
+ IEEE80211_HE_MAC_CAP3_MAX_AMPDU_LEN_EXP_EXT_3,
+ .mac_cap_info[4] = IEEE80211_HE_MAC_CAP4_AMSDU_IN_AMPDU,
+ .phy_cap_info[0] =
+ IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G |
+ IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G |
+ IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G,
+ .phy_cap_info[1] =
+ IEEE80211_HE_PHY_CAP1_PREAMBLE_PUNC_RX_MASK |
+ IEEE80211_HE_PHY_CAP1_DEVICE_CLASS_A |
+ IEEE80211_HE_PHY_CAP1_LDPC_CODING_IN_PAYLOAD |
+ IEEE80211_HE_PHY_CAP1_MIDAMBLE_RX_TX_MAX_NSTS,
+ .phy_cap_info[2] =
+ IEEE80211_HE_PHY_CAP2_NDP_4x_LTF_AND_3_2US |
+ IEEE80211_HE_PHY_CAP2_STBC_TX_UNDER_80MHZ |
+ IEEE80211_HE_PHY_CAP2_STBC_RX_UNDER_80MHZ |
+ IEEE80211_HE_PHY_CAP2_UL_MU_FULL_MU_MIMO |
+ IEEE80211_HE_PHY_CAP2_UL_MU_PARTIAL_MU_MIMO,
+
+ /* Leave all the other PHY capability bytes
+ * unset, as DCM, beam forming, RU and PPE
+ * threshold information are not supported
+ */
+ },
+ .he_mcs_nss_supp = {
+ .rx_mcs_80 = cpu_to_le16(0xfffa),
+ .tx_mcs_80 = cpu_to_le16(0xfffa),
+ .rx_mcs_160 = cpu_to_le16(0xfffa),
+ .tx_mcs_160 = cpu_to_le16(0xfffa),
+ .rx_mcs_80p80 = cpu_to_le16(0xfffa),
+ .tx_mcs_80p80 = cpu_to_le16(0xfffa),
+ },
+ },
+ .eht_cap = {
+ .has_eht = true,
+ .eht_cap_elem = {
+ .mac_cap_info[0] =
+ IEEE80211_EHT_MAC_CAP0_EPCS_PRIO_ACCESS |
+ IEEE80211_EHT_MAC_CAP0_OM_CONTROL |
+ IEEE80211_EHT_MAC_CAP0_TRIG_TXOP_SHARING_MODE1,
+ .phy_cap_info[0] =
+ IEEE80211_EHT_PHY_CAP0_242_TONE_RU_GT20MHZ |
+ IEEE80211_EHT_PHY_CAP0_NDP_4_EHT_LFT_32_GI |
+ IEEE80211_EHT_PHY_CAP0_PARTIAL_BW_UL_MU_MIMO |
+ IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMER |
+ IEEE80211_EHT_PHY_CAP0_SU_BEAMFORMEE |
+ IEEE80211_EHT_PHY_CAP0_BEAMFORMEE_SS_80MHZ_MASK,
+ .phy_cap_info[1] =
+ IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_80MHZ_MASK |
+ IEEE80211_EHT_PHY_CAP1_BEAMFORMEE_SS_160MHZ_MASK,
+ .phy_cap_info[2] =
+ IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_80MHZ_MASK |
+ IEEE80211_EHT_PHY_CAP2_SOUNDING_DIM_160MHZ_MASK,
+ .phy_cap_info[3] =
+ IEEE80211_EHT_PHY_CAP3_NG_16_SU_FEEDBACK |
+ IEEE80211_EHT_PHY_CAP3_NG_16_MU_FEEDBACK |
+ IEEE80211_EHT_PHY_CAP3_CODEBOOK_4_2_SU_FDBK |
+ IEEE80211_EHT_PHY_CAP3_CODEBOOK_7_5_MU_FDBK |
+ IEEE80211_EHT_PHY_CAP3_TRIG_SU_BF_FDBK |
+ IEEE80211_EHT_PHY_CAP3_TRIG_MU_BF_PART_BW_FDBK |
+ IEEE80211_EHT_PHY_CAP3_TRIG_CQI_FDBK,
+ .phy_cap_info[4] =
+ IEEE80211_EHT_PHY_CAP4_PART_BW_DL_MU_MIMO |
+ IEEE80211_EHT_PHY_CAP4_PSR_SR_SUPP |
+ IEEE80211_EHT_PHY_CAP4_POWER_BOOST_FACT_SUPP |
+ IEEE80211_EHT_PHY_CAP4_EHT_MU_PPDU_4_EHT_LTF_08_GI |
+ IEEE80211_EHT_PHY_CAP4_MAX_NC_MASK,
+ .phy_cap_info[5] =
+ IEEE80211_EHT_PHY_CAP5_NON_TRIG_CQI_FEEDBACK |
+ IEEE80211_EHT_PHY_CAP5_TX_LESS_242_TONE_RU_SUPP |
+ IEEE80211_EHT_PHY_CAP5_RX_LESS_242_TONE_RU_SUPP |
+ IEEE80211_EHT_PHY_CAP5_PPE_THRESHOLD_PRESENT |
+ IEEE80211_EHT_PHY_CAP5_COMMON_NOMINAL_PKT_PAD_MASK |
+ IEEE80211_EHT_PHY_CAP5_MAX_NUM_SUPP_EHT_LTF_MASK,
+ .phy_cap_info[6] =
+ IEEE80211_EHT_PHY_CAP6_MAX_NUM_SUPP_EHT_LTF_MASK |
+ IEEE80211_EHT_PHY_CAP6_MCS15_SUPP_MASK,
+ .phy_cap_info[7] =
+ IEEE80211_EHT_PHY_CAP7_20MHZ_STA_RX_NDP_WIDER_BW |
+ IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_80MHZ |
+ IEEE80211_EHT_PHY_CAP7_NON_OFDMA_UL_MU_MIMO_160MHZ |
+ IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_80MHZ |
+ IEEE80211_EHT_PHY_CAP7_MU_BEAMFORMER_160MHZ,
+ },
+
+ /* For all MCS and bandwidth, set 4 NSS for both Tx and
+ * Rx
+ */
+ .eht_mcs_nss_supp = {
+ /*
+ * As B1 and B2 are set in the supported
+ * channel width set field in the HE PHY
+ * capabilities information field include all
+ * the following MCS/NSS.
+ */
+ .bw._80 = {
+ .rx_tx_mcs9_max_nss = 0x44,
+ .rx_tx_mcs11_max_nss = 0x44,
+ .rx_tx_mcs13_max_nss = 0x44,
+ },
+ .bw._160 = {
+ .rx_tx_mcs9_max_nss = 0x44,
+ .rx_tx_mcs11_max_nss = 0x44,
+ .rx_tx_mcs13_max_nss = 0x44,
+ },
+ },
+ /* PPE threshold information is not supported */
+ },
+ },
+};
+
+int t_sdata_init(struct kunit_resource *resource, void *ctx)
+{
+ struct kunit *test = kunit_get_current_test();
+ struct t_sdata *t_sdata;
+
+ t_sdata = kzalloc(sizeof(*t_sdata), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, t_sdata);
+
+ resource->data = t_sdata;
+ resource->name = "sdata";
+
+ t_sdata->sdata = kzalloc(sizeof(*t_sdata->sdata), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, t_sdata->sdata);
+
+ t_sdata->wiphy = kzalloc(sizeof(*t_sdata->wiphy), GFP_KERNEL);
+ KUNIT_ASSERT_NOT_NULL(test, t_sdata->wiphy);
+
+ strscpy(t_sdata->sdata->name, "kunit");
+
+ t_sdata->sdata->local = &t_sdata->local;
+ t_sdata->sdata->local->hw.wiphy = t_sdata->wiphy;
+ t_sdata->sdata->wdev.wiphy = t_sdata->wiphy;
+ t_sdata->sdata->vif.type = NL80211_IFTYPE_STATION;
+
+ t_sdata->sdata->deflink.sdata = t_sdata->sdata;
+ t_sdata->sdata->deflink.link_id = 0;
+
+ t_sdata->wiphy->bands[NL80211_BAND_2GHZ] = &t_sdata->band_2ghz;
+ t_sdata->wiphy->bands[NL80211_BAND_5GHZ] = &t_sdata->band_5ghz;
+
+ for (int band = NL80211_BAND_2GHZ; band <= NL80211_BAND_5GHZ; band++) {
+ struct ieee80211_supported_band *sband;
+
+ sband = t_sdata->wiphy->bands[band];
+ sband->band = band;
+
+ sband->bitrates =
+ kmemdup(bitrates, sizeof(bitrates), GFP_KERNEL);
+ sband->n_bitrates = ARRAY_SIZE(bitrates);
+
+ /* Initialize channels, feel free to add more channels/bands */
+ switch (band) {
+ case NL80211_BAND_2GHZ:
+ sband->channels = kmemdup(channels_2ghz,
+ sizeof(channels_2ghz),
+ GFP_KERNEL);
+ sband->n_channels = ARRAY_SIZE(channels_2ghz);
+ sband->bitrates = kmemdup(bitrates,
+ sizeof(bitrates),
+ GFP_KERNEL);
+ sband->n_bitrates = ARRAY_SIZE(bitrates);
+ break;
+ case NL80211_BAND_5GHZ:
+ sband->channels = kmemdup(channels_5ghz,
+ sizeof(channels_5ghz),
+ GFP_KERNEL);
+ sband->n_channels = ARRAY_SIZE(channels_5ghz);
+ sband->bitrates = kmemdup(bitrates,
+ sizeof(bitrates),
+ GFP_KERNEL);
+ sband->n_bitrates = ARRAY_SIZE(bitrates);
+
+ sband->vht_cap.vht_supported = true;
+ sband->vht_cap.cap =
+ IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 |
+ IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ |
+ IEEE80211_VHT_CAP_RXLDPC |
+ IEEE80211_VHT_CAP_SHORT_GI_80 |
+ IEEE80211_VHT_CAP_SHORT_GI_160 |
+ IEEE80211_VHT_CAP_TXSTBC |
+ IEEE80211_VHT_CAP_RXSTBC_4 |
+ IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK;
+ sband->vht_cap.vht_mcs.rx_mcs_map =
+ cpu_to_le16(IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 2 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 4 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 6 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 8 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 10 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 12 |
+ IEEE80211_VHT_MCS_SUPPORT_0_9 << 14);
+ sband->vht_cap.vht_mcs.tx_mcs_map =
+ sband->vht_cap.vht_mcs.rx_mcs_map;
+ break;
+ default:
+ continue;
+ }
+
+ sband->ht_cap.ht_supported = band != NL80211_BAND_6GHZ;
+ sband->ht_cap.cap = IEEE80211_HT_CAP_SUP_WIDTH_20_40 |
+ IEEE80211_HT_CAP_GRN_FLD |
+ IEEE80211_HT_CAP_SGI_20 |
+ IEEE80211_HT_CAP_SGI_40 |
+ IEEE80211_HT_CAP_DSSSCCK40;
+ sband->ht_cap.ampdu_factor = 0x3;
+ sband->ht_cap.ampdu_density = 0x6;
+ memset(&sband->ht_cap.mcs, 0, sizeof(sband->ht_cap.mcs));
+ sband->ht_cap.mcs.rx_mask[0] = 0xff;
+ sband->ht_cap.mcs.rx_mask[1] = 0xff;
+ sband->ht_cap.mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED;
+ }
+
+ ieee80211_set_sband_iftype_data(&t_sdata->band_5ghz, sband_capa_5ghz);
+
+ return 0;
+}
+
+void t_sdata_exit(struct kunit_resource *resource)
+{
+ struct t_sdata *t_sdata = resource->data;
+
+ kfree(t_sdata->band_2ghz.channels);
+ kfree(t_sdata->band_2ghz.bitrates);
+ kfree(t_sdata->band_5ghz.channels);
+ kfree(t_sdata->band_5ghz.bitrates);
+
+ kfree(t_sdata->sdata);
+ kfree(t_sdata->wiphy);
+
+ kfree(t_sdata);
+}
diff --git a/net/mac80211/tests/util.h b/net/mac80211/tests/util.h
new file mode 100644
index 000000000000..6615880c123f
--- /dev/null
+++ b/net/mac80211/tests/util.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Utilities for mac80211 unit testing
+ *
+ * Copyright (C) 2024 Intel Corporation
+ */
+#ifndef __MAC80211_UTILS_H
+#define __MAC80211_UTILS_H
+
+#include "../ieee80211_i.h"
+
+struct t_sdata {
+ struct ieee80211_sub_if_data *sdata;
+ struct wiphy *wiphy;
+ struct ieee80211_local local;
+
+ void *ctx;
+
+ struct ieee80211_supported_band band_2ghz;
+ struct ieee80211_supported_band band_5ghz;
+};
+
+#define T_SDATA(test) ({ \
+ struct t_sdata *__t_sdata = \
+ kunit_alloc_resource(test, t_sdata_init, \
+ t_sdata_exit, \
+ GFP_KERNEL, NULL); \
+ \
+ KUNIT_ASSERT_NOT_NULL(test, __t_sdata); \
+ __t_sdata; \
+ })
+
+int t_sdata_init(struct kunit_resource *resource, void *data);
+void t_sdata_exit(struct kunit_resource *resource);
+
+#endif /* __MAC80211_UTILS_H */
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 7a4985fc2b16..72fad8ea8bb9 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -2173,13 +2173,14 @@ DEFINE_EVENT(chanswitch_evt, drv_channel_switch_rx_beacon,
TRACE_EVENT(drv_get_txpower,
TP_PROTO(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
- int dbm, int ret),
+ unsigned int link_id, int dbm, int ret),
- TP_ARGS(local, sdata, dbm, ret),
+ TP_ARGS(local, sdata, link_id, dbm, ret),
TP_STRUCT__entry(
LOCAL_ENTRY
VIF_ENTRY
+ __field(unsigned int, link_id)
__field(int, dbm)
__field(int, ret)
),
@@ -2187,13 +2188,14 @@ TRACE_EVENT(drv_get_txpower,
TP_fast_assign(
LOCAL_ASSIGN;
VIF_ASSIGN;
+ __entry->link_id = link_id;
__entry->dbm = dbm;
__entry->ret = ret;
),
TP_printk(
- LOCAL_PR_FMT VIF_PR_FMT " dbm:%d ret:%d",
- LOCAL_PR_ARG, VIF_PR_ARG, __entry->dbm, __entry->ret
+ LOCAL_PR_FMT VIF_PR_FMT " link_id:%d dbm:%d ret:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, __entry->link_id, __entry->dbm, __entry->ret
)
);
@@ -2588,6 +2590,45 @@ TRACE_EVENT(drv_change_sta_links,
* Tracing for API calls that drivers call.
*/
+TRACE_EVENT(api_return_bool,
+ TP_PROTO(struct ieee80211_local *local, bool result),
+
+ TP_ARGS(local, result),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ __field(bool, result)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ __entry->result = result;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " result=%d",
+ LOCAL_PR_ARG, __entry->result
+ )
+);
+
+TRACE_EVENT(api_return_void,
+ TP_PROTO(struct ieee80211_local *local),
+
+ TP_ARGS(local),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT, LOCAL_PR_ARG
+ )
+);
+
TRACE_EVENT(api_start_tx_ba_session,
TP_PROTO(struct ieee80211_sta *sta, u16 tid),
@@ -3052,6 +3093,65 @@ TRACE_EVENT(api_request_smps,
)
);
+TRACE_EVENT(api_prepare_rx_omi_bw,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct link_sta_info *link_sta,
+ enum ieee80211_sta_rx_bandwidth bw),
+
+ TP_ARGS(local, sdata, link_sta, bw),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(int, link_id)
+ __field(u32, bw)
+ __field(bool, result)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_NAMED_ASSIGN(link_sta->sta);
+ __entry->link_id = link_sta->link_id;
+ __entry->bw = bw;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " " VIF_PR_FMT " " STA_PR_FMT " link:%d, bw:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG,
+ __entry->link_id, __entry->bw
+ )
+);
+
+TRACE_EVENT(api_finalize_rx_omi_bw,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct link_sta_info *link_sta),
+
+ TP_ARGS(local, sdata, link_sta),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(int, link_id)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_NAMED_ASSIGN(link_sta->sta);
+ __entry->link_id = link_sta->link_id;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT " " VIF_PR_FMT " " STA_PR_FMT " link:%d",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id
+ )
+);
+
/*
* Tracing for internal functions
* (which may also be called in response to driver calls)
@@ -3059,49 +3159,55 @@ TRACE_EVENT(api_request_smps,
TRACE_EVENT(wake_queue,
TP_PROTO(struct ieee80211_local *local, u16 queue,
- enum queue_stop_reason reason),
+ enum queue_stop_reason reason, int refcount),
- TP_ARGS(local, queue, reason),
+ TP_ARGS(local, queue, reason, refcount),
TP_STRUCT__entry(
LOCAL_ENTRY
__field(u16, queue)
__field(u32, reason)
+ __field(int, refcount)
),
TP_fast_assign(
LOCAL_ASSIGN;
__entry->queue = queue;
__entry->reason = reason;
+ __entry->refcount = refcount;
),
TP_printk(
- LOCAL_PR_FMT " queue:%d, reason:%d",
- LOCAL_PR_ARG, __entry->queue, __entry->reason
+ LOCAL_PR_FMT " queue:%d, reason:%d, refcount: %d",
+ LOCAL_PR_ARG, __entry->queue, __entry->reason,
+ __entry->refcount
)
);
TRACE_EVENT(stop_queue,
TP_PROTO(struct ieee80211_local *local, u16 queue,
- enum queue_stop_reason reason),
+ enum queue_stop_reason reason, int refcount),
- TP_ARGS(local, queue, reason),
+ TP_ARGS(local, queue, reason, refcount),
TP_STRUCT__entry(
LOCAL_ENTRY
__field(u16, queue)
__field(u32, reason)
+ __field(int, refcount)
),
TP_fast_assign(
LOCAL_ASSIGN;
__entry->queue = queue;
__entry->reason = reason;
+ __entry->refcount = refcount;
),
TP_printk(
- LOCAL_PR_FMT " queue:%d, reason:%d",
- LOCAL_PR_ARG, __entry->queue, __entry->reason
+ LOCAL_PR_FMT " queue:%d, reason:%d, refcount: %d",
+ LOCAL_PR_ARG, __entry->queue, __entry->reason,
+ __entry->refcount
)
);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index dc0b74443c8d..fdda14c08e2b 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -6,7 +6,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2024 Intel Corporation
+ * Copyright (C) 2018-2025 Intel Corporation
*
* utilities for mac80211
*/
@@ -437,8 +437,6 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
{
struct ieee80211_local *local = hw_to_local(hw);
- trace_wake_queue(local, queue, reason);
-
if (WARN_ON(queue >= hw->queues))
return;
@@ -456,6 +454,9 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
if (local->q_stop_reasons[queue][reason] == 0)
__clear_bit(reason, &local->queue_stop_reasons[queue]);
+ trace_wake_queue(local, queue, reason,
+ local->q_stop_reasons[queue][reason]);
+
if (local->queue_stop_reasons[queue] != 0)
/* someone still has this queue stopped */
return;
@@ -502,8 +503,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
{
struct ieee80211_local *local = hw_to_local(hw);
- trace_stop_queue(local, queue, reason);
-
if (WARN_ON(queue >= hw->queues))
return;
@@ -512,6 +511,9 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue,
else
local->q_stop_reasons[queue][reason]++;
+ trace_stop_queue(local, queue, reason,
+ local->q_stop_reasons[queue][reason]);
+
set_bit(reason, &local->queue_stop_reasons[queue]);
}
@@ -685,7 +687,7 @@ void __ieee80211_flush_queues(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
unsigned int queues, bool drop)
{
- if (!local->ops->flush)
+ if (!local->ops->flush && !drop)
return;
/*
@@ -712,7 +714,8 @@ void __ieee80211_flush_queues(struct ieee80211_local *local,
}
}
- drv_flush(local, sdata, queues, drop);
+ if (local->ops->flush)
+ drv_flush(local, sdata, queues, drop);
ieee80211_wake_queues_by_reason(&local->hw, queues,
IEEE80211_QUEUE_STOP_REASON_FLUSH,
@@ -2190,8 +2193,10 @@ int ieee80211_reconfig(struct ieee80211_local *local)
ieee80211_reconfig_roc(local);
/* Requeue all works */
- list_for_each_entry(sdata, &local->interfaces, list)
- wiphy_work_queue(local->hw.wiphy, &sdata->work);
+ list_for_each_entry(sdata, &local->interfaces, list) {
+ if (ieee80211_sdata_running(sdata))
+ wiphy_work_queue(local->hw.wiphy, &sdata->work);
+ }
}
ieee80211_wake_queues_by_reason(hw, IEEE80211_MAX_QUEUE_MAP,
@@ -2748,6 +2753,7 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef)
{
struct ieee80211_he_operation *he_oper;
struct ieee80211_he_6ghz_oper *he_6ghz_op;
+ struct cfg80211_chan_def he_chandef;
u32 he_oper_params;
u8 ie_len = 1 + sizeof(struct ieee80211_he_operation);
@@ -2779,27 +2785,33 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef)
if (chandef->chan->band != NL80211_BAND_6GHZ)
goto out;
+ cfg80211_chandef_create(&he_chandef, chandef->chan, NL80211_CHAN_NO_HT);
+ he_chandef.center_freq1 = chandef->center_freq1;
+ he_chandef.center_freq2 = chandef->center_freq2;
+ he_chandef.width = chandef->width;
+
/* TODO add VHT operational */
he_6ghz_op = (struct ieee80211_he_6ghz_oper *)pos;
he_6ghz_op->minrate = 6; /* 6 Mbps */
he_6ghz_op->primary =
- ieee80211_frequency_to_channel(chandef->chan->center_freq);
+ ieee80211_frequency_to_channel(he_chandef.chan->center_freq);
he_6ghz_op->ccfs0 =
- ieee80211_frequency_to_channel(chandef->center_freq1);
- if (chandef->center_freq2)
+ ieee80211_frequency_to_channel(he_chandef.center_freq1);
+ if (he_chandef.center_freq2)
he_6ghz_op->ccfs1 =
- ieee80211_frequency_to_channel(chandef->center_freq2);
+ ieee80211_frequency_to_channel(he_chandef.center_freq2);
else
he_6ghz_op->ccfs1 = 0;
- switch (chandef->width) {
+ switch (he_chandef.width) {
case NL80211_CHAN_WIDTH_320:
- /*
- * TODO: mesh operation is not defined over 6GHz 320 MHz
- * channels.
+ /* Downgrade EHT 320 MHz BW to 160 MHz for HE and set new
+ * center_freq1
*/
- WARN_ON(1);
- break;
+ ieee80211_chandef_downgrade(&he_chandef, NULL);
+ he_6ghz_op->ccfs0 =
+ ieee80211_frequency_to_channel(he_chandef.center_freq1);
+ fallthrough;
case NL80211_CHAN_WIDTH_160:
/* Convert 160 MHz channel width to new style as interop
* workaround.
@@ -2807,7 +2819,7 @@ u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef)
he_6ghz_op->control =
IEEE80211_HE_6GHZ_OPER_CTRL_CHANWIDTH_160MHZ;
he_6ghz_op->ccfs1 = he_6ghz_op->ccfs0;
- if (chandef->chan->center_freq < chandef->center_freq1)
+ if (he_chandef.chan->center_freq < he_chandef.center_freq1)
he_6ghz_op->ccfs0 -= 8;
else
he_6ghz_op->ccfs0 += 8;
@@ -3647,31 +3659,6 @@ again:
WARN_ON_ONCE(!cfg80211_chandef_valid(c));
}
-/*
- * Returns true if smps_mode_new is strictly more restrictive than
- * smps_mode_old.
- */
-bool ieee80211_smps_is_restrictive(enum ieee80211_smps_mode smps_mode_old,
- enum ieee80211_smps_mode smps_mode_new)
-{
- if (WARN_ON_ONCE(smps_mode_old == IEEE80211_SMPS_AUTOMATIC ||
- smps_mode_new == IEEE80211_SMPS_AUTOMATIC))
- return false;
-
- switch (smps_mode_old) {
- case IEEE80211_SMPS_STATIC:
- return false;
- case IEEE80211_SMPS_DYNAMIC:
- return smps_mode_new == IEEE80211_SMPS_STATIC;
- case IEEE80211_SMPS_OFF:
- return smps_mode_new != IEEE80211_SMPS_OFF;
- default:
- WARN_ON(1);
- }
-
- return false;
-}
-
int ieee80211_send_action_csa(struct ieee80211_sub_if_data *sdata,
struct cfg80211_csa_settings *csa_settings)
{
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index 6a20fa099190..c5c5d16ed6c8 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -350,9 +350,9 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
}
/* FIXME: move this to some better location - parses HE/EHT now */
-enum ieee80211_sta_rx_bandwidth
-_ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta,
- struct cfg80211_chan_def *chandef)
+static enum ieee80211_sta_rx_bandwidth
+__ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta,
+ struct cfg80211_chan_def *chandef)
{
unsigned int link_id = link_sta->link_id;
struct ieee80211_sub_if_data *sdata = link_sta->sta->sdata;
@@ -423,6 +423,28 @@ _ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta,
return IEEE80211_STA_RX_BW_80;
}
+enum ieee80211_sta_rx_bandwidth
+_ieee80211_sta_cap_rx_bw(struct link_sta_info *link_sta,
+ struct cfg80211_chan_def *chandef)
+{
+ /*
+ * With RX OMI, also pretend that the STA's capability changed.
+ * Of course this isn't really true, it didn't change, only our
+ * RX capability was changed by notifying RX OMI to the STA.
+ * The purpose, however, is to save power, and that requires
+ * changing also transmissions to the AP and the chanctx. The
+ * transmissions depend on link_sta->bandwidth which is set in
+ * _ieee80211_sta_cur_vht_bw() below, but the chanctx depends
+ * on the result of this function which is also called by
+ * _ieee80211_sta_cur_vht_bw(), so we need to do that here as
+ * well. This is sufficient for the steady state, but during
+ * the transition we already need to change TX/RX separately,
+ * so _ieee80211_sta_cur_vht_bw() below applies the _tx one.
+ */
+ return min(__ieee80211_sta_cap_rx_bw(link_sta, chandef),
+ link_sta->rx_omi_bw_rx);
+}
+
enum nl80211_chan_width
ieee80211_sta_cap_chan_bw(struct link_sta_info *link_sta)
{
@@ -503,8 +525,11 @@ _ieee80211_sta_cur_vht_bw(struct link_sta_info *link_sta,
rcu_read_unlock();
}
- bw = _ieee80211_sta_cap_rx_bw(link_sta, chandef);
+ /* intentionally do not take rx_bw_omi_rx into account */
+ bw = __ieee80211_sta_cap_rx_bw(link_sta, chandef);
bw = min(bw, link_sta->cur_max_bandwidth);
+ /* but do apply rx_omi_bw_tx */
+ bw = min(bw, link_sta->rx_omi_bw_tx);
/* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of
* IEEE80211-2016 specification makes higher bandwidth operation
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 08dd521a51a5..8f2bff268392 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -194,9 +194,6 @@ int ieee802154_mlme_tx_locked(struct ieee802154_local *local,
struct ieee802154_sub_if_data *sdata,
struct sk_buff *skb);
void ieee802154_mlme_op_post(struct ieee802154_local *local);
-int ieee802154_mlme_tx_one(struct ieee802154_local *local,
- struct ieee802154_sub_if_data *sdata,
- struct sk_buff *skb);
int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local,
struct ieee802154_sub_if_data *sdata,
struct sk_buff *skb);
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 337d6faf0d2a..4d13f18f6f2c 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -178,19 +178,6 @@ void ieee802154_mlme_op_post(struct ieee802154_local *local)
ieee802154_release_queue(local);
}
-int ieee802154_mlme_tx_one(struct ieee802154_local *local,
- struct ieee802154_sub_if_data *sdata,
- struct sk_buff *skb)
-{
- int ret;
-
- ieee802154_mlme_op_pre(local);
- ret = ieee802154_mlme_tx(local, sdata, skb);
- ieee802154_mlme_op_post(local);
-
- return ret;
-}
-
int ieee802154_mlme_tx_one_locked(struct ieee802154_local *local,
struct ieee802154_sub_if_data *sdata,
struct sk_buff *skb)
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index f6de136008f6..57850d4dac5d 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -630,6 +630,9 @@ static int mctp_sk_hash(struct sock *sk)
{
struct net *net = sock_net(sk);
+ /* Bind lookup runs under RCU, remain live during that. */
+ sock_set_flag(sk, SOCK_RCU_FREE);
+
mutex_lock(&net->mctp.bind_lock);
sk_add_node_rcu(sk, &net->mctp.binds);
mutex_unlock(&net->mctp.bind_lock);
diff --git a/net/mctp/device.c b/net/mctp/device.c
index 26ce34b7e88e..8e0724c56723 100644
--- a/net/mctp/device.c
+++ b/net/mctp/device.c
@@ -20,8 +20,7 @@
#include <net/sock.h>
struct mctp_dump_cb {
- int h;
- int idx;
+ unsigned long ifindex;
size_t a_idx;
};
@@ -115,43 +114,29 @@ static int mctp_dump_addrinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct mctp_dump_cb *mcb = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
- struct hlist_head *head;
struct net_device *dev;
struct ifaddrmsg *hdr;
struct mctp_dev *mdev;
- int ifindex;
- int idx = 0, rc;
+ int ifindex, rc;
hdr = nlmsg_data(cb->nlh);
// filter by ifindex if requested
ifindex = hdr->ifa_index;
rcu_read_lock();
- for (; mcb->h < NETDEV_HASHENTRIES; mcb->h++, mcb->idx = 0) {
- idx = 0;
- head = &net->dev_index_head[mcb->h];
- hlist_for_each_entry_rcu(dev, head, index_hlist) {
- if (idx >= mcb->idx &&
- (ifindex == 0 || ifindex == dev->ifindex)) {
- mdev = __mctp_dev_get(dev);
- if (mdev) {
- rc = mctp_dump_dev_addrinfo(mdev,
- skb, cb);
- mctp_dev_put(mdev);
- // Error indicates full buffer, this
- // callback will get retried.
- if (rc < 0)
- goto out;
- }
- }
- idx++;
- // reset for next iteration
- mcb->a_idx = 0;
- }
+ for_each_netdev_dump(net, dev, mcb->ifindex) {
+ if (ifindex && ifindex != dev->ifindex)
+ continue;
+ mdev = __mctp_dev_get(dev);
+ if (!mdev)
+ continue;
+ rc = mctp_dump_dev_addrinfo(mdev, skb, cb);
+ mctp_dev_put(mdev);
+ if (rc < 0)
+ break;
+ mcb->a_idx = 0;
}
-out:
rcu_read_unlock();
- mcb->idx = idx;
return skb->len;
}
@@ -531,9 +516,12 @@ static struct notifier_block mctp_dev_nb = {
};
static const struct rtnl_msg_handler mctp_device_rtnl_msg_handlers[] = {
- {THIS_MODULE, PF_MCTP, RTM_NEWADDR, mctp_rtm_newaddr, NULL, 0},
- {THIS_MODULE, PF_MCTP, RTM_DELADDR, mctp_rtm_deladdr, NULL, 0},
- {THIS_MODULE, PF_MCTP, RTM_GETADDR, NULL, mctp_dump_addrinfo, 0},
+ {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_NEWADDR,
+ .doit = mctp_rtm_newaddr},
+ {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_DELADDR,
+ .doit = mctp_rtm_deladdr},
+ {.owner = THIS_MODULE, .protocol = PF_MCTP, .msgtype = RTM_GETADDR,
+ .dumpit = mctp_dump_addrinfo},
};
int __init mctp_device_init(void)
diff --git a/net/mctp/route.c b/net/mctp/route.c
index 3f2bd65ff5e3..4c460160914f 100644
--- a/net/mctp/route.c
+++ b/net/mctp/route.c
@@ -332,8 +332,14 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
& MCTP_HDR_SEQ_MASK;
if (!key->reasm_head) {
- key->reasm_head = skb;
- key->reasm_tailp = &(skb_shinfo(skb)->frag_list);
+ /* Since we're manipulating the shared frag_list, ensure it isn't
+ * shared with any other SKBs.
+ */
+ key->reasm_head = skb_unshare(skb, GFP_ATOMIC);
+ if (!key->reasm_head)
+ return -ENOMEM;
+
+ key->reasm_tailp = &(skb_shinfo(key->reasm_head)->frag_list);
key->last_seq = this_seq;
return 0;
}
diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
index 17165b86ce22..06c1897b685a 100644
--- a/net/mctp/test/route-test.c
+++ b/net/mctp/test/route-test.c
@@ -921,6 +921,114 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test)
__mctp_route_test_fini(test, dev, rt, sock);
}
+/* Input route to socket, using a fragmented message created from clones.
+ */
+static void mctp_test_route_input_cloned_frag(struct kunit *test)
+{
+ /* 5 packet fragments, forming 2 complete messages */
+ const struct mctp_hdr hdrs[5] = {
+ RX_FRAG(FL_S, 0),
+ RX_FRAG(0, 1),
+ RX_FRAG(FL_E, 2),
+ RX_FRAG(FL_S, 0),
+ RX_FRAG(FL_E, 1),
+ };
+ struct mctp_test_route *rt;
+ struct mctp_test_dev *dev;
+ struct sk_buff *skb[5];
+ struct sk_buff *rx_skb;
+ struct socket *sock;
+ size_t data_len;
+ u8 compare[100];
+ u8 flat[100];
+ size_t total;
+ void *p;
+ int rc;
+
+ /* Arbitrary length */
+ data_len = 3;
+ total = data_len + sizeof(struct mctp_hdr);
+
+ __mctp_route_test_init(test, &dev, &rt, &sock, MCTP_NET_ANY);
+
+ /* Create a single skb initially with concatenated packets */
+ skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total);
+ mctp_test_skb_set_dev(skb[0], dev);
+ memset(skb[0]->data, 0 * 0x11, skb[0]->len);
+ memcpy(skb[0]->data, &hdrs[0], sizeof(struct mctp_hdr));
+
+ /* Extract and populate packets */
+ for (int i = 1; i < 5; i++) {
+ skb[i] = skb_clone(skb[i - 1], GFP_ATOMIC);
+ KUNIT_ASSERT_TRUE(test, skb[i]);
+ p = skb_pull(skb[i], total);
+ KUNIT_ASSERT_TRUE(test, p);
+ skb_reset_network_header(skb[i]);
+ memcpy(skb[i]->data, &hdrs[i], sizeof(struct mctp_hdr));
+ memset(&skb[i]->data[sizeof(struct mctp_hdr)], i * 0x11, data_len);
+ }
+ for (int i = 0; i < 5; i++)
+ skb_trim(skb[i], total);
+
+ /* SOM packets have a type byte to match the socket */
+ skb[0]->data[4] = 0;
+ skb[3]->data[4] = 0;
+
+ skb_dump("pkt1 ", skb[0], false);
+ skb_dump("pkt2 ", skb[1], false);
+ skb_dump("pkt3 ", skb[2], false);
+ skb_dump("pkt4 ", skb[3], false);
+ skb_dump("pkt5 ", skb[4], false);
+
+ for (int i = 0; i < 5; i++) {
+ KUNIT_EXPECT_EQ(test, refcount_read(&skb[i]->users), 1);
+ /* Take a reference so we can check refcounts at the end */
+ skb_get(skb[i]);
+ }
+
+ /* Feed the fragments into MCTP core */
+ for (int i = 0; i < 5; i++) {
+ rc = mctp_route_input(&rt->rt, skb[i]);
+ KUNIT_EXPECT_EQ(test, rc, 0);
+ }
+
+ /* Receive first reassembled message */
+ rx_skb = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc);
+ KUNIT_EXPECT_EQ(test, rc, 0);
+ KUNIT_EXPECT_EQ(test, rx_skb->len, 3 * data_len);
+ rc = skb_copy_bits(rx_skb, 0, flat, rx_skb->len);
+ for (int i = 0; i < rx_skb->len; i++)
+ compare[i] = (i / data_len) * 0x11;
+ /* Set type byte */
+ compare[0] = 0;
+
+ KUNIT_EXPECT_MEMEQ(test, flat, compare, rx_skb->len);
+ KUNIT_EXPECT_EQ(test, refcount_read(&rx_skb->users), 1);
+ kfree_skb(rx_skb);
+
+ /* Receive second reassembled message */
+ rx_skb = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc);
+ KUNIT_EXPECT_EQ(test, rc, 0);
+ KUNIT_EXPECT_EQ(test, rx_skb->len, 2 * data_len);
+ rc = skb_copy_bits(rx_skb, 0, flat, rx_skb->len);
+ for (int i = 0; i < rx_skb->len; i++)
+ compare[i] = (i / data_len + 3) * 0x11;
+ /* Set type byte */
+ compare[0] = 0;
+
+ KUNIT_EXPECT_MEMEQ(test, flat, compare, rx_skb->len);
+ KUNIT_EXPECT_EQ(test, refcount_read(&rx_skb->users), 1);
+ kfree_skb(rx_skb);
+
+ /* Check input skb refcounts */
+ for (int i = 0; i < 5; i++) {
+ KUNIT_EXPECT_EQ(test, refcount_read(&skb[i]->users), 1);
+ kfree_skb(skb[i]);
+ }
+
+ __mctp_route_test_fini(test, dev, rt, sock);
+}
+
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
static void mctp_test_flow_init(struct kunit *test,
@@ -1144,6 +1252,7 @@ static struct kunit_case mctp_test_cases[] = {
KUNIT_CASE(mctp_test_packet_flow),
KUNIT_CASE(mctp_test_fragment_flow),
KUNIT_CASE(mctp_test_route_output_key_create),
+ KUNIT_CASE(mctp_test_route_input_cloned_frag),
{}
};
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index dd595d9b5e50..2dd81e6c26bd 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -32,6 +32,7 @@ struct mptcp_pernet {
unsigned int close_timeout;
unsigned int stale_loss_cnt;
atomic_t active_disable_times;
+ u8 syn_retrans_before_tcp_fallback;
unsigned long active_disable_stamp;
u8 mptcp_enabled;
u8 checksum_enabled;
@@ -92,6 +93,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->mptcp_enabled = 1;
pernet->add_addr_timeout = TCP_RTO_MAX;
pernet->blackhole_timeout = 3600;
+ pernet->syn_retrans_before_tcp_fallback = 2;
atomic_set(&pernet->active_disable_times, 0);
pernet->close_timeout = TCP_TIMEWAIT_LEN;
pernet->checksum_enabled = 0;
@@ -245,6 +247,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.proc_handler = proc_blackhole_detect_timeout,
.extra1 = SYSCTL_ZERO,
},
+ {
+ .procname = "syn_retrans_before_tcp_fallback",
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ },
};
static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
@@ -269,6 +277,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
/* table[7] is for available_schedulers which is read-only info */
table[8].data = &pernet->close_timeout;
table[9].data = &pernet->blackhole_timeout;
+ table[10].data = &pernet->syn_retrans_before_tcp_fallback;
hdr = register_net_sysctl_sz(net, MPTCP_SYSCTL_PATH, table,
ARRAY_SIZE(mptcp_sysctl_table));
@@ -392,17 +401,21 @@ void mptcp_active_enable(struct sock *sk)
void mptcp_active_detect_blackhole(struct sock *ssk, bool expired)
{
struct mptcp_subflow_context *subflow;
- u32 timeouts;
if (!sk_is_mptcp(ssk))
return;
- timeouts = inet_csk(ssk)->icsk_retransmits;
subflow = mptcp_subflow_ctx(ssk);
if (subflow->request_mptcp && ssk->sk_state == TCP_SYN_SENT) {
- if (timeouts == 2 || (timeouts < 2 && expired)) {
- MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_MPCAPABLEACTIVEDROP);
+ struct net *net = sock_net(ssk);
+ u8 timeouts, to_max;
+
+ timeouts = inet_csk(ssk)->icsk_retransmits;
+ to_max = mptcp_get_pernet(net)->syn_retrans_before_tcp_fallback;
+
+ if (timeouts == to_max || (timeouts < to_max && expired)) {
+ MPTCP_INC_STATS(net, MPTCP_MIB_MPCAPABLEACTIVEDROP);
subflow->mpc_drop = 1;
mptcp_subflow_early_fallback(mptcp_sk(subflow->conn), subflow);
}
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index fd2de185bc93..23949ae2a3a8 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -651,6 +651,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
struct mptcp_sock *msk = mptcp_sk(subflow->conn);
bool drop_other_suboptions = false;
unsigned int opt_size = *size;
+ struct mptcp_addr_info addr;
bool echo;
int len;
@@ -659,7 +660,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
*/
if (!mptcp_pm_should_add_signal(msk) ||
(opts->suboptions & (OPTION_MPTCP_MPJ_ACK | OPTION_MPTCP_MPC_ACK)) ||
- !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &opts->addr,
+ !mptcp_pm_add_addr_signal(msk, skb, opt_size, remaining, &addr,
&echo, &drop_other_suboptions))
return false;
@@ -672,7 +673,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
else if (opts->suboptions & OPTION_MPTCP_DSS)
return false;
- len = mptcp_add_addr_len(opts->addr.family, echo, !!opts->addr.port);
+ len = mptcp_add_addr_len(addr.family, echo, !!addr.port);
if (remaining < len)
return false;
@@ -689,6 +690,7 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *
opts->ahmac = 0;
*size -= opt_size;
}
+ opts->addr = addr;
opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
if (!echo) {
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX);
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 3f93ddab9408..7868207c4e9d 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -107,8 +107,8 @@ static void remote_address(const struct sock_common *skc,
#endif
}
-static bool lookup_subflow_by_saddr(const struct list_head *list,
- const struct mptcp_addr_info *saddr)
+bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr)
{
struct mptcp_subflow_context *subflow;
struct mptcp_addr_info cur;
@@ -1459,8 +1459,8 @@ out_free:
return ret;
}
-static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
+bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
{
struct mptcp_pm_add_entry *entry;
@@ -1488,7 +1488,7 @@ static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
- ret = remove_anno_list_by_saddr(msk, addr);
+ ret = mptcp_remove_anno_list_by_saddr(msk, addr);
if (ret || force) {
spin_lock_bh(&msk->pm.lock);
if (ret) {
@@ -1527,7 +1527,7 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
goto next;
lock_sock(sk);
- remove_subflow = lookup_subflow_by_saddr(&msk->conn_list, addr);
+ remove_subflow = mptcp_lookup_subflow_by_saddr(&msk->conn_list, addr);
mptcp_pm_remove_anno_addr(msk, addr, remove_subflow &&
!(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
@@ -1640,36 +1640,6 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
return ret;
}
-/* Called from the userspace PM only */
-void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list)
-{
- struct mptcp_rm_list alist = { .nr = 0 };
- struct mptcp_pm_addr_entry *entry;
- int anno_nr = 0;
-
- list_for_each_entry(entry, rm_list, list) {
- if (alist.nr >= MPTCP_RM_IDS_MAX)
- break;
-
- /* only delete if either announced or matching a subflow */
- if (remove_anno_list_by_saddr(msk, &entry->addr))
- anno_nr++;
- else if (!lookup_subflow_by_saddr(&msk->conn_list,
- &entry->addr))
- continue;
-
- alist.ids[alist.nr++] = entry->addr.id;
- }
-
- if (alist.nr) {
- spin_lock_bh(&msk->pm.lock);
- msk->pm.add_addr_signaled -= anno_nr;
- mptcp_pm_remove_addr(msk, &alist);
- spin_unlock_bh(&msk->pm.lock);
- }
-}
-
-/* Called from the in-kernel PM only */
static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk,
struct list_head *rm_list)
{
@@ -1678,11 +1648,11 @@ static void mptcp_pm_flush_addrs_and_subflows(struct mptcp_sock *msk,
list_for_each_entry(entry, rm_list, list) {
if (slist.nr < MPTCP_RM_IDS_MAX &&
- lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
+ mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
slist.ids[slist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
if (alist.nr < MPTCP_RM_IDS_MAX &&
- remove_anno_list_by_saddr(msk, &entry->addr))
+ mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
alist.ids[alist.nr++] = mptcp_endp_get_local_id(msk, &entry->addr);
}
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index e35178f5205f..a3d477059b11 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -8,6 +8,10 @@
#include "mib.h"
#include "mptcp_pm_gen.h"
+#define mptcp_for_each_userspace_pm_addr(__msk, __entry) \
+ list_for_each_entry(__entry, \
+ &((__msk)->pm.userspace_pm_local_addr_list), list)
+
void mptcp_free_local_addr_list(struct mptcp_sock *msk)
{
struct mptcp_pm_addr_entry *entry, *tmp;
@@ -26,6 +30,19 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk)
}
}
+static struct mptcp_pm_addr_entry *
+mptcp_userspace_pm_lookup_addr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
+{
+ struct mptcp_pm_addr_entry *entry;
+
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
+ if (mptcp_addresses_equal(&entry->addr, addr, false))
+ return entry;
+ }
+ return NULL;
+}
+
static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
struct mptcp_pm_addr_entry *entry,
bool needs_id)
@@ -41,7 +58,7 @@ static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk,
bitmap_zero(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
+ mptcp_for_each_userspace_pm_addr(msk, e) {
addr_match = mptcp_addresses_equal(&e->addr, &entry->addr, true);
if (addr_match && entry->addr.id == 0 && needs_id)
entry->addr.id = e->addr.id;
@@ -90,22 +107,20 @@ append_err:
static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk,
struct mptcp_pm_addr_entry *addr)
{
- struct mptcp_pm_addr_entry *entry, *tmp;
struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
- list_for_each_entry_safe(entry, tmp, &msk->pm.userspace_pm_local_addr_list, list) {
- if (mptcp_addresses_equal(&entry->addr, &addr->addr, false)) {
- /* TODO: a refcount is needed because the entry can
- * be used multiple times (e.g. fullmesh mode).
- */
- list_del_rcu(&entry->list);
- sock_kfree_s(sk, entry, sizeof(*entry));
- msk->pm.local_addr_used--;
- return 0;
- }
- }
-
- return -EINVAL;
+ entry = mptcp_userspace_pm_lookup_addr(msk, &addr->addr);
+ if (!entry)
+ return -EINVAL;
+
+ /* TODO: a refcount is needed because the entry can
+ * be used multiple times (e.g. fullmesh mode).
+ */
+ list_del_rcu(&entry->list);
+ sock_kfree_s(sk, entry, sizeof(*entry));
+ msk->pm.local_addr_used--;
+ return 0;
}
static struct mptcp_pm_addr_entry *
@@ -113,7 +128,7 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
if (entry->addr.id == id)
return entry;
}
@@ -123,17 +138,12 @@ mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id)
int mptcp_userspace_pm_get_local_id(struct mptcp_sock *msk,
struct mptcp_addr_info *skc)
{
- struct mptcp_pm_addr_entry *entry = NULL, *e, new_entry;
+ struct mptcp_pm_addr_entry *entry = NULL, new_entry;
__be16 msk_sport = ((struct inet_sock *)
inet_sk((struct sock *)msk))->inet_sport;
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(e, &msk->pm.userspace_pm_local_addr_list, list) {
- if (mptcp_addresses_equal(&e->addr, skc, false)) {
- entry = e;
- break;
- }
- }
+ entry = mptcp_userspace_pm_lookup_addr(msk, skc);
spin_unlock_bh(&msk->pm.lock);
if (entry)
return entry->addr.id;
@@ -153,50 +163,60 @@ bool mptcp_userspace_pm_is_backup(struct mptcp_sock *msk,
struct mptcp_addr_info *skc)
{
struct mptcp_pm_addr_entry *entry;
- bool backup = false;
+ bool backup;
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
- if (mptcp_addresses_equal(&entry->addr, skc, false)) {
- backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
- break;
- }
- }
+ entry = mptcp_userspace_pm_lookup_addr(msk, skc);
+ backup = entry && !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
spin_unlock_bh(&msk->pm.lock);
return backup;
}
-int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info)
+static struct mptcp_sock *mptcp_userspace_pm_get_sock(const struct genl_info *info)
{
struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
+ struct mptcp_sock *msk;
+
+ if (!token) {
+ GENL_SET_ERR_MSG(info, "missing required token");
+ return NULL;
+ }
+
+ msk = mptcp_token_get_sock(genl_info_net(info), nla_get_u32(token));
+ if (!msk) {
+ NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ return NULL;
+ }
+
+ if (!mptcp_pm_is_userspace(msk)) {
+ GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
+ sock_put((struct sock *)msk);
+ return NULL;
+ }
+
+ return msk;
+}
+
+int mptcp_pm_nl_announce_doit(struct sk_buff *skb, struct genl_info *info)
+{
struct nlattr *addr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct mptcp_pm_addr_entry addr_val;
struct mptcp_sock *msk;
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
- if (!addr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!addr) {
+ GENL_SET_ERR_MSG(info, "missing required address");
return err;
}
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto announce_err;
- }
-
err = mptcp_pm_parse_entry(addr, info, true, &addr_val);
if (err < 0) {
GENL_SET_ERR_MSG(info, "error parsing local address");
@@ -267,40 +287,48 @@ remove_err:
return err;
}
+void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry)
+{
+ struct mptcp_rm_list alist = { .nr = 0 };
+ int anno_nr = 0;
+
+ /* only delete if either announced or matching a subflow */
+ if (mptcp_remove_anno_list_by_saddr(msk, &entry->addr))
+ anno_nr++;
+ else if (!mptcp_lookup_subflow_by_saddr(&msk->conn_list, &entry->addr))
+ return;
+
+ alist.ids[alist.nr++] = entry->addr.id;
+
+ spin_lock_bh(&msk->pm.lock);
+ msk->pm.add_addr_signaled -= anno_nr;
+ mptcp_pm_remove_addr(msk, &alist);
+ spin_unlock_bh(&msk->pm.lock);
+}
+
int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info)
{
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID];
struct mptcp_pm_addr_entry *match;
- struct mptcp_pm_addr_entry *entry;
struct mptcp_sock *msk;
- LIST_HEAD(free_list);
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
u8 id_val;
- if (!id || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!id) {
+ GENL_SET_ERR_MSG(info, "missing required ID");
return err;
}
id_val = nla_get_u8(id);
- token_val = nla_get_u32(token);
- msk = mptcp_token_get_sock(sock_net(skb->sk), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto out;
- }
-
if (id_val == 0) {
err = mptcp_userspace_pm_remove_id_zero_address(msk, info);
goto out;
@@ -317,16 +345,14 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info)
goto out;
}
- list_move(&match->list, &free_list);
+ list_del_rcu(&match->list);
spin_unlock_bh(&msk->pm.lock);
- mptcp_pm_remove_addrs(msk, &free_list);
+ mptcp_pm_remove_addr_entry(msk, match);
release_sock(sk);
- list_for_each_entry_safe(match, entry, &free_list, list) {
- sock_kfree_s(sk, match, sizeof(*match));
- }
+ sock_kfree_s(sk, match, sizeof(*match));
err = 0;
out:
@@ -337,7 +363,6 @@ out:
int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
struct mptcp_pm_addr_entry entry = { 0 };
struct mptcp_addr_info addr_r;
@@ -345,28 +370,18 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
struct mptcp_sock *msk;
int err = -EINVAL;
struct sock *sk;
- u32 token_val;
- if (!laddr || !raddr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!laddr || !raddr) {
+ GENL_SET_ERR_MSG(info, "missing required address(es)");
return err;
}
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(genl_info_net(info), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto create_err;
- }
-
err = mptcp_pm_parse_entry(laddr, info, true, &entry);
if (err < 0) {
NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
@@ -469,36 +484,25 @@ static struct sock *mptcp_nl_find_ssk(struct mptcp_sock *msk,
int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr *raddr = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *laddr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct mptcp_addr_info addr_l;
+ struct mptcp_pm_addr_entry addr_l;
struct mptcp_addr_info addr_r;
struct mptcp_sock *msk;
struct sock *sk, *ssk;
int err = -EINVAL;
- u32 token_val;
- if (!laddr || !raddr || !token) {
- GENL_SET_ERR_MSG(info, "missing required inputs");
+ if (!laddr || !raddr) {
+ GENL_SET_ERR_MSG(info, "missing required address(es)");
return err;
}
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(genl_info_net(info), token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return err;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto destroy_err;
- }
-
- err = mptcp_pm_parse_addr(laddr, info, &addr_l);
+ err = mptcp_pm_parse_entry(laddr, info, true, &addr_l);
if (err < 0) {
NL_SET_ERR_MSG_ATTR(info->extack, laddr, "error parsing local addr");
goto destroy_err;
@@ -511,43 +515,41 @@ int mptcp_pm_nl_subflow_destroy_doit(struct sk_buff *skb, struct genl_info *info
}
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (addr_l.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) {
- ipv6_addr_set_v4mapped(addr_l.addr.s_addr, &addr_l.addr6);
- addr_l.family = AF_INET6;
+ if (addr_l.addr.family == AF_INET && ipv6_addr_v4mapped(&addr_r.addr6)) {
+ ipv6_addr_set_v4mapped(addr_l.addr.addr.s_addr, &addr_l.addr.addr6);
+ addr_l.addr.family = AF_INET6;
}
- if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr6)) {
+ if (addr_r.family == AF_INET && ipv6_addr_v4mapped(&addr_l.addr.addr6)) {
ipv6_addr_set_v4mapped(addr_r.addr.s_addr, &addr_r.addr6);
addr_r.family = AF_INET6;
}
#endif
- if (addr_l.family != addr_r.family) {
+ if (addr_l.addr.family != addr_r.family) {
GENL_SET_ERR_MSG(info, "address families do not match");
err = -EINVAL;
goto destroy_err;
}
- if (!addr_l.port || !addr_r.port) {
+ if (!addr_l.addr.port || !addr_r.port) {
GENL_SET_ERR_MSG(info, "missing local or remote port");
err = -EINVAL;
goto destroy_err;
}
lock_sock(sk);
- ssk = mptcp_nl_find_ssk(msk, &addr_l, &addr_r);
- if (ssk) {
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct mptcp_pm_addr_entry entry = { .addr = addr_l };
-
- spin_lock_bh(&msk->pm.lock);
- mptcp_userspace_pm_delete_local_addr(msk, &entry);
- spin_unlock_bh(&msk->pm.lock);
- mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
- mptcp_close_ssk(sk, ssk, subflow);
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
- err = 0;
- } else {
+ ssk = mptcp_nl_find_ssk(msk, &addr_l.addr, &addr_r);
+ if (!ssk) {
err = -ESRCH;
+ goto release_sock;
}
+
+ spin_lock_bh(&msk->pm.lock);
+ mptcp_userspace_pm_delete_local_addr(msk, &addr_l);
+ spin_unlock_bh(&msk->pm.lock);
+ mptcp_subflow_shutdown(sk, ssk, RCV_SHUTDOWN | SEND_SHUTDOWN);
+ mptcp_close_ssk(sk, ssk, mptcp_subflow_ctx(ssk));
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RMSUBFLOW);
+release_sock:
release_sock(sk);
destroy_err:
@@ -560,31 +562,19 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
struct mptcp_pm_addr_entry loc = { .addr = { .family = AF_UNSPEC }, };
struct mptcp_pm_addr_entry rem = { .addr = { .family = AF_UNSPEC }, };
struct nlattr *attr_rem = info->attrs[MPTCP_PM_ATTR_ADDR_REMOTE];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR];
- struct net *net = sock_net(skb->sk);
struct mptcp_pm_addr_entry *entry;
struct mptcp_sock *msk;
int ret = -EINVAL;
struct sock *sk;
- u32 token_val;
u8 bkup = 0;
- token_val = nla_get_u32(token);
-
- msk = mptcp_token_get_sock(net, token_val);
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return ret;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "userspace PM not selected");
- goto set_flags_err;
- }
-
ret = mptcp_pm_parse_entry(attr, info, false, &loc);
if (ret < 0)
goto set_flags_err;
@@ -606,13 +596,12 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info)
bkup = 1;
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
- if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) {
- if (bkup)
- entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
- else
- entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
- }
+ entry = mptcp_userspace_pm_lookup_addr(msk, &loc.addr);
+ if (entry) {
+ if (bkup)
+ entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP;
+ else
+ entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP;
}
spin_unlock_bh(&msk->pm.lock);
@@ -632,33 +621,23 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
DECLARE_BITMAP(map, MPTCP_PM_MAX_ADDR_ID + 1);
} *bitmap;
const struct genl_info *info = genl_info_dump(cb);
- struct net *net = sock_net(msg->sk);
struct mptcp_pm_addr_entry *entry;
struct mptcp_sock *msk;
- struct nlattr *token;
int ret = -EINVAL;
struct sock *sk;
void *hdr;
bitmap = (struct id_bitmap *)cb->ctx;
- token = info->attrs[MPTCP_PM_ATTR_TOKEN];
- msk = mptcp_token_get_sock(net, nla_get_u32(token));
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return ret;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto out;
- }
-
lock_sock(sk);
spin_lock_bh(&msk->pm.lock);
- list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) {
+ mptcp_for_each_userspace_pm_addr(msk, entry) {
if (test_bit(entry->addr.id, bitmap->map))
continue;
@@ -680,7 +659,6 @@ int mptcp_userspace_pm_dump_addr(struct sk_buff *msg,
release_sock(sk);
ret = msg->len;
-out:
sock_put(sk);
return ret;
}
@@ -689,28 +667,19 @@ int mptcp_userspace_pm_get_addr(struct sk_buff *skb,
struct genl_info *info)
{
struct nlattr *attr = info->attrs[MPTCP_PM_ENDPOINT_ADDR];
- struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN];
struct mptcp_pm_addr_entry addr, *entry;
- struct net *net = sock_net(skb->sk);
struct mptcp_sock *msk;
struct sk_buff *msg;
int ret = -EINVAL;
struct sock *sk;
void *reply;
- msk = mptcp_token_get_sock(net, nla_get_u32(token));
- if (!msk) {
- NL_SET_ERR_MSG_ATTR(info->extack, token, "invalid token");
+ msk = mptcp_userspace_pm_get_sock(info);
+ if (!msk)
return ret;
- }
sk = (struct sock *)msk;
- if (!mptcp_pm_is_userspace(msk)) {
- GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected");
- goto out;
- }
-
ret = mptcp_pm_parse_entry(attr, info, false, &addr);
if (ret < 0)
goto out;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index f910e840aa8f..6bd819047470 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -47,7 +47,7 @@ static void __mptcp_destroy_sock(struct sock *sk);
static void mptcp_check_send_data_fin(struct sock *sk);
DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
-static struct net_device mptcp_napi_dev;
+static struct net_device *mptcp_napi_dev;
/* Returns end sequence number of the receiver's advertised window */
static u64 mptcp_wnd_end(const struct mptcp_sock *msk)
@@ -4149,11 +4149,13 @@ void __init mptcp_proto_init(void)
if (percpu_counter_init(&mptcp_sockets_allocated, 0, GFP_KERNEL))
panic("Failed to allocate MPTCP pcpu counter\n");
- init_dummy_netdev(&mptcp_napi_dev);
+ mptcp_napi_dev = alloc_netdev_dummy(0);
+ if (!mptcp_napi_dev)
+ panic("Failed to allocate MPTCP dummy netdev\n");
for_each_possible_cpu(cpu) {
delegated = per_cpu_ptr(&mptcp_delegated_actions, cpu);
INIT_LIST_HEAD(&delegated->head);
- netif_napi_add_tx(&mptcp_napi_dev, &delegated->napi,
+ netif_napi_add_tx(mptcp_napi_dev, &delegated->napi,
mptcp_napi_poll);
napi_enable(&delegated->napi);
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index b70a303e0828..ad21925af061 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -1034,6 +1034,10 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
struct mptcp_pm_add_entry *
mptcp_lookup_anno_list_by_saddr(const struct mptcp_sock *msk,
const struct mptcp_addr_info *addr);
+bool mptcp_lookup_subflow_by_saddr(const struct list_head *list,
+ const struct mptcp_addr_info *saddr);
+bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr);
int mptcp_pm_set_flags(struct sk_buff *skb, struct genl_info *info);
int mptcp_pm_nl_set_flags(struct sk_buff *skb, struct genl_info *info);
int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info);
@@ -1041,7 +1045,8 @@ int mptcp_pm_announce_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool echo);
int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list);
-void mptcp_pm_remove_addrs(struct mptcp_sock *msk, struct list_head *rm_list);
+void mptcp_pm_remove_addr_entry(struct mptcp_sock *msk,
+ struct mptcp_pm_addr_entry *entry);
void mptcp_free_local_addr_list(struct mptcp_sock *msk);
@@ -1194,6 +1199,8 @@ static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
pr_debug("TCP fallback already done (msk=%p)\n", msk);
return;
}
+ if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback)))
+ return;
set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
}
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 505445a9598f..3caa0a9d3b38 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -1419,6 +1419,12 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
switch (optname) {
case IP_TOS:
return mptcp_put_int_option(msk, optval, optlen, READ_ONCE(inet_sk(sk)->tos));
+ case IP_FREEBIND:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(FREEBIND, sk));
+ case IP_TRANSPARENT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(TRANSPARENT, sk));
case IP_BIND_ADDRESS_NO_PORT:
return mptcp_put_int_option(msk, optval, optlen,
inet_test_bit(BIND_ADDRESS_NO_PORT, sk));
@@ -1430,6 +1436,26 @@ static int mptcp_getsockopt_v4(struct mptcp_sock *msk, int optname,
return -EOPNOTSUPP;
}
+static int mptcp_getsockopt_v6(struct mptcp_sock *msk, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct sock *sk = (void *)msk;
+
+ switch (optname) {
+ case IPV6_V6ONLY:
+ return mptcp_put_int_option(msk, optval, optlen,
+ sk->sk_ipv6only);
+ case IPV6_TRANSPARENT:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(TRANSPARENT, sk));
+ case IPV6_FREEBIND:
+ return mptcp_put_int_option(msk, optval, optlen,
+ inet_test_bit(FREEBIND, sk));
+ }
+
+ return -EOPNOTSUPP;
+}
+
static int mptcp_getsockopt_sol_mptcp(struct mptcp_sock *msk, int optname,
char __user *optval, int __user *optlen)
{
@@ -1469,6 +1495,8 @@ int mptcp_getsockopt(struct sock *sk, int level, int optname,
if (level == SOL_IP)
return mptcp_getsockopt_v4(msk, optname, optval, option);
+ if (level == SOL_IPV6)
+ return mptcp_getsockopt_v6(msk, optname, optval, option);
if (level == SOL_TCP)
return mptcp_getsockopt_sol_tcp(msk, optname, optval, option);
if (level == SOL_MPTCP)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 9f18217dddc8..f2e82599dd0a 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -754,8 +754,6 @@ static bool subflow_hmac_valid(const struct request_sock *req,
subflow_req = mptcp_subflow_rsk(req);
msk = subflow_req->msk;
- if (!msk)
- return false;
subflow_generate_hmac(READ_ONCE(msk->remote_key),
READ_ONCE(msk->local_key),
@@ -853,12 +851,8 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk,
} else if (subflow_req->mp_join) {
mptcp_get_options(skb, &mp_opt);
- if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK) ||
- !subflow_hmac_valid(req, &mp_opt) ||
- !mptcp_can_accept_new_subflow(subflow_req->msk)) {
- SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ if (!(mp_opt.suboptions & OPTION_MPTCP_MPJ_ACK))
fallback = true;
- }
}
create_child:
@@ -908,6 +902,17 @@ create_child:
goto dispose_child;
}
+ if (!subflow_hmac_valid(req, &mp_opt)) {
+ SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ goto dispose_child;
+ }
+
+ if (!mptcp_can_accept_new_subflow(owner)) {
+ subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
+ goto dispose_child;
+ }
+
/* move the msk reference ownership to the subflow */
subflow_req->msk = NULL;
ctx->conn = (struct sock *)owner;
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index c0289f83f96d..20a1727e2457 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1046,28 +1046,35 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
#ifdef CONFIG_PROC_FS
struct ip_vs_iter_state {
struct seq_net_private p;
- struct hlist_head *l;
+ unsigned int bucket;
+ unsigned int skip_elems;
};
-static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos)
+static void *ip_vs_conn_array(struct ip_vs_iter_state *iter)
{
int idx;
struct ip_vs_conn *cp;
- struct ip_vs_iter_state *iter = seq->private;
- for (idx = 0; idx < ip_vs_conn_tab_size; idx++) {
+ for (idx = iter->bucket; idx < ip_vs_conn_tab_size; idx++) {
+ unsigned int skip = 0;
+
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
/* __ip_vs_conn_get() is not needed by
* ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
*/
- if (pos-- == 0) {
- iter->l = &ip_vs_conn_tab[idx];
+ if (skip >= iter->skip_elems) {
+ iter->bucket = idx;
return cp;
}
+
+ ++skip;
}
+
+ iter->skip_elems = 0;
cond_resched_rcu();
}
+ iter->bucket = idx;
return NULL;
}
@@ -1076,9 +1083,14 @@ static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
{
struct ip_vs_iter_state *iter = seq->private;
- iter->l = NULL;
rcu_read_lock();
- return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN;
+ if (*pos == 0) {
+ iter->skip_elems = 0;
+ iter->bucket = 0;
+ return SEQ_START_TOKEN;
+ }
+
+ return ip_vs_conn_array(iter);
}
static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -1086,28 +1098,22 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
struct ip_vs_conn *cp = v;
struct ip_vs_iter_state *iter = seq->private;
struct hlist_node *e;
- struct hlist_head *l = iter->l;
- int idx;
++*pos;
if (v == SEQ_START_TOKEN)
- return ip_vs_conn_array(seq, 0);
+ return ip_vs_conn_array(iter);
/* more on same hash chain? */
e = rcu_dereference(hlist_next_rcu(&cp->c_list));
- if (e)
+ if (e) {
+ iter->skip_elems++;
return hlist_entry(e, struct ip_vs_conn, c_list);
-
- idx = l - ip_vs_conn_tab;
- while (++idx < ip_vs_conn_tab_size) {
- hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
- iter->l = &ip_vs_conn_tab[idx];
- return cp;
- }
- cond_resched_rcu();
}
- iter->l = NULL;
- return NULL;
+
+ iter->skip_elems = 0;
+ iter->bucket++;
+
+ return ip_vs_conn_array(iter);
}
static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 7d13110ce188..0633276d96bf 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3091,12 +3091,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_SERVICES:
{
struct ip_vs_get_services *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_services *)arg;
size = struct_size(get, entrytable, get->num_services);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
@@ -3132,12 +3132,12 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
case IP_VS_SO_GET_DESTS:
{
struct ip_vs_get_dests *get;
- int size;
+ size_t size;
get = (struct ip_vs_get_dests *)arg;
size = struct_size(get, entrytable, get->num_dests);
if (*len != size) {
- pr_err("length: %u != %u\n", *len, size);
+ pr_err("length: %u != %zu\n", *len, size);
ret = -EINVAL;
goto out;
}
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 4890af4dc263..913ede2f57f9 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -132,7 +132,7 @@ static int __nf_conncount_add(struct net *net,
struct nf_conn *found_ct;
unsigned int collect = 0;
- if (time_is_after_eq_jiffies((unsigned long)list->last_gc))
+ if ((u32)jiffies == list->last_gc)
goto add_new_node;
/* check the saved connections */
@@ -234,7 +234,7 @@ bool nf_conncount_gc_list(struct net *net,
bool ret = false;
/* don't bother if we just did GC */
- if (time_is_after_eq_jiffies((unsigned long)READ_ONCE(list->last_gc)))
+ if ((u32)jiffies == READ_ONCE(list->last_gc))
return false;
/* don't bother if other cpu is already doing GC */
@@ -377,6 +377,8 @@ restart:
conn->tuple = *tuple;
conn->zone = *zone;
+ conn->cpu = raw_smp_processor_id();
+ conn->jiffies32 = (u32)jiffies;
memcpy(rbconn->key, key, sizeof(u32) * data->keylen);
nf_conncount_list_init(&rbconn->list);
diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c
index d011d2eb0848..7be4c35e4795 100644
--- a/net/netfilter/nf_conntrack_amanda.c
+++ b/net/netfilter/nf_conntrack_amanda.c
@@ -106,7 +106,7 @@ static int amanda_help(struct sk_buff *skb,
/* increase the UDP timeout of the master connection as replies from
* Amanda clients to the server can be quite delayed */
- nf_ct_refresh(ct, skb, master_timeout * HZ);
+ nf_ct_refresh(ct, master_timeout * HZ);
/* No data? */
dataoff = protoff + sizeof(struct udphdr);
diff --git a/net/netfilter/nf_conntrack_broadcast.c b/net/netfilter/nf_conntrack_broadcast.c
index cfa0fe0356de..a7552a46d6ac 100644
--- a/net/netfilter/nf_conntrack_broadcast.c
+++ b/net/netfilter/nf_conntrack_broadcast.c
@@ -75,7 +75,7 @@ int nf_conntrack_broadcast_help(struct sk_buff *skb,
nf_ct_expect_related(exp, 0);
nf_ct_expect_put(exp);
- nf_ct_refresh(ct, skb, timeout * HZ);
+ nf_ct_refresh(ct, timeout * HZ);
out:
return NF_ACCEPT;
}
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 456446d7af20..7f8b245e287a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1544,12 +1544,6 @@ static void gc_worker(struct work_struct *work)
tmp = nf_ct_tuplehash_to_ctrack(h);
- if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) {
- nf_ct_offload_timeout(tmp);
- if (!nf_conntrack_max95)
- continue;
- }
-
if (expired_count > GC_SCAN_EXPIRED_MAX) {
rcu_read_unlock();
@@ -2089,9 +2083,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_in);
/* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
void __nf_ct_refresh_acct(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
- const struct sk_buff *skb,
u32 extra_jiffies,
- bool do_acct)
+ unsigned int bytes)
{
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
@@ -2104,8 +2097,8 @@ void __nf_ct_refresh_acct(struct nf_conn *ct,
if (READ_ONCE(ct->timeout) != extra_jiffies)
WRITE_ONCE(ct->timeout, extra_jiffies);
acct:
- if (do_acct)
- nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), skb->len);
+ if (bytes)
+ nf_ct_acct_update(ct, CTINFO2DIR(ctinfo), bytes);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 69948e1d6974..af68c64acaab 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -162,6 +162,14 @@ static int __nf_conntrack_eventmask_report(struct nf_conntrack_ecache *e,
return ret;
}
+static void nf_ct_ecache_tstamp_refresh(struct nf_conntrack_ecache *e)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ if (local64_read(&e->timestamp))
+ local64_set(&e->timestamp, ktime_get_real_ns());
+#endif
+}
+
int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
u32 portid, int report)
{
@@ -186,6 +194,8 @@ int nf_conntrack_eventmask_report(unsigned int events, struct nf_conn *ct,
/* This is a resent of a destroy event? If so, skip missed */
missed = e->portid ? 0 : e->missed;
+ nf_ct_ecache_tstamp_refresh(e);
+
ret = __nf_conntrack_eventmask_report(e, events, missed, &item);
if (unlikely(ret < 0 && (events & (1 << IPCT_DESTROY)))) {
/* This is a destroy event that has been triggered by a process,
@@ -297,6 +307,18 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
}
}
+static void nf_ct_ecache_tstamp_new(const struct nf_conn *ct, struct nf_conntrack_ecache *e)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ u64 ts = 0;
+
+ if (nf_ct_ext_exist(ct, NF_CT_EXT_TSTAMP))
+ ts = ktime_get_real_ns();
+
+ local64_set(&e->timestamp, ts);
+#endif
+}
+
bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp)
{
struct net *net = nf_ct_net(ct);
@@ -326,6 +348,7 @@ bool nf_ct_ecache_ext_add(struct nf_conn *ct, u16 ctmask, u16 expmask, gfp_t gfp
e = nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
if (e) {
+ nf_ct_ecache_tstamp_new(ct, e);
e->ctmask = ctmask;
e->expmask = expmask;
}
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 5a9bce24f3c3..14f73872f647 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -1385,7 +1385,7 @@ static int process_rcf(struct sk_buff *skb, struct nf_conn *ct,
if (info->timeout > 0) {
pr_debug("nf_ct_ras: set RAS connection timeout to "
"%u seconds\n", info->timeout);
- nf_ct_refresh(ct, skb, info->timeout * HZ);
+ nf_ct_refresh(ct, info->timeout * HZ);
/* Set expect timeout */
spin_lock_bh(&nf_conntrack_expect_lock);
@@ -1433,7 +1433,7 @@ static int process_urq(struct sk_buff *skb, struct nf_conn *ct,
info->sig_port[!dir] = 0;
/* Give it 30 seconds for UCF or URJ */
- nf_ct_refresh(ct, skb, 30 * HZ);
+ nf_ct_refresh(ct, 30 * HZ);
return 0;
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 36168f8b6efa..db23876a6016 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -357,11 +357,11 @@ nla_put_failure:
static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
{
struct nlattr *nest_secctx;
- int len, ret;
- char *secctx;
+ struct lsm_context ctx;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return 0;
ret = -1;
@@ -369,13 +369,13 @@ static int ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
if (!nest_secctx)
goto nla_put_failure;
- if (nla_put_string(skb, CTA_SECCTX_NAME, secctx))
+ if (nla_put_string(skb, CTA_SECCTX_NAME, ctx.context))
goto nla_put_failure;
nla_nest_end(skb, nest_secctx);
ret = 0;
nla_put_failure:
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
return ret;
}
#else
@@ -383,6 +383,23 @@ nla_put_failure:
#endif
#ifdef CONFIG_NF_CONNTRACK_EVENTS
+static int
+ctnetlink_dump_event_timestamp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ const struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct);
+
+ if (e) {
+ u64 ts = local64_read(&e->timestamp);
+
+ if (ts)
+ return nla_put_be64(skb, CTA_TIMESTAMP_EVENT,
+ cpu_to_be64(ts), CTA_TIMESTAMP_PAD);
+ }
+#endif
+ return 0;
+}
+
static inline int ctnetlink_label_size(const struct nf_conn *ct)
{
struct nf_conn_labels *labels = nf_ct_labels_find(ct);
@@ -663,14 +680,14 @@ static inline size_t ctnetlink_acct_size(const struct nf_conn *ct)
static inline int ctnetlink_secctx_size(const struct nf_conn *ct)
{
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- int len, ret;
+ int ret;
- ret = security_secid_to_secctx(ct->secmark, NULL, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, NULL);
+ if (ret < 0)
return 0;
return nla_total_size(0) /* CTA_SECCTX */
- + nla_total_size(sizeof(char) * len); /* CTA_SECCTX_NAME */
+ + nla_total_size(sizeof(char) * ret); /* CTA_SECCTX_NAME */
#else
return 0;
#endif
@@ -717,6 +734,9 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct)
#endif
+ ctnetlink_proto_size(ct)
+ ctnetlink_label_size(ct)
+#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP
+ + nla_total_size(sizeof(u64)) /* CTA_TIMESTAMP_EVENT */
+#endif
;
}
@@ -838,6 +858,10 @@ ctnetlink_conntrack_event(unsigned int events, const struct nf_ct_event *item)
if (ctnetlink_dump_mark(skb, ct, events & (1 << IPCT_MARK)))
goto nla_put_failure;
#endif
+
+ if (ctnetlink_dump_event_timestamp(skb, ct))
+ goto nla_put_failure;
+
nlmsg_end(skb, nlh);
err = nfnetlink_send(skb, net, item->portid, group, item->report,
GFP_ATOMIC);
@@ -1557,6 +1581,7 @@ static const struct nla_policy ct_nla_policy[CTA_MAX+1] = {
.len = NF_CT_LABELS_MAX_SIZE },
[CTA_FILTER] = { .type = NLA_NESTED },
[CTA_STATUS_MASK] = { .type = NLA_U32 },
+ [CTA_TIMESTAMP_EVENT] = { .type = NLA_REJECT },
};
static int ctnetlink_flush_iterate(struct nf_conn *ct, void *data)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index 4cc97f971264..7c6f7c9f7332 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -39,20 +39,15 @@ static const char *const sctp_conntrack_names[] = {
[SCTP_CONNTRACK_HEARTBEAT_SENT] = "HEARTBEAT_SENT",
};
-#define SECS * HZ
-#define MINS * 60 SECS
-#define HOURS * 60 MINS
-#define DAYS * 24 HOURS
-
static const unsigned int sctp_timeouts[SCTP_CONNTRACK_MAX] = {
- [SCTP_CONNTRACK_CLOSED] = 10 SECS,
- [SCTP_CONNTRACK_COOKIE_WAIT] = 3 SECS,
- [SCTP_CONNTRACK_COOKIE_ECHOED] = 3 SECS,
- [SCTP_CONNTRACK_ESTABLISHED] = 210 SECS,
- [SCTP_CONNTRACK_SHUTDOWN_SENT] = 3 SECS,
- [SCTP_CONNTRACK_SHUTDOWN_RECD] = 3 SECS,
- [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = 3 SECS,
- [SCTP_CONNTRACK_HEARTBEAT_SENT] = 30 SECS,
+ [SCTP_CONNTRACK_CLOSED] = secs_to_jiffies(10),
+ [SCTP_CONNTRACK_COOKIE_WAIT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_COOKIE_ECHOED] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_ESTABLISHED] = secs_to_jiffies(210),
+ [SCTP_CONNTRACK_SHUTDOWN_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_RECD] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_SHUTDOWN_ACK_SENT] = secs_to_jiffies(3),
+ [SCTP_CONNTRACK_HEARTBEAT_SENT] = secs_to_jiffies(30),
};
#define SCTP_FLAG_HEARTBEAT_VTAG_FAILED 1
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index d0eac27f6ba0..ca748f8dbff1 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -1553,7 +1553,7 @@ static int sip_help_tcp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
@@ -1624,7 +1624,7 @@ static int sip_help_udp(struct sk_buff *skb, unsigned int protoff,
if (dataoff >= skb->len)
return NF_ACCEPT;
- nf_ct_refresh(ct, skb, sip_timeout * HZ);
+ nf_ct_refresh(ct, sip_timeout * HZ);
if (unlikely(skb_linearize(skb)))
return NF_DROP;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 7d4f0fa8b609..502cf10aab41 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -172,17 +172,16 @@ static void ct_seq_stop(struct seq_file *s, void *v)
#ifdef CONFIG_NF_CONNTRACK_SECMARK
static void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
{
+ struct lsm_context ctx;
int ret;
- u32 len;
- char *secctx;
- ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
- if (ret)
+ ret = security_secid_to_secctx(ct->secmark, &ctx);
+ if (ret < 0)
return;
- seq_printf(s, "secctx=%s ", secctx);
+ seq_printf(s, "secctx=%s ", ctx.context);
- security_release_secctx(secctx, len);
+ security_release_secctx(&ctx);
}
#else
static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index df72b0376970..9441ac3d8c1a 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -161,42 +161,86 @@ void flow_offload_route_init(struct flow_offload *flow,
}
EXPORT_SYMBOL_GPL(flow_offload_route_init);
-static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
+static inline bool nf_flow_has_expired(const struct flow_offload *flow)
+{
+ return nf_flow_timeout_delta(flow->timeout) <= 0;
+}
+
+static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
{
+ struct ip_ct_tcp *tcp = &ct->proto.tcp;
+
+ spin_lock_bh(&ct->lock);
+ if (tcp->state != tcp_state)
+ tcp->state = tcp_state;
+
+ /* syn packet triggers the TCP reopen case from conntrack. */
+ if (tcp->state == TCP_CONNTRACK_CLOSE)
+ ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+
+ /* Conntrack state is outdated due to offload bypass.
+ * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
+ * TCP reset validation will fail.
+ */
tcp->seen[0].td_maxwin = 0;
+ tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
tcp->seen[1].td_maxwin = 0;
+ tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
+ spin_unlock_bh(&ct->lock);
}
-static void flow_offload_fixup_ct(struct nf_conn *ct)
+static void flow_offload_fixup_ct(struct flow_offload *flow)
{
+ struct nf_conn *ct = flow->ct;
struct net *net = nf_ct_net(ct);
int l4num = nf_ct_protonum(ct);
+ bool expired, closing = false;
+ u32 offload_timeout = 0;
s32 timeout;
if (l4num == IPPROTO_TCP) {
- struct nf_tcp_net *tn = nf_tcp_pernet(net);
-
- flow_offload_fixup_tcp(&ct->proto.tcp);
+ const struct nf_tcp_net *tn = nf_tcp_pernet(net);
+ u8 tcp_state;
+
+ /* Enter CLOSE state if fin/rst packet has been seen, this
+ * allows TCP reopen from conntrack. Otherwise, pick up from
+ * the last seen TCP state.
+ */
+ closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
+ if (closing) {
+ flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
+ timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
+ expired = false;
+ } else {
+ tcp_state = READ_ONCE(ct->proto.tcp.state);
+ flow_offload_fixup_tcp(ct, tcp_state);
+ timeout = READ_ONCE(tn->timeouts[tcp_state]);
+ expired = nf_flow_has_expired(flow);
+ }
+ offload_timeout = READ_ONCE(tn->offload_timeout);
- timeout = tn->timeouts[ct->proto.tcp.state];
- timeout -= tn->offload_timeout;
} else if (l4num == IPPROTO_UDP) {
- struct nf_udp_net *tn = nf_udp_pernet(net);
+ const struct nf_udp_net *tn = nf_udp_pernet(net);
enum udp_conntrack state =
test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
UDP_CT_REPLIED : UDP_CT_UNREPLIED;
- timeout = tn->timeouts[state];
- timeout -= tn->offload_timeout;
+ timeout = READ_ONCE(tn->timeouts[state]);
+ expired = nf_flow_has_expired(flow);
+ offload_timeout = READ_ONCE(tn->offload_timeout);
} else {
return;
}
+ if (expired)
+ timeout -= offload_timeout;
+
if (timeout < 0)
timeout = 0;
- if (nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
- WRITE_ONCE(ct->timeout, nfct_time_stamp + timeout);
+ if (closing ||
+ nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
+ nf_ct_refresh(ct, timeout);
}
static void flow_offload_route_release(struct flow_offload *flow)
@@ -294,7 +338,7 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
return err;
}
- nf_ct_offload_timeout(flow->ct);
+ nf_ct_refresh(flow->ct, NF_CT_DAY);
if (nf_flowtable_hw_offload(flow_table)) {
__set_bit(NF_FLOW_HW, &flow->flags);
@@ -316,18 +360,14 @@ void flow_offload_refresh(struct nf_flowtable *flow_table,
else
return;
- if (likely(!nf_flowtable_hw_offload(flow_table)))
+ if (likely(!nf_flowtable_hw_offload(flow_table)) ||
+ test_bit(NF_FLOW_CLOSING, &flow->flags))
return;
nf_flow_offload_add(flow_table, flow);
}
EXPORT_SYMBOL_GPL(flow_offload_refresh);
-static inline bool nf_flow_has_expired(const struct flow_offload *flow)
-{
- return nf_flow_timeout_delta(flow->timeout) <= 0;
-}
-
static void flow_offload_del(struct nf_flowtable *flow_table,
struct flow_offload *flow)
{
@@ -343,8 +383,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table,
void flow_offload_teardown(struct flow_offload *flow)
{
clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
- set_bit(NF_FLOW_TEARDOWN, &flow->flags);
- flow_offload_fixup_ct(flow->ct);
+ if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
+ flow_offload_fixup_ct(flow);
}
EXPORT_SYMBOL_GPL(flow_offload_teardown);
@@ -414,15 +454,118 @@ static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
return flow_table->type->gc && flow_table->type->gc(flow);
}
+/**
+ * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
+ * @ct: Flowtable offloaded tcp ct
+ *
+ * Return: number of seconds when ct entry should expire.
+ */
+static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
+{
+ u8 state = READ_ONCE(ct->proto.tcp.state);
+
+ switch (state) {
+ case TCP_CONNTRACK_SYN_SENT:
+ case TCP_CONNTRACK_SYN_RECV:
+ return 0;
+ case TCP_CONNTRACK_ESTABLISHED:
+ return NF_CT_DAY;
+ case TCP_CONNTRACK_FIN_WAIT:
+ case TCP_CONNTRACK_CLOSE_WAIT:
+ case TCP_CONNTRACK_LAST_ACK:
+ case TCP_CONNTRACK_TIME_WAIT:
+ return 5 * 60 * HZ;
+ case TCP_CONNTRACK_CLOSE:
+ return 0;
+ }
+
+ return 0;
+}
+
+/**
+ * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
+ * @ct: Flowtable offloaded ct
+ *
+ * Datapath lookups in the conntrack table will evict nf_conn entries
+ * if they have expired.
+ *
+ * Once nf_conn entries have been offloaded, nf_conntrack might not see any
+ * packets anymore. Thus ct->timeout is no longer refreshed and ct can
+ * be evicted.
+ *
+ * To avoid the need for an additional check on the offload bit for every
+ * packet processed via nf_conntrack_in(), set an arbitrary timeout large
+ * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
+ * from the packet path via nf_ct_is_expired().
+ */
+static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
+{
+ static const u32 min_timeout = 5 * 60 * HZ;
+ u32 expires = nf_ct_expires(ct);
+
+ /* normal case: large enough timeout, nothing to do. */
+ if (likely(expires >= min_timeout))
+ return;
+
+ /* must check offload bit after this, we do not hold any locks.
+ * flowtable and ct entries could have been removed on another CPU.
+ */
+ if (!refcount_inc_not_zero(&ct->ct_general.use))
+ return;
+
+ /* load ct->status after refcount increase */
+ smp_acquire__after_ctrl_dep();
+
+ if (nf_ct_is_confirmed(ct) &&
+ test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
+ u8 l4proto = nf_ct_protonum(ct);
+ u32 new_timeout = true;
+
+ switch (l4proto) {
+ case IPPROTO_UDP:
+ new_timeout = NF_CT_DAY;
+ break;
+ case IPPROTO_TCP:
+ new_timeout = nf_flow_table_tcp_timeout(ct);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ /* Update to ct->timeout from nf_conntrack happens
+ * without holding ct->lock.
+ *
+ * Use cmpxchg to ensure timeout extension doesn't
+ * happen when we race with conntrack datapath.
+ *
+ * The inverse -- datapath updating ->timeout right
+ * after this -- is fine, datapath is authoritative.
+ */
+ if (new_timeout) {
+ new_timeout += nfct_time_stamp;
+ cmpxchg(&ct->timeout, expires, new_timeout);
+ }
+ }
+
+ nf_ct_put(ct);
+}
+
static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
struct flow_offload *flow, void *data)
{
+ bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
+
if (nf_flow_has_expired(flow) ||
nf_ct_is_dying(flow->ct) ||
- nf_flow_custom_gc(flow_table, flow))
+ nf_flow_custom_gc(flow_table, flow)) {
flow_offload_teardown(flow);
+ teardown = true;
+ } else if (!teardown) {
+ nf_flow_table_extend_ct_timeout(flow->ct);
+ }
- if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) {
+ if (teardown) {
if (test_bit(NF_FLOW_HW, &flow->flags)) {
if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
nf_flow_offload_del(flow_table, flow);
@@ -431,6 +574,10 @@ static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
} else {
flow_offload_del(flow_table, flow);
}
+ } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
+ test_bit(NF_FLOW_HW, &flow->flags) &&
+ !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
+ nf_flow_offload_del(flow_table, flow);
} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
nf_flow_offload_stats(flow_table, flow);
}
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 98edcaa37b38..8cd4cf7ae211 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -28,11 +28,15 @@ static int nf_flow_state_check(struct flow_offload *flow, int proto,
return 0;
tcph = (void *)(skb_network_header(skb) + thoff);
- if (unlikely(tcph->fin || tcph->rst)) {
+ if (tcph->syn && test_bit(NF_FLOW_CLOSING, &flow->flags)) {
flow_offload_teardown(flow);
return -1;
}
+ if ((tcph->fin || tcph->rst) &&
+ !test_bit(NF_FLOW_CLOSING, &flow->flags))
+ set_bit(NF_FLOW_CLOSING, &flow->flags);
+
return 0;
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 73e37861ff11..a133e1c175ce 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -34,7 +34,6 @@ unsigned int nf_tables_net_id __read_mostly;
static LIST_HEAD(nf_tables_expressions);
static LIST_HEAD(nf_tables_objects);
static LIST_HEAD(nf_tables_flowtables);
-static LIST_HEAD(nf_tables_destroy_list);
static LIST_HEAD(nf_tables_gc_list);
static DEFINE_SPINLOCK(nf_tables_destroy_list_lock);
static DEFINE_SPINLOCK(nf_tables_gc_list_lock);
@@ -125,7 +124,6 @@ static void nft_validate_state_update(struct nft_table *table, u8 new_validate_s
table->validate_state = new_validate_state;
}
static void nf_tables_trans_destroy_work(struct work_struct *w);
-static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work);
static void nft_trans_gc_work(struct work_struct *work);
static DECLARE_WORK(trans_gc_work, nft_trans_gc_work);
@@ -1956,15 +1954,16 @@ static int nft_dump_basechain_hook(struct sk_buff *skb,
if (!first)
first = hook;
- if (nla_put_string(skb, NFTA_DEVICE_NAME,
- hook->ops.dev->name))
+ if (nla_put(skb, NFTA_DEVICE_NAME,
+ hook->ifnamelen, hook->ifname))
goto nla_put_failure;
n++;
}
nla_nest_end(skb, nest_devs);
if (n == 1 &&
- nla_put_string(skb, NFTA_HOOK_DEV, first->ops.dev->name))
+ nla_put(skb, NFTA_HOOK_DEV,
+ first->ifnamelen, first->ifname))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -2276,7 +2275,6 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
const struct nlattr *attr)
{
struct net_device *dev;
- char ifname[IFNAMSIZ];
struct nft_hook *hook;
int err;
@@ -2286,12 +2284,17 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net,
goto err_hook_alloc;
}
- nla_strscpy(ifname, attr, IFNAMSIZ);
+ err = nla_strscpy(hook->ifname, attr, IFNAMSIZ);
+ if (err < 0)
+ goto err_hook_dev;
+
+ hook->ifnamelen = nla_len(attr);
+
/* nf_tables_netdev_event() is called under rtnl_mutex, this is
* indirectly serializing all the other holders of the commit_mutex with
* the rtnl_mutex.
*/
- dev = __dev_get_by_name(net, ifname);
+ dev = __dev_get_by_name(net, hook->ifname);
if (!dev) {
err = -ENOENT;
goto err_hook_dev;
@@ -2312,7 +2315,7 @@ static struct nft_hook *nft_hook_list_find(struct list_head *hook_list,
struct nft_hook *hook;
list_for_each_entry(hook, hook_list, list) {
- if (this->ops.dev == hook->ops.dev)
+ if (!strcmp(hook->ifname, this->ifname))
return hook;
}
@@ -2598,9 +2601,8 @@ int nft_chain_add(struct nft_table *table, struct nft_chain *chain)
static u64 chain_id;
-static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy, u32 flags,
- struct netlink_ext_ack *extack)
+static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 policy,
+ u32 flags, struct netlink_ext_ack *extack)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -2837,11 +2839,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
err = nft_netdev_register_hooks(ctx->net, &hook.list);
if (err < 0)
goto err_hooks;
+
+ unregister = true;
}
}
- unregister = true;
-
if (nla[NFTA_CHAIN_COUNTERS]) {
if (!nft_is_base_chain(chain)) {
err = -EOPNOTSUPP;
@@ -3038,7 +3040,7 @@ static int nf_tables_newchain(struct sk_buff *skb, const struct nfnl_info *info,
extack);
}
- return nf_tables_addchain(&ctx, family, genmask, policy, flags, extack);
+ return nf_tables_addchain(&ctx, family, policy, flags, extack);
}
static int nft_delchain_hook(struct nft_ctx *ctx,
@@ -8896,7 +8898,7 @@ static int nft_register_flowtable_net_hooks(struct net *net,
struct list_head *hook_list,
struct nft_flowtable *flowtable)
{
- struct nft_hook *hook, *hook2, *next;
+ struct nft_hook *hook, *next;
struct nft_flowtable *ft;
int err, i = 0;
@@ -8905,12 +8907,9 @@ static int nft_register_flowtable_net_hooks(struct net *net,
if (!nft_is_active_next(net, ft))
continue;
- list_for_each_entry(hook2, &ft->hook_list, list) {
- if (hook->ops.dev == hook2->ops.dev &&
- hook->ops.pf == hook2->ops.pf) {
- err = -EEXIST;
- goto err_unregister_net_hooks;
- }
+ if (nft_hook_list_find(&ft->hook_list, hook)) {
+ err = -EEXIST;
+ goto err_unregister_net_hooks;
}
}
@@ -9324,7 +9323,8 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
list_for_each_entry_rcu(hook, hook_list, list,
lockdep_commit_lock_is_held(net)) {
- if (nla_put_string(skb, NFTA_DEVICE_NAME, hook->ops.dev->name))
+ if (nla_put(skb, NFTA_DEVICE_NAME,
+ hook->ifnamelen, hook->ifname))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -10004,11 +10004,12 @@ static void nft_commit_release(struct nft_trans *trans)
static void nf_tables_trans_destroy_work(struct work_struct *w)
{
+ struct nftables_pernet *nft_net = container_of(w, struct nftables_pernet, destroy_work);
struct nft_trans *trans, *next;
LIST_HEAD(head);
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_init(&nf_tables_destroy_list, &head);
+ list_splice_init(&nft_net->destroy_list, &head);
spin_unlock(&nf_tables_destroy_list_lock);
if (list_empty(&head))
@@ -10022,9 +10023,11 @@ static void nf_tables_trans_destroy_work(struct work_struct *w)
}
}
-void nf_tables_trans_destroy_flush_work(void)
+void nf_tables_trans_destroy_flush_work(struct net *net)
{
- flush_work(&trans_destroy_work);
+ struct nftables_pernet *nft_net = nft_pernet(net);
+
+ flush_work(&nft_net->destroy_work);
}
EXPORT_SYMBOL_GPL(nf_tables_trans_destroy_flush_work);
@@ -10482,11 +10485,11 @@ static void nf_tables_commit_release(struct net *net)
trans->put_net = true;
spin_lock(&nf_tables_destroy_list_lock);
- list_splice_tail_init(&nft_net->commit_list, &nf_tables_destroy_list);
+ list_splice_tail_init(&nft_net->commit_list, &nft_net->destroy_list);
spin_unlock(&nf_tables_destroy_list_lock);
nf_tables_module_autoload_cleanup(net);
- schedule_work(&trans_destroy_work);
+ schedule_work(&nft_net->destroy_work);
mutex_unlock(&nft_net->commit_mutex);
}
@@ -11739,47 +11742,6 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-static void __nft_release_basechain_now(struct nft_ctx *ctx)
-{
- struct nft_rule *rule, *nr;
-
- list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
- list_del(&rule->list);
- nf_tables_rule_release(ctx, rule);
- }
- nf_tables_chain_destroy(ctx->chain);
-}
-
-int __nft_release_basechain(struct nft_ctx *ctx)
-{
- struct nft_rule *rule;
-
- if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain)))
- return 0;
-
- nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain);
- list_for_each_entry(rule, &ctx->chain->rules, list)
- nft_use_dec(&ctx->chain->use);
-
- nft_chain_del(ctx->chain);
- nft_use_dec(&ctx->table->use);
-
- if (!maybe_get_net(ctx->net)) {
- __nft_release_basechain_now(ctx);
- return 0;
- }
-
- /* wait for ruleset dumps to complete. Owning chain is no longer in
- * lists, so new dumps can't find any of these rules anymore.
- */
- synchronize_rcu();
-
- __nft_release_basechain_now(ctx);
- put_net(ctx->net);
- return 0;
-}
-EXPORT_SYMBOL_GPL(__nft_release_basechain);
-
static void __nft_release_hook(struct net *net, struct nft_table *table)
{
struct nft_flowtable *flowtable;
@@ -11892,7 +11854,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event,
gc_seq = nft_gc_seq_begin(nft_net);
- nf_tables_trans_destroy_flush_work();
+ nf_tables_trans_destroy_flush_work(net);
again:
list_for_each_entry(table, &nft_net->tables, list) {
if (nft_table_has_owner(table) &&
@@ -11934,6 +11896,7 @@ static int __net_init nf_tables_init_net(struct net *net)
INIT_LIST_HEAD(&nft_net->tables);
INIT_LIST_HEAD(&nft_net->commit_list);
+ INIT_LIST_HEAD(&nft_net->destroy_list);
INIT_LIST_HEAD(&nft_net->commit_set_list);
INIT_LIST_HEAD(&nft_net->binding_list);
INIT_LIST_HEAD(&nft_net->module_list);
@@ -11942,6 +11905,7 @@ static int __net_init nf_tables_init_net(struct net *net)
nft_net->base_seq = 1;
nft_net->gc_seq = 0;
nft_net->validate_state = NFT_VALIDATE_SKIP;
+ INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work);
return 0;
}
@@ -11970,14 +11934,17 @@ static void __net_exit nf_tables_exit_net(struct net *net)
if (!list_empty(&nft_net->module_list))
nf_tables_module_autoload_cleanup(net);
+ cancel_work_sync(&nft_net->destroy_work);
__nft_release_tables(net);
nft_gc_seq_end(nft_net, gc_seq);
mutex_unlock(&nft_net->commit_mutex);
+
WARN_ON_ONCE(!list_empty(&nft_net->tables));
WARN_ON_ONCE(!list_empty(&nft_net->module_list));
WARN_ON_ONCE(!list_empty(&nft_net->notify_list));
+ WARN_ON_ONCE(!list_empty(&nft_net->destroy_list));
}
static void nf_tables_exit_batch(struct list_head *net_exit_list)
@@ -12068,10 +12035,8 @@ static void __exit nf_tables_module_exit(void)
unregister_netdevice_notifier(&nf_tables_flowtable_notifier);
nft_chain_filter_fini();
nft_chain_route_fini();
- nf_tables_trans_destroy_flush_work();
unregister_pernet_subsys(&nf_tables_net_ops);
cancel_work_sync(&trans_gc_work);
- cancel_work_sync(&trans_destroy_work);
rcu_barrier();
rhltable_destroy(&nft_objname_ht);
nf_tables_core_module_exit();
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 75598520b0fa..6557a4018c09 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -21,25 +21,22 @@
#include <net/netfilter/nf_log.h>
#include <net/netfilter/nft_meta.h>
-#if defined(CONFIG_MITIGATION_RETPOLINE) && defined(CONFIG_X86)
-
+#ifdef CONFIG_MITIGATION_RETPOLINE
static struct static_key_false nf_tables_skip_direct_calls;
-static bool nf_skip_indirect_calls(void)
+static inline bool nf_skip_indirect_calls(void)
{
return static_branch_likely(&nf_tables_skip_direct_calls);
}
-static void __init nf_skip_indirect_calls_enable(void)
+static inline void __init nf_skip_indirect_calls_enable(void)
{
if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
static_branch_enable(&nf_tables_skip_direct_calls);
}
#else
-static inline bool nf_skip_indirect_calls(void) { return false; }
-
static inline void nf_skip_indirect_calls_enable(void) { }
-#endif
+#endif /* CONFIG_MITIGATION_RETPOLINE */
static noinline void __nft_trace_packet(const struct nft_pktinfo *pkt,
const struct nft_verdict *verdict,
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index d2773ce9b585..8b7b39d8a109 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -470,18 +470,18 @@ static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk)
return 0;
}
-static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata)
+static int nfqnl_get_sk_secctx(struct sk_buff *skb, struct lsm_context *ctx)
{
- u32 seclen = 0;
+ int seclen = 0;
#if IS_ENABLED(CONFIG_NETWORK_SECMARK)
+
if (!skb || !sk_fullsock(skb->sk))
return 0;
read_lock_bh(&skb->sk->sk_callback_lock);
if (skb->secmark)
- security_secid_to_secctx(skb->secmark, secdata, &seclen);
-
+ seclen = security_secid_to_secctx(skb->secmark, ctx);
read_unlock_bh(&skb->sk->sk_callback_lock);
#endif
return seclen;
@@ -567,8 +567,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
enum ip_conntrack_info ctinfo = 0;
const struct nfnl_ct_hook *nfnl_ct;
bool csum_verify;
- char *secdata = NULL;
- u32 seclen = 0;
+ struct lsm_context ctx = { NULL, 0, 0 };
+ int seclen = 0;
ktime_t tstamp;
size = nlmsg_total_size(sizeof(struct nfgenmsg))
@@ -642,7 +642,9 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) {
- seclen = nfqnl_get_sk_secctx(entskb, &secdata);
+ seclen = nfqnl_get_sk_secctx(entskb, &ctx);
+ if (seclen < 0)
+ return NULL;
if (seclen)
size += nla_total_size(seclen);
}
@@ -782,7 +784,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_sk_classid(skb, entskb->sk) < 0)
goto nla_put_failure;
- if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata))
+ if (seclen > 0 && nla_put(skb, NFQA_SECCTX, ctx.len, ctx.context))
goto nla_put_failure;
if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
@@ -810,8 +812,8 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
}
nlh->nlmsg_len = skb->len;
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return skb;
nla_put_failure:
@@ -819,8 +821,8 @@ nla_put_failure:
kfree_skb(skb);
net_err_ratelimited("nf_queue: error creating packet message\n");
nlmsg_failure:
- if (seclen)
- security_release_secctx(secdata, seclen);
+ if (seclen >= 0)
+ security_release_secctx(&ctx);
return NULL;
}
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index 7010541fcca6..19a553550c76 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -319,37 +319,21 @@ static const struct nft_chain_type nft_chain_filter_netdev = {
};
static void nft_netdev_event(unsigned long event, struct net_device *dev,
- struct nft_ctx *ctx)
+ struct nft_base_chain *basechain)
{
- struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
- struct nft_hook *hook, *found = NULL;
- int n = 0;
+ struct nft_hook *hook;
list_for_each_entry(hook, &basechain->hook_list, list) {
- if (hook->ops.dev == dev)
- found = hook;
-
- n++;
- }
- if (!found)
- return;
+ if (hook->ops.dev != dev)
+ continue;
- if (n > 1) {
- if (!(ctx->chain->table->flags & NFT_TABLE_F_DORMANT))
- nf_unregister_net_hook(ctx->net, &found->ops);
+ if (!(basechain->chain.table->flags & NFT_TABLE_F_DORMANT))
+ nf_unregister_net_hook(dev_net(dev), &hook->ops);
- list_del_rcu(&found->list);
- kfree_rcu(found, rcu);
- return;
+ list_del_rcu(&hook->list);
+ kfree_rcu(hook, rcu);
+ break;
}
-
- /* UNREGISTER events are also happening on netns exit.
- *
- * Although nf_tables core releases all tables/chains, only this event
- * handler provides guarantee that hook->ops.dev is still accessible,
- * so we cannot skip exiting net namespaces.
- */
- __nft_release_basechain(ctx);
}
static int nf_tables_netdev_event(struct notifier_block *this,
@@ -358,25 +342,20 @@ static int nf_tables_netdev_event(struct notifier_block *this,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_base_chain *basechain;
struct nftables_pernet *nft_net;
- struct nft_chain *chain, *nr;
+ struct nft_chain *chain;
struct nft_table *table;
- struct nft_ctx ctx = {
- .net = dev_net(dev),
- };
if (event != NETDEV_UNREGISTER)
return NOTIFY_DONE;
- nft_net = nft_pernet(ctx.net);
+ nft_net = nft_pernet(dev_net(dev));
mutex_lock(&nft_net->commit_mutex);
list_for_each_entry(table, &nft_net->tables, list) {
if (table->family != NFPROTO_NETDEV &&
table->family != NFPROTO_INET)
continue;
- ctx.family = table->family;
- ctx.table = table;
- list_for_each_entry_safe(chain, nr, &table->chains, list) {
+ list_for_each_entry(chain, &table->chains, list) {
if (!nft_is_base_chain(chain))
continue;
@@ -385,8 +364,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
basechain->ops.hooknum != NF_INET_INGRESS)
continue;
- ctx.chain = chain;
- nft_netdev_event(event, dev, &ctx);
+ nft_netdev_event(event, dev, basechain);
}
}
mutex_unlock(&nft_net->commit_mutex);
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 7ca4f0d21fe2..72711d62fddf 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -228,7 +228,7 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv)
return 0;
}
-static void nft_compat_wait_for_destructors(void)
+static void nft_compat_wait_for_destructors(struct net *net)
{
/* xtables matches or targets can have side effects, e.g.
* creation/destruction of /proc files.
@@ -236,7 +236,7 @@ static void nft_compat_wait_for_destructors(void)
* work queue. If we have pending invocations we thus
* need to wait for those to finish.
*/
- nf_tables_trans_destroy_flush_work();
+ nf_tables_trans_destroy_flush_work(net);
}
static int
@@ -262,7 +262,7 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_target_set_tgchk_param(&par, ctx, target, info, &e, proto, inv);
- nft_compat_wait_for_destructors();
+ nft_compat_wait_for_destructors(ctx->net);
ret = xt_check_target(&par, size, proto, inv);
if (ret < 0) {
@@ -515,7 +515,7 @@ __nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
nft_match_set_mtchk_param(&par, ctx, match, info, &e, proto, inv);
- nft_compat_wait_for_destructors();
+ nft_compat_wait_for_destructors(ctx->net);
return xt_check_match(&par, size, proto, inv);
}
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 67a41cd2baaf..d526e69a2a2b 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -230,6 +230,7 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
enum ip_conntrack_info ctinfo;
u16 value = nft_reg_load16(&regs->data[priv->sreg]);
struct nf_conn *ct;
+ int oldcnt;
ct = nf_ct_get(skb, &ctinfo);
if (ct) /* already tracked */
@@ -250,10 +251,11 @@ static void nft_ct_set_zone_eval(const struct nft_expr *expr,
ct = this_cpu_read(nft_ct_pcpu_template);
- if (likely(refcount_read(&ct->ct_general.use) == 1)) {
- refcount_inc(&ct->ct_general.use);
+ __refcount_inc(&ct->ct_general.use, &oldcnt);
+ if (likely(oldcnt == 1)) {
nf_ct_zone_add(ct, &zone);
} else {
+ refcount_dec(&ct->ct_general.use);
/* previous skb got queued to userspace, allocate temporary
* one until percpu template can be reused.
*/
@@ -929,7 +931,7 @@ static void nft_ct_timeout_obj_eval(struct nft_object *obj,
*/
values = nf_ct_timeout_data(timeout);
if (values)
- nf_ct_refresh(ct, pkt->skb, values[0]);
+ nf_ct_refresh(ct, values[0]);
}
static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index b8d03364566c..c74012c99125 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -85,7 +85,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
unsigned char optbuf[sizeof(struct ip_options) + 40];
struct ip_options *opt = (struct ip_options *)optbuf;
struct iphdr *iph, _iph;
- unsigned int start;
bool found = false;
__be32 info;
int optlen;
@@ -93,7 +92,6 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
iph = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
if (!iph)
return -EBADMSG;
- start = sizeof(struct iphdr);
optlen = iph->ihl * 4 - (int)sizeof(struct iphdr);
if (optlen <= 0)
@@ -103,7 +101,7 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
/* Copy the options since __ip_options_compile() modifies
* the options.
*/
- if (skb_copy_bits(skb, start, opt->__data, optlen))
+ if (skb_copy_bits(skb, sizeof(struct iphdr), opt->__data, optlen))
return -EBADMSG;
opt->optlen = optlen;
@@ -118,18 +116,18 @@ static int ipv4_find_option(struct net *net, struct sk_buff *skb,
found = target == IPOPT_SSRR ? opt->is_strictroute :
!opt->is_strictroute;
if (found)
- *offset = opt->srr + start;
+ *offset = opt->srr;
break;
case IPOPT_RR:
if (!opt->rr)
break;
- *offset = opt->rr + start;
+ *offset = opt->rr;
found = true;
break;
case IPOPT_RA:
if (!opt->router_alert)
break;
- *offset = opt->router_alert + start;
+ *offset = opt->router_alert;
found = true;
break;
default:
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 8bfac4185ac7..abb0c8ec6371 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -309,7 +309,8 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
nft_setelem_expr_foreach(expr, elem_expr, size) {
if (expr->ops->gc &&
- expr->ops->gc(read_pnet(&set->net), expr))
+ expr->ops->gc(read_pnet(&set->net), expr) &&
+ set->flags & NFT_SET_EVAL)
return true;
}
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index b8d3c3213efe..c15db28c5ebc 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -994,8 +994,9 @@ static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 8, pkt[8], bsize);
NFT_PIPAPO_AVX2_AND(6, 2, 3);
+ NFT_PIPAPO_AVX2_AND(3, 4, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 9, pkt[9], bsize);
- NFT_PIPAPO_AVX2_AND(0, 4, 5);
+ NFT_PIPAPO_AVX2_AND(0, 3, 5);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
NFT_PIPAPO_AVX2_AND(2, 6, 7);
NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 681301b46aa4..0c63d1367cf7 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -335,13 +335,13 @@ static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
static const struct nla_policy nft_tunnel_opts_geneve_policy[NFTA_TUNNEL_KEY_GENEVE_MAX + 1] = {
[NFTA_TUNNEL_KEY_GENEVE_CLASS] = { .type = NLA_U16 },
[NFTA_TUNNEL_KEY_GENEVE_TYPE] = { .type = NLA_U8 },
- [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 128 },
+ [NFTA_TUNNEL_KEY_GENEVE_DATA] = { .type = NLA_BINARY, .len = 127 },
};
static int nft_tunnel_obj_geneve_init(const struct nlattr *attr,
struct nft_tunnel_opts *opts)
{
- struct geneve_opt *opt = (struct geneve_opt *)opts->u.data + opts->len;
+ struct geneve_opt *opt = (struct geneve_opt *)(opts->u.data + opts->len);
struct nlattr *tb[NFTA_TUNNEL_KEY_GENEVE_MAX + 1];
int err, data_len;
@@ -625,7 +625,7 @@ static int nft_tunnel_opts_dump(struct sk_buff *skb,
if (!inner)
goto failure;
while (opts->len > offset) {
- opt = (struct geneve_opt *)opts->u.data + offset;
+ opt = (struct geneve_opt *)(opts->u.data + offset);
if (nla_put_be16(skb, NFTA_TUNNEL_KEY_GENEVE_CLASS,
opt->opt_class) ||
nla_put_u8(skb, NFTA_TUNNEL_KEY_GENEVE_TYPE,
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 8a07b46cc8fb..3210cfc966ab 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -112,7 +112,8 @@ static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode)
return true;
}
- return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL;
+ return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL ||
+ mode == XFRM_MODE_IPTFS;
}
static void nft_xfrm_state_get_key(const struct nft_xfrm *priv,
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 0859b8f76764..fa02aab56724 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -363,11 +363,15 @@ static void htable_selective_cleanup(struct xt_hashlimit_htable *ht, bool select
unsigned int i;
for (i = 0; i < ht->cfg.size; i++) {
+ struct hlist_head *head = &ht->hash[i];
struct dsthash_ent *dh;
struct hlist_node *n;
+ if (hlist_empty(head))
+ continue;
+
spin_lock_bh(&ht->lock);
- hlist_for_each_entry_safe(dh, n, &ht->hash[i], node) {
+ hlist_for_each_entry_safe(dh, n, head, node) {
if (time_after_eq(jiffies, dh->expires) || select_all)
dsthash_free(ht, dh);
}
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index 1bc2d0890a9f..dfda9ea61971 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -374,8 +374,7 @@ int netlbl_unlhsh_add(struct net *net,
struct net_device *dev;
struct netlbl_unlhsh_iface *iface;
struct audit_buffer *audit_buf = NULL;
- char *secctx = NULL;
- u32 secctx_len;
+ struct lsm_context ctx;
if (addr_len != sizeof(struct in_addr) &&
addr_len != sizeof(struct in6_addr))
@@ -438,11 +437,9 @@ int netlbl_unlhsh_add(struct net *net,
unlhsh_add_return:
rcu_read_unlock();
if (audit_buf != NULL) {
- if (security_secid_to_secctx(secid,
- &secctx,
- &secctx_len) == 0) {
- audit_log_format(audit_buf, " sec_obj=%s", secctx);
- security_release_secctx(secctx, secctx_len);
+ if (security_secid_to_secctx(secid, &ctx) >= 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
+ security_release_secctx(&ctx);
}
audit_log_format(audit_buf, " res=%u", ret_val == 0 ? 1 : 0);
audit_log_end(audit_buf);
@@ -473,8 +470,7 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
struct netlbl_unlhsh_addr4 *entry;
struct audit_buffer *audit_buf;
struct net_device *dev;
- char *secctx;
- u32 secctx_len;
+ struct lsm_context ctx;
spin_lock(&netlbl_unlhsh_lock);
list_entry = netlbl_af4list_remove(addr->s_addr, mask->s_addr,
@@ -494,10 +490,9 @@ static int netlbl_unlhsh_remove_addr4(struct net *net,
addr->s_addr, mask->s_addr);
dev_put(dev);
if (entry != NULL &&
- security_secid_to_secctx(entry->secid,
- &secctx, &secctx_len) == 0) {
- audit_log_format(audit_buf, " sec_obj=%s", secctx);
- security_release_secctx(secctx, secctx_len);
+ security_secid_to_secctx(entry->secid, &ctx) >= 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
+ security_release_secctx(&ctx);
}
audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
audit_log_end(audit_buf);
@@ -534,8 +529,7 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
struct netlbl_unlhsh_addr6 *entry;
struct audit_buffer *audit_buf;
struct net_device *dev;
- char *secctx;
- u32 secctx_len;
+ struct lsm_context ctx;
spin_lock(&netlbl_unlhsh_lock);
list_entry = netlbl_af6list_remove(addr, mask, &iface->addr6_list);
@@ -554,10 +548,9 @@ static int netlbl_unlhsh_remove_addr6(struct net *net,
addr, mask);
dev_put(dev);
if (entry != NULL &&
- security_secid_to_secctx(entry->secid,
- &secctx, &secctx_len) == 0) {
- audit_log_format(audit_buf, " sec_obj=%s", secctx);
- security_release_secctx(secctx, secctx_len);
+ security_secid_to_secctx(entry->secid, &ctx) >= 0) {
+ audit_log_format(audit_buf, " sec_obj=%s", ctx.context);
+ security_release_secctx(&ctx);
}
audit_log_format(audit_buf, " res=%u", entry != NULL ? 1 : 0);
audit_log_end(audit_buf);
@@ -1069,10 +1062,9 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
int ret_val = -ENOMEM;
struct netlbl_unlhsh_walk_arg *cb_arg = arg;
struct net_device *dev;
+ struct lsm_context ctx;
void *data;
u32 secid;
- char *secctx;
- u32 secctx_len;
data = genlmsg_put(cb_arg->skb, NETLINK_CB(cb_arg->nl_cb->skb).portid,
cb_arg->seq, &netlbl_unlabel_gnl_family,
@@ -1127,14 +1119,14 @@ static int netlbl_unlabel_staticlist_gen(u32 cmd,
secid = addr6->secid;
}
- ret_val = security_secid_to_secctx(secid, &secctx, &secctx_len);
- if (ret_val != 0)
+ ret_val = security_secid_to_secctx(secid, &ctx);
+ if (ret_val < 0)
goto list_cb_failure;
ret_val = nla_put(cb_arg->skb,
NLBL_UNLABEL_A_SECCTX,
- secctx_len,
- secctx);
- security_release_secctx(secctx, secctx_len);
+ ctx.len,
+ ctx.context);
+ security_release_secctx(&ctx);
if (ret_val != 0)
goto list_cb_failure;
diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c
index 81635a13987b..0d04d23aafe7 100644
--- a/net/netlabel/netlabel_user.c
+++ b/net/netlabel/netlabel_user.c
@@ -84,8 +84,7 @@ struct audit_buffer *netlbl_audit_start_common(int type,
struct netlbl_audit *audit_info)
{
struct audit_buffer *audit_buf;
- char *secctx;
- u32 secctx_len;
+ struct lsm_context ctx;
if (audit_enabled == AUDIT_OFF)
return NULL;
@@ -99,10 +98,9 @@ struct audit_buffer *netlbl_audit_start_common(int type,
audit_info->sessionid);
if (lsmprop_is_set(&audit_info->prop) &&
- security_lsmprop_to_secctx(&audit_info->prop, &secctx,
- &secctx_len) == 0) {
- audit_log_format(audit_buf, " subj=%s", secctx);
- security_release_secctx(secctx, secctx_len);
+ security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) {
+ audit_log_format(audit_buf, " subj=%s", ctx.context);
+ security_release_secctx(&ctx);
}
return audit_buf;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e88a1ac160bc..a53ea60d0a78 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1277,6 +1277,7 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
int delta;
+ skb_assert_len(skb);
WARN_ON(skb->sk != NULL);
delta = skb->end - skb->tail;
if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 704c858cf209..61fea7baae5d 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -947,12 +947,6 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
pskb_trim(skb, ovs_mac_header_len(key));
}
- /* Need to set the pkt_type to involve the routing layer. The
- * packet movement through the OVS datapath doesn't generally
- * use routing, but this is needed for tunnel cases.
- */
- skb->pkt_type = PACKET_OUTGOING;
-
if (likely(!mru ||
(skb->len <= mru + vport->dev->hard_header_len))) {
ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 3bb4810234aa..e573e9221302 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1368,8 +1368,11 @@ bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr)
attr == OVS_KEY_ATTR_CT_MARK)
return true;
if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
- attr == OVS_KEY_ATTR_CT_LABELS)
- return true;
+ attr == OVS_KEY_ATTR_CT_LABELS) {
+ struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+
+ return ovs_net->xt_label;
+ }
return false;
}
@@ -1378,7 +1381,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
const struct sw_flow_key *key,
struct sw_flow_actions **sfa, bool log)
{
- unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
struct ovs_conntrack_info ct_info;
const char *helper = NULL;
u16 family;
@@ -1407,12 +1409,6 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
return -ENOMEM;
}
- if (nf_connlabels_get(net, n_bits - 1)) {
- nf_ct_tmpl_free(ct_info.ct);
- OVS_NLERR(log, "Failed to set connlabel length");
- return -EOPNOTSUPP;
- }
-
if (ct_info.timeout[0]) {
if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto,
ct_info.timeout))
@@ -1581,7 +1577,6 @@ static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
if (ct_info->ct) {
if (ct_info->timeout[0])
nf_ct_destroy_timeout(ct_info->ct);
- nf_connlabels_put(nf_ct_net(ct_info->ct));
nf_ct_tmpl_free(ct_info->ct);
}
}
@@ -2006,9 +2001,17 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = {
int ovs_ct_init(struct net *net)
{
-#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
+ unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE;
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+ if (nf_connlabels_get(net, n_bits - 1)) {
+ ovs_net->xt_label = false;
+ OVS_NLERR(true, "Failed to set connlabel length");
+ } else {
+ ovs_net->xt_label = true;
+ }
+
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
return ovs_ct_limit_init(net, ovs_net);
#else
return 0;
@@ -2017,9 +2020,12 @@ int ovs_ct_init(struct net *net)
void ovs_ct_exit(struct net *net)
{
-#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
+#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
ovs_ct_limit_exit(net, ovs_net);
#endif
+
+ if (ovs_net->xt_label)
+ nf_connlabels_put(net);
}
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 365b9bb7f546..9ca6231ea647 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -160,6 +160,9 @@ struct ovs_net {
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
struct ovs_ct_limit_info *ct_limit_info;
#endif
+
+ /* Module reference for configuring conntrack. */
+ bool xt_label;
};
/**
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 881ddd3696d5..518be23e48ea 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -2317,14 +2317,10 @@ int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb)
OVS_FLOW_ATTR_MASK, true, skb);
}
-#define MAX_ACTIONS_BUFSIZE (32 * 1024)
-
static struct sw_flow_actions *nla_alloc_flow_actions(int size)
{
struct sw_flow_actions *sfa;
- WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE);
-
sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL);
if (!sfa)
return ERR_PTR(-ENOMEM);
@@ -2480,15 +2476,6 @@ static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa,
new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2);
- if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
- if ((next_offset + req_size) > MAX_ACTIONS_BUFSIZE) {
- OVS_NLERR(log, "Flow action size exceeds max %u",
- MAX_ACTIONS_BUFSIZE);
- return ERR_PTR(-EMSGSIZE);
- }
- new_acts_size = MAX_ACTIONS_BUFSIZE;
- }
-
acts = nla_alloc_flow_actions(new_acts_size);
if (IS_ERR(acts))
return ERR_CAST(acts);
@@ -2889,7 +2876,8 @@ static int validate_set(const struct nlattr *a,
size_t key_len;
/* There can be only one key in a action */
- if (nla_total_size(nla_len(ovs_key)) != nla_len(a))
+ if (!nla_ok(ovs_key, nla_len(a)) ||
+ nla_total_size(nla_len(ovs_key)) != nla_len(a))
return -EINVAL;
key_len = nla_len(ovs_key);
@@ -3545,7 +3533,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr,
int err;
u32 mpls_label_count = 0;
- *sfa = nla_alloc_flow_actions(min(nla_len(attr), MAX_ACTIONS_BUFSIZE));
+ *sfa = nla_alloc_flow_actions(nla_len(attr));
if (IS_ERR(*sfa))
return PTR_ERR(*sfa);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 2d73769d67f4..c131e5ceea37 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -3112,7 +3112,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
skb->protocol = proto;
skb->dev = dev;
- skb->priority = READ_ONCE(sk->sk_priority);
+ skb->priority = sockc.priority;
skb->mark = sockc.mark;
skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid);
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index ac5caf5a48e1..210b75e3179e 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -16,6 +16,7 @@ rxrpc-y := \
conn_object.o \
conn_service.o \
input.o \
+ input_rack.o \
insecure.o \
io_thread.o \
key.o \
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 9d8bd0b37e41..86873399f7d5 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -408,9 +408,9 @@ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call)
/* Make sure we're not going to call back into a kernel service */
if (call->notify_rx) {
- spin_lock(&call->notify_lock);
+ spin_lock_irq(&call->notify_lock);
call->notify_rx = rxrpc_dummy_notify_rx;
- spin_unlock(&call->notify_lock);
+ spin_unlock_irq(&call->notify_lock);
}
}
mutex_unlock(&call->user_mutex);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 6b036c0564c7..a64a0cab1bf7 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -30,6 +30,7 @@ struct rxrpc_crypt {
struct key_preparsed_payload;
struct rxrpc_connection;
struct rxrpc_txbuf;
+struct rxrpc_txqueue;
/*
* Mark applied to socket buffers in skb->mark. skb->priority is used
@@ -98,6 +99,7 @@ struct rxrpc_net {
atomic_t stat_tx_data_send;
atomic_t stat_tx_data_send_frag;
atomic_t stat_tx_data_send_fail;
+ atomic_t stat_tx_data_send_msgsize;
atomic_t stat_tx_data_underflow;
atomic_t stat_tx_data_cwnd_reset;
atomic_t stat_rx_data;
@@ -109,6 +111,8 @@ struct rxrpc_net {
atomic_t stat_tx_ack_skip;
atomic_t stat_tx_acks[256];
atomic_t stat_rx_acks[256];
+ atomic_t stat_tx_jumbo[10];
+ atomic_t stat_rx_jumbo[10];
atomic_t stat_why_req_ack[8];
@@ -210,9 +214,8 @@ struct rxrpc_skb_priv {
rxrpc_seq_t first_ack; /* First packet in acks table */
rxrpc_seq_t prev_ack; /* Highest seq seen */
rxrpc_serial_t acked_serial; /* Packet in response to (or 0) */
+ u16 nr_acks; /* Number of acks+nacks */
u8 reason; /* Reason for ack */
- u8 nr_acks; /* Number of acks+nacks */
- u8 nr_nacks; /* Number of nacks */
} ack;
};
struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
@@ -320,6 +323,12 @@ struct rxrpc_local {
struct list_head new_client_calls; /* Newly created client calls need connection */
spinlock_t client_call_lock; /* Lock for ->new_client_calls */
struct sockaddr_rxrpc srx; /* local address */
+ /* Provide a kvec table sufficiently large to manage either a DATA
+ * packet with a maximum set of jumbo subpackets or a PING ACK padded
+ * out to 64K with zeropages for PMTUD.
+ */
+ struct kvec kvec[1 + RXRPC_MAX_NR_JUMBO > 3 + 16 ?
+ 1 + RXRPC_MAX_NR_JUMBO : 3 + 16];
};
/*
@@ -338,25 +347,27 @@ struct rxrpc_peer {
time64_t last_tx_at; /* Last time packet sent here */
seqlock_t service_conn_lock;
spinlock_t lock; /* access lock */
- unsigned int if_mtu; /* interface MTU for this peer */
- unsigned int mtu; /* network MTU for this peer */
- unsigned int maxdata; /* data size (MTU - hdrsize) */
- unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
int debug_id; /* debug ID for printks */
struct sockaddr_rxrpc srx; /* remote address */
- /* calculated RTT cache */
-#define RXRPC_RTT_CACHE_SIZE 32
- spinlock_t rtt_input_lock; /* RTT lock for input routine */
- ktime_t rtt_last_req; /* Time of last RTT request */
- unsigned int rtt_count; /* Number of samples we've got */
+ /* Path MTU discovery [RFC8899] */
+ unsigned int pmtud_trial; /* Current MTU probe size */
+ unsigned int pmtud_good; /* Largest working MTU probe we've tried */
+ unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */
+ bool pmtud_lost; /* T if MTU probe was lost */
+ bool pmtud_probing; /* T if we have an active probe outstanding */
+ bool pmtud_pending; /* T if a call to this peer should send a probe */
+ u8 pmtud_jumbo; /* Max jumbo packets for the MTU */
+ bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */
+ unsigned int ackr_max_data; /* Maximum data advertised by peer */
+ unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */
+ unsigned int max_data; /* Maximum packet data capacity for this peer */
+ unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
+ unsigned short tx_seg_max; /* Maximum number of transmissable segments */
- u32 srtt_us; /* smoothed round trip time << 3 in usecs */
- u32 mdev_us; /* medium deviation */
- u32 mdev_max_us; /* maximal mdev for the last rtt period */
- u32 rttvar_us; /* smoothed mdev_max */
- u32 rto_us; /* Retransmission timeout in usec */
- u8 backoff; /* Backoff timeout (as shift) */
+ /* Calculated RTT cache */
+ unsigned int recent_srtt_us;
+ unsigned int recent_rto_us;
u8 cong_ssthresh; /* Congestion slow-start threshold */
};
@@ -525,6 +536,8 @@ struct rxrpc_connection {
int debug_id; /* debug ID for printks */
rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */
unsigned int hi_serial; /* highest serial number received */
+ rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */
+ unsigned int pmtud_call; /* ID of call used for probe */
u32 service_id; /* Service ID, possibly upgraded */
u32 security_level; /* Security level selected */
u8 security_ix; /* security type */
@@ -557,6 +570,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_RX_LAST, /* Received the last packet (at rxtx_top) */
RXRPC_CALL_TX_LAST, /* Last packet in Tx buffer (at rxtx_top) */
RXRPC_CALL_TX_ALL_ACKED, /* Last packet has been hard-acked */
+ RXRPC_CALL_TX_NO_MORE, /* No more data to transmit (MSG_MORE deasserted) */
RXRPC_CALL_SEND_PING, /* A ping will need to be sent */
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
@@ -599,13 +613,25 @@ enum rxrpc_call_state {
/*
* Call Tx congestion management modes.
*/
-enum rxrpc_congest_mode {
- RXRPC_CALL_SLOW_START,
- RXRPC_CALL_CONGEST_AVOIDANCE,
- RXRPC_CALL_PACKET_LOSS,
- RXRPC_CALL_FAST_RETRANSMIT,
- NR__RXRPC_CONGEST_MODES
-};
+enum rxrpc_ca_state {
+ RXRPC_CA_SLOW_START,
+ RXRPC_CA_CONGEST_AVOIDANCE,
+ RXRPC_CA_PACKET_LOSS,
+ RXRPC_CA_FAST_RETRANSMIT,
+ NR__RXRPC_CA_STATES
+} __mode(byte);
+
+/*
+ * Current purpose of call RACK timer. According to the RACK-TLP protocol
+ * [RFC8985], the transmission timer (call->rack_timo_at) may only be used for
+ * one of these at once.
+ */
+enum rxrpc_rack_timer_mode {
+ RXRPC_CALL_RACKTIMER_OFF, /* Timer not running */
+ RXRPC_CALL_RACKTIMER_RACK_REORDER, /* RACK reordering timer */
+ RXRPC_CALL_RACKTIMER_TLP_PTO, /* TLP timeout */
+ RXRPC_CALL_RACKTIMER_RTO, /* Retransmission timeout */
+} __mode(byte);
/*
* RxRPC call definition
@@ -624,8 +650,7 @@ struct rxrpc_call {
struct mutex user_mutex; /* User access mutex */
struct sockaddr_rxrpc dest_srx; /* Destination address */
ktime_t delay_ack_at; /* When DELAY ACK needs to happen */
- ktime_t ack_lost_at; /* When ACK is figured as lost */
- ktime_t resend_at; /* When next resend needs to happen */
+ ktime_t rack_timo_at; /* When ACK is figured as lost */
ktime_t ping_at; /* When next to send a ping */
ktime_t keepalive_at; /* When next to send a keepalive ping */
ktime_t expect_rx_by; /* When we expect to get a packet by */
@@ -670,21 +695,30 @@ struct rxrpc_call {
unsigned short rx_pkt_offset; /* Current recvmsg packet offset */
unsigned short rx_pkt_len; /* Current recvmsg packet len */
+ /* Sendmsg data tracking. */
+ rxrpc_seq_t send_top; /* Highest Tx slot filled by sendmsg. */
+ struct rxrpc_txqueue *send_queue; /* Queue that sendmsg is writing into */
+
/* Transmitted data tracking. */
- spinlock_t tx_lock; /* Transmit queue lock */
- struct list_head tx_sendmsg; /* Sendmsg prepared packets */
- struct list_head tx_buffer; /* Buffer of transmissible packets */
+ struct rxrpc_txqueue *tx_queue; /* Start of transmission buffers */
+ struct rxrpc_txqueue *tx_qtail; /* End of transmission buffers */
+ rxrpc_seq_t tx_qbase; /* First slot in tx_queue */
rxrpc_seq_t tx_bottom; /* First packet in buffer */
rxrpc_seq_t tx_transmitted; /* Highest packet transmitted */
- rxrpc_seq_t tx_prepared; /* Highest Tx slot prepared. */
rxrpc_seq_t tx_top; /* Highest Tx slot allocated. */
+ rxrpc_serial_t tx_last_serial; /* Serial of last DATA transmitted */
u16 tx_backoff; /* Delay to insert due to Tx failure (ms) */
- u8 tx_winsize; /* Maximum size of Tx window */
+ u16 tx_nr_sent; /* Number of packets sent, but unacked */
+ u16 tx_nr_lost; /* Number of packets marked lost */
+ u16 tx_nr_resent; /* Number of packets resent, but unacked */
+ u16 tx_winsize; /* Maximum size of Tx window */
#define RXRPC_TX_MAX_WINDOW 128
+ u8 tx_jumbo_max; /* Maximum subpkts peer will accept */
ktime_t tx_last_sent; /* Last time a transmission occurred */
/* Received data tracking */
struct sk_buff_head recvmsg_queue; /* Queue of packets ready for recvmsg() */
+ struct sk_buff_head rx_queue; /* Queue of packets for this call to receive */
struct sk_buff_head rx_oos_queue; /* Queue of out of sequence packets */
rxrpc_seq_t rx_highest_seq; /* Higest sequence number received */
@@ -698,14 +732,32 @@ struct rxrpc_call {
*/
#define RXRPC_TX_SMSS RXRPC_JUMBO_DATALEN
#define RXRPC_MIN_CWND 4
- u8 cong_cwnd; /* Congestion window size */
+ enum rxrpc_ca_state cong_ca_state; /* Congestion control state */
u8 cong_extra; /* Extra to send for congestion management */
- u8 cong_ssthresh; /* Slow-start threshold */
- enum rxrpc_congest_mode cong_mode:8; /* Congestion management mode */
- u8 cong_dup_acks; /* Count of ACKs showing missing packets */
- u8 cong_cumul_acks; /* Cumulative ACK count */
+ u16 cong_cwnd; /* Congestion window size */
+ u16 cong_ssthresh; /* Slow-start threshold */
+ u16 cong_dup_acks; /* Count of ACKs showing missing packets */
+ u16 cong_cumul_acks; /* Cumulative ACK count */
ktime_t cong_tstamp; /* Last time cwnd was changed */
- struct sk_buff *cong_last_nack; /* Last ACK with nacks received */
+
+ /* RACK-TLP [RFC8985] state. */
+ ktime_t rack_xmit_ts; /* Latest transmission timestamp */
+ ktime_t rack_rtt; /* RTT of most recently ACK'd segment */
+ ktime_t rack_rtt_ts; /* Timestamp of rack_rtt */
+ ktime_t rack_reo_wnd; /* Reordering window */
+ unsigned int rack_reo_wnd_mult; /* Multiplier applied to rack_reo_wnd */
+ int rack_reo_wnd_persist; /* Num loss recoveries before reset reo_wnd */
+ rxrpc_seq_t rack_fack; /* Highest sequence so far ACK'd */
+ rxrpc_seq_t rack_end_seq; /* Highest sequence seen */
+ rxrpc_seq_t rack_dsack_round; /* DSACK opt recv'd in latest roundtrip */
+ bool rack_dsack_round_none; /* T if dsack_round is "None" */
+ bool rack_reordering_seen; /* T if detected reordering event */
+ enum rxrpc_rack_timer_mode rack_timer_mode; /* Current mode of RACK timer */
+ bool tlp_is_retrans; /* T if unacked TLP retransmission */
+ rxrpc_serial_t tlp_serial; /* Serial of TLP probe (or 0 if none in progress) */
+ rxrpc_seq_t tlp_seq; /* Sequence of TLP probe */
+ unsigned int tlp_rtt_taken; /* Last time RTT taken */
+ ktime_t tlp_max_ack_delay; /* Sender budget for max delayed ACK interval */
/* Receive-phase ACK management (ACKs we send). */
u8 ackr_reason; /* reason to ACK */
@@ -730,32 +782,45 @@ struct rxrpc_call {
/* Transmission-phase ACK management (ACKs we've received). */
ktime_t acks_latest_ts; /* Timestamp of latest ACK received */
- rxrpc_seq_t acks_first_seq; /* first sequence number received */
+ rxrpc_seq_t acks_hard_ack; /* Highest sequence hard acked */
rxrpc_seq_t acks_prev_seq; /* Highest previousPacket received */
- rxrpc_seq_t acks_hard_ack; /* Latest hard-ack point */
rxrpc_seq_t acks_lowest_nak; /* Lowest NACK in the buffer (or ==tx_hard_ack) */
rxrpc_serial_t acks_highest_serial; /* Highest serial number ACK'd */
+ unsigned short acks_nr_sacks; /* Number of soft acks recorded */
+ unsigned short acks_nr_snacks; /* Number of soft nacks recorded */
+
+ /* Calculated RTT cache */
+ ktime_t rtt_last_req; /* Time of last RTT request */
+ unsigned int rtt_count; /* Number of samples we've got */
+ unsigned int rtt_taken; /* Number of samples taken (wrapping) */
+ struct minmax min_rtt; /* Estimated minimum RTT */
+ u32 srtt_us; /* smoothed round trip time << 3 in usecs */
+ u32 mdev_us; /* medium deviation */
+ u32 mdev_max_us; /* maximal mdev for the last rtt period */
+ u32 rttvar_us; /* smoothed mdev_max */
+ u32 rto_us; /* Retransmission timeout in usec */
+ u8 backoff; /* Backoff timeout (as shift) */
};
/*
* Summary of a new ACK and the changes it made to the Tx buffer packet states.
*/
struct rxrpc_ack_summary {
- u16 nr_acks; /* Number of ACKs in packet */
- u16 nr_new_acks; /* Number of new ACKs in packet */
- u16 nr_new_nacks; /* Number of new nacks in packet */
- u16 nr_retained_nacks; /* Number of nacks retained between ACKs */
- u8 ack_reason;
- bool saw_nacks; /* Saw NACKs in packet */
- bool new_low_nack; /* T if new low NACK found */
- bool retrans_timeo; /* T if reTx due to timeout happened */
- u8 flight_size; /* Number of unreceived transmissions */
- /* Place to stash values for tracing */
- enum rxrpc_congest_mode mode:8;
- u8 cwnd;
- u8 ssthresh;
- u8 dup_acks;
- u8 cumulative_acks;
+ rxrpc_serial_t ack_serial; /* Serial number of ACK */
+ rxrpc_serial_t acked_serial; /* Serial number ACK'd */
+ u16 in_flight; /* Number of unreceived transmissions */
+ u16 nr_new_hacks; /* Number of rotated new ACKs */
+ u16 nr_new_sacks; /* Number of new soft ACKs in packet */
+ u16 nr_new_snacks; /* Number of new soft nacks in packet */
+ u8 ack_reason;
+ bool new_low_snack:1; /* T if new low soft NACK found */
+ bool retrans_timeo:1; /* T if reTx due to timeout happened */
+ bool need_retransmit:1; /* T if we need transmission */
+ bool rtt_sample_avail:1; /* T if RTT sample available */
+ bool in_fast_or_rto_recovery:1;
+ bool exiting_fast_or_rto_recovery:1;
+ bool tlp_probe_acked:1; /* T if the TLP probe seq was acked */
+ u8 /*enum rxrpc_congest_change*/ change;
};
/*
@@ -793,25 +858,22 @@ struct rxrpc_send_params {
* Buffer of data to be output as a packet.
*/
struct rxrpc_txbuf {
- struct list_head call_link; /* Link in call->tx_sendmsg/tx_buffer */
- struct list_head tx_link; /* Link in live Enc queue or Tx queue */
- ktime_t last_sent; /* Time at which last transmitted */
refcount_t ref;
rxrpc_seq_t seq; /* Sequence number of this packet */
rxrpc_serial_t serial; /* Last serial number transmitted with */
unsigned int call_debug_id;
unsigned int debug_id;
- unsigned int len; /* Amount of data in buffer */
- unsigned int space; /* Remaining data space */
- unsigned int offset; /* Offset of fill point */
+ unsigned short len; /* Amount of data in buffer */
+ unsigned short space; /* Remaining data space */
+ unsigned short offset; /* Offset of fill point */
+ unsigned short pkt_len; /* Size of packet content */
+ unsigned short alloc_size; /* Amount of bufferage allocated */
unsigned int flags;
#define RXRPC_TXBUF_WIRE_FLAGS 0xff /* The wire protocol flags */
#define RXRPC_TXBUF_RESENT 0x100 /* Set if has been resent */
__be16 cksum; /* Checksum to go in header */
- unsigned short ack_rwind; /* ACK receive window */
- u8 /*enum rxrpc_propose_ack_trace*/ ack_why; /* If ack, why */
- u8 nr_kvec; /* Amount of kvec[] used */
- struct kvec kvec[3];
+ bool jumboable; /* Can be non-terminal jumbo subpacket */
+ void *data; /* Data with preceding jumbo header */
};
static inline bool rxrpc_sending_to_server(const struct rxrpc_txbuf *txb)
@@ -824,6 +886,46 @@ static inline bool rxrpc_sending_to_client(const struct rxrpc_txbuf *txb)
return !rxrpc_sending_to_server(txb);
}
+/*
+ * Transmit queue element, including RACK [RFC8985] per-segment metadata. The
+ * transmission timestamp is in usec from the base.
+ */
+struct rxrpc_txqueue {
+ /* Start with the members we want to prefetch. */
+ struct rxrpc_txqueue *next;
+ ktime_t xmit_ts_base;
+ rxrpc_seq_t qbase;
+ u8 nr_reported_acks; /* Number of segments explicitly acked/nacked */
+ unsigned long segment_acked; /* Bit-per-buf: Set if ACK'd */
+ unsigned long segment_lost; /* Bit-per-buf: Set if declared lost */
+ unsigned long segment_retransmitted; /* Bit-per-buf: Set if retransmitted */
+ unsigned long rtt_samples; /* Bit-per-buf: Set if available for RTT */
+ unsigned long ever_retransmitted; /* Bit-per-buf: Set if ever retransmitted */
+
+ /* The arrays we want to pack into as few cache lines as possible. */
+ struct {
+#define RXRPC_NR_TXQUEUE BITS_PER_LONG
+#define RXRPC_TXQ_MASK (RXRPC_NR_TXQUEUE - 1)
+ struct rxrpc_txbuf *bufs[RXRPC_NR_TXQUEUE];
+ unsigned int segment_serial[RXRPC_NR_TXQUEUE];
+ unsigned int segment_xmit_ts[RXRPC_NR_TXQUEUE];
+ } ____cacheline_aligned;
+};
+
+/*
+ * Data transmission request.
+ */
+struct rxrpc_send_data_req {
+ ktime_t now; /* Current time */
+ struct rxrpc_txqueue *tq; /* Tx queue segment holding first DATA */
+ rxrpc_seq_t seq; /* Sequence of first data */
+ int n; /* Number of DATA packets to glue into jumbo */
+ bool retrans; /* T if this is a retransmission */
+ bool did_send; /* T if did actually send */
+ bool tlp_probe; /* T if this is a TLP probe */
+ int /* enum rxrpc_txdata_trace */ trace;
+};
+
#include <trace/events/rxrpc.h>
/*
@@ -841,6 +943,21 @@ static inline rxrpc_serial_t rxrpc_get_next_serial(struct rxrpc_connection *conn
}
/*
+ * Allocate the next serial n numbers on a connection. 0 must be skipped.
+ */
+static inline rxrpc_serial_t rxrpc_get_next_serials(struct rxrpc_connection *conn,
+ unsigned int n)
+{
+ rxrpc_serial_t serial;
+
+ serial = conn->tx_serial;
+ if (serial + n <= n)
+ serial = 1;
+ conn->tx_serial = serial + n;
+ return serial;
+}
+
+/*
* af_rxrpc.c
*/
extern atomic_t rxrpc_n_rx_skbs;
@@ -865,10 +982,10 @@ void rxrpc_propose_ping(struct rxrpc_call *call, u32 serial,
enum rxrpc_propose_ack_trace why);
void rxrpc_propose_delay_ACK(struct rxrpc_call *, rxrpc_serial_t,
enum rxrpc_propose_ack_trace);
-void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *);
-void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb);
-
-bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb);
+void rxrpc_resend_tlp(struct rxrpc_call *call);
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace);
+bool rxrpc_input_call_event(struct rxrpc_call *call);
/*
* call_object.c
@@ -1047,6 +1164,32 @@ void rxrpc_input_call_packet(struct rxrpc_call *, struct sk_buff *);
void rxrpc_implicit_end_call(struct rxrpc_call *, struct sk_buff *);
/*
+ * input_rack.c
+ */
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix);
+void rxrpc_input_rack(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long new_acks);
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary);
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now);
+void rxrpc_tlp_send_probe(struct rxrpc_call *call);
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary);
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by);
+
+/* Initialise TLP state [RFC8958 7.1]. */
+static inline void rxrpc_tlp_init(struct rxrpc_call *call)
+{
+ call->tlp_serial = 0;
+ call->tlp_seq = call->acks_hard_ack;
+ call->tlp_is_retrans = false;
+}
+
+/*
* io_thread.c
*/
int rxrpc_encap_rcv(struct sock *, struct sk_buff *);
@@ -1149,17 +1292,20 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
*/
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
int rxrpc_send_abort_packet(struct rxrpc_call *);
+void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req);
void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
void rxrpc_send_keepalive(struct rxrpc_peer *);
-void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
/*
* peer_event.c
*/
void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
void rxrpc_peer_keepalive_worker(struct work_struct *);
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+ bool sendmsg_fail);
/*
* peer_object.c
@@ -1208,10 +1354,12 @@ static inline int rxrpc_abort_eproto(struct rxrpc_call *call,
/*
* rtt.c
*/
-void rxrpc_peer_add_rtt(struct rxrpc_call *, enum rxrpc_rtt_rx_trace, int,
- rxrpc_serial_t, rxrpc_serial_t, ktime_t, ktime_t);
-ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans);
-void rxrpc_peer_init_rtt(struct rxrpc_peer *);
+void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+ int rtt_slot,
+ rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
+ ktime_t send_time, ktime_t resp_time);
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans);
+void rxrpc_call_init_rtt(struct rxrpc_call *call);
/*
* rxkad.c
@@ -1284,7 +1432,6 @@ static inline void rxrpc_sysctl_exit(void) {}
extern atomic_t rxrpc_nr_txbuf;
struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size,
size_t data_align, gfp_t gfp);
-struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size);
void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what);
@@ -1311,6 +1458,53 @@ static inline bool after_eq(u32 seq1, u32 seq2)
return (s32)(seq1 - seq2) >= 0;
}
+static inline u32 earliest(u32 seq1, u32 seq2)
+{
+ return before(seq1, seq2) ? seq1 : seq2;
+}
+
+static inline u32 latest(u32 seq1, u32 seq2)
+{
+ return after(seq1, seq2) ? seq1 : seq2;
+}
+
+static inline bool rxrpc_seq_in_txq(const struct rxrpc_txqueue *tq, rxrpc_seq_t seq)
+{
+ return (seq & (RXRPC_NR_TXQUEUE - 1)) == tq->qbase;
+}
+
+static inline void rxrpc_queue_rx_call_packet(struct rxrpc_call *call, struct sk_buff *skb)
+{
+ rxrpc_get_skb(skb, rxrpc_skb_get_call_rx);
+ __skb_queue_tail(&call->rx_queue, skb);
+ rxrpc_poke_call(call, rxrpc_call_poke_rx_packet);
+}
+
+/*
+ * Calculate how much space there is for transmitting more DATA packets.
+ */
+static inline unsigned int rxrpc_tx_window_space(const struct rxrpc_call *call)
+{
+ int winsize = umin(call->tx_winsize, call->cong_cwnd + call->cong_extra);
+ int transmitted = call->tx_top - call->tx_bottom;
+
+ return max(winsize - transmitted, 0);
+}
+
+static inline unsigned int rxrpc_left_out(const struct rxrpc_call *call)
+{
+ return call->acks_nr_sacks + call->tx_nr_lost;
+}
+
+/*
+ * Calculate the number of transmitted DATA packets assumed to be in flight
+ * [approx RFC6675].
+ */
+static inline unsigned int rxrpc_tx_in_flight(const struct rxrpc_call *call)
+{
+ return call->tx_nr_sent - rxrpc_left_out(call) + call->tx_nr_resent;
+}
+
/*
* debug tracing
*/
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index 0f5a1d77b890..e685034ce4f7 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -188,8 +188,8 @@ void rxrpc_discard_prealloc(struct rxrpc_sock *rx)
/* Make sure that there aren't any incoming calls in progress before we
* clear the preallocation buffers.
*/
- spin_lock(&rx->incoming_lock);
- spin_unlock(&rx->incoming_lock);
+ spin_lock_irq(&rx->incoming_lock);
+ spin_unlock_irq(&rx->incoming_lock);
head = b->peer_backlog_head;
tail = b->peer_backlog_tail;
@@ -343,7 +343,7 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
if (sp->hdr.type != RXRPC_PACKET_TYPE_DATA)
return rxrpc_protocol_error(skb, rxrpc_eproto_no_service_call);
- read_lock(&local->services_lock);
+ read_lock_irq(&local->services_lock);
/* Weed out packets to services we're not offering. Packets that would
* begin a call are explicitly rejected and the rest are just
@@ -399,34 +399,34 @@ bool rxrpc_new_incoming_call(struct rxrpc_local *local,
spin_unlock(&conn->state_lock);
spin_unlock(&rx->incoming_lock);
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
if (hlist_unhashed(&call->error_link)) {
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_add_head(&call->error_link, &call->peer->error_targets);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
_leave(" = %p{%d}", call, call->debug_id);
- rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
return true;
unsupported_service:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered,
RX_INVALID_OPERATION, -EOPNOTSUPP);
unsupported_security:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return rxrpc_direct_abort(skb, rxrpc_abort_service_not_offered,
RX_INVALID_OPERATION, -EKEYREJECTED);
no_call:
spin_unlock(&rx->incoming_lock);
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
_leave(" = f [%u]", skb->mark);
return false;
discard:
- read_unlock(&local->services_lock);
+ read_unlock_irq(&local->services_lock);
return true;
}
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 7bbb68504766..8e477f7f8850 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -44,8 +44,8 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
trace_rxrpc_propose_ack(call, why, RXRPC_ACK_DELAY, serial);
- if (call->peer->srtt_us)
- delay = (call->peer->srtt_us >> 3) * NSEC_PER_USEC;
+ if (call->srtt_us)
+ delay = (call->srtt_us >> 3) * NSEC_PER_USEC;
else
delay = ms_to_ktime(READ_ONCE(rxrpc_soft_ack_delay));
ktime_add_ms(delay, call->tx_backoff);
@@ -55,147 +55,104 @@ void rxrpc_propose_delay_ACK(struct rxrpc_call *call, rxrpc_serial_t serial,
}
/*
- * Handle congestion being detected by the retransmit timeout.
+ * Retransmit one or more packets.
*/
-static void rxrpc_congestion_timeout(struct rxrpc_call *call)
+static bool rxrpc_retransmit_data(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req)
{
- set_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags);
+ struct rxrpc_txqueue *tq = req->tq;
+ unsigned int ix = req->seq & RXRPC_TXQ_MASK;
+ struct rxrpc_txbuf *txb = tq->bufs[ix];
+
+ _enter("%x,%x,%x,%x", tq->qbase, req->seq, ix, txb->debug_id);
+
+ req->retrans = true;
+ trace_rxrpc_retransmit(call, req, txb);
+
+ txb->flags |= RXRPC_TXBUF_RESENT;
+ rxrpc_send_data_packet(call, req);
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
+
+ req->tq = NULL;
+ req->n = 0;
+ req->did_send = true;
+ req->now = ktime_get_real();
+ return true;
}
/*
* Perform retransmission of NAK'd and unack'd packets.
*/
-void rxrpc_resend(struct rxrpc_call *call, struct sk_buff *ack_skb)
+static void rxrpc_resend(struct rxrpc_call *call)
{
- struct rxrpc_ackpacket *ack = NULL;
- struct rxrpc_skb_priv *sp;
- struct rxrpc_txbuf *txb;
- rxrpc_seq_t transmitted = call->tx_transmitted;
- ktime_t next_resend = KTIME_MAX, rto = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
- ktime_t resend_at = KTIME_MAX, now, delay;
- bool unacked = false, did_send = false;
- unsigned int i;
-
- _enter("{%d,%d}", call->acks_hard_ack, call->tx_top);
-
- now = ktime_get_real();
-
- if (list_empty(&call->tx_buffer))
- goto no_resend;
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .trace = rxrpc_txdata_retransmit,
+ };
+ struct rxrpc_txqueue *tq;
- trace_rxrpc_resend(call, ack_skb);
- txb = list_first_entry(&call->tx_buffer, struct rxrpc_txbuf, call_link);
+ _enter("{%d,%d}", call->tx_bottom, call->tx_top);
- /* Scan the soft ACK table without dropping the lock and resend any
- * explicitly NAK'd packets.
- */
- if (ack_skb) {
- sp = rxrpc_skb(ack_skb);
- ack = (void *)ack_skb->data + sizeof(struct rxrpc_wire_header);
+ trace_rxrpc_resend(call, call->acks_highest_serial);
- for (i = 0; i < sp->ack.nr_acks; i++) {
- rxrpc_seq_t seq;
+ /* Scan the transmission queue, looking for lost packets. */
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long lost = tq->segment_lost;
- if (ack->acks[i] & 1)
- continue;
- seq = sp->ack.first_ack + i;
- if (after(txb->seq, transmitted))
- break;
- if (after(txb->seq, seq))
- continue; /* A new hard ACK probably came in */
- list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
- if (txb->seq == seq)
- goto found_txb;
- }
- goto no_further_resend;
-
- found_txb:
- resend_at = ktime_add(txb->last_sent, rto);
- if (after(txb->serial, call->acks_highest_serial)) {
- if (ktime_after(resend_at, now) &&
- ktime_before(resend_at, next_resend))
- next_resend = resend_at;
- continue; /* Ack point not yet reached */
- }
+ if (after(tq->qbase, call->tx_transmitted))
+ break;
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_unacked);
+ _debug("retr %16lx %u c=%08x [%x]",
+ tq->segment_acked, tq->nr_reported_acks, call->debug_id, tq->qbase);
+ _debug("lost %16lx", lost);
- trace_rxrpc_retransmit(call, txb->seq, txb->serial,
- ktime_sub(resend_at, now));
+ trace_rxrpc_resend_lost(call, tq, lost);
+ while (lost) {
+ unsigned int ix = __ffs(lost);
+ struct rxrpc_txbuf *txb = tq->bufs[ix];
- txb->flags |= RXRPC_TXBUF_RESENT;
- rxrpc_transmit_one(call, txb);
- did_send = true;
- now = ktime_get_real();
+ __clear_bit(ix, &lost);
+ rxrpc_see_txbuf(txb, rxrpc_txbuf_see_lost);
- if (list_is_last(&txb->call_link, &call->tx_buffer))
- goto no_further_resend;
- txb = list_next_entry(txb, call_link);
+ req.tq = tq;
+ req.seq = tq->qbase + ix;
+ req.n = 1;
+ rxrpc_retransmit_data(call, &req);
}
}
- /* Fast-forward through the Tx queue to the point the peer says it has
- * seen. Anything between the soft-ACK table and that point will get
- * ACK'd or NACK'd in due course, so don't worry about it here; here we
- * need to consider retransmitting anything beyond that point.
- */
- if (after_eq(call->acks_prev_seq, call->tx_transmitted))
- goto no_further_resend;
-
- list_for_each_entry_from(txb, &call->tx_buffer, call_link) {
- resend_at = ktime_add(txb->last_sent, rto);
-
- if (before_eq(txb->seq, call->acks_prev_seq))
- continue;
- if (after(txb->seq, call->tx_transmitted))
- break; /* Not transmitted yet */
-
- if (ack && ack->reason == RXRPC_ACK_PING_RESPONSE &&
- before(txb->serial, ntohl(ack->serial)))
- goto do_resend; /* Wasn't accounted for by a more recent ping. */
-
- if (ktime_after(resend_at, now)) {
- if (ktime_before(resend_at, next_resend))
- next_resend = resend_at;
- continue;
- }
-
- do_resend:
- unacked = true;
-
- txb->flags |= RXRPC_TXBUF_RESENT;
- rxrpc_transmit_one(call, txb);
- did_send = true;
- rxrpc_inc_stat(call->rxnet, stat_tx_data_retrans);
- now = ktime_get_real();
- }
+ rxrpc_get_rto_backoff(call, req.did_send);
+ _leave("");
+}
-no_further_resend:
-no_resend:
- if (resend_at < KTIME_MAX) {
- delay = rxrpc_get_rto_backoff(call->peer, did_send);
- resend_at = ktime_add(resend_at, delay);
- trace_rxrpc_timer_set(call, resend_at - now, rxrpc_timer_trace_resend_reset);
+/*
+ * Resend the highest-seq DATA packet so far transmitted for RACK-TLP [RFC8985 7.3].
+ */
+void rxrpc_resend_tlp(struct rxrpc_call *call)
+{
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .seq = call->tx_transmitted,
+ .n = 1,
+ .tlp_probe = true,
+ .trace = rxrpc_txdata_tlp_retransmit,
+ };
+
+ /* There's a chance it'll be on the tail segment of the queue. */
+ req.tq = READ_ONCE(call->tx_qtail);
+ if (req.tq &&
+ before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+ rxrpc_retransmit_data(call, &req);
+ return;
}
- call->resend_at = resend_at;
-
- if (unacked)
- rxrpc_congestion_timeout(call);
-
- /* If there was nothing that needed retransmission then it's likely
- * that an ACK got lost somewhere. Send a ping to find out instead of
- * retransmitting data.
- */
- if (!did_send) {
- ktime_t next_ping = ktime_add_us(call->acks_latest_ts,
- call->peer->srtt_us >> 3);
- if (ktime_sub(next_ping, now) <= 0)
- rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
- rxrpc_propose_ack_ping_for_0_retrans);
+ for (req.tq = call->tx_queue; req.tq; req.tq = req.tq->next) {
+ if (after_eq(call->tx_transmitted, req.tq->qbase) &&
+ before(call->tx_transmitted, req.tq->qbase + RXRPC_NR_TXQUEUE)) {
+ rxrpc_retransmit_data(call, &req);
+ return;
+ }
}
-
- _leave("");
}
/*
@@ -231,68 +188,93 @@ static void rxrpc_close_tx_phase(struct rxrpc_call *call)
}
}
-static bool rxrpc_tx_window_has_space(struct rxrpc_call *call)
-{
- unsigned int winsize = min_t(unsigned int, call->tx_winsize,
- call->cong_cwnd + call->cong_extra);
- rxrpc_seq_t window = call->acks_hard_ack, wtop = window + winsize;
- rxrpc_seq_t tx_top = call->tx_top;
- int space;
-
- space = wtop - tx_top;
- return space > 0;
-}
-
/*
- * Decant some if the sendmsg prepared queue into the transmission buffer.
+ * Transmit some as-yet untransmitted data, to a maximum of the supplied limit.
*/
-static void rxrpc_decant_prepared_tx(struct rxrpc_call *call)
+static void rxrpc_transmit_fresh_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace)
{
- struct rxrpc_txbuf *txb;
+ int space = rxrpc_tx_window_space(call);
if (!test_bit(RXRPC_CALL_EXPOSED, &call->flags)) {
- if (list_empty(&call->tx_sendmsg))
+ if (call->send_top == call->tx_top)
return;
rxrpc_expose_client_call(call);
}
- while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
- struct rxrpc_txbuf, call_link))) {
- spin_lock(&call->tx_lock);
- list_del(&txb->call_link);
- spin_unlock(&call->tx_lock);
+ while (space > 0) {
+ struct rxrpc_send_data_req req = {
+ .now = ktime_get_real(),
+ .seq = call->tx_transmitted + 1,
+ .n = 0,
+ .trace = trace,
+ };
+ struct rxrpc_txqueue *tq;
+ struct rxrpc_txbuf *txb;
+ rxrpc_seq_t send_top, seq;
+ int limit = min(space, max(call->peer->pmtud_jumbo, 1));
+
+ /* Order send_top before the contents of the new txbufs and
+ * txqueue pointers
+ */
+ send_top = smp_load_acquire(&call->send_top);
+ if (call->tx_top == send_top)
+ break;
- call->tx_top = txb->seq;
- list_add_tail(&txb->call_link, &call->tx_buffer);
+ trace_rxrpc_transmit(call, send_top, space);
- if (txb->flags & RXRPC_LAST_PACKET)
- rxrpc_close_tx_phase(call);
+ tq = call->tx_qtail;
+ seq = call->tx_top;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant);
- rxrpc_transmit_one(call, txb);
+ do {
+ int ix;
- if (!rxrpc_tx_window_has_space(call))
- break;
+ seq++;
+ ix = seq & RXRPC_TXQ_MASK;
+ if (!ix) {
+ tq = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_decant_advance);
+ }
+ if (!req.tq)
+ req.tq = tq;
+ txb = tq->bufs[ix];
+ req.n++;
+ if (!txb->jumboable)
+ break;
+ } while (req.n < limit && before(seq, send_top));
+
+ if (txb->flags & RXRPC_LAST_PACKET) {
+ rxrpc_close_tx_phase(call);
+ tq = NULL;
+ }
+ call->tx_qtail = tq;
+ call->tx_top = seq;
+
+ space -= req.n;
+ rxrpc_send_data_packet(call, &req);
}
}
-static void rxrpc_transmit_some_data(struct rxrpc_call *call)
+void rxrpc_transmit_some_data(struct rxrpc_call *call, unsigned int limit,
+ enum rxrpc_txdata_trace trace)
{
switch (__rxrpc_call_state(call)) {
case RXRPC_CALL_SERVER_ACK_REQUEST:
- if (list_empty(&call->tx_sendmsg))
+ if (call->tx_bottom == READ_ONCE(call->send_top))
return;
rxrpc_begin_service_reply(call);
fallthrough;
case RXRPC_CALL_SERVER_SEND_REPLY:
case RXRPC_CALL_CLIENT_SEND_REQUEST:
- if (!rxrpc_tx_window_has_space(call))
+ if (!rxrpc_tx_window_space(call))
return;
- if (list_empty(&call->tx_sendmsg)) {
+ if (call->tx_bottom == READ_ONCE(call->send_top)) {
rxrpc_inc_stat(call->rxnet, stat_tx_data_underflow);
return;
}
- rxrpc_decant_prepared_tx(call);
+ rxrpc_transmit_fresh_data(call, limit, trace);
break;
default:
return;
@@ -305,8 +287,8 @@ static void rxrpc_transmit_some_data(struct rxrpc_call *call)
*/
static void rxrpc_send_initial_ping(struct rxrpc_call *call)
{
- if (call->peer->rtt_count < 3 ||
- ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ if (call->rtt_count < 3 ||
+ ktime_before(ktime_add_ms(call->rtt_last_req, 1000),
ktime_get_real()))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_params);
@@ -315,10 +297,11 @@ static void rxrpc_send_initial_ping(struct rxrpc_call *call)
/*
* Handle retransmission and deferred ACK/abort generation.
*/
-bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
+bool rxrpc_input_call_event(struct rxrpc_call *call)
{
+ struct sk_buff *skb;
ktime_t now, t;
- bool resend = false;
+ bool did_receive = false, saw_ack = false;
s32 abort_code;
rxrpc_see_call(call, rxrpc_call_see_input);
@@ -328,9 +311,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
call->debug_id, rxrpc_call_states[__rxrpc_call_state(call)],
call->events);
- if (__rxrpc_call_is_complete(call))
- goto out;
-
/* Handle abort request locklessly, vs rxrpc_propose_abort(). */
abort_code = smp_load_acquire(&call->send_abort);
if (abort_code) {
@@ -339,11 +319,33 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
goto out;
}
- if (skb && skb->mark == RXRPC_SKB_MARK_ERROR)
- goto out;
+ do {
+ skb = __skb_dequeue(&call->rx_queue);
+ if (skb) {
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ if (__rxrpc_call_is_complete(call) ||
+ skb->mark == RXRPC_SKB_MARK_ERROR) {
+ rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+ goto out;
+ }
+
+ saw_ack |= sp->hdr.type == RXRPC_PACKET_TYPE_ACK;
+
+ rxrpc_input_call_packet(call, skb);
+ rxrpc_free_skb(skb, rxrpc_skb_put_call_rx);
+ did_receive = true;
+ }
- if (skb)
- rxrpc_input_call_packet(call, skb);
+ t = ktime_sub(call->rack_timo_at, ktime_get_real());
+ if (t <= 0) {
+ trace_rxrpc_timer_exp(call, t,
+ rxrpc_timer_trace_rack_off + call->rack_timer_mode);
+ call->rack_timo_at = KTIME_MAX;
+ rxrpc_rack_timer_expired(call, t);
+ }
+
+ } while (!skb_queue_empty(&call->rx_queue));
/* If we see our async-event poke, check for timeout trippage. */
now = ktime_get_real();
@@ -376,13 +378,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_delayed_ack);
}
- t = ktime_sub(call->ack_lost_at, now);
- if (t <= 0) {
- trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_lost_ack);
- call->ack_lost_at = KTIME_MAX;
- set_bit(RXRPC_CALL_EV_ACK_LOST, &call->events);
- }
-
t = ktime_sub(call->ping_at, now);
if (t <= 0) {
trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_ping);
@@ -391,15 +386,6 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_ping_for_keepalive);
}
- t = ktime_sub(call->resend_at, now);
- if (t <= 0) {
- trace_rxrpc_timer_exp(call, t, rxrpc_timer_trace_resend);
- call->resend_at = KTIME_MAX;
- resend = true;
- }
-
- rxrpc_transmit_some_data(call);
-
now = ktime_get_real();
t = ktime_sub(call->keepalive_at, now);
if (t <= 0) {
@@ -409,35 +395,40 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
rxrpc_propose_ack_ping_for_keepalive);
}
- if (skb) {
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
-
- if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK)
- rxrpc_congestion_degrade(call);
- }
-
if (test_and_clear_bit(RXRPC_CALL_EV_INITIAL_PING, &call->events))
rxrpc_send_initial_ping(call);
+ rxrpc_transmit_some_data(call, UINT_MAX, rxrpc_txdata_new_data);
+
+ if (saw_ack)
+ rxrpc_congestion_degrade(call);
+
+ if (did_receive &&
+ (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_SEND_REQUEST ||
+ __rxrpc_call_state(call) == RXRPC_CALL_SERVER_SEND_REPLY)) {
+ t = ktime_sub(call->rack_timo_at, ktime_get_real());
+ trace_rxrpc_rack(call, t);
+ }
+
/* Process events */
if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_lost_ack);
- if (resend &&
+ if (call->tx_nr_lost > 0 &&
__rxrpc_call_state(call) != RXRPC_CALL_CLIENT_RECV_REPLY &&
!test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags))
- rxrpc_resend(call, NULL);
+ rxrpc_resend(call);
if (test_and_clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags))
rxrpc_send_ACK(call, RXRPC_ACK_IDLE, 0,
rxrpc_propose_ack_rx_idle);
if (call->ackr_nr_unacked > 2) {
- if (call->peer->rtt_count < 3)
+ if (call->rtt_count < 3)
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_rtt);
- else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000),
+ else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000),
ktime_get_real()))
rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
rxrpc_propose_ack_ping_for_old_rtt);
@@ -455,8 +446,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
set(call->expect_req_by);
set(call->expect_rx_by);
set(call->delay_ack_at);
- set(call->ack_lost_at);
- set(call->resend_at);
+ set(call->rack_timo_at);
set(call->keepalive_at);
set(call->ping_at);
@@ -467,7 +457,7 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
} else {
unsigned long nowj = jiffies, delayj, nextj;
- delayj = max(nsecs_to_jiffies(delay), 1);
+ delayj = umax(nsecs_to_jiffies(delay), 1);
nextj = nowj + delayj;
if (time_before(nextj, call->timer.expires) ||
!timer_pending(&call->timer)) {
@@ -484,9 +474,12 @@ out:
rxrpc_disconnect_call(call);
if (call->security)
call->security->free_call_crypto(call);
+ } else {
+ if (did_receive &&
+ call->peer->ackr_adv_pmtud &&
+ call->peer->pmtud_pending)
+ rxrpc_send_probe_for_pmtud(call);
}
- if (call->acks_hard_ack != call->tx_bottom)
- rxrpc_shrink_call_tx_buffer(call);
_leave("");
return true;
diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
index e379a2a9375a..c4c8b46a68c6 100644
--- a/net/rxrpc/call_object.c
+++ b/net/rxrpc/call_object.c
@@ -48,7 +48,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
bool busy;
if (!test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) {
- spin_lock_bh(&local->lock);
+ spin_lock_irq(&local->lock);
busy = !list_empty(&call->attend_link);
trace_rxrpc_poke_call(call, busy, what);
if (!busy && !rxrpc_try_get_call(call, rxrpc_call_get_poke))
@@ -56,7 +56,7 @@ void rxrpc_poke_call(struct rxrpc_call *call, enum rxrpc_call_poke_trace what)
if (!busy) {
list_add_tail(&call->attend_link, &local->call_attend_q);
}
- spin_unlock_bh(&local->lock);
+ spin_unlock_irq(&local->lock);
if (!busy)
rxrpc_wake_up_io_thread(local);
}
@@ -145,23 +145,21 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
INIT_LIST_HEAD(&call->recvmsg_link);
INIT_LIST_HEAD(&call->sock_link);
INIT_LIST_HEAD(&call->attend_link);
- INIT_LIST_HEAD(&call->tx_sendmsg);
- INIT_LIST_HEAD(&call->tx_buffer);
+ skb_queue_head_init(&call->rx_queue);
skb_queue_head_init(&call->recvmsg_queue);
skb_queue_head_init(&call->rx_oos_queue);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->notify_lock);
- spin_lock_init(&call->tx_lock);
refcount_set(&call->ref, 1);
call->debug_id = debug_id;
call->tx_total_len = -1;
+ call->tx_jumbo_max = 1;
call->next_rx_timo = 20 * HZ;
call->next_req_timo = 1 * HZ;
call->ackr_window = 1;
call->ackr_wtop = 1;
call->delay_ack_at = KTIME_MAX;
- call->ack_lost_at = KTIME_MAX;
- call->resend_at = KTIME_MAX;
+ call->rack_timo_at = KTIME_MAX;
call->ping_at = KTIME_MAX;
call->keepalive_at = KTIME_MAX;
call->expect_rx_by = KTIME_MAX;
@@ -176,6 +174,8 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
call->cong_cwnd = RXRPC_MIN_CWND;
call->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
+ rxrpc_call_init_rtt(call);
+
call->rxnet = rxnet;
call->rtt_avail = RXRPC_CALL_RTT_AVAIL_MASK;
atomic_inc(&rxnet->nr_calls);
@@ -219,9 +219,9 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
__set_bit(RXRPC_CALL_EXCLUSIVE, &call->flags);
if (p->timeouts.normal)
- call->next_rx_timo = min(p->timeouts.normal, 1);
+ call->next_rx_timo = umin(p->timeouts.normal, 1);
if (p->timeouts.idle)
- call->next_req_timo = min(p->timeouts.idle, 1);
+ call->next_req_timo = umin(p->timeouts.idle, 1);
if (p->timeouts.hard)
call->hard_timo = p->timeouts.hard;
@@ -301,9 +301,9 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
trace_rxrpc_client(NULL, -1, rxrpc_client_queue_new_call);
rxrpc_get_call(call, rxrpc_call_get_io_thread);
- spin_lock(&local->client_call_lock);
+ spin_lock_irq(&local->client_call_lock);
list_add_tail(&call->wait_link, &local->new_client_calls);
- spin_unlock(&local->client_call_lock);
+ spin_unlock_irq(&local->client_call_lock);
rxrpc_wake_up_io_thread(local);
return 0;
@@ -433,7 +433,7 @@ error_attached_to_socket:
/*
* Set up an incoming call. call->conn points to the connection.
- * This is called in BH context and isn't allowed to fail.
+ * This is called with interrupts disabled and isn't allowed to fail.
*/
void rxrpc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_call *call,
@@ -529,11 +529,29 @@ void rxrpc_get_call(struct rxrpc_call *call, enum rxrpc_call_trace why)
}
/*
- * Clean up the Rx skb ring.
+ * Clean up the transmission buffers.
+ */
+static void rxrpc_cleanup_tx_buffers(struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq, *next;
+
+ for (tq = call->tx_queue; tq; tq = next) {
+ next = tq->next;
+ for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
+ if (tq->bufs[i])
+ rxrpc_put_txbuf(tq->bufs[i], rxrpc_txbuf_put_cleaned);
+ trace_rxrpc_tq(call, tq, 0, rxrpc_tq_cleaned);
+ kfree(tq);
+ }
+}
+
+/*
+ * Clean up the receive buffers.
*/
-static void rxrpc_cleanup_ring(struct rxrpc_call *call)
+static void rxrpc_cleanup_rx_buffers(struct rxrpc_call *call)
{
rxrpc_purge_queue(&call->recvmsg_queue);
+ rxrpc_purge_queue(&call->rx_queue);
rxrpc_purge_queue(&call->rx_oos_queue);
}
@@ -556,7 +574,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
rxrpc_put_call_slot(call);
/* Make sure we don't get any more notifications */
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
if (!list_empty(&call->recvmsg_link)) {
_debug("unlinking once-pending call %p { e=%lx f=%lx }",
@@ -569,7 +587,7 @@ void rxrpc_release_call(struct rxrpc_sock *rx, struct rxrpc_call *call)
call->recvmsg_link.next = NULL;
call->recvmsg_link.prev = NULL;
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
if (put)
rxrpc_put_call(call, rxrpc_call_put_unnotify);
@@ -669,23 +687,11 @@ static void rxrpc_rcu_free_call(struct rcu_head *rcu)
static void rxrpc_destroy_call(struct work_struct *work)
{
struct rxrpc_call *call = container_of(work, struct rxrpc_call, destroyer);
- struct rxrpc_txbuf *txb;
del_timer_sync(&call->timer);
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- rxrpc_cleanup_ring(call);
- while ((txb = list_first_entry_or_null(&call->tx_sendmsg,
- struct rxrpc_txbuf, call_link))) {
- list_del(&txb->call_link);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
- }
- while ((txb = list_first_entry_or_null(&call->tx_buffer,
- struct rxrpc_txbuf, call_link))) {
- list_del(&txb->call_link);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_cleaned);
- }
-
+ rxrpc_cleanup_tx_buffers(call);
+ rxrpc_cleanup_rx_buffers(call);
rxrpc_put_txbuf(call->tx_pending, rxrpc_txbuf_put_cleaned);
rxrpc_put_connection(call->conn, rxrpc_conn_put_call);
rxrpc_deactivate_bundle(call->bundle);
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index bb11e8289d6d..db0099197890 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -231,7 +231,7 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn)
distance = id - id_cursor;
if (distance < 0)
distance = -distance;
- limit = max_t(unsigned long, atomic_read(&rxnet->nr_conns) * 4, 1024);
+ limit = umax(atomic_read(&rxnet->nr_conns) * 4, 1024);
if (distance > limit)
goto mark_dont_reuse;
@@ -437,9 +437,9 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
call->dest_srx.srx_service = conn->service_id;
call->cong_ssthresh = call->peer->cong_ssthresh;
if (call->cong_cwnd >= call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
else
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
chan->call_id = call_id;
chan->call_debug_id = call->debug_id;
@@ -508,16 +508,18 @@ static void rxrpc_activate_channels(struct rxrpc_bundle *bundle)
void rxrpc_connect_client_calls(struct rxrpc_local *local)
{
struct rxrpc_call *call;
+ LIST_HEAD(new_client_calls);
- while ((call = list_first_entry_or_null(&local->new_client_calls,
- struct rxrpc_call, wait_link))
- ) {
+ spin_lock_irq(&local->client_call_lock);
+ list_splice_tail_init(&local->new_client_calls, &new_client_calls);
+ spin_unlock_irq(&local->client_call_lock);
+
+ while ((call = list_first_entry_or_null(&new_client_calls,
+ struct rxrpc_call, wait_link))) {
struct rxrpc_bundle *bundle = call->bundle;
- spin_lock(&local->client_call_lock);
list_move_tail(&call->wait_link, &bundle->waiting_calls);
rxrpc_see_call(call, rxrpc_call_see_waiting_call);
- spin_unlock(&local->client_call_lock);
if (rxrpc_bundle_has_space(bundle))
rxrpc_activate_channels(bundle);
@@ -545,9 +547,9 @@ void rxrpc_expose_client_call(struct rxrpc_call *call)
set_bit(RXRPC_CONN_DONT_REUSE, &conn->flags);
trace_rxrpc_client(conn, channel, rxrpc_client_exposed);
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_add_head(&call->error_link, &call->peer->error_targets);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
}
@@ -588,9 +590,9 @@ void rxrpc_disconnect_client_call(struct rxrpc_bundle *bundle, struct rxrpc_call
ASSERTCMP(call->call_id, ==, 0);
ASSERT(!test_bit(RXRPC_CALL_EXPOSED, &call->flags));
/* May still be on ->new_client_calls. */
- spin_lock(&local->client_call_lock);
+ spin_lock_irq(&local->client_call_lock);
list_del_init(&call->wait_link);
- spin_unlock(&local->client_call_lock);
+ spin_unlock_irq(&local->client_call_lock);
return;
}
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index c4eb7986efdd..4d9c5e21ba78 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -26,7 +26,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
bool aborted = false;
if (conn->state != RXRPC_CONN_ABORTED) {
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state != RXRPC_CONN_ABORTED) {
conn->abort_code = abort_code;
conn->error = err;
@@ -37,7 +37,7 @@ static bool rxrpc_set_conn_aborted(struct rxrpc_connection *conn, struct sk_buff
set_bit(RXRPC_CONN_EV_ABORT_CALLS, &conn->events);
aborted = true;
}
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
}
return aborted;
@@ -92,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
struct rxrpc_acktrailer trailer;
size_t len;
int ret, ioc;
- u32 serial, mtu, call_id, padding;
+ u32 serial, max_mtu, if_mtu, call_id, padding;
_enter("%d", conn->debug_id);
@@ -150,8 +150,13 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
break;
case RXRPC_PACKET_TYPE_ACK:
- mtu = conn->peer->if_mtu;
- mtu -= conn->peer->hdrsize;
+ if_mtu = conn->peer->if_mtu - conn->peer->hdrsize;
+ if (conn->peer->ackr_adv_pmtud) {
+ max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu);
+ } else {
+ if_mtu = umin(1444, if_mtu);
+ max_mtu = if_mtu;
+ }
pkt.ack.bufferSpace = 0;
pkt.ack.maxSkew = htons(skb ? skb->priority : 0);
pkt.ack.firstPacket = htonl(chan->last_seq + 1);
@@ -159,10 +164,10 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0);
pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
pkt.ack.nAcks = 0;
- trailer.maxMTU = htonl(rxrpc_rx_mtu);
- trailer.ifMTU = htonl(mtu);
+ trailer.maxMTU = htonl(max_mtu);
+ trailer.ifMTU = htonl(if_mtu);
trailer.rwind = htonl(rxrpc_rx_window_size);
- trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max);
+ trailer.jumbo_max = 0;
pkt.whdr.flags |= RXRPC_SLOW_START_OK;
padding = 0;
iov[0].iov_len += sizeof(pkt.ack);
@@ -172,7 +177,8 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
trace_rxrpc_tx_ack(chan->call_debug_id, serial,
ntohl(pkt.ack.firstPacket),
ntohl(pkt.ack.serial),
- pkt.ack.reason, 0, rxrpc_rx_window_size);
+ pkt.ack.reason, 0, rxrpc_rx_window_size,
+ rxrpc_propose_ack_retransmit);
break;
default:
@@ -254,10 +260,10 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
if (ret < 0)
return ret;
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE_CHALLENGING)
conn->state = RXRPC_CONN_SERVICE;
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_SERVICE) {
/* Offload call state flipping to the I/O thread. As
diff --git a/net/rxrpc/conn_object.c b/net/rxrpc/conn_object.c
index 88b4aab5a091..2f1fd1e2e7e4 100644
--- a/net/rxrpc/conn_object.c
+++ b/net/rxrpc/conn_object.c
@@ -31,13 +31,13 @@ void rxrpc_poke_conn(struct rxrpc_connection *conn, enum rxrpc_conn_trace why)
if (WARN_ON_ONCE(!local))
return;
- spin_lock_bh(&local->lock);
+ spin_lock_irq(&local->lock);
busy = !list_empty(&conn->attend_link);
if (!busy) {
rxrpc_get_connection(conn, why);
list_add_tail(&conn->attend_link, &local->conn_attend_q);
}
- spin_unlock_bh(&local->lock);
+ spin_unlock_irq(&local->lock);
rxrpc_wake_up_io_thread(local);
}
@@ -197,9 +197,9 @@ void rxrpc_disconnect_call(struct rxrpc_call *call)
call->peer->cong_ssthresh = call->cong_ssthresh;
if (!hlist_unhashed(&call->error_link)) {
- spin_lock(&call->peer->lock);
+ spin_lock_irq(&call->peer->lock);
hlist_del_init(&call->error_link);
- spin_unlock(&call->peer->lock);
+ spin_unlock_irq(&call->peer->lock);
}
if (rxrpc_is_client_call(call)) {
@@ -322,6 +322,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
list_del_init(&conn->proc_link);
write_unlock(&rxnet->conn_lock);
+ if (conn->pmtud_probe) {
+ trace_rxrpc_pmtud_lost(conn, 0);
+ conn->peer->pmtud_probing = false;
+ conn->peer->pmtud_pending = true;
+ }
+
rxrpc_purge_queue(&conn->rx_queue);
rxrpc_kill_client_conn(conn);
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 6a075a7c190d..24aceb183c2c 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -27,80 +27,68 @@ static void rxrpc_proto_abort(struct rxrpc_call *call, rxrpc_seq_t seq,
}
/*
- * Do TCP-style congestion management [RFC 5681].
+ * Do TCP-style congestion management [RFC5681].
*/
static void rxrpc_congestion_management(struct rxrpc_call *call,
- struct sk_buff *skb,
- struct rxrpc_ack_summary *summary,
- rxrpc_serial_t acked_serial)
+ struct rxrpc_ack_summary *summary)
{
- enum rxrpc_congest_change change = rxrpc_cong_no_change;
- unsigned int cumulative_acks = call->cong_cumul_acks;
- unsigned int cwnd = call->cong_cwnd;
- bool resend = false;
-
- summary->flight_size =
- (call->tx_top - call->acks_hard_ack) - summary->nr_acks;
+ summary->change = rxrpc_cong_no_change;
+ summary->in_flight = rxrpc_tx_in_flight(call);
if (test_and_clear_bit(RXRPC_CALL_RETRANS_TIMEOUT, &call->flags)) {
summary->retrans_timeo = true;
- call->cong_ssthresh = max_t(unsigned int,
- summary->flight_size / 2, 2);
- cwnd = 1;
- if (cwnd >= call->cong_ssthresh &&
- call->cong_mode == RXRPC_CALL_SLOW_START) {
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- call->cong_tstamp = skb->tstamp;
- cumulative_acks = 0;
+ call->cong_ssthresh = umax(summary->in_flight / 2, 2);
+ call->cong_cwnd = 1;
+ if (call->cong_cwnd >= call->cong_ssthresh &&
+ call->cong_ca_state == RXRPC_CA_SLOW_START) {
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
+ call->cong_tstamp = call->acks_latest_ts;
+ call->cong_cumul_acks = 0;
}
}
- cumulative_acks += summary->nr_new_acks;
- if (cumulative_acks > 255)
- cumulative_acks = 255;
-
- summary->cwnd = call->cong_cwnd;
- summary->ssthresh = call->cong_ssthresh;
- summary->cumulative_acks = cumulative_acks;
- summary->dup_acks = call->cong_dup_acks;
+ call->cong_cumul_acks += summary->nr_new_sacks;
+ call->cong_cumul_acks += summary->nr_new_hacks;
+ if (call->cong_cumul_acks > 255)
+ call->cong_cumul_acks = 255;
- switch (call->cong_mode) {
- case RXRPC_CALL_SLOW_START:
- if (summary->saw_nacks)
+ switch (call->cong_ca_state) {
+ case RXRPC_CA_SLOW_START:
+ if (call->acks_nr_snacks > 0)
goto packet_loss_detected;
- if (summary->cumulative_acks > 0)
- cwnd += 1;
- if (cwnd >= call->cong_ssthresh) {
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
- call->cong_tstamp = skb->tstamp;
+ if (call->cong_cumul_acks > 0)
+ call->cong_cwnd += 1;
+ if (call->cong_cwnd >= call->cong_ssthresh) {
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
+ call->cong_tstamp = call->acks_latest_ts;
}
goto out;
- case RXRPC_CALL_CONGEST_AVOIDANCE:
- if (summary->saw_nacks)
+ case RXRPC_CA_CONGEST_AVOIDANCE:
+ if (call->acks_nr_snacks > 0)
goto packet_loss_detected;
/* We analyse the number of packets that get ACK'd per RTT
* period and increase the window if we managed to fill it.
*/
- if (call->peer->rtt_count == 0)
+ if (call->rtt_count == 0)
goto out;
- if (ktime_before(skb->tstamp,
+ if (ktime_before(call->acks_latest_ts,
ktime_add_us(call->cong_tstamp,
- call->peer->srtt_us >> 3)))
+ call->srtt_us >> 3)))
goto out_no_clear_ca;
- change = rxrpc_cong_rtt_window_end;
- call->cong_tstamp = skb->tstamp;
- if (cumulative_acks >= cwnd)
- cwnd++;
+ summary->change = rxrpc_cong_rtt_window_end;
+ call->cong_tstamp = call->acks_latest_ts;
+ if (call->cong_cumul_acks >= call->cong_cwnd)
+ call->cong_cwnd++;
goto out;
- case RXRPC_CALL_PACKET_LOSS:
- if (!summary->saw_nacks)
+ case RXRPC_CA_PACKET_LOSS:
+ if (call->acks_nr_snacks == 0)
goto resume_normality;
- if (summary->new_low_nack) {
- change = rxrpc_cong_new_low_nack;
+ if (summary->new_low_snack) {
+ summary->change = rxrpc_cong_new_low_nack;
call->cong_dup_acks = 1;
if (call->cong_extra > 1)
call->cong_extra = 1;
@@ -111,31 +99,35 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
if (call->cong_dup_acks < 3)
goto send_extra_data;
- change = rxrpc_cong_begin_retransmission;
- call->cong_mode = RXRPC_CALL_FAST_RETRANSMIT;
- call->cong_ssthresh = max_t(unsigned int,
- summary->flight_size / 2, 2);
- cwnd = call->cong_ssthresh + 3;
+ summary->change = rxrpc_cong_begin_retransmission;
+ call->cong_ca_state = RXRPC_CA_FAST_RETRANSMIT;
+ call->cong_ssthresh = umax(summary->in_flight / 2, 2);
+ call->cong_cwnd = call->cong_ssthresh + 3;
call->cong_extra = 0;
call->cong_dup_acks = 0;
- resend = true;
+ summary->need_retransmit = true;
+ summary->in_fast_or_rto_recovery = true;
goto out;
- case RXRPC_CALL_FAST_RETRANSMIT:
- if (!summary->new_low_nack) {
- if (summary->nr_new_acks == 0)
- cwnd += 1;
+ case RXRPC_CA_FAST_RETRANSMIT:
+ rxrpc_tlp_init(call);
+ summary->in_fast_or_rto_recovery = true;
+ if (!summary->new_low_snack) {
+ if (summary->nr_new_sacks == 0)
+ call->cong_cwnd += 1;
call->cong_dup_acks++;
if (call->cong_dup_acks == 2) {
- change = rxrpc_cong_retransmit_again;
+ summary->change = rxrpc_cong_retransmit_again;
call->cong_dup_acks = 0;
- resend = true;
+ summary->need_retransmit = true;
}
} else {
- change = rxrpc_cong_progress;
- cwnd = call->cong_ssthresh;
- if (!summary->saw_nacks)
+ summary->change = rxrpc_cong_progress;
+ call->cong_cwnd = call->cong_ssthresh;
+ if (call->acks_nr_snacks == 0) {
+ summary->exiting_fast_or_rto_recovery = true;
goto resume_normality;
+ }
}
goto out;
@@ -145,30 +137,25 @@ static void rxrpc_congestion_management(struct rxrpc_call *call,
}
resume_normality:
- change = rxrpc_cong_cleared_nacks;
+ summary->change = rxrpc_cong_cleared_nacks;
call->cong_dup_acks = 0;
call->cong_extra = 0;
- call->cong_tstamp = skb->tstamp;
- if (cwnd < call->cong_ssthresh)
- call->cong_mode = RXRPC_CALL_SLOW_START;
+ call->cong_tstamp = call->acks_latest_ts;
+ if (call->cong_cwnd < call->cong_ssthresh)
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
else
- call->cong_mode = RXRPC_CALL_CONGEST_AVOIDANCE;
+ call->cong_ca_state = RXRPC_CA_CONGEST_AVOIDANCE;
out:
- cumulative_acks = 0;
+ call->cong_cumul_acks = 0;
out_no_clear_ca:
- if (cwnd >= RXRPC_TX_MAX_WINDOW)
- cwnd = RXRPC_TX_MAX_WINDOW;
- call->cong_cwnd = cwnd;
- call->cong_cumul_acks = cumulative_acks;
- summary->mode = call->cong_mode;
- trace_rxrpc_congest(call, summary, acked_serial, change);
- if (resend)
- rxrpc_resend(call, skb);
+ if (call->cong_cwnd >= RXRPC_TX_MAX_WINDOW)
+ call->cong_cwnd = RXRPC_TX_MAX_WINDOW;
+ trace_rxrpc_congest(call, summary);
return;
packet_loss_detected:
- change = rxrpc_cong_saw_nack;
- call->cong_mode = RXRPC_CALL_PACKET_LOSS;
+ summary->change = rxrpc_cong_saw_nack;
+ call->cong_ca_state = RXRPC_CA_PACKET_LOSS;
call->cong_dup_acks = 0;
goto send_extra_data;
@@ -177,7 +164,7 @@ send_extra_data:
* state.
*/
if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) ||
- summary->nr_acks != call->tx_top - call->acks_hard_ack) {
+ call->acks_nr_sacks != call->tx_top - call->tx_bottom) {
call->cong_extra++;
wake_up(&call->waitq);
}
@@ -189,26 +176,42 @@ send_extra_data:
*/
void rxrpc_congestion_degrade(struct rxrpc_call *call)
{
- ktime_t rtt, now;
+ ktime_t rtt, now, time_since;
- if (call->cong_mode != RXRPC_CALL_SLOW_START &&
- call->cong_mode != RXRPC_CALL_CONGEST_AVOIDANCE)
+ if (call->cong_ca_state != RXRPC_CA_SLOW_START &&
+ call->cong_ca_state != RXRPC_CA_CONGEST_AVOIDANCE)
return;
if (__rxrpc_call_state(call) == RXRPC_CALL_CLIENT_AWAIT_REPLY)
return;
- rtt = ns_to_ktime(call->peer->srtt_us * (1000 / 8));
+ rtt = ns_to_ktime(call->srtt_us * (NSEC_PER_USEC / 8));
now = ktime_get_real();
- if (!ktime_before(ktime_add(call->tx_last_sent, rtt), now))
+ time_since = ktime_sub(now, call->tx_last_sent);
+ if (ktime_before(time_since, rtt))
return;
- trace_rxrpc_reset_cwnd(call, now);
+ trace_rxrpc_reset_cwnd(call, time_since, rtt);
rxrpc_inc_stat(call->rxnet, stat_tx_data_cwnd_reset);
call->tx_last_sent = now;
- call->cong_mode = RXRPC_CALL_SLOW_START;
- call->cong_ssthresh = max_t(unsigned int, call->cong_ssthresh,
- call->cong_cwnd * 3 / 4);
- call->cong_cwnd = max_t(unsigned int, call->cong_cwnd / 2, RXRPC_MIN_CWND);
+ call->cong_ca_state = RXRPC_CA_SLOW_START;
+ call->cong_ssthresh = umax(call->cong_ssthresh, call->cong_cwnd * 3 / 4);
+ call->cong_cwnd = umax(call->cong_cwnd / 2, RXRPC_MIN_CWND);
+}
+
+/*
+ * Add an RTT sample derived from an ACK'd DATA packet.
+ */
+static void rxrpc_add_data_rtt_sample(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ int ix)
+{
+ ktime_t xmit_ts = ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+
+ rxrpc_call_add_rtt(call, rxrpc_rtt_rx_data_ack, -1,
+ summary->acked_serial, summary->ack_serial,
+ xmit_ts, call->acks_latest_ts);
+ __clear_bit(ix, &tq->rtt_samples); /* Prevent repeat RTT sample */
}
/*
@@ -217,37 +220,120 @@ void rxrpc_congestion_degrade(struct rxrpc_call *call)
static bool rxrpc_rotate_tx_window(struct rxrpc_call *call, rxrpc_seq_t to,
struct rxrpc_ack_summary *summary)
{
- struct rxrpc_txbuf *txb;
- bool rot_last = false;
+ struct rxrpc_txqueue *tq = call->tx_queue;
+ rxrpc_seq_t seq = call->tx_bottom + 1;
+ bool rot_last = false, trace = false;
- list_for_each_entry_rcu(txb, &call->tx_buffer, call_link, false) {
- if (before_eq(txb->seq, call->acks_hard_ack))
- continue;
- if (txb->flags & RXRPC_LAST_PACKET) {
+ _enter("%x,%x", call->tx_bottom, to);
+
+ trace_rxrpc_tx_rotate(call, seq, to);
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate);
+
+ if (call->acks_lowest_nak == call->tx_bottom) {
+ call->acks_lowest_nak = to;
+ } else if (after(to, call->acks_lowest_nak)) {
+ summary->new_low_snack = true;
+ call->acks_lowest_nak = to;
+ }
+
+ /* We may have a left over fully-consumed buffer at the front that we
+ * couldn't drop before (rotate_and_keep below).
+ */
+ if (seq == call->tx_qbase + RXRPC_NR_TXQUEUE) {
+ call->tx_qbase += RXRPC_NR_TXQUEUE;
+ call->tx_queue = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ tq = call->tx_queue;
+ }
+
+ do {
+ unsigned int ix = seq - call->tx_qbase;
+
+ _debug("tq=%x seq=%x i=%d f=%x", tq->qbase, seq, ix, tq->bufs[ix]->flags);
+ if (tq->bufs[ix]->flags & RXRPC_LAST_PACKET) {
set_bit(RXRPC_CALL_TX_LAST, &call->flags);
rot_last = true;
}
- if (txb->seq == to)
- break;
- }
- if (rot_last)
- set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
+ if (summary->acked_serial == tq->segment_serial[ix] &&
+ test_bit(ix, &tq->rtt_samples))
+ rxrpc_add_data_rtt_sample(call, summary, tq, ix);
+
+ if (ix == tq->nr_reported_acks) {
+ /* Packet directly hard ACK'd. */
+ tq->nr_reported_acks++;
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ if (seq == call->tlp_seq)
+ summary->tlp_probe_acked = true;
+ summary->nr_new_hacks++;
+ __set_bit(ix, &tq->segment_acked);
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_hack);
+ } else if (test_bit(ix, &tq->segment_acked)) {
+ /* Soft ACK -> hard ACK. */
+ call->acks_nr_sacks--;
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_sack);
+ } else {
+ /* Soft NAK -> hard ACK. */
+ call->acks_nr_snacks--;
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ if (seq == call->tlp_seq)
+ summary->tlp_probe_acked = true;
+ summary->nr_new_hacks++;
+ __set_bit(ix, &tq->segment_acked);
+ trace_rxrpc_rotate(call, tq, summary, seq, rxrpc_rotate_trace_snak);
+ }
- _enter("%x,%x,%x,%d", to, call->acks_hard_ack, call->tx_top, rot_last);
+ call->tx_nr_sent--;
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+ if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+ call->tx_nr_resent--;
+ __clear_bit(ix, &tq->ever_retransmitted);
- if (call->acks_lowest_nak == call->acks_hard_ack) {
- call->acks_lowest_nak = to;
- } else if (after(to, call->acks_lowest_nak)) {
- summary->new_low_nack = true;
- call->acks_lowest_nak = to;
+ rxrpc_put_txbuf(tq->bufs[ix], rxrpc_txbuf_put_rotated);
+ tq->bufs[ix] = NULL;
+
+ WRITE_ONCE(call->tx_bottom, seq);
+ trace_rxrpc_txqueue(call, (rot_last ?
+ rxrpc_txqueue_rotate_last :
+ rxrpc_txqueue_rotate));
+
+ seq++;
+ trace = true;
+ if (!(seq & RXRPC_TXQ_MASK)) {
+ trace_rxrpc_rack_update(call, summary);
+ trace = false;
+ prefetch(tq->next);
+ if (tq != call->tx_qtail) {
+ call->tx_qbase += RXRPC_NR_TXQUEUE;
+ call->tx_queue = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ tq = call->tx_queue;
+ } else {
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_keep);
+ tq = NULL;
+ break;
+ }
+ }
+
+ } while (before_eq(seq, to));
+
+ if (trace)
+ trace_rxrpc_rack_update(call, summary);
+
+ if (rot_last) {
+ set_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags);
+ if (tq) {
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_rotate_and_free);
+ kfree(tq);
+ call->tx_queue = NULL;
+ }
}
- smp_store_release(&call->acks_hard_ack, to);
+ _debug("%x,%x,%x,%d", to, call->tx_bottom, call->tx_top, rot_last);
- trace_rxrpc_txqueue(call, (rot_last ?
- rxrpc_txqueue_rotate_last :
- rxrpc_txqueue_rotate));
wake_up(&call->waitq);
return rot_last;
}
@@ -263,13 +349,10 @@ static void rxrpc_end_tx_phase(struct rxrpc_call *call, bool reply_begun,
{
ASSERT(test_bit(RXRPC_CALL_TX_LAST, &call->flags));
- call->resend_at = KTIME_MAX;
- trace_rxrpc_timer_can(call, rxrpc_timer_trace_resend);
-
- if (unlikely(call->cong_last_nack)) {
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- call->cong_last_nack = NULL;
- }
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+ call->rack_timo_at = KTIME_MAX;
+ trace_rxrpc_rack_timer(call, 0, false);
+ trace_rxrpc_timer_can(call, rxrpc_timer_trace_rack_off + call->rack_timer_mode);
switch (__rxrpc_call_state(call)) {
case RXRPC_CALL_CLIENT_SEND_REQUEST:
@@ -365,11 +448,19 @@ static void rxrpc_input_queue_data(struct rxrpc_call *call, struct sk_buff *skb,
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
bool last = sp->hdr.flags & RXRPC_LAST_PACKET;
+ spin_lock_irq(&call->recvmsg_queue.lock);
+
__skb_queue_tail(&call->recvmsg_queue, skb);
rxrpc_input_update_ack_window(call, window, wtop);
trace_rxrpc_receive(call, last ? why + 1 : why, sp->hdr.serial, sp->hdr.seq);
if (last)
+ /* Change the state inside the lock so that recvmsg syncs
+ * correctly with it and using sendmsg() to send a reply
+ * doesn't race.
+ */
rxrpc_end_rx_phase(call, sp->hdr.serial);
+
+ spin_unlock_irq(&call->recvmsg_queue.lock);
}
/*
@@ -442,7 +533,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_get_skb(skb, rxrpc_skb_get_to_recvmsg);
- spin_lock(&call->recvmsg_queue.lock);
rxrpc_input_queue_data(call, skb, window, wtop, rxrpc_receive_queue);
*_notify = true;
@@ -464,8 +554,6 @@ static void rxrpc_input_data_one(struct rxrpc_call *call, struct sk_buff *skb,
rxrpc_receive_queue_oos);
}
- spin_unlock(&call->recvmsg_queue.lock);
-
call->ackr_sack_base = sack;
} else {
unsigned int slot;
@@ -530,7 +618,7 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
unsigned int offset = sizeof(struct rxrpc_wire_header);
unsigned int len = skb->len - offset;
bool notify = false;
- int ack_reason = 0;
+ int ack_reason = 0, count = 1, stat_ix;
while (sp->hdr.flags & RXRPC_JUMBO_PACKET) {
if (len < RXRPC_JUMBO_SUBPKTLEN)
@@ -559,12 +647,16 @@ static bool rxrpc_input_split_jumbo(struct rxrpc_call *call, struct sk_buff *skb
sp->hdr.serial++;
offset += RXRPC_JUMBO_SUBPKTLEN;
len -= RXRPC_JUMBO_SUBPKTLEN;
+ count++;
}
sp->offset = offset;
sp->len = len;
rxrpc_input_data_one(call, skb, &notify, &ack_serial, &ack_reason);
+ stat_ix = umin(count, ARRAY_SIZE(call->rxnet->stat_rx_jumbo)) - 1;
+ atomic_inc(&call->rxnet->stat_rx_jumbo[stat_ix]);
+
if (ack_reason > 0) {
rxrpc_send_ACK(call, ack_reason, ack_serial,
rxrpc_propose_ack_input_data);
@@ -667,7 +759,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
smp_mb(); /* Read data before setting avail bit */
set_bit(i, &call->rtt_avail);
- rxrpc_peer_add_rtt(call, type, i, acked_serial, ack_serial,
+ rxrpc_call_add_rtt(call, type, i, acked_serial, ack_serial,
sent_at, resp_time);
matched = true;
}
@@ -677,7 +769,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
*/
if (after(acked_serial, orig_serial)) {
trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_obsolete, i,
- orig_serial, acked_serial, 0, 0);
+ orig_serial, acked_serial, 0, 0, 0);
clear_bit(i + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
smp_wmb();
set_bit(i, &call->rtt_avail);
@@ -685,7 +777,7 @@ static void rxrpc_complete_rtt_probe(struct rxrpc_call *call,
}
if (!matched)
- trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0);
+ trace_rxrpc_rtt_rx(call, rxrpc_rtt_rx_lost, 9, 0, acked_serial, 0, 0, 0);
}
/*
@@ -695,10 +787,13 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
struct rxrpc_acktrailer *trailer)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- struct rxrpc_peer *peer;
- unsigned int mtu;
+ struct rxrpc_peer *peer = call->peer;
+ unsigned int max_data, capacity;
bool wake = false;
- u32 rwind = ntohl(trailer->rwind);
+ u32 max_mtu = ntohl(trailer->maxMTU);
+ //u32 if_mtu = ntohl(trailer->ifMTU);
+ u32 rwind = ntohl(trailer->rwind);
+ u32 jumbo_max = ntohl(trailer->jumbo_max);
if (rwind > RXRPC_TX_MAX_WINDOW)
rwind = RXRPC_TX_MAX_WINDOW;
@@ -709,54 +804,147 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
call->tx_winsize = rwind;
}
- mtu = min(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU));
+ max_mtu = clamp(max_mtu, 500, 65535);
+ peer->ackr_max_data = max_mtu;
+
+ if (max_mtu < peer->max_data) {
+ trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_mtu,
+ rxrpc_pmtud_reduce_ack);
+ peer->max_data = max_mtu;
+ }
+
+ max_data = umin(max_mtu, peer->max_data);
+ capacity = max_data;
+ capacity += sizeof(struct rxrpc_jumbo_header); /* First subpacket has main hdr, not jumbo */
+ capacity /= sizeof(struct rxrpc_jumbo_header) + RXRPC_JUMBO_DATALEN;
- peer = call->peer;
- if (mtu < peer->maxdata) {
- spin_lock(&peer->lock);
- peer->maxdata = mtu;
- peer->mtu = mtu + peer->hdrsize;
- spin_unlock(&peer->lock);
+ if (jumbo_max == 0) {
+ /* The peer says it supports pmtu discovery */
+ peer->ackr_adv_pmtud = true;
+ } else {
+ peer->ackr_adv_pmtud = false;
+ capacity = clamp(capacity, 1, jumbo_max);
}
+ call->tx_jumbo_max = capacity;
+
if (wake)
wake_up(&call->waitq);
}
+#if defined(CONFIG_X86) && __GNUC__ && !defined(__clang__)
+/* Clang doesn't support the %z constraint modifier */
+#define shiftr_adv_rotr(shift_from, rotate_into) ({ \
+ asm(" shr%z1 %1\n" \
+ " inc %0\n" \
+ " rcr%z2 %2\n" \
+ : "+d"(shift_from), "+m"(*(shift_from)), "+rm"(rotate_into) \
+ ); \
+ })
+#else
+#define shiftr_adv_rotr(shift_from, rotate_into) ({ \
+ typeof(rotate_into) __bit0 = *(shift_from) & 1; \
+ *(shift_from) >>= 1; \
+ shift_from++; \
+ rotate_into >>= 1; \
+ rotate_into |= __bit0 << (sizeof(rotate_into) * 8 - 1); \
+ })
+#endif
+
/*
- * Determine how many nacks from the previous ACK have now been satisfied.
+ * Deal with RTT samples from soft ACKs.
*/
-static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call,
- struct rxrpc_ack_summary *summary,
- rxrpc_seq_t seq)
+static void rxrpc_input_soft_rtt(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq)
{
- struct sk_buff *skb = call->cong_last_nack;
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int i, new_acks = 0, retained_nacks = 0;
- rxrpc_seq_t old_seq = sp->ack.first_ack;
- u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
+ for (int ix = 0; ix < RXRPC_NR_TXQUEUE; ix++)
+ if (summary->acked_serial == tq->segment_serial[ix])
+ return rxrpc_add_data_rtt_sample(call, summary, tq, ix);
+}
- if (after_eq(seq, old_seq + sp->ack.nr_acks)) {
- summary->nr_new_acks += sp->ack.nr_nacks;
- summary->nr_new_acks += seq - (old_seq + sp->ack.nr_acks);
- summary->nr_retained_nacks = 0;
- } else if (seq == old_seq) {
- summary->nr_retained_nacks = sp->ack.nr_nacks;
- } else {
- for (i = 0; i < sp->ack.nr_acks; i++) {
- if (acks[i] == RXRPC_ACK_TYPE_NACK) {
- if (before(old_seq + i, seq))
- new_acks++;
- else
- retained_nacks++;
- }
+/*
+ * Process a batch of soft ACKs specific to a transmission queue segment.
+ */
+static void rxrpc_input_soft_ack_tq(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long extracted_acks,
+ int nr_reported,
+ rxrpc_seq_t seq,
+ rxrpc_seq_t *lowest_nak)
+{
+ unsigned long old_reported = 0, flipped, new_acks = 0;
+ unsigned long a_to_n, n_to_a = 0;
+ int new, a, n;
+
+ if (tq->nr_reported_acks > 0)
+ old_reported = ~0UL >> (RXRPC_NR_TXQUEUE - tq->nr_reported_acks);
+
+ _enter("{%x,%lx,%d},%lx,%d,%x",
+ tq->qbase, tq->segment_acked, tq->nr_reported_acks,
+ extracted_acks, nr_reported, seq);
+
+ _debug("[%x]", tq->qbase);
+ _debug("tq %16lx %u", tq->segment_acked, tq->nr_reported_acks);
+ _debug("sack %16lx %u", extracted_acks, nr_reported);
+
+ /* See how many previously logged ACKs/NAKs have flipped. */
+ flipped = (tq->segment_acked ^ extracted_acks) & old_reported;
+ if (flipped) {
+ n_to_a = ~tq->segment_acked & flipped; /* Old NAK -> ACK */
+ a_to_n = tq->segment_acked & flipped; /* Old ACK -> NAK */
+ a = hweight_long(n_to_a);
+ n = hweight_long(a_to_n);
+ _debug("flip %16lx", flipped);
+ _debug("ntoa %16lx %d", n_to_a, a);
+ _debug("aton %16lx %d", a_to_n, n);
+ call->acks_nr_sacks += a - n;
+ call->acks_nr_snacks += n - a;
+ summary->nr_new_sacks += a;
+ summary->nr_new_snacks += n;
+ }
+
+ /* See how many new ACKs/NAKs have been acquired. */
+ new = nr_reported - tq->nr_reported_acks;
+ if (new > 0) {
+ new_acks = extracted_acks & ~old_reported;
+ if (new_acks) {
+ a = hweight_long(new_acks);
+ n = new - a;
+ _debug("new_a %16lx new=%d a=%d n=%d", new_acks, new, a, n);
+ call->acks_nr_sacks += a;
+ call->acks_nr_snacks += n;
+ summary->nr_new_sacks += a;
+ summary->nr_new_snacks += n;
+ } else {
+ call->acks_nr_snacks += new;
+ summary->nr_new_snacks += new;
}
+ }
+
+ tq->nr_reported_acks = nr_reported;
+ tq->segment_acked = extracted_acks;
+ trace_rxrpc_apply_acks(call, tq);
- summary->nr_new_acks += new_acks;
- summary->nr_retained_nacks = retained_nacks;
+ if (extracted_acks != ~0UL) {
+ rxrpc_seq_t lowest = seq + ffz(extracted_acks);
+
+ if (before(lowest, *lowest_nak))
+ *lowest_nak = lowest;
}
- return old_seq + sp->ack.nr_acks;
+ if (summary->acked_serial)
+ rxrpc_input_soft_rtt(call, summary, tq);
+
+ new_acks |= n_to_a;
+ if (new_acks)
+ rxrpc_input_rack(call, summary, tq, new_acks);
+
+ if (call->tlp_serial &&
+ rxrpc_seq_in_txq(tq, call->tlp_seq) &&
+ test_bit(call->tlp_seq - tq->qbase, &new_acks))
+ summary->tlp_probe_acked = true;
}
/*
@@ -770,39 +958,50 @@ static rxrpc_seq_t rxrpc_input_check_prev_ack(struct rxrpc_call *call,
*/
static void rxrpc_input_soft_acks(struct rxrpc_call *call,
struct rxrpc_ack_summary *summary,
- struct sk_buff *skb,
- rxrpc_seq_t seq,
- rxrpc_seq_t since)
+ struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- unsigned int i, old_nacks = 0;
+ struct rxrpc_txqueue *tq = call->tx_queue;
+ unsigned long extracted = ~0UL;
+ unsigned int nr = 0;
+ rxrpc_seq_t seq = call->acks_hard_ack + 1;
rxrpc_seq_t lowest_nak = seq + sp->ack.nr_acks;
u8 *acks = skb->data + sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
- for (i = 0; i < sp->ack.nr_acks; i++) {
- if (acks[i] == RXRPC_ACK_TYPE_ACK) {
- summary->nr_acks++;
- if (after_eq(seq, since))
- summary->nr_new_acks++;
- } else {
- summary->saw_nacks = true;
- if (before(seq, since)) {
- /* Overlap with previous ACK */
- old_nacks++;
- } else {
- summary->nr_new_nacks++;
- sp->ack.nr_nacks++;
- }
+ _enter("%x,%x,%u", tq->qbase, seq, sp->ack.nr_acks);
+
+ while (after(seq, tq->qbase + RXRPC_NR_TXQUEUE - 1))
+ tq = tq->next;
- if (before(seq, lowest_nak))
- lowest_nak = seq;
+ for (unsigned int i = 0; i < sp->ack.nr_acks; i++) {
+ /* Decant ACKs until we hit a txqueue boundary. */
+ shiftr_adv_rotr(acks, extracted);
+ if (i == 256) {
+ acks -= i;
+ i = 0;
}
seq++;
+ nr++;
+ if ((seq & RXRPC_TXQ_MASK) != 0)
+ continue;
+
+ _debug("bound %16lx %u", extracted, nr);
+
+ rxrpc_input_soft_ack_tq(call, summary, tq, extracted, RXRPC_NR_TXQUEUE,
+ seq - RXRPC_NR_TXQUEUE, &lowest_nak);
+ extracted = ~0UL;
+ nr = 0;
+ tq = tq->next;
+ prefetch(tq);
}
- if (lowest_nak != call->acks_lowest_nak) {
- call->acks_lowest_nak = lowest_nak;
- summary->new_low_nack = true;
+ if (nr) {
+ unsigned int nr_reported = seq & RXRPC_TXQ_MASK;
+
+ extracted >>= RXRPC_NR_TXQUEUE - nr_reported;
+ _debug("tail %16lx %u", extracted, nr_reported);
+ rxrpc_input_soft_ack_tq(call, summary, tq, extracted, nr_reported,
+ seq & ~RXRPC_TXQ_MASK, &lowest_nak);
}
/* We *can* have more nacks than we did - the peer is permitted to drop
@@ -810,9 +1009,14 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
* possible for the nack distribution to change whilst the number of
* nacks stays the same or goes down.
*/
- if (old_nacks < summary->nr_retained_nacks)
- summary->nr_new_acks += summary->nr_retained_nacks - old_nacks;
- summary->nr_retained_nacks = old_nacks;
+ if (lowest_nak != call->acks_lowest_nak) {
+ call->acks_lowest_nak = lowest_nak;
+ summary->new_low_snack = true;
+ }
+
+ _debug("summary A=%d+%d N=%d+%d",
+ call->acks_nr_sacks, summary->nr_new_sacks,
+ call->acks_nr_snacks, summary->nr_new_snacks);
}
/*
@@ -820,21 +1024,21 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call,
* with respect to the ack state conveyed by preceding ACKs.
*/
static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
- rxrpc_seq_t first_pkt, rxrpc_seq_t prev_pkt)
+ rxrpc_seq_t hard_ack, rxrpc_seq_t prev_pkt)
{
- rxrpc_seq_t base = READ_ONCE(call->acks_first_seq);
+ rxrpc_seq_t base = READ_ONCE(call->acks_hard_ack);
- if (after(first_pkt, base))
+ if (after(hard_ack, base))
return true; /* The window advanced */
- if (before(first_pkt, base))
+ if (before(hard_ack, base))
return false; /* firstPacket regressed */
if (after_eq(prev_pkt, call->acks_prev_seq))
return true; /* previousPacket hasn't regressed. */
/* Some rx implementations put a serial number in previousPacket. */
- if (after_eq(prev_pkt, base + call->tx_winsize))
+ if (after(prev_pkt, base + call->tx_winsize))
return false;
return true;
}
@@ -852,53 +1056,34 @@ static bool rxrpc_is_ack_valid(struct rxrpc_call *call,
static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_ack_summary summary = { 0 };
- struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_acktrailer trailer;
- rxrpc_serial_t ack_serial, acked_serial;
- rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt, since;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+ rxrpc_seq_t first_soft_ack, hard_ack, prev_pkt;
int nr_acks, offset, ioffset;
_enter("");
offset = sizeof(struct rxrpc_wire_header) + sizeof(struct rxrpc_ackpacket);
- ack_serial = sp->hdr.serial;
- acked_serial = sp->ack.acked_serial;
- first_soft_ack = sp->ack.first_ack;
- prev_pkt = sp->ack.prev_ack;
- nr_acks = sp->ack.nr_acks;
- hard_ack = first_soft_ack - 1;
- summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ?
- sp->ack.reason : RXRPC_ACK__INVALID);
-
- trace_rxrpc_rx_ack(call, ack_serial, acked_serial,
- first_soft_ack, prev_pkt,
- summary.ack_reason, nr_acks);
- rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]);
+ summary.ack_serial = sp->hdr.serial;
+ first_soft_ack = sp->ack.first_ack;
+ prev_pkt = sp->ack.prev_ack;
+ nr_acks = sp->ack.nr_acks;
+ hard_ack = first_soft_ack - 1;
+ summary.acked_serial = sp->ack.acked_serial;
+ summary.ack_reason = (sp->ack.reason < RXRPC_ACK__INVALID ?
+ sp->ack.reason : RXRPC_ACK__INVALID);
- if (acked_serial != 0) {
- switch (summary.ack_reason) {
- case RXRPC_ACK_PING_RESPONSE:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_ping_response);
- break;
- case RXRPC_ACK_REQUESTED:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_requested_ack);
- break;
- default:
- rxrpc_complete_rtt_probe(call, skb->tstamp, acked_serial, ack_serial,
- rxrpc_rtt_rx_other_ack);
- break;
- }
- }
+ trace_rxrpc_rx_ack(call, sp);
+ rxrpc_inc_stat(call->rxnet, stat_rx_acks[summary.ack_reason]);
+ prefetch(call->tx_queue);
/* If we get an EXCEEDS_WINDOW ACK from the server, it probably
* indicates that the client address changed due to NAT. The server
* lost the call because it switched to a different peer.
*/
if (unlikely(summary.ack_reason == RXRPC_ACK_EXCEEDS_WINDOW) &&
- first_soft_ack == 1 &&
+ hard_ack == 0 &&
prev_pkt == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
@@ -911,9 +1096,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
* if we still have it buffered to the beginning.
*/
if (unlikely(summary.ack_reason == RXRPC_ACK_OUT_OF_SEQUENCE) &&
- first_soft_ack == 1 &&
+ hard_ack == 0 &&
prev_pkt == 0 &&
- call->acks_hard_ack == 0 &&
+ call->tx_bottom == 0 &&
rxrpc_is_client_call(call)) {
rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED,
0, -ENETRESET);
@@ -921,11 +1106,9 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
}
/* Discard any out-of-order or duplicate ACKs (outside lock). */
- if (!rxrpc_is_ack_valid(call, first_soft_ack, prev_pkt)) {
- trace_rxrpc_rx_discard_ack(call->debug_id, ack_serial,
- first_soft_ack, call->acks_first_seq,
- prev_pkt, call->acks_prev_seq);
- goto send_response;
+ if (!rxrpc_is_ack_valid(call, hard_ack, prev_pkt)) {
+ trace_rxrpc_rx_discard_ack(call, summary.ack_serial, hard_ack, prev_pkt);
+ goto send_response; /* Still respond if requested. */
}
trailer.maxMTU = 0;
@@ -937,34 +1120,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
if (nr_acks > 0)
skb_condense(skb);
- if (call->cong_last_nack) {
- since = rxrpc_input_check_prev_ack(call, &summary, first_soft_ack);
- rxrpc_free_skb(call->cong_last_nack, rxrpc_skb_put_last_nack);
- call->cong_last_nack = NULL;
- } else {
- summary.nr_new_acks = first_soft_ack - call->acks_first_seq;
- call->acks_lowest_nak = first_soft_ack + nr_acks;
- since = first_soft_ack;
- }
-
- call->acks_latest_ts = skb->tstamp;
- call->acks_first_seq = first_soft_ack;
+ call->acks_latest_ts = ktime_get_real();
+ call->acks_hard_ack = hard_ack;
call->acks_prev_seq = prev_pkt;
- switch (summary.ack_reason) {
- case RXRPC_ACK_PING:
- break;
- default:
- if (acked_serial && after(acked_serial, call->acks_highest_serial))
- call->acks_highest_serial = acked_serial;
- break;
+ if (summary.acked_serial) {
+ switch (summary.ack_reason) {
+ case RXRPC_ACK_PING_RESPONSE:
+ rxrpc_complete_rtt_probe(call, call->acks_latest_ts,
+ summary.acked_serial, summary.ack_serial,
+ rxrpc_rtt_rx_ping_response);
+ break;
+ default:
+ if (after(summary.acked_serial, call->acks_highest_serial))
+ call->acks_highest_serial = summary.acked_serial;
+ summary.rtt_sample_avail = true;
+ break;
+ }
}
/* Parse rwind and mtu sizes if provided. */
if (trailer.maxMTU)
rxrpc_input_ack_trailer(call, skb, &trailer);
- if (first_soft_ack == 0)
+ if (hard_ack + 1 == 0)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_zero);
/* Ignore ACKs unless we are or have just been transmitting. */
@@ -978,13 +1157,13 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
goto send_response;
}
- if (before(hard_ack, call->acks_hard_ack) ||
+ if (before(hard_ack, call->tx_bottom) ||
after(hard_ack, call->tx_top))
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_outside_window);
if (nr_acks > call->tx_top - hard_ack)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_sack_overflow);
- if (after(hard_ack, call->acks_hard_ack)) {
+ if (after(hard_ack, call->tx_bottom)) {
if (rxrpc_rotate_tx_window(call, hard_ack, &summary)) {
rxrpc_end_tx_phase(call, false, rxrpc_eproto_unexpected_ack);
goto send_response;
@@ -994,25 +1173,30 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb)
if (nr_acks > 0) {
if (offset > (int)skb->len - nr_acks)
return rxrpc_proto_abort(call, 0, rxrpc_eproto_ackr_short_sack);
- rxrpc_input_soft_acks(call, &summary, skb, first_soft_ack, since);
- rxrpc_get_skb(skb, rxrpc_skb_get_last_nack);
- call->cong_last_nack = skb;
+ rxrpc_input_soft_acks(call, &summary, skb);
}
if (test_bit(RXRPC_CALL_TX_LAST, &call->flags) &&
- summary.nr_acks == call->tx_top - hard_ack &&
+ call->acks_nr_sacks == call->tx_top - hard_ack &&
rxrpc_is_client_call(call))
- rxrpc_propose_ping(call, ack_serial,
+ rxrpc_propose_ping(call, summary.ack_serial,
rxrpc_propose_ack_ping_for_lost_reply);
- rxrpc_congestion_management(call, skb, &summary, acked_serial);
+ /* Drive the congestion management algorithm first and then RACK-TLP as
+ * the latter depends on the state/change in state in the former.
+ */
+ rxrpc_congestion_management(call, &summary);
+ rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+ rxrpc_tlp_process_ack(call, &summary);
+ if (call->tlp_serial && after_eq(summary.acked_serial, call->tlp_serial))
+ call->tlp_serial = 0;
send_response:
if (summary.ack_reason == RXRPC_ACK_PING)
- rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, ack_serial,
+ rxrpc_send_ACK(call, RXRPC_ACK_PING_RESPONSE, summary.ack_serial,
rxrpc_propose_ack_respond_to_ping);
else if (sp->hdr.flags & RXRPC_REQUEST_ACK)
- rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, ack_serial,
+ rxrpc_send_ACK(call, RXRPC_ACK_REQUESTED, summary.ack_serial,
rxrpc_propose_ack_respond_to_ack);
}
@@ -1111,5 +1295,5 @@ void rxrpc_implicit_end_call(struct rxrpc_call *call, struct sk_buff *skb)
break;
}
- rxrpc_input_call_event(call, skb);
+ rxrpc_input_call_event(call);
}
diff --git a/net/rxrpc/input_rack.c b/net/rxrpc/input_rack.c
new file mode 100644
index 000000000000..13c371261e0a
--- /dev/null
+++ b/net/rxrpc/input_rack.c
@@ -0,0 +1,418 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* RACK-TLP [RFC8958] Implementation
+ *
+ * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include "ar-internal.h"
+
+static bool rxrpc_rack_sent_after(ktime_t t1, rxrpc_seq_t seq1,
+ ktime_t t2, rxrpc_seq_t seq2)
+{
+ if (ktime_after(t1, t2))
+ return true;
+ return t1 == t2 && after(seq1, seq2);
+}
+
+/*
+ * Mark a packet lost.
+ */
+static void rxrpc_rack_mark_lost(struct rxrpc_call *call,
+ struct rxrpc_txqueue *tq, unsigned int ix)
+{
+ if (__test_and_set_bit(ix, &tq->segment_lost)) {
+ if (__test_and_clear_bit(ix, &tq->segment_retransmitted))
+ call->tx_nr_resent--;
+ } else {
+ call->tx_nr_lost++;
+ }
+ tq->segment_xmit_ts[ix] = UINT_MAX;
+}
+
+/*
+ * Get the transmission time of a packet in the Tx queue.
+ */
+static ktime_t rxrpc_get_xmit_ts(const struct rxrpc_txqueue *tq, unsigned int ix)
+{
+ if (tq->segment_xmit_ts[ix] == UINT_MAX)
+ return KTIME_MAX;
+ return ktime_add_us(tq->xmit_ts_base, tq->segment_xmit_ts[ix]);
+}
+
+/*
+ * Get a bitmask of nack bits for a queue segment and mask off any that aren't
+ * yet reported.
+ */
+static unsigned long rxrpc_tq_nacks(const struct rxrpc_txqueue *tq)
+{
+ unsigned long nacks = ~tq->segment_acked;
+
+ if (tq->nr_reported_acks < RXRPC_NR_TXQUEUE)
+ nacks &= (1UL << tq->nr_reported_acks) - 1;
+ return nacks;
+}
+
+/*
+ * Update the RACK state for the most recently sent packet that has been
+ * delivered [RFC8958 6.2 Step 2].
+ */
+static void rxrpc_rack_update(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+ ktime_t rtt = ktime_sub(call->acks_latest_ts, xmit_ts);
+
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+
+ if (test_bit(ix, &tq->segment_retransmitted)) {
+ /* Use Rx.serial instead of TCP.ACK.ts_option.echo_reply. */
+ if (before(call->acks_highest_serial, tq->segment_serial[ix]))
+ return;
+ if (rtt < minmax_get(&call->min_rtt))
+ return;
+ }
+
+ /* The RACK algorithm requires the segment ACKs to be traversed in
+ * order of segment transmission - but the only thing this seems to
+ * matter for is that RACK.rtt is set to the rtt of the most recently
+ * transmitted segment. We should be able to achieve the same by only
+ * setting RACK.rtt if the xmit time is greater.
+ */
+ if (ktime_after(xmit_ts, call->rack_rtt_ts)) {
+ call->rack_rtt = rtt;
+ call->rack_rtt_ts = xmit_ts;
+ }
+
+ if (rxrpc_rack_sent_after(xmit_ts, seq, call->rack_xmit_ts, call->rack_end_seq)) {
+ call->rack_rtt = rtt;
+ call->rack_xmit_ts = xmit_ts;
+ call->rack_end_seq = seq;
+ }
+}
+
+/*
+ * Detect data segment reordering [RFC8958 6.2 Step 3].
+ */
+static void rxrpc_rack_detect_reordering(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_seq_t seq = tq->qbase + ix;
+
+ /* Track the highest sequence number so far ACK'd. This is not
+ * necessarily the same as ack.firstPacket + ack.nAcks - 1 as the peer
+ * could put a NACK in the last SACK slot.
+ */
+ if (after(seq, call->rack_fack))
+ call->rack_fack = seq;
+ else if (before(seq, call->rack_fack) &&
+ test_bit(ix, &tq->segment_retransmitted))
+ call->rack_reordering_seen = true;
+}
+
+void rxrpc_input_rack_one(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned int ix)
+{
+ rxrpc_rack_update(call, summary, tq, ix);
+ rxrpc_rack_detect_reordering(call, summary, tq, ix);
+}
+
+void rxrpc_input_rack(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary,
+ struct rxrpc_txqueue *tq,
+ unsigned long new_acks)
+{
+ while (new_acks) {
+ unsigned int ix = __ffs(new_acks);
+
+ __clear_bit(ix, &new_acks);
+ rxrpc_input_rack_one(call, summary, tq, ix);
+ }
+
+ trace_rxrpc_rack_update(call, summary);
+}
+
+/*
+ * Update the reordering window [RFC8958 6.2 Step 4]. Returns the updated
+ * duration of the reordering window.
+ *
+ * Note that the Rx protocol doesn't have a 'DSACK option' per se, but ACKs can
+ * be given a 'DUPLICATE' reason with the serial number referring to the
+ * duplicated DATA packet. Rx does not inform as to whether this was a
+ * reception of the same packet twice or of a retransmission of a packet we
+ * already received (though this could be determined by the transmitter based
+ * on the serial number).
+ */
+static ktime_t rxrpc_rack_update_reo_wnd(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+ rxrpc_seq_t snd_nxt = call->tx_transmitted + 1; /* Next seq to be sent */
+ bool have_dsack_option = summary->ack_reason == RXRPC_ACK_DUPLICATE;
+ int dup_thresh = 3;
+
+ /* DSACK-based reordering window adaptation */
+ if (!call->rack_dsack_round_none &&
+ after_eq(snd_una, call->rack_dsack_round))
+ call->rack_dsack_round_none = true;
+
+ /* Grow the reordering window per round that sees DSACK. Reset the
+ * window after 16 DSACK-free recoveries.
+ */
+ if (call->rack_dsack_round_none && have_dsack_option) {
+ call->rack_dsack_round_none = false;
+ call->rack_dsack_round = snd_nxt;
+ call->rack_reo_wnd_mult++;
+ call->rack_reo_wnd_persist = 16;
+ } else if (summary->exiting_fast_or_rto_recovery) {
+ call->rack_reo_wnd_persist--;
+ if (call->rack_reo_wnd_persist <= 0)
+ call->rack_reo_wnd_mult = 1;
+ }
+
+ if (!call->rack_reordering_seen) {
+ if (summary->in_fast_or_rto_recovery)
+ return 0;
+ if (call->acks_nr_sacks >= dup_thresh)
+ return 0;
+ }
+
+ return us_to_ktime(umin(call->rack_reo_wnd_mult * minmax_get(&call->min_rtt) / 4,
+ call->srtt_us >> 3));
+}
+
+/*
+ * Detect losses [RFC8958 6.2 Step 5].
+ */
+static ktime_t rxrpc_rack_detect_loss(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ struct rxrpc_txqueue *tq;
+ ktime_t timeout = 0, lost_after, now = ktime_get_real();
+
+ call->rack_reo_wnd = rxrpc_rack_update_reo_wnd(call, summary);
+ lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+ trace_rxrpc_rack_scan_loss(call);
+
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long nacks = rxrpc_tq_nacks(tq);
+
+ if (after(tq->qbase, call->tx_transmitted))
+ break;
+ trace_rxrpc_rack_scan_loss_tq(call, tq, nacks);
+
+ /* Skip ones marked lost but not yet retransmitted */
+ nacks &= ~tq->segment_lost | tq->segment_retransmitted;
+
+ while (nacks) {
+ unsigned int ix = __ffs(nacks);
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t remaining;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+ __clear_bit(ix, &nacks);
+
+ if (rxrpc_rack_sent_after(call->rack_xmit_ts, call->rack_end_seq,
+ xmit_ts, seq)) {
+ remaining = ktime_sub(ktime_add(xmit_ts, lost_after), now);
+ if (remaining <= 0) {
+ rxrpc_rack_mark_lost(call, tq, ix);
+ trace_rxrpc_rack_detect_loss(call, summary, seq);
+ } else {
+ timeout = max(remaining, timeout);
+ }
+ }
+ }
+ }
+
+ return timeout;
+}
+
+/*
+ * Detect losses and set a timer to retry the detection [RFC8958 6.2 Step 5].
+ */
+void rxrpc_rack_detect_loss_and_arm_timer(struct rxrpc_call *call,
+ struct rxrpc_ack_summary *summary)
+{
+ ktime_t timeout = rxrpc_rack_detect_loss(call, summary);
+
+ if (timeout) {
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RACK_REORDER;
+ call->rack_timo_at = ktime_add(ktime_get_real(), timeout);
+ trace_rxrpc_rack_timer(call, timeout, false);
+ trace_rxrpc_timer_set(call, timeout, rxrpc_timer_trace_rack_reo);
+ }
+}
+
+/*
+ * Handle RACK-TLP RTO expiration [RFC8958 6.3].
+ */
+static void rxrpc_rack_mark_losses_on_rto(struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq;
+ rxrpc_seq_t snd_una = call->acks_lowest_nak; /* Lowest unack'd seq */
+ ktime_t lost_after = ktime_add(call->rack_rtt, call->rack_reo_wnd);
+ ktime_t deadline = ktime_sub(ktime_get_real(), lost_after);
+
+ for (tq = call->tx_queue; tq; tq = tq->next) {
+ unsigned long unacked = ~tq->segment_acked;
+
+ trace_rxrpc_rack_mark_loss_tq(call, tq);
+ while (unacked) {
+ unsigned int ix = __ffs(unacked);
+ rxrpc_seq_t seq = tq->qbase + ix;
+ ktime_t xmit_ts = rxrpc_get_xmit_ts(tq, ix);
+
+ if (after(seq, call->tx_transmitted))
+ return;
+ __clear_bit(ix, &unacked);
+
+ if (seq == snd_una ||
+ ktime_before(xmit_ts, deadline))
+ rxrpc_rack_mark_lost(call, tq, ix);
+ }
+ }
+}
+
+/*
+ * Calculate the TLP loss probe timeout (PTO) [RFC8958 7.2].
+ */
+ktime_t rxrpc_tlp_calc_pto(struct rxrpc_call *call, ktime_t now)
+{
+ unsigned int flight_size = rxrpc_tx_in_flight(call);
+ ktime_t rto_at = ktime_add(call->tx_last_sent,
+ rxrpc_get_rto_backoff(call, false));
+ ktime_t pto;
+
+ if (call->rtt_count > 0) {
+ /* Use 2*SRTT as the timeout. */
+ pto = ns_to_ktime(call->srtt_us * NSEC_PER_USEC / 4);
+ if (flight_size)
+ pto = ktime_add(pto, call->tlp_max_ack_delay);
+ } else {
+ pto = NSEC_PER_SEC;
+ }
+
+ if (ktime_after(ktime_add(now, pto), rto_at))
+ pto = ktime_sub(rto_at, now);
+ return pto;
+}
+
+/*
+ * Send a TLP loss probe on PTO expiration [RFC8958 7.3].
+ */
+void rxrpc_tlp_send_probe(struct rxrpc_call *call)
+{
+ unsigned int in_flight = rxrpc_tx_in_flight(call);
+
+ if (after_eq(call->acks_hard_ack, call->tx_transmitted))
+ return; /* Everything we transmitted has been acked. */
+
+ /* There must be no other loss probe still in flight and we need to
+ * have taken a new RTT sample since last probe or the start of
+ * connection.
+ */
+ if (!call->tlp_serial &&
+ call->tlp_rtt_taken != call->rtt_taken) {
+ call->tlp_is_retrans = false;
+ if (after(call->send_top, call->tx_transmitted) &&
+ rxrpc_tx_window_space(call) > 0) {
+ /* Transmit the lowest-sequence unsent DATA */
+ call->tx_last_serial = 0;
+ rxrpc_transmit_some_data(call, 1, rxrpc_txdata_tlp_new_data);
+ call->tlp_serial = call->tx_last_serial;
+ call->tlp_seq = call->tx_transmitted;
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_transmit_new);
+ in_flight = rxrpc_tx_in_flight(call);
+ } else {
+ /* Retransmit the highest-sequence DATA sent */
+ call->tx_last_serial = 0;
+ rxrpc_resend_tlp(call);
+ call->tlp_is_retrans = true;
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_retransmit);
+ }
+ } else {
+ trace_rxrpc_tlp_probe(call, rxrpc_tlp_probe_trace_busy);
+ }
+
+ if (in_flight != 0) {
+ ktime_t rto = rxrpc_get_rto_backoff(call, false);
+
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_RTO;
+ call->rack_timo_at = ktime_add(ktime_get_real(), rto);
+ trace_rxrpc_rack_timer(call, rto, false);
+ trace_rxrpc_timer_set(call, rto, rxrpc_timer_trace_rack_rto);
+ }
+}
+
+/*
+ * Detect losses using the ACK of a TLP loss probe [RFC8958 7.4].
+ */
+void rxrpc_tlp_process_ack(struct rxrpc_call *call, struct rxrpc_ack_summary *summary)
+{
+ if (!call->tlp_serial || after(call->tlp_seq, call->acks_hard_ack))
+ return;
+
+ if (!call->tlp_is_retrans) {
+ /* TLP of new data delivered */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_new_data);
+ call->tlp_serial = 0;
+ } else if (summary->ack_reason == RXRPC_ACK_DUPLICATE &&
+ summary->acked_serial == call->tlp_serial) {
+ /* General Case: Detected packet losses using RACK [7.4.1] */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_dup_acked);
+ call->tlp_serial = 0;
+ } else if (after(call->acks_hard_ack, call->tlp_seq)) {
+ /* Repaired the single loss */
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_hard_beyond);
+ call->tlp_serial = 0;
+ // TODO: Invoke congestion control to react to the loss
+ // event the probe has repaired
+ } else if (summary->tlp_probe_acked) {
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_acked);
+ /* Special Case: Detected a single loss repaired by the loss
+ * probe [7.4.2]
+ */
+ call->tlp_serial = 0;
+ } else {
+ trace_rxrpc_tlp_ack(call, summary, rxrpc_tlp_ack_trace_incomplete);
+ }
+}
+
+/*
+ * Handle RACK timer expiration; returns true to request a resend.
+ */
+void rxrpc_rack_timer_expired(struct rxrpc_call *call, ktime_t overran_by)
+{
+ struct rxrpc_ack_summary summary = {};
+ enum rxrpc_rack_timer_mode mode = call->rack_timer_mode;
+
+ trace_rxrpc_rack_timer(call, overran_by, true);
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_OFF;
+
+ switch (mode) {
+ case RXRPC_CALL_RACKTIMER_RACK_REORDER:
+ rxrpc_rack_detect_loss_and_arm_timer(call, &summary);
+ break;
+ case RXRPC_CALL_RACKTIMER_TLP_PTO:
+ rxrpc_tlp_send_probe(call);
+ break;
+ case RXRPC_CALL_RACKTIMER_RTO:
+ // Might need to poke the congestion algo in some way
+ rxrpc_rack_mark_losses_on_rto(call);
+ break;
+ //case RXRPC_CALL_RACKTIMER_ZEROWIN:
+ default:
+ pr_warn("Unexpected rack timer %u", call->rack_timer_mode);
+ }
+}
diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c
index 6716c021a532..e068f9b79d02 100644
--- a/net/rxrpc/insecure.c
+++ b/net/rxrpc/insecure.c
@@ -19,11 +19,14 @@ static int none_init_connection_security(struct rxrpc_connection *conn,
*/
static struct rxrpc_txbuf *none_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
{
- return rxrpc_alloc_data_txbuf(call, min_t(size_t, remain, RXRPC_JUMBO_DATALEN), 1, gfp);
+ return rxrpc_alloc_data_txbuf(call, umin(remain, RXRPC_JUMBO_DATALEN), 1, gfp);
}
static int none_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
{
+ txb->pkt_len = txb->len;
+ if (txb->len == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
return 0;
}
diff --git a/net/rxrpc/io_thread.c b/net/rxrpc/io_thread.c
index 07c74c77d802..64f8d77b8731 100644
--- a/net/rxrpc/io_thread.c
+++ b/net/rxrpc/io_thread.c
@@ -338,7 +338,6 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
struct rxrpc_channel *chan;
struct rxrpc_call *call = NULL;
unsigned int channel;
- bool ret;
if (sp->hdr.securityIndex != conn->security_ix)
return rxrpc_direct_abort(skb, rxrpc_eproto_wrong_security,
@@ -364,6 +363,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
if (sp->hdr.callNumber == 0)
return rxrpc_input_conn_packet(conn, skb);
+ /* Deal with path MTU discovery probing. */
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK &&
+ conn->pmtud_probe &&
+ after_eq(sp->ack.acked_serial, conn->pmtud_probe))
+ rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false);
+
/* Call-bound packets are routed by connection channel. */
channel = sp->hdr.cid & RXRPC_CHANNELMASK;
chan = &conn->channels[channel];
@@ -419,9 +424,9 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
peer_srx, skb);
}
- ret = rxrpc_input_call_event(call, skb);
+ rxrpc_queue_rx_call_packet(call, skb);
rxrpc_put_call(call, rxrpc_call_put_input);
- return ret;
+ return true;
}
/*
@@ -438,6 +443,8 @@ int rxrpc_io_thread(void *data)
ktime_t now;
#endif
bool should_stop;
+ LIST_HEAD(conn_attend_q);
+ LIST_HEAD(call_attend_q);
complete(&local->io_thread_ready);
@@ -448,43 +455,26 @@ int rxrpc_io_thread(void *data)
for (;;) {
rxrpc_inc_stat(local->rxnet, stat_io_loop);
- /* Deal with connections that want immediate attention. */
- conn = list_first_entry_or_null(&local->conn_attend_q,
- struct rxrpc_connection,
- attend_link);
- if (conn) {
- spin_lock_bh(&local->lock);
- list_del_init(&conn->attend_link);
- spin_unlock_bh(&local->lock);
-
- rxrpc_input_conn_event(conn, NULL);
- rxrpc_put_connection(conn, rxrpc_conn_put_poke);
- continue;
+ /* Inject a delay into packets if requested. */
+#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
+ now = ktime_get_real();
+ while ((skb = skb_peek(&local->rx_delay_queue))) {
+ if (ktime_before(now, skb->tstamp))
+ break;
+ skb = skb_dequeue(&local->rx_delay_queue);
+ skb_queue_tail(&local->rx_queue, skb);
}
+#endif
- if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
- &local->client_conn_flags))
- rxrpc_discard_expired_client_conns(local);
-
- /* Deal with calls that want immediate attention. */
- if ((call = list_first_entry_or_null(&local->call_attend_q,
- struct rxrpc_call,
- attend_link))) {
- spin_lock_bh(&local->lock);
- list_del_init(&call->attend_link);
- spin_unlock_bh(&local->lock);
-
- trace_rxrpc_call_poked(call);
- rxrpc_input_call_event(call, NULL);
- rxrpc_put_call(call, rxrpc_call_put_poke);
- continue;
+ if (!skb_queue_empty(&local->rx_queue)) {
+ spin_lock_irq(&local->rx_queue.lock);
+ skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
+ spin_unlock_irq(&local->rx_queue.lock);
+ trace_rxrpc_iothread_rx(local, skb_queue_len(&rx_queue));
}
- if (!list_empty(&local->new_client_calls))
- rxrpc_connect_client_calls(local);
-
- /* Process received packets and errors. */
- if ((skb = __skb_dequeue(&rx_queue))) {
+ /* Distribute packets and errors. */
+ while ((skb = __skb_dequeue(&rx_queue))) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
switch (skb->mark) {
case RXRPC_SKB_MARK_PACKET:
@@ -508,27 +498,46 @@ int rxrpc_io_thread(void *data)
rxrpc_free_skb(skb, rxrpc_skb_put_unknown);
break;
}
- continue;
}
- /* Inject a delay into packets if requested. */
-#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
- now = ktime_get_real();
- while ((skb = skb_peek(&local->rx_delay_queue))) {
- if (ktime_before(now, skb->tstamp))
- break;
- skb = skb_dequeue(&local->rx_delay_queue);
- skb_queue_tail(&local->rx_queue, skb);
+ /* Deal with connections that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->conn_attend_q, &conn_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((conn = list_first_entry_or_null(&conn_attend_q,
+ struct rxrpc_connection,
+ attend_link))) {
+ spin_lock_irq(&local->lock);
+ list_del_init(&conn->attend_link);
+ spin_unlock_irq(&local->lock);
+ rxrpc_input_conn_event(conn, NULL);
+ rxrpc_put_connection(conn, rxrpc_conn_put_poke);
}
-#endif
- if (!skb_queue_empty(&local->rx_queue)) {
- spin_lock_irq(&local->rx_queue.lock);
- skb_queue_splice_tail_init(&local->rx_queue, &rx_queue);
- spin_unlock_irq(&local->rx_queue.lock);
- continue;
+ if (test_and_clear_bit(RXRPC_CLIENT_CONN_REAP_TIMER,
+ &local->client_conn_flags))
+ rxrpc_discard_expired_client_conns(local);
+
+ /* Deal with calls that want immediate attention. */
+ spin_lock_irq(&local->lock);
+ list_splice_tail_init(&local->call_attend_q, &call_attend_q);
+ spin_unlock_irq(&local->lock);
+
+ while ((call = list_first_entry_or_null(&call_attend_q,
+ struct rxrpc_call,
+ attend_link))) {
+ spin_lock_irq(&local->lock);
+ list_del_init(&call->attend_link);
+ spin_unlock_irq(&local->lock);
+ trace_rxrpc_call_poked(call);
+ rxrpc_input_call_event(call);
+ rxrpc_put_call(call, rxrpc_call_put_poke);
}
+ if (!list_empty(&local->new_client_calls))
+ rxrpc_connect_client_calls(local);
+
set_current_state(TASK_INTERRUPTIBLE);
should_stop = kthread_should_stop();
if (!skb_queue_empty(&local->rx_queue) ||
@@ -558,7 +567,7 @@ int rxrpc_io_thread(void *data)
}
timeout = nsecs_to_jiffies(delay_ns);
- timeout = max(timeout, 1UL);
+ timeout = umax(timeout, 1);
schedule_timeout(timeout);
__set_current_state(TASK_RUNNING);
continue;
diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c
index 2792d2304605..a74a4b43904f 100644
--- a/net/rxrpc/local_object.c
+++ b/net/rxrpc/local_object.c
@@ -215,9 +215,6 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net)
/* we want to set the don't fragment bit */
rxrpc_local_dont_fragment(local, true);
-
- /* We want receive timestamps. */
- sock_enable_timestamps(usk);
break;
default:
diff --git a/net/rxrpc/misc.c b/net/rxrpc/misc.c
index 657cf35089a6..8fcc8139d771 100644
--- a/net/rxrpc/misc.c
+++ b/net/rxrpc/misc.c
@@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255;
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
* made by gluing normal packets together that we're willing to handle.
*/
-unsigned int rxrpc_rx_mtu = 5692;
+unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46);
/*
* The maximum number of fragments in a received jumbo packet that we tell the
* sender that we're willing to handle.
*/
-unsigned int rxrpc_rx_jumbo_max = 4;
+unsigned int rxrpc_rx_jumbo_max = 46;
#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
/*
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index 5ea9601efd05..95905b85a8d7 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -72,22 +72,96 @@ static void rxrpc_set_keepalive(struct rxrpc_call *call, ktime_t now)
}
/*
+ * Allocate transmission buffers for an ACK and attach them to local->kv[].
+ */
+static int rxrpc_alloc_ack(struct rxrpc_call *call, size_t sack_size)
+{
+ struct rxrpc_wire_header *whdr;
+ struct rxrpc_acktrailer *trailer;
+ struct rxrpc_ackpacket *ack;
+ struct kvec *kv = call->local->kvec;
+ gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS;
+ void *buf, *buf2 = NULL;
+ u8 *filler;
+
+ buf = page_frag_alloc(&call->local->tx_alloc,
+ sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
+ if (!buf)
+ return -ENOMEM;
+
+ if (sack_size) {
+ buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
+ if (!buf2) {
+ page_frag_free(buf);
+ return -ENOMEM;
+ }
+ }
+
+ whdr = buf;
+ ack = buf + sizeof(*whdr);
+ filler = buf + sizeof(*whdr) + sizeof(*ack) + 1;
+ trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3;
+
+ kv[0].iov_base = whdr;
+ kv[0].iov_len = sizeof(*whdr) + sizeof(*ack);
+ kv[1].iov_base = buf2;
+ kv[1].iov_len = sack_size;
+ kv[2].iov_base = filler;
+ kv[2].iov_len = 3 + sizeof(*trailer);
+ return 3; /* Number of kvec[] used. */
+}
+
+static void rxrpc_free_ack(struct rxrpc_call *call)
+{
+ page_frag_free(call->local->kvec[0].iov_base);
+ if (call->local->kvec[1].iov_base)
+ page_frag_free(call->local->kvec[1].iov_base);
+}
+
+/*
+ * Record the beginning of an RTT probe.
+ */
+static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
+ ktime_t now, enum rxrpc_rtt_tx_trace why)
+{
+ unsigned long avail = call->rtt_avail;
+ int rtt_slot = 9;
+
+ if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
+ goto no_slot;
+
+ rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
+ if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
+ goto no_slot;
+
+ call->rtt_serial[rtt_slot] = serial;
+ call->rtt_sent_at[rtt_slot] = now;
+ smp_wmb(); /* Write data before avail bit */
+ set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+
+ trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
+ return;
+
+no_slot:
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+}
+
+/*
* Fill out an ACK packet.
*/
-static void rxrpc_fill_out_ack(struct rxrpc_call *call,
- struct rxrpc_txbuf *txb,
- u8 ack_reason,
- rxrpc_serial_t serial)
+static int rxrpc_fill_out_ack(struct rxrpc_call *call, int nr_kv, u8 ack_reason,
+ rxrpc_serial_t serial_to_ack, rxrpc_serial_t *_ack_serial)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxrpc_acktrailer *trailer = txb->kvec[2].iov_base + 3;
+ struct kvec *kv = call->local->kvec;
+ struct rxrpc_wire_header *whdr = kv[0].iov_base;
+ struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3;
struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
- unsigned int qsize, sack, wrap, to;
+ unsigned int qsize, sack, wrap, to, max_mtu, if_mtu;
rxrpc_seq_t window, wtop;
+ ktime_t now = ktime_get_real();
int rsize;
- u32 mtu, jmax;
- u8 *filler = txb->kvec[2].iov_base;
- u8 *sackp = txb->kvec[1].iov_base;
+ u8 *filler = kv[2].iov_base;
+ u8 *sackp = kv[1].iov_base;
rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill);
@@ -95,14 +169,25 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
wtop = call->ackr_wtop;
sack = call->ackr_sack_base % RXRPC_SACK_SIZE;
+ *_ack_serial = rxrpc_get_next_serial(call->conn);
+
+ whdr->epoch = htonl(call->conn->proto.epoch);
+ whdr->cid = htonl(call->cid);
+ whdr->callNumber = htonl(call->call_id);
+ whdr->serial = htonl(*_ack_serial);
whdr->seq = 0;
whdr->type = RXRPC_PACKET_TYPE_ACK;
- txb->flags |= RXRPC_SLOW_START_OK;
+ whdr->flags = call->conn->out_clientflag | RXRPC_SLOW_START_OK;
+ whdr->userStatus = 0;
+ whdr->securityIndex = call->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(call->dest_srx.srx_service);
+
ack->bufferSpace = 0;
ack->maxSkew = 0;
ack->firstPacket = htonl(window);
ack->previousPacket = htonl(call->rx_highest_seq);
- ack->serial = htonl(serial);
+ ack->serial = htonl(serial_to_ack);
ack->reason = ack_reason;
ack->nAcks = wtop - window;
filler[0] = 0;
@@ -110,15 +195,13 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
filler[2] = 0;
if (ack_reason == RXRPC_ACK_PING)
- txb->flags |= RXRPC_REQUEST_ACK;
+ whdr->flags |= RXRPC_REQUEST_ACK;
if (after(wtop, window)) {
- txb->len += ack->nAcks;
- txb->kvec[1].iov_base = sackp;
- txb->kvec[1].iov_len = ack->nAcks;
+ kv[1].iov_len = ack->nAcks;
wrap = RXRPC_SACK_SIZE - sack;
- to = min_t(unsigned int, ack->nAcks, RXRPC_SACK_SIZE);
+ to = umin(ack->nAcks, RXRPC_SACK_SIZE);
if (sack + ack->nAcks <= RXRPC_SACK_SIZE) {
memcpy(sackp, call->ackr_sack_table + sack, ack->nAcks);
@@ -132,56 +215,42 @@ static void rxrpc_fill_out_ack(struct rxrpc_call *call,
ack->reason = RXRPC_ACK_IDLE;
}
- mtu = call->peer->if_mtu;
- mtu -= call->peer->hdrsize;
- jmax = rxrpc_rx_jumbo_max;
qsize = (window - 1) - call->rx_consumed;
rsize = max_t(int, call->rx_winsize - qsize, 0);
- txb->ack_rwind = rsize;
- trailer->maxMTU = htonl(rxrpc_rx_mtu);
- trailer->ifMTU = htonl(mtu);
- trailer->rwind = htonl(rsize);
- trailer->jumbo_max = htonl(jmax);
-}
-/*
- * Record the beginning of an RTT probe.
- */
-static void rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
- ktime_t now, enum rxrpc_rtt_tx_trace why)
-{
- unsigned long avail = call->rtt_avail;
- int rtt_slot = 9;
-
- if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
- goto no_slot;
-
- rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
- if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
- goto no_slot;
-
- call->rtt_serial[rtt_slot] = serial;
- call->rtt_sent_at[rtt_slot] = now;
- smp_wmb(); /* Write data before avail bit */
- set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
+ if_mtu = call->peer->if_mtu - call->peer->hdrsize;
+ if (call->peer->ackr_adv_pmtud) {
+ max_mtu = umax(call->peer->max_data, rxrpc_rx_mtu);
+ } else {
+ if_mtu = umin(if_mtu, 1444);
+ max_mtu = if_mtu;
+ }
- trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
- return;
+ trailer->maxMTU = htonl(max_mtu);
+ trailer->ifMTU = htonl(if_mtu);
+ trailer->rwind = htonl(rsize);
+ trailer->jumbo_max = 0; /* Advertise pmtu discovery */
-no_slot:
- trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
+ if (ack_reason == RXRPC_ACK_PING)
+ rxrpc_begin_rtt_probe(call, *_ack_serial, now, rxrpc_rtt_tx_ping);
+ if (whdr->flags & RXRPC_REQUEST_ACK)
+ call->rtt_last_req = now;
+ rxrpc_set_keepalive(call, now);
+ return nr_kv;
}
/*
* Transmit an ACK packet.
*/
-static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static void rxrpc_send_ack_packet(struct rxrpc_call *call, int nr_kv, size_t len,
+ rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct kvec *kv = call->local->kvec;
+ struct rxrpc_wire_header *whdr = kv[0].iov_base;
+ struct rxrpc_acktrailer *trailer = kv[2].iov_base + 3;
struct rxrpc_connection *conn;
struct rxrpc_ackpacket *ack = (struct rxrpc_ackpacket *)(whdr + 1);
struct msghdr msg;
- ktime_t now;
int ret;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
@@ -195,33 +264,34 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
msg.msg_controllen = 0;
msg.msg_flags = MSG_SPLICE_PAGES;
- whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
-
- txb->serial = rxrpc_get_next_serial(conn);
- whdr->serial = htonl(txb->serial);
- trace_rxrpc_tx_ack(call->debug_id, txb->serial,
+ trace_rxrpc_tx_ack(call->debug_id, serial,
ntohl(ack->firstPacket),
ntohl(ack->serial), ack->reason, ack->nAcks,
- txb->ack_rwind);
+ ntohl(trailer->rwind), why);
rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
- iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, txb->len);
- rxrpc_local_dont_fragment(conn->local, false);
- ret = do_udp_sendmsg(conn->local->socket, &msg, txb->len);
+ iov_iter_kvec(&msg.msg_iter, WRITE, kv, nr_kv, len);
+ rxrpc_local_dont_fragment(conn->local, why == rxrpc_propose_ack_ping_for_mtu_probe);
+
+ ret = do_udp_sendmsg(conn->local->socket, &msg, len);
call->peer->last_tx_at = ktime_get_seconds();
if (ret < 0) {
- trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret,
+ trace_rxrpc_tx_fail(call->debug_id, serial, ret,
rxrpc_tx_point_call_ack);
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe &&
+ ret == -EMSGSIZE)
+ rxrpc_input_probe_for_pmtud(conn, serial, true);
} else {
trace_rxrpc_tx_packet(call->debug_id, whdr,
rxrpc_tx_point_call_ack);
- now = ktime_get_real();
- if (ack->reason == RXRPC_ACK_PING)
- rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_ping);
- if (txb->flags & RXRPC_REQUEST_ACK)
- call->peer->rtt_last_req = now;
- rxrpc_set_keepalive(call, now);
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+ call->peer->pmtud_pending = false;
+ call->peer->pmtud_probing = true;
+ call->conn->pmtud_probe = serial;
+ call->conn->pmtud_call = call->debug_id;
+ trace_rxrpc_pmtud_tx(call);
+ }
}
rxrpc_tx_backoff(call, ret);
}
@@ -230,31 +300,62 @@ static void rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
* Queue an ACK for immediate transmission.
*/
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
- rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why)
+ rxrpc_serial_t serial_to_ack, enum rxrpc_propose_ack_trace why)
{
- struct rxrpc_txbuf *txb;
+ struct kvec *kv = call->local->kvec;
+ rxrpc_serial_t ack_serial;
+ size_t len;
+ int nr_kv;
if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
return;
rxrpc_inc_stat(call->rxnet, stat_tx_acks[ack_reason]);
- txb = rxrpc_alloc_ack_txbuf(call, call->ackr_wtop - call->ackr_window);
- if (!txb) {
+ nr_kv = rxrpc_alloc_ack(call, call->ackr_wtop - call->ackr_window);
+ if (nr_kv < 0) {
kleave(" = -ENOMEM");
return;
}
- txb->ack_why = why;
+ nr_kv = rxrpc_fill_out_ack(call, nr_kv, ack_reason, serial_to_ack, &ack_serial);
+ len = kv[0].iov_len;
+ len += kv[1].iov_len;
+ len += kv[2].iov_len;
+
+ /* Extend a path MTU probe ACK. */
+ if (why == rxrpc_propose_ack_ping_for_mtu_probe) {
+ size_t probe_mtu = call->peer->pmtud_trial + sizeof(struct rxrpc_wire_header);
+
+ if (len > probe_mtu)
+ goto skip;
+ while (len < probe_mtu) {
+ size_t part = umin(probe_mtu - len, PAGE_SIZE);
+
+ kv[nr_kv].iov_base = page_address(ZERO_PAGE(0));
+ kv[nr_kv].iov_len = part;
+ len += part;
+ nr_kv++;
+ }
+ }
- rxrpc_fill_out_ack(call, txb, ack_reason, serial);
call->ackr_nr_unacked = 0;
atomic_set(&call->ackr_nr_consumed, 0);
clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags);
- trace_rxrpc_send_ack(call, why, ack_reason, serial);
- rxrpc_send_ack_packet(call, txb);
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_ack_tx);
+ trace_rxrpc_send_ack(call, why, ack_reason, ack_serial);
+ rxrpc_send_ack_packet(call, nr_kv, len, ack_serial, why);
+skip:
+ rxrpc_free_ack(call);
+}
+
+/*
+ * Send an ACK probe for path MTU discovery.
+ */
+void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call)
+{
+ rxrpc_send_ACK(call, RXRPC_ACK_PING, 0,
+ rxrpc_propose_ack_ping_for_mtu_probe);
}
/*
@@ -324,14 +425,21 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
/*
* Prepare a (sub)packet for transmission.
*/
-static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_txbuf *txb,
- rxrpc_serial_t serial)
+static size_t rxrpc_prepare_data_subpacket(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req,
+ struct rxrpc_txbuf *txb,
+ struct rxrpc_wire_header *whdr,
+ rxrpc_serial_t serial, int subpkt)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct rxrpc_jumbo_header *jumbo = txb->data - sizeof(*jumbo);
enum rxrpc_req_ack_trace why;
struct rxrpc_connection *conn = call->conn;
+ struct kvec *kv = &call->local->kvec[1 + subpkt];
+ size_t len = txb->pkt_len;
+ bool last;
+ u8 flags;
- _enter("%x,{%d}", txb->seq, txb->len);
+ _enter("%x,%zd", txb->seq, len);
txb->serial = serial;
@@ -339,6 +447,15 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t
txb->seq == 1)
whdr->userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE;
+ txb->flags &= ~RXRPC_REQUEST_ACK;
+ flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
+ last = txb->flags & RXRPC_LAST_PACKET;
+
+ if (subpkt < req->n - 1) {
+ len = RXRPC_JUMBO_DATALEN;
+ goto dont_set_request_ack;
+ }
+
/* If our RTT cache needs working on, request an ACK. Also request
* ACKs if a DATA packet appears to have been lost.
*
@@ -346,113 +463,208 @@ static void rxrpc_prepare_data_subpacket(struct rxrpc_call *call, struct rxrpc_t
* service call, lest OpenAFS incorrectly send us an ACK with some
* soft-ACKs in it and then never follow up with a proper hard ACK.
*/
- if (txb->flags & RXRPC_REQUEST_ACK)
- why = rxrpc_reqack_already_on;
- else if ((txb->flags & RXRPC_LAST_PACKET) && rxrpc_sending_to_client(txb))
+ if (last && rxrpc_sending_to_client(txb))
why = rxrpc_reqack_no_srv_last;
else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
why = rxrpc_reqack_ack_lost;
else if (txb->flags & RXRPC_TXBUF_RESENT)
why = rxrpc_reqack_retrans;
- else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2)
+ else if (call->cong_ca_state == RXRPC_CA_SLOW_START && call->cong_cwnd <= RXRPC_MIN_CWND)
why = rxrpc_reqack_slow_start;
else if (call->tx_winsize <= 2)
why = rxrpc_reqack_small_txwin;
- else if (call->peer->rtt_count < 3 && txb->seq & 1)
+ else if (call->rtt_count < 3)
why = rxrpc_reqack_more_rtt;
- else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))
+ else if (ktime_before(ktime_add_ms(call->rtt_last_req, 1000), ktime_get_real()))
why = rxrpc_reqack_old_rtt;
+ else if (!last && !after(READ_ONCE(call->send_top), txb->seq))
+ why = rxrpc_reqack_app_stall;
else
goto dont_set_request_ack;
rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]);
trace_rxrpc_req_ack(call->debug_id, txb->seq, why);
- if (why != rxrpc_reqack_no_srv_last)
- txb->flags |= RXRPC_REQUEST_ACK;
+ if (why != rxrpc_reqack_no_srv_last) {
+ flags |= RXRPC_REQUEST_ACK;
+ trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_data, -1, serial);
+ call->rtt_last_req = req->now;
+ }
dont_set_request_ack:
- whdr->flags = txb->flags & RXRPC_TXBUF_WIRE_FLAGS;
- whdr->serial = htonl(txb->serial);
- whdr->cksum = txb->cksum;
+ /* There's a jumbo header prepended to the data if we need it. */
+ if (subpkt < req->n - 1)
+ flags |= RXRPC_JUMBO_PACKET;
+ else
+ flags &= ~RXRPC_JUMBO_PACKET;
+ if (subpkt == 0) {
+ whdr->flags = flags;
+ whdr->cksum = txb->cksum;
+ kv->iov_base = txb->data;
+ } else {
+ jumbo->flags = flags;
+ jumbo->pad = 0;
+ jumbo->cksum = txb->cksum;
+ kv->iov_base = jumbo;
+ len += sizeof(*jumbo);
+ }
- trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags, false);
+ trace_rxrpc_tx_data(call, txb->seq, txb->serial, flags, req->trace);
+ kv->iov_len = len;
+ return len;
}
/*
- * Prepare a packet for transmission.
+ * Prepare a transmission queue object for initial transmission. Returns the
+ * number of microseconds since the transmission queue base timestamp.
*/
-static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static unsigned int rxrpc_prepare_txqueue(struct rxrpc_txqueue *tq,
+ struct rxrpc_send_data_req *req)
{
- rxrpc_serial_t serial;
-
- /* Each transmission of a Tx packet needs a new serial number */
- serial = rxrpc_get_next_serial(call->conn);
-
- rxrpc_prepare_data_subpacket(call, txb, serial);
-
- return txb->len;
+ if (!tq)
+ return 0;
+ if (tq->xmit_ts_base == KTIME_MIN) {
+ tq->xmit_ts_base = req->now;
+ return 0;
+ }
+ return ktime_to_us(ktime_sub(req->now, tq->xmit_ts_base));
}
/*
- * Set timeouts after transmitting a packet.
+ * Prepare a (jumbo) packet for transmission.
*/
-static void rxrpc_tstamp_data_packets(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+static size_t rxrpc_prepare_data_packet(struct rxrpc_call *call,
+ struct rxrpc_send_data_req *req,
+ struct rxrpc_wire_header *whdr)
{
- ktime_t now = ktime_get_real();
- bool ack_requested = txb->flags & RXRPC_REQUEST_ACK;
+ struct rxrpc_txqueue *tq = req->tq;
+ rxrpc_serial_t serial;
+ unsigned int xmit_ts;
+ rxrpc_seq_t seq = req->seq;
+ size_t len = 0;
+ bool start_tlp = false;
- call->tx_last_sent = now;
- txb->last_sent = now;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit);
- if (ack_requested) {
- rxrpc_begin_rtt_probe(call, txb->serial, now, rxrpc_rtt_tx_data);
+ /* Each transmission of a Tx packet needs a new serial number */
+ serial = rxrpc_get_next_serials(call->conn, req->n);
+
+ whdr->epoch = htonl(call->conn->proto.epoch);
+ whdr->cid = htonl(call->cid);
+ whdr->callNumber = htonl(call->call_id);
+ whdr->seq = htonl(seq);
+ whdr->serial = htonl(serial);
+ whdr->type = RXRPC_PACKET_TYPE_DATA;
+ whdr->flags = 0;
+ whdr->userStatus = 0;
+ whdr->securityIndex = call->security_ix;
+ whdr->_rsvd = 0;
+ whdr->serviceId = htons(call->conn->service_id);
+
+ call->tx_last_serial = serial + req->n - 1;
+ call->tx_last_sent = req->now;
+ xmit_ts = rxrpc_prepare_txqueue(tq, req);
+ prefetch(tq->next);
+
+ for (int i = 0;;) {
+ int ix = seq & RXRPC_TXQ_MASK;
+ struct rxrpc_txbuf *txb = tq->bufs[seq & RXRPC_TXQ_MASK];
+
+ _debug("prep[%u] tq=%x q=%x", i, tq->qbase, seq);
+
+ /* Record (re-)transmission for RACK [RFC8985 6.1]. */
+ if (__test_and_clear_bit(ix, &tq->segment_lost))
+ call->tx_nr_lost--;
+ if (req->retrans) {
+ __set_bit(ix, &tq->ever_retransmitted);
+ __set_bit(ix, &tq->segment_retransmitted);
+ call->tx_nr_resent++;
+ } else {
+ call->tx_nr_sent++;
+ start_tlp = true;
+ }
+ tq->segment_xmit_ts[ix] = xmit_ts;
+ tq->segment_serial[ix] = serial;
+ if (i + 1 == req->n)
+ /* Only sample the last subpacket in a jumbo. */
+ __set_bit(ix, &tq->rtt_samples);
+ len += rxrpc_prepare_data_subpacket(call, req, txb, whdr, serial, i);
+ serial++;
+ seq++;
+ i++;
+ if (i >= req->n)
+ break;
+ if (!(seq & RXRPC_TXQ_MASK)) {
+ tq = tq->next;
+ trace_rxrpc_tq(call, tq, seq, rxrpc_tq_transmit_advance);
+ xmit_ts = rxrpc_prepare_txqueue(tq, req);
+ }
+ }
- call->peer->rtt_last_req = now;
- if (call->peer->rtt_count > 1) {
- ktime_t delay = rxrpc_get_rto_backoff(call->peer, false);
+ /* Set timeouts */
+ if (req->tlp_probe) {
+ /* Sending TLP loss probe [RFC8985 7.3]. */
+ call->tlp_serial = serial - 1;
+ call->tlp_seq = seq - 1;
+ } else if (start_tlp) {
+ /* Schedule TLP loss probe [RFC8985 7.2]. */
+ ktime_t pto;
+
+ if (!test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+ /* The first packet may take longer to elicit a response. */
+ pto = NSEC_PER_SEC;
+ else
+ pto = rxrpc_tlp_calc_pto(call, req->now);
- call->ack_lost_at = ktime_add(now, delay);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_lost_ack);
- }
+ call->rack_timer_mode = RXRPC_CALL_RACKTIMER_TLP_PTO;
+ call->rack_timo_at = ktime_add(req->now, pto);
+ trace_rxrpc_rack_timer(call, pto, false);
+ trace_rxrpc_timer_set(call, pto, rxrpc_timer_trace_rack_tlp_pto);
}
if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) {
ktime_t delay = ms_to_ktime(READ_ONCE(call->next_rx_timo));
- call->expect_rx_by = ktime_add(now, delay);
+ call->expect_rx_by = ktime_add(req->now, delay);
trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_expect_rx);
}
- rxrpc_set_keepalive(call, now);
+ rxrpc_set_keepalive(call, req->now);
+ page_frag_free(whdr);
+ return len;
}
/*
- * send a packet through the transport endpoint
+ * Send one or more packets through the transport endpoint
*/
-static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
+void rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_send_data_req *req)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
+ struct rxrpc_wire_header *whdr;
struct rxrpc_connection *conn = call->conn;
enum rxrpc_tx_point frag;
+ struct rxrpc_txqueue *tq = req->tq;
+ struct rxrpc_txbuf *txb;
struct msghdr msg;
- size_t len;
- int ret;
+ rxrpc_seq_t seq = req->seq;
+ size_t len = sizeof(*whdr);
+ bool new_call = test_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags);
+ int ret, stat_ix;
- _enter("%x,{%d}", txb->seq, txb->len);
+ _enter("%x,%x-%x", tq->qbase, seq, seq + req->n - 1);
- len = rxrpc_prepare_data_packet(call, txb);
+ whdr = page_frag_alloc(&call->local->tx_alloc, sizeof(*whdr), GFP_NOFS);
+ if (!whdr)
+ return; /* Drop the packet if no memory. */
- if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
- static int lose;
- if ((lose++ & 7) == 7) {
- ret = 0;
- trace_rxrpc_tx_data(call, txb->seq, txb->serial,
- txb->flags, true);
- goto done;
- }
- }
+ call->local->kvec[0].iov_base = whdr;
+ call->local->kvec[0].iov_len = sizeof(*whdr);
- iov_iter_kvec(&msg.msg_iter, WRITE, txb->kvec, txb->nr_kvec, len);
+ stat_ix = umin(req->n, ARRAY_SIZE(call->rxnet->stat_tx_jumbo)) - 1;
+ atomic_inc(&call->rxnet->stat_tx_jumbo[stat_ix]);
+
+ len += rxrpc_prepare_data_packet(call, req, whdr);
+ txb = tq->bufs[seq & RXRPC_TXQ_MASK];
+
+ iov_iter_kvec(&msg.msg_iter, WRITE, call->local->kvec, 1 + req->n, len);
msg.msg_name = &call->peer->srx.transport;
msg.msg_namelen = call->peer->srx.transport_len;
@@ -460,16 +672,11 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
msg.msg_controllen = 0;
msg.msg_flags = MSG_SPLICE_PAGES;
- /* Track what we've attempted to transmit at least once so that the
- * retransmission algorithm doesn't try to resend what we haven't sent
- * yet.
+ /* Send the packet with the don't fragment bit set unless we think it's
+ * too big or if this is a retransmission.
*/
- if (txb->seq == call->tx_transmitted + 1)
- call->tx_transmitted = txb->seq;
-
- /* send the packet with the don't fragment bit set if we currently
- * think it's small enough */
- if (txb->len >= call->peer->maxdata) {
+ if (seq == call->tx_transmitted + 1 &&
+ len >= sizeof(struct rxrpc_wire_header) + call->peer->max_data) {
rxrpc_local_dont_fragment(conn->local, false);
frag = rxrpc_tx_point_call_data_frag;
} else {
@@ -477,7 +684,25 @@ static int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *t
frag = rxrpc_tx_point_call_data_nofrag;
}
-retry:
+ /* Track what we've attempted to transmit at least once so that the
+ * retransmission algorithm doesn't try to resend what we haven't sent
+ * yet.
+ */
+ if (seq == call->tx_transmitted + 1)
+ call->tx_transmitted = seq + req->n - 1;
+
+ if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
+ static int lose;
+
+ if ((lose++ & 7) == 7) {
+ ret = 0;
+ trace_rxrpc_tx_data(call, txb->seq, txb->serial, txb->flags,
+ rxrpc_txdata_inject_loss);
+ conn->peer->last_tx_at = ktime_get_seconds();
+ goto done;
+ }
+ }
+
/* send the packet by UDP
* - returns -EMSGSIZE if UDP would have to fragment the packet
* to go out of the interface
@@ -488,7 +713,11 @@ retry:
ret = do_udp_sendmsg(conn->local->socket, &msg, len);
conn->peer->last_tx_at = ktime_get_seconds();
- if (ret < 0) {
+ if (ret == -EMSGSIZE) {
+ rxrpc_inc_stat(call->rxnet, stat_tx_data_send_msgsize);
+ trace_rxrpc_tx_packet(call->debug_id, whdr, frag);
+ ret = 0;
+ } else if (ret < 0) {
rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
trace_rxrpc_tx_fail(call->debug_id, txb->serial, ret, frag);
} else {
@@ -496,28 +725,23 @@ retry:
}
rxrpc_tx_backoff(call, ret);
- if (ret == -EMSGSIZE && frag == rxrpc_tx_point_call_data_frag) {
- rxrpc_local_dont_fragment(conn->local, false);
- frag = rxrpc_tx_point_call_data_frag;
- goto retry;
- }
-done:
- if (ret >= 0) {
- rxrpc_tstamp_data_packets(call, txb);
- } else {
- /* Cancel the call if the initial transmission fails,
- * particularly if that's due to network routing issues that
- * aren't going away anytime soon. The layer above can arrange
- * the retransmission.
+ if (ret < 0) {
+ /* Cancel the call if the initial transmission fails or if we
+ * hit due to network routing issues that aren't going away
+ * anytime soon. The layer above can arrange the
+ * retransmission.
*/
- if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
+ if (new_call ||
+ ret == -ENETUNREACH ||
+ ret == -EHOSTUNREACH ||
+ ret == -ECONNREFUSED)
rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
RX_USER_ABORT, ret);
}
- _leave(" = %d [%u]", ret, call->peer->maxdata);
- return ret;
+done:
+ _leave(" = %d [%u]", ret, call->peer->max_data);
}
/*
@@ -692,41 +916,3 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
peer->last_tx_at = ktime_get_seconds();
_leave("");
}
-
-/*
- * Schedule an instant Tx resend.
- */
-static inline void rxrpc_instant_resend(struct rxrpc_call *call,
- struct rxrpc_txbuf *txb)
-{
- if (!__rxrpc_call_is_complete(call))
- kdebug("resend");
-}
-
-/*
- * Transmit one packet.
- */
-void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
-{
- int ret;
-
- ret = rxrpc_send_data_packet(call, txb);
- if (ret < 0) {
- switch (ret) {
- case -ENETUNREACH:
- case -EHOSTUNREACH:
- case -ECONNREFUSED:
- rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
- 0, ret);
- break;
- default:
- _debug("need instant resend %d", ret);
- rxrpc_instant_resend(call, txb);
- }
- } else {
- ktime_t delay = ns_to_ktime(call->peer->rto_us * NSEC_PER_USEC);
-
- call->resend_at = ktime_add(ktime_get_real(), delay);
- trace_rxrpc_timer_set(call, delay, rxrpc_timer_trace_resend_tx);
- }
-}
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 5d0842efde69..7f4729234957 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -102,6 +102,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local,
*/
static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
{
+ unsigned int max_data;
+
/* wind down the local interface MTU */
if (mtu > 0 && peer->if_mtu == 65535 && mtu < peer->if_mtu)
peer->if_mtu = mtu;
@@ -120,11 +122,15 @@ static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu)
}
}
- if (mtu < peer->mtu) {
- spin_lock(&peer->lock);
- peer->mtu = mtu;
- peer->maxdata = peer->mtu - peer->hdrsize;
- spin_unlock(&peer->lock);
+ max_data = max_t(int, mtu - peer->hdrsize, 500);
+ if (max_data < peer->max_data) {
+ if (peer->pmtud_good > max_data)
+ peer->pmtud_good = max_data;
+ if (peer->pmtud_bad > max_data + 1)
+ peer->pmtud_bad = max_data + 1;
+
+ trace_rxrpc_pmtud_reduce(peer, 0, max_data, rxrpc_pmtud_reduce_icmp);
+ peer->max_data = max_data;
}
}
@@ -161,6 +167,13 @@ void rxrpc_input_error(struct rxrpc_local *local, struct sk_buff *skb)
goto out;
}
+ if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6 &&
+ serr->ee.ee_type == ICMPV6_PKT_TOOBIG &&
+ serr->ee.ee_code == 0)) {
+ rxrpc_adjust_mtu(peer, serr->ee.ee_info);
+ goto out;
+ }
+
rxrpc_store_error(peer, skb);
out:
rxrpc_put_peer(peer, rxrpc_peer_put_input_error);
@@ -205,23 +218,23 @@ static void rxrpc_distribute_error(struct rxrpc_peer *peer, struct sk_buff *skb,
struct rxrpc_call *call;
HLIST_HEAD(error_targets);
- spin_lock(&peer->lock);
+ spin_lock_irq(&peer->lock);
hlist_move_list(&peer->error_targets, &error_targets);
while (!hlist_empty(&error_targets)) {
call = hlist_entry(error_targets.first,
struct rxrpc_call, error_link);
hlist_del_init(&call->error_link);
- spin_unlock(&peer->lock);
+ spin_unlock_irq(&peer->lock);
rxrpc_see_call(call, rxrpc_call_see_distribute_error);
rxrpc_set_call_completion(call, compl, 0, -err);
- rxrpc_input_call_event(call, skb);
+ rxrpc_input_call_event(call);
- spin_lock(&peer->lock);
+ spin_lock_irq(&peer->lock);
}
- spin_unlock(&peer->lock);
+ spin_unlock_irq(&peer->lock);
}
/*
@@ -347,3 +360,84 @@ void rxrpc_peer_keepalive_worker(struct work_struct *work)
_leave("");
}
+
+/*
+ * Do path MTU probing.
+ */
+void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
+ bool sendmsg_fail)
+{
+ struct rxrpc_peer *peer = conn->peer;
+ unsigned int max_data = peer->max_data;
+ int good, trial, bad, jumbo;
+
+ good = peer->pmtud_good;
+ trial = peer->pmtud_trial;
+ bad = peer->pmtud_bad;
+ if (good >= bad - 1) {
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = false;
+ return;
+ }
+
+ if (!peer->pmtud_probing)
+ goto send_probe;
+
+ if (sendmsg_fail || after(acked_serial, conn->pmtud_probe)) {
+ /* Retry a lost probe. */
+ if (!peer->pmtud_lost) {
+ trace_rxrpc_pmtud_lost(conn, acked_serial);
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = true;
+ goto send_probe;
+ }
+
+ /* The probed size didn't seem to get through. */
+ bad = trial;
+ peer->pmtud_bad = bad;
+ if (bad <= max_data)
+ max_data = bad - 1;
+ } else {
+ /* It did get through. */
+ good = trial;
+ peer->pmtud_good = good;
+ if (good > max_data)
+ max_data = good;
+ }
+
+ max_data = umin(max_data, peer->ackr_max_data);
+ if (max_data != peer->max_data)
+ peer->max_data = max_data;
+
+ jumbo = max_data + sizeof(struct rxrpc_jumbo_header);
+ jumbo /= RXRPC_JUMBO_SUBPKTLEN;
+ peer->pmtud_jumbo = jumbo;
+
+ trace_rxrpc_pmtud_rx(conn, acked_serial);
+ conn->pmtud_probe = 0;
+ peer->pmtud_lost = false;
+
+ if (good < RXRPC_JUMBO(2) && bad > RXRPC_JUMBO(2))
+ trial = RXRPC_JUMBO(2);
+ else if (good < RXRPC_JUMBO(4) && bad > RXRPC_JUMBO(4))
+ trial = RXRPC_JUMBO(4);
+ else if (good < RXRPC_JUMBO(3) && bad > RXRPC_JUMBO(3))
+ trial = RXRPC_JUMBO(3);
+ else if (good < RXRPC_JUMBO(6) && bad > RXRPC_JUMBO(6))
+ trial = RXRPC_JUMBO(6);
+ else if (good < RXRPC_JUMBO(5) && bad > RXRPC_JUMBO(5))
+ trial = RXRPC_JUMBO(5);
+ else if (good < RXRPC_JUMBO(8) && bad > RXRPC_JUMBO(8))
+ trial = RXRPC_JUMBO(8);
+ else if (good < RXRPC_JUMBO(7) && bad > RXRPC_JUMBO(7))
+ trial = RXRPC_JUMBO(7);
+ else
+ trial = (good + bad) / 2;
+ peer->pmtud_trial = trial;
+
+ if (good >= bad)
+ return;
+
+send_probe:
+ peer->pmtud_pending = true;
+}
diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
index 956fc7ea4b73..56e09d161a97 100644
--- a/net/rxrpc/peer_object.c
+++ b/net/rxrpc/peer_object.c
@@ -162,6 +162,11 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
#endif
peer->if_mtu = 1500;
+ if (peer->max_data < peer->if_mtu - peer->hdrsize) {
+ trace_rxrpc_pmtud_reduce(peer, 0, peer->if_mtu - peer->hdrsize,
+ rxrpc_pmtud_reduce_route);
+ peer->max_data = peer->if_mtu - peer->hdrsize;
+ }
memset(&fl, 0, sizeof(fl));
switch (peer->srx.transport.family) {
@@ -199,8 +204,16 @@ static void rxrpc_assess_MTU_size(struct rxrpc_local *local,
}
peer->if_mtu = dst_mtu(dst);
+ peer->hdrsize += dst->header_len + dst->trailer_len;
+ peer->tx_seg_max = dst->dev->gso_max_segs;
dst_release(dst);
+ peer->max_data = umin(RXRPC_JUMBO(1), peer->if_mtu - peer->hdrsize);
+ peer->pmtud_good = 500;
+ peer->pmtud_bad = peer->if_mtu - peer->hdrsize + 1;
+ peer->pmtud_trial = umin(peer->max_data, peer->pmtud_bad - 1);
+ peer->pmtud_pending = true;
+
_leave(" [if_mtu %u]", peer->if_mtu);
}
@@ -222,11 +235,8 @@ struct rxrpc_peer *rxrpc_alloc_peer(struct rxrpc_local *local, gfp_t gfp,
peer->service_conns = RB_ROOT;
seqlock_init(&peer->service_conn_lock);
spin_lock_init(&peer->lock);
- spin_lock_init(&peer->rtt_input_lock);
peer->debug_id = atomic_inc_return(&rxrpc_debug_id);
-
- rxrpc_peer_init_rtt(peer);
-
+ peer->recent_srtt_us = UINT_MAX;
peer->cong_ssthresh = RXRPC_TX_MAX_WINDOW;
trace_rxrpc_peer(peer->debug_id, 1, why);
}
@@ -242,9 +252,7 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
unsigned long hash_key)
{
peer->hash_key = hash_key;
- rxrpc_assess_MTU_size(local, peer);
- peer->mtu = peer->if_mtu;
- peer->rtt_last_req = ktime_get_real();
+
switch (peer->srx.transport.family) {
case AF_INET:
@@ -268,7 +276,9 @@ static void rxrpc_init_peer(struct rxrpc_local *local, struct rxrpc_peer *peer,
}
peer->hdrsize += sizeof(struct rxrpc_wire_header);
- peer->maxdata = peer->mtu - peer->hdrsize;
+ peer->max_data = peer->if_mtu - peer->hdrsize;
+
+ rxrpc_assess_MTU_size(local, peer);
}
/*
@@ -304,6 +314,7 @@ static void rxrpc_free_peer(struct rxrpc_peer *peer)
* Set up a new incoming peer. There shouldn't be any other matching peers
* since we've already done a search in the list from the non-reentrant context
* (the data_ready handler) that is the only place we can add new peers.
+ * Called with interrupts disabled.
*/
void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer)
{
@@ -313,10 +324,10 @@ void rxrpc_new_incoming_peer(struct rxrpc_local *local, struct rxrpc_peer *peer)
hash_key = rxrpc_peer_hash_key(local, &peer->srx);
rxrpc_init_peer(local, peer, hash_key);
- spin_lock_bh(&rxnet->peer_hash_lock);
+ spin_lock(&rxnet->peer_hash_lock);
hash_add_rcu(rxnet->peer_hash, &peer->hash_link, hash_key);
list_add_tail(&peer->keepalive_link, &rxnet->peer_keepalive_new);
- spin_unlock_bh(&rxnet->peer_hash_lock);
+ spin_unlock(&rxnet->peer_hash_lock);
}
/*
@@ -479,7 +490,7 @@ EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
*/
unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer)
{
- return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX;
+ return READ_ONCE(peer->recent_srtt_us);
}
EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index 263a2251e3d2..d803562ca0ac 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -52,7 +52,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_call *call;
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
enum rxrpc_call_state state;
- rxrpc_seq_t acks_hard_ack;
+ rxrpc_seq_t tx_bottom;
char lbuff[50], rbuff[50];
long timeout = 0;
@@ -79,7 +79,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
if (state != RXRPC_CALL_SERVER_PREALLOC)
timeout = ktime_ms_delta(READ_ONCE(call->expect_rx_by), ktime_get_real());
- acks_hard_ack = READ_ONCE(call->acks_hard_ack);
+ tx_bottom = READ_ONCE(call->tx_bottom);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
" %-8.8s %08x %08x %08x %02x %08x %02x %08x %02x %06lx\n",
@@ -93,7 +93,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
rxrpc_call_states[state],
call->abort_code,
call->debug_id,
- acks_hard_ack, READ_ONCE(call->tx_top) - acks_hard_ack,
+ tx_bottom, READ_ONCE(call->tx_top) - tx_bottom,
call->ackr_window, call->ackr_wtop - call->ackr_window,
call->rx_serial,
call->cong_cwnd,
@@ -283,9 +283,7 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
- "Proto Local "
- " Remote "
- " Use SST MTU LastUse RTT RTO\n"
+ "Proto Local Remote Use SST Maxd LastUse RTT RTO\n"
);
return 0;
}
@@ -298,16 +296,15 @@ static int rxrpc_peer_seq_show(struct seq_file *seq, void *v)
now = ktime_get_seconds();
seq_printf(seq,
- "UDP %-47.47s %-47.47s %3u"
- " %3u %5u %6llus %8u %8u\n",
+ "UDP %-47.47s %-47.47s %3u %4u %5u %6llus %8d %8d\n",
lbuff,
rbuff,
refcount_read(&peer->ref),
peer->cong_ssthresh,
- peer->mtu,
+ peer->max_data,
now - peer->last_tx_at,
- peer->srtt_us >> 3,
- peer->rto_us);
+ READ_ONCE(peer->recent_srtt_us),
+ READ_ONCE(peer->recent_rto_us));
return 0;
}
@@ -476,10 +473,11 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
struct rxrpc_net *rxnet = rxrpc_net(seq_file_single_net(seq));
seq_printf(seq,
- "Data : send=%u sendf=%u fail=%u\n",
+ "Data : send=%u sendf=%u fail=%u emsz=%u\n",
atomic_read(&rxnet->stat_tx_data_send),
atomic_read(&rxnet->stat_tx_data_send_frag),
- atomic_read(&rxnet->stat_tx_data_send_fail));
+ atomic_read(&rxnet->stat_tx_data_send_fail),
+ atomic_read(&rxnet->stat_tx_data_send_msgsize));
seq_printf(seq,
"Data-Tx : nr=%u retrans=%u uf=%u cwr=%u\n",
atomic_read(&rxnet->stat_tx_data),
@@ -508,7 +506,7 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_DELAY]),
atomic_read(&rxnet->stat_tx_acks[RXRPC_ACK_IDLE]));
seq_printf(seq,
- "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u\n",
+ "Ack-Rx : req=%u dup=%u oos=%u exw=%u nos=%u png=%u prs=%u dly=%u idl=%u z=%u\n",
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_REQUESTED]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DUPLICATE]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_OUT_OF_SEQUENCE]),
@@ -517,13 +515,14 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_PING_RESPONSE]),
atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_DELAY]),
- atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]));
+ atomic_read(&rxnet->stat_rx_acks[RXRPC_ACK_IDLE]),
+ atomic_read(&rxnet->stat_rx_acks[0]));
seq_printf(seq,
- "Why-Req-A: acklost=%u already=%u mrtt=%u ortt=%u\n",
+ "Why-Req-A: acklost=%u mrtt=%u ortt=%u stall=%u\n",
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_ack_lost]),
- atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_already_on]),
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_more_rtt]),
- atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]));
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_old_rtt]),
+ atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_app_stall]));
seq_printf(seq,
"Why-Req-A: nolast=%u retx=%u slows=%u smtxw=%u\n",
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_no_srv_last]),
@@ -531,6 +530,30 @@ int rxrpc_stats_show(struct seq_file *seq, void *v)
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_slow_start]),
atomic_read(&rxnet->stat_why_req_ack[rxrpc_reqack_small_txwin]));
seq_printf(seq,
+ "Jumbo-Tx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
+ atomic_read(&rxnet->stat_tx_jumbo[0]),
+ atomic_read(&rxnet->stat_tx_jumbo[1]),
+ atomic_read(&rxnet->stat_tx_jumbo[2]),
+ atomic_read(&rxnet->stat_tx_jumbo[3]),
+ atomic_read(&rxnet->stat_tx_jumbo[4]),
+ atomic_read(&rxnet->stat_tx_jumbo[5]),
+ atomic_read(&rxnet->stat_tx_jumbo[6]),
+ atomic_read(&rxnet->stat_tx_jumbo[7]),
+ atomic_read(&rxnet->stat_tx_jumbo[8]),
+ atomic_read(&rxnet->stat_tx_jumbo[9]));
+ seq_printf(seq,
+ "Jumbo-Rx : %u,%u,%u,%u,%u,%u,%u,%u,%u,%u\n",
+ atomic_read(&rxnet->stat_rx_jumbo[0]),
+ atomic_read(&rxnet->stat_rx_jumbo[1]),
+ atomic_read(&rxnet->stat_rx_jumbo[2]),
+ atomic_read(&rxnet->stat_rx_jumbo[3]),
+ atomic_read(&rxnet->stat_rx_jumbo[4]),
+ atomic_read(&rxnet->stat_rx_jumbo[5]),
+ atomic_read(&rxnet->stat_rx_jumbo[6]),
+ atomic_read(&rxnet->stat_rx_jumbo[7]),
+ atomic_read(&rxnet->stat_rx_jumbo[8]),
+ atomic_read(&rxnet->stat_rx_jumbo[9]));
+ seq_printf(seq,
"Buffers : txb=%u rxb=%u\n",
atomic_read(&rxrpc_nr_txbuf),
atomic_read(&rxrpc_n_rx_skbs));
@@ -567,6 +590,8 @@ int rxrpc_stats_clear(struct file *file, char *buf, size_t size)
atomic_set(&rxnet->stat_tx_ack_skip, 0);
memset(&rxnet->stat_tx_acks, 0, sizeof(rxnet->stat_tx_acks));
memset(&rxnet->stat_rx_acks, 0, sizeof(rxnet->stat_rx_acks));
+ memset(&rxnet->stat_tx_jumbo, 0, sizeof(rxnet->stat_tx_jumbo));
+ memset(&rxnet->stat_rx_jumbo, 0, sizeof(rxnet->stat_rx_jumbo));
memset(&rxnet->stat_why_req_ack, 0, sizeof(rxnet->stat_why_req_ack));
diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h
index 4fe6b4d20ada..42f70e4636f8 100644
--- a/net/rxrpc/protocol.h
+++ b/net/rxrpc/protocol.h
@@ -92,11 +92,16 @@ struct rxrpc_jumbo_header {
/*
* The maximum number of subpackets that can possibly fit in a UDP packet is:
*
- * ((max_IP - IP_hdr - UDP_hdr) / RXRPC_JUMBO_SUBPKTLEN) + 1
- * = ((65535 - 28 - 28) / 1416) + 1
- * = 46 non-terminal packets and 1 terminal packet.
+ * (max_UDP - wirehdr + jumbohdr) / (jumbohdr + 1412)
+ * = ((65535 - 28 + 4) / 1416)
+ * = 45 non-terminal packets and 1 terminal packet.
*/
-#define RXRPC_MAX_NR_JUMBO 47
+#define RXRPC_MAX_NR_JUMBO 46
+
+/* Size of a jumbo packet with N subpackets, excluding UDP+IP */
+#define RXRPC_JUMBO(N) ((int)sizeof(struct rxrpc_wire_header) + \
+ RXRPC_JUMBO_DATALEN + \
+ ((N) - 1) * RXRPC_JUMBO_SUBPKTLEN)
/*****************************************************************************/
/*
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index a482f88c5fc5..32cd5f1d541d 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -36,16 +36,16 @@ void rxrpc_notify_socket(struct rxrpc_call *call)
sk = &rx->sk;
if (rx && sk->sk_state < RXRPC_CLOSE) {
if (call->notify_rx) {
- spin_lock(&call->notify_lock);
+ spin_lock_irq(&call->notify_lock);
call->notify_rx(sk, call, call->user_call_ID);
- spin_unlock(&call->notify_lock);
+ spin_unlock_irq(&call->notify_lock);
} else {
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
if (list_empty(&call->recvmsg_link)) {
rxrpc_get_call(call, rxrpc_call_get_notify_socket);
list_add_tail(&call->recvmsg_link, &rx->recvmsg_q);
}
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
if (!sock_flag(sk, SOCK_DEAD)) {
_debug("call %ps", sk->sk_data_ready);
@@ -337,14 +337,14 @@ try_again:
* We also want to weed out calls that got requeued whilst we were
* shovelling data out.
*/
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
l = rx->recvmsg_q.next;
call = list_entry(l, struct rxrpc_call, recvmsg_link);
if (!rxrpc_call_is_complete(call) &&
skb_queue_empty(&call->recvmsg_queue)) {
list_del_init(&call->recvmsg_link);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
release_sock(&rx->sk);
trace_rxrpc_recvmsg(call->debug_id, rxrpc_recvmsg_unqueue, 0);
rxrpc_put_call(call, rxrpc_call_put_recvmsg);
@@ -355,7 +355,7 @@ try_again:
list_del_init(&call->recvmsg_link);
else
rxrpc_get_call(call, rxrpc_call_get_recvmsg);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
call_debug_id = call->debug_id;
trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_dequeue, 0);
@@ -445,9 +445,9 @@ error_unlock_call:
error_requeue_call:
if (!(flags & MSG_PEEK)) {
- spin_lock(&rx->recvmsg_lock);
+ spin_lock_irq(&rx->recvmsg_lock);
list_add(&call->recvmsg_link, &rx->recvmsg_q);
- spin_unlock(&rx->recvmsg_lock);
+ spin_unlock_irq(&rx->recvmsg_lock);
trace_rxrpc_recvmsg(call_debug_id, rxrpc_recvmsg_requeue, 0);
} else {
rxrpc_put_call(call, rxrpc_call_put_recvmsg);
diff --git a/net/rxrpc/rtt.c b/net/rxrpc/rtt.c
index cdab7b7d08a0..7474f88d7b18 100644
--- a/net/rxrpc/rtt.c
+++ b/net/rxrpc/rtt.c
@@ -12,22 +12,22 @@
#include "ar-internal.h"
#define RXRPC_RTO_MAX (120 * USEC_PER_SEC)
-#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * MSEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
+#define RXRPC_TIMEOUT_INIT ((unsigned int)(1 * USEC_PER_SEC)) /* RFC6298 2.1 initial RTO value */
#define rxrpc_jiffies32 ((u32)jiffies) /* As rxrpc_jiffies32 */
-static u32 rxrpc_rto_min_us(struct rxrpc_peer *peer)
+static u32 rxrpc_rto_min_us(struct rxrpc_call *call)
{
return 200;
}
-static u32 __rxrpc_set_rto(const struct rxrpc_peer *peer)
+static u32 __rxrpc_set_rto(const struct rxrpc_call *call)
{
- return (peer->srtt_us >> 3) + peer->rttvar_us;
+ return (call->srtt_us >> 3) + call->rttvar_us;
}
static u32 rxrpc_bound_rto(u32 rto)
{
- return min(rto, RXRPC_RTO_MAX);
+ return clamp(200000, rto + 100000, RXRPC_RTO_MAX);
}
/*
@@ -40,10 +40,10 @@ static u32 rxrpc_bound_rto(u32 rto)
* To save cycles in the RFC 1323 implementation it was better to break
* it up into three procedures. -- erics
*/
-static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
+static void rxrpc_rtt_estimator(struct rxrpc_call *call, long sample_rtt_us)
{
long m = sample_rtt_us; /* RTT */
- u32 srtt = peer->srtt_us;
+ u32 srtt = call->srtt_us;
/* The following amusing code comes from Jacobson's
* article in SIGCOMM '88. Note that rtt and mdev
@@ -66,7 +66,7 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
srtt += m; /* rtt = 7/8 rtt + 1/8 new */
if (m < 0) {
m = -m; /* m is now abs(error) */
- m -= (peer->mdev_us >> 2); /* similar update on mdev */
+ m -= (call->mdev_us >> 2); /* similar update on mdev */
/* This is similar to one of Eifel findings.
* Eifel blocks mdev updates when rtt decreases.
* This solution is a bit different: we use finer gain
@@ -78,31 +78,31 @@ static void rxrpc_rtt_estimator(struct rxrpc_peer *peer, long sample_rtt_us)
if (m > 0)
m >>= 3;
} else {
- m -= (peer->mdev_us >> 2); /* similar update on mdev */
+ m -= (call->mdev_us >> 2); /* similar update on mdev */
}
- peer->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
- if (peer->mdev_us > peer->mdev_max_us) {
- peer->mdev_max_us = peer->mdev_us;
- if (peer->mdev_max_us > peer->rttvar_us)
- peer->rttvar_us = peer->mdev_max_us;
+ call->mdev_us += m; /* mdev = 3/4 mdev + 1/4 new */
+ if (call->mdev_us > call->mdev_max_us) {
+ call->mdev_max_us = call->mdev_us;
+ if (call->mdev_max_us > call->rttvar_us)
+ call->rttvar_us = call->mdev_max_us;
}
} else {
/* no previous measure. */
srtt = m << 3; /* take the measured time to be rtt */
- peer->mdev_us = m << 1; /* make sure rto = 3*rtt */
- peer->rttvar_us = max(peer->mdev_us, rxrpc_rto_min_us(peer));
- peer->mdev_max_us = peer->rttvar_us;
+ call->mdev_us = m << 1; /* make sure rto = 3*rtt */
+ call->rttvar_us = umax(call->mdev_us, rxrpc_rto_min_us(call));
+ call->mdev_max_us = call->rttvar_us;
}
- peer->srtt_us = max(1U, srtt);
+ call->srtt_us = umax(srtt, 1);
}
/*
* Calculate rto without backoff. This is the second half of Van Jacobson's
* routine referred to above.
*/
-static void rxrpc_set_rto(struct rxrpc_peer *peer)
+static void rxrpc_set_rto(struct rxrpc_call *call)
{
u32 rto;
@@ -113,7 +113,7 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
* is invisible. Actually, Linux-2.4 also generates erratic
* ACKs in some circumstances.
*/
- rto = __rxrpc_set_rto(peer);
+ rto = __rxrpc_set_rto(call);
/* 2. Fixups made earlier cannot be right.
* If we do not estimate RTO correctly without them,
@@ -124,61 +124,73 @@ static void rxrpc_set_rto(struct rxrpc_peer *peer)
/* NOTE: clamping at RXRPC_RTO_MIN is not required, current algo
* guarantees that rto is higher.
*/
- peer->rto_us = rxrpc_bound_rto(rto);
+ call->rto_us = rxrpc_bound_rto(rto);
}
-static void rxrpc_ack_update_rtt(struct rxrpc_peer *peer, long rtt_us)
+static void rxrpc_update_rtt_min(struct rxrpc_call *call, ktime_t resp_time, long rtt_us)
+{
+ /* Window size 5mins in approx usec (ipv4.sysctl_tcp_min_rtt_wlen) */
+ u32 wlen_us = 5ULL * NSEC_PER_SEC / 1024;
+
+ minmax_running_min(&call->min_rtt, wlen_us, resp_time / 1024,
+ (u32)rtt_us ? : jiffies_to_usecs(1));
+}
+
+static void rxrpc_ack_update_rtt(struct rxrpc_call *call, ktime_t resp_time, long rtt_us)
{
if (rtt_us < 0)
return;
- //rxrpc_update_rtt_min(peer, rtt_us);
- rxrpc_rtt_estimator(peer, rtt_us);
- rxrpc_set_rto(peer);
+ /* Update RACK min RTT [RFC8985 6.1 Step 1]. */
+ rxrpc_update_rtt_min(call, resp_time, rtt_us);
+
+ rxrpc_rtt_estimator(call, rtt_us);
+ rxrpc_set_rto(call);
- /* RFC6298: only reset backoff on valid RTT measurement. */
- peer->backoff = 0;
+ /* Only reset backoff on valid RTT measurement [RFC6298]. */
+ call->backoff = 0;
}
/*
* Add RTT information to cache. This is called in softirq mode and has
- * exclusive access to the peer RTT data.
+ * exclusive access to the call RTT data.
*/
-void rxrpc_peer_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
+void rxrpc_call_add_rtt(struct rxrpc_call *call, enum rxrpc_rtt_rx_trace why,
int rtt_slot,
rxrpc_serial_t send_serial, rxrpc_serial_t resp_serial,
ktime_t send_time, ktime_t resp_time)
{
- struct rxrpc_peer *peer = call->peer;
s64 rtt_us;
rtt_us = ktime_to_us(ktime_sub(resp_time, send_time));
if (rtt_us < 0)
return;
- spin_lock(&peer->rtt_input_lock);
- rxrpc_ack_update_rtt(peer, rtt_us);
- if (peer->rtt_count < 3)
- peer->rtt_count++;
- spin_unlock(&peer->rtt_input_lock);
+ rxrpc_ack_update_rtt(call, resp_time, rtt_us);
+ if (call->rtt_count < 3)
+ call->rtt_count++;
+ call->rtt_taken++;
+
+ WRITE_ONCE(call->peer->recent_srtt_us, call->srtt_us / 8);
+ WRITE_ONCE(call->peer->recent_rto_us, call->rto_us);
trace_rxrpc_rtt_rx(call, why, rtt_slot, send_serial, resp_serial,
- peer->srtt_us >> 3, peer->rto_us);
+ rtt_us, call->srtt_us, call->rto_us);
}
/*
* Get the retransmission timeout to set in nanoseconds, backing it off each
* time we retransmit.
*/
-ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
+ktime_t rxrpc_get_rto_backoff(struct rxrpc_call *call, bool retrans)
{
u64 timo_us;
- u32 backoff = READ_ONCE(peer->backoff);
+ u32 backoff = READ_ONCE(call->backoff);
- timo_us = peer->rto_us;
+ timo_us = call->rto_us;
timo_us <<= backoff;
if (retrans && timo_us * 2 <= RXRPC_RTO_MAX)
- WRITE_ONCE(peer->backoff, backoff + 1);
+ WRITE_ONCE(call->backoff, backoff + 1);
if (timo_us < 1)
timo_us = 1;
@@ -186,10 +198,11 @@ ktime_t rxrpc_get_rto_backoff(struct rxrpc_peer *peer, bool retrans)
return ns_to_ktime(timo_us * NSEC_PER_USEC);
}
-void rxrpc_peer_init_rtt(struct rxrpc_peer *peer)
+void rxrpc_call_init_rtt(struct rxrpc_call *call)
{
- peer->rto_us = RXRPC_TIMEOUT_INIT;
- peer->mdev_us = RXRPC_TIMEOUT_INIT;
- peer->backoff = 0;
- //minmax_reset(&peer->rtt_min, rxrpc_jiffies32, ~0U);
+ call->rtt_last_req = KTIME_MIN;
+ call->rto_us = RXRPC_TIMEOUT_INIT;
+ call->mdev_us = RXRPC_TIMEOUT_INIT;
+ call->backoff = 0;
+ //minmax_reset(&call->rtt_min, rxrpc_jiffies32, ~0U);
}
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 48a1475e6b06..6cb37b0eb77f 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -148,14 +148,14 @@ error:
static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t remain, gfp_t gfp)
{
struct rxrpc_txbuf *txb;
- size_t shdr, space;
+ size_t shdr, alloc, limit, part;
- remain = min(remain, 65535 - sizeof(struct rxrpc_wire_header));
+ remain = umin(remain, 65535 - sizeof(struct rxrpc_wire_header));
switch (call->conn->security_level) {
default:
- space = min_t(size_t, remain, RXRPC_JUMBO_DATALEN);
- return rxrpc_alloc_data_txbuf(call, space, 1, gfp);
+ alloc = umin(remain, RXRPC_JUMBO_DATALEN);
+ return rxrpc_alloc_data_txbuf(call, alloc, 1, gfp);
case RXRPC_SECURITY_AUTH:
shdr = sizeof(struct rxkad_level1_hdr);
break;
@@ -164,15 +164,21 @@ static struct rxrpc_txbuf *rxkad_alloc_txbuf(struct rxrpc_call *call, size_t rem
break;
}
- space = min_t(size_t, round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN), remain + shdr);
- space = round_up(space, RXKAD_ALIGN);
+ limit = round_down(RXRPC_JUMBO_DATALEN, RXKAD_ALIGN) - shdr;
+ if (remain < limit) {
+ part = remain;
+ alloc = round_up(shdr + part, RXKAD_ALIGN);
+ } else {
+ part = limit;
+ alloc = RXRPC_JUMBO_DATALEN;
+ }
- txb = rxrpc_alloc_data_txbuf(call, space, RXKAD_ALIGN, gfp);
+ txb = rxrpc_alloc_data_txbuf(call, alloc, RXKAD_ALIGN, gfp);
if (!txb)
return NULL;
txb->offset += shdr;
- txb->space -= shdr;
+ txb->space = part;
return txb;
}
@@ -251,8 +257,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
struct rxrpc_txbuf *txb,
struct skcipher_request *req)
{
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxkad_level1_hdr *hdr = (void *)(whdr + 1);
+ struct rxkad_level1_hdr *hdr = txb->data;
struct rxrpc_crypt iv;
struct scatterlist sg;
size_t pad;
@@ -263,13 +268,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
check = txb->seq ^ call->call_id;
hdr->data_size = htonl((u32)check << 16 | txb->len);
- txb->len += sizeof(struct rxkad_level1_hdr);
- pad = txb->len;
+ txb->pkt_len = sizeof(struct rxkad_level1_hdr) + txb->len;
+ pad = txb->pkt_len;
pad = RXKAD_ALIGN - pad;
pad &= RXKAD_ALIGN - 1;
if (pad) {
- memset(txb->kvec[0].iov_base + txb->offset, 0, pad);
- txb->len += pad;
+ memset(txb->data + txb->offset, 0, pad);
+ txb->pkt_len += pad;
}
/* start the encryption afresh */
@@ -294,11 +299,10 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
- struct rxrpc_wire_header *whdr = txb->kvec[0].iov_base;
- struct rxkad_level2_hdr *rxkhdr = (void *)(whdr + 1);
+ struct rxkad_level2_hdr *rxkhdr = txb->data;
struct rxrpc_crypt iv;
struct scatterlist sg;
- size_t pad;
+ size_t content, pad;
u16 check;
int ret;
@@ -309,23 +313,20 @@ static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
rxkhdr->data_size = htonl(txb->len | (u32)check << 16);
rxkhdr->checksum = 0;
- txb->len += sizeof(struct rxkad_level2_hdr);
- pad = txb->len;
- pad = RXKAD_ALIGN - pad;
- pad &= RXKAD_ALIGN - 1;
- if (pad) {
- memset(txb->kvec[0].iov_base + txb->offset, 0, pad);
- txb->len += pad;
- }
+ content = sizeof(struct rxkad_level2_hdr) + txb->len;
+ txb->pkt_len = round_up(content, RXKAD_ALIGN);
+ pad = txb->pkt_len - content;
+ if (pad)
+ memset(txb->data + txb->offset, 0, pad);
/* encrypt from the session key */
token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- sg_init_one(&sg, rxkhdr, txb->len);
+ sg_init_one(&sg, rxkhdr, txb->pkt_len);
skcipher_request_set_sync_tfm(req, call->conn->rxkad.cipher);
skcipher_request_set_callback(req, 0, NULL, NULL);
- skcipher_request_set_crypt(req, &sg, &sg, txb->len, iv.x);
+ skcipher_request_set_crypt(req, &sg, &sg, txb->pkt_len, iv.x);
ret = crypto_skcipher_encrypt(req);
skcipher_request_zero(req);
return ret;
@@ -384,19 +385,32 @@ static int rxkad_secure_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
switch (call->conn->security_level) {
case RXRPC_SECURITY_PLAIN:
+ txb->pkt_len = txb->len;
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
ret = rxkad_secure_packet_auth(call, txb, req);
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
break;
case RXRPC_SECURITY_ENCRYPT:
ret = rxkad_secure_packet_encrypt(call, txb, req);
+ if (txb->alloc_size == RXRPC_JUMBO_DATALEN)
+ txb->jumboable = true;
break;
default:
ret = -EPERM;
break;
}
+ /* Clear excess space in the packet */
+ if (txb->pkt_len < txb->alloc_size) {
+ size_t gap = txb->alloc_size - txb->pkt_len;
+ void *p = txb->data;
+
+ memset(p + txb->pkt_len, 0, gap);
+ }
+
skcipher_request_free(req);
_leave(" = %d [set %x]", ret, y);
return ret;
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index b1536da2246b..e848a4777b8c 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -515,7 +515,7 @@ static int rxperf_process_call(struct rxperf_call *call)
reply_len + sizeof(rxperf_magic_cookie));
while (reply_len > 0) {
- len = min_t(size_t, reply_len, PAGE_SIZE);
+ len = umin(reply_len, PAGE_SIZE);
bvec_set_page(&bv, ZERO_PAGE(0), len, 0);
iov_iter_bvec(&msg.msg_iter, WRITE, &bv, 1, len);
msg.msg_flags = MSG_MORE;
diff --git a/net/rxrpc/security.c b/net/rxrpc/security.c
index cb8dd1d3b1d4..9784adc8f275 100644
--- a/net/rxrpc/security.c
+++ b/net/rxrpc/security.c
@@ -114,10 +114,10 @@ found:
if (conn->state == RXRPC_CONN_CLIENT_UNSECURED) {
ret = conn->security->init_connection_security(conn, token);
if (ret == 0) {
- spin_lock(&conn->state_lock);
+ spin_lock_irq(&conn->state_lock);
if (conn->state == RXRPC_CONN_CLIENT_UNSECURED)
conn->state = RXRPC_CONN_CLIENT;
- spin_unlock(&conn->state_lock);
+ spin_unlock_irq(&conn->state_lock);
}
}
mutex_unlock(&conn->security_lock);
diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
index 708a1484dada..84dc6c94f23b 100644
--- a/net/rxrpc/sendmsg.c
+++ b/net/rxrpc/sendmsg.c
@@ -94,9 +94,11 @@ no_wait:
*/
static bool rxrpc_check_tx_space(struct rxrpc_call *call, rxrpc_seq_t *_tx_win)
{
+ rxrpc_seq_t tx_bottom = READ_ONCE(call->tx_bottom);
+
if (_tx_win)
- *_tx_win = call->tx_bottom;
- return call->tx_prepared - call->tx_bottom < 256;
+ *_tx_win = tx_bottom;
+ return call->send_top - tx_bottom < 256;
}
/*
@@ -132,13 +134,13 @@ static int rxrpc_wait_for_tx_window_waitall(struct rxrpc_sock *rx,
rxrpc_seq_t tx_start, tx_win;
signed long rtt, timeout;
- rtt = READ_ONCE(call->peer->srtt_us) >> 3;
+ rtt = READ_ONCE(call->srtt_us) >> 3;
rtt = usecs_to_jiffies(rtt) * 2;
if (rtt < 2)
rtt = 2;
timeout = rtt;
- tx_start = smp_load_acquire(&call->acks_hard_ack);
+ tx_start = READ_ONCE(call->tx_bottom);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -195,8 +197,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
DECLARE_WAITQUEUE(myself, current);
int ret;
- _enter(",{%u,%u,%u,%u}",
- call->tx_bottom, call->acks_hard_ack, call->tx_top, call->tx_winsize);
+ _enter(",{%u,%u,%u}",
+ call->tx_bottom, call->tx_top, call->tx_winsize);
add_wait_queue(&call->waitq, &myself);
@@ -240,37 +242,77 @@ static void rxrpc_queue_packet(struct rxrpc_sock *rx, struct rxrpc_call *call,
struct rxrpc_txbuf *txb,
rxrpc_notify_end_tx_t notify_end_tx)
{
+ struct rxrpc_txqueue *sq = call->send_queue;
rxrpc_seq_t seq = txb->seq;
bool poke, last = txb->flags & RXRPC_LAST_PACKET;
-
+ int ix = seq & RXRPC_TXQ_MASK;
rxrpc_inc_stat(call->rxnet, stat_tx_data);
- ASSERTCMP(txb->seq, ==, call->tx_prepared + 1);
-
- /* We have to set the timestamp before queueing as the retransmit
- * algorithm can see the packet as soon as we queue it.
- */
- txb->last_sent = ktime_get_real();
+ ASSERTCMP(txb->seq, ==, call->send_top + 1);
if (last)
trace_rxrpc_txqueue(call, rxrpc_txqueue_queue_last);
else
trace_rxrpc_txqueue(call, rxrpc_txqueue_queue);
+ if (WARN_ON_ONCE(sq->bufs[ix]))
+ trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue_dup);
+ else
+ trace_rxrpc_tq(call, sq, seq, rxrpc_tq_queue);
+
/* Add the packet to the call's output buffer */
- spin_lock(&call->tx_lock);
- poke = list_empty(&call->tx_sendmsg);
- list_add_tail(&txb->call_link, &call->tx_sendmsg);
- call->tx_prepared = seq;
- if (last)
+ poke = (READ_ONCE(call->tx_bottom) == call->send_top);
+ sq->bufs[ix] = txb;
+ /* Order send_top after the queue->next pointer and txb content. */
+ smp_store_release(&call->send_top, seq);
+ if (last) {
+ set_bit(RXRPC_CALL_TX_NO_MORE, &call->flags);
rxrpc_notify_end_tx(rx, call, notify_end_tx);
- spin_unlock(&call->tx_lock);
+ call->send_queue = NULL;
+ }
if (poke)
rxrpc_poke_call(call, rxrpc_call_poke_start);
}
/*
+ * Allocate a new txqueue unit and add it to the transmission queue.
+ */
+static int rxrpc_alloc_txqueue(struct sock *sk, struct rxrpc_call *call)
+{
+ struct rxrpc_txqueue *tq;
+
+ tq = kzalloc(sizeof(*tq), sk->sk_allocation);
+ if (!tq)
+ return -ENOMEM;
+
+ tq->xmit_ts_base = KTIME_MIN;
+ for (int i = 0; i < RXRPC_NR_TXQUEUE; i++)
+ tq->segment_xmit_ts[i] = UINT_MAX;
+
+ if (call->send_queue) {
+ tq->qbase = call->send_top + 1;
+ call->send_queue->next = tq;
+ call->send_queue = tq;
+ } else if (WARN_ON(call->tx_queue)) {
+ kfree(tq);
+ return -ENOMEM;
+ } else {
+ /* We start at seq 1, so pretend seq 0 is hard-acked. */
+ tq->nr_reported_acks = 1;
+ tq->segment_acked = 1UL;
+ tq->qbase = 0;
+ call->tx_qbase = 0;
+ call->send_queue = tq;
+ call->tx_qtail = tq;
+ call->tx_queue = tq;
+ }
+
+ trace_rxrpc_tq(call, tq, call->send_top, rxrpc_tq_alloc);
+ return 0;
+}
+
+/*
* send data through a socket
* - must be called in process context
* - The caller holds the call user access mutex, but not the socket lock.
@@ -288,6 +330,13 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
bool more = msg->msg_flags & MSG_MORE;
int ret, copied = 0;
+ if (test_bit(RXRPC_CALL_TX_NO_MORE, &call->flags)) {
+ trace_rxrpc_abort(call->debug_id, rxrpc_sendmsg_late_send,
+ call->cid, call->call_id, call->rx_consumed,
+ 0, -EPROTO);
+ return -EPROTO;
+ }
+
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
ret = rxrpc_wait_to_be_connected(call, &timeo);
@@ -344,6 +393,13 @@ reload:
if (!rxrpc_check_tx_space(call, NULL))
goto wait_for_space;
+ /* See if we need to begin/extend the Tx queue. */
+ if (!call->send_queue || !((call->send_top + 1) & RXRPC_TXQ_MASK)) {
+ ret = rxrpc_alloc_txqueue(sk, call);
+ if (ret < 0)
+ goto maybe_error;
+ }
+
/* Work out the maximum size of a packet. Assume that
* the security header is going to be in the padded
* region (enc blocksize), but the trailer is not.
@@ -360,10 +416,10 @@ reload:
/* append next segment of data to the current buffer */
if (msg_data_left(msg) > 0) {
- size_t copy = min_t(size_t, txb->space, msg_data_left(msg));
+ size_t copy = umin(txb->space, msg_data_left(msg));
_debug("add %zu", copy);
- if (!copy_from_iter_full(txb->kvec[0].iov_base + txb->offset,
+ if (!copy_from_iter_full(txb->data + txb->offset,
copy, &msg->msg_iter))
goto efault;
_debug("added");
@@ -385,16 +441,10 @@ reload:
(msg_data_left(msg) == 0 && !more)) {
if (msg_data_left(msg) == 0 && !more)
txb->flags |= RXRPC_LAST_PACKET;
- else if (call->tx_top - call->acks_hard_ack <
- call->tx_winsize)
- txb->flags |= RXRPC_MORE_PACKETS;
ret = call->security->secure_packet(call, txb);
if (ret < 0)
goto out;
-
- txb->kvec[0].iov_len += txb->len;
- txb->len = txb->kvec[0].iov_len;
rxrpc_queue_packet(rx, call, txb, notify_end_tx);
txb = NULL;
}
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 9bf9a1f6e4cb..46a20cf4c402 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -11,6 +11,8 @@
#include "ar-internal.h"
static struct ctl_table_header *rxrpc_sysctl_reg_table;
+static const unsigned int rxrpc_rx_mtu_min = 500;
+static const unsigned int rxrpc_jumbo_max = RXRPC_MAX_NR_JUMBO;
static const unsigned int four = 4;
static const unsigned int max_backlog = RXRPC_BACKLOG_MAX - 1;
static const unsigned int n_65535 = 65535;
@@ -115,7 +117,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
- .extra1 = (void *)SYSCTL_ONE,
+ .extra1 = (void *)&rxrpc_rx_mtu_min,
.extra2 = (void *)&n_65535,
},
{
@@ -125,7 +127,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)SYSCTL_ONE,
- .extra2 = (void *)&four,
+ .extra2 = (void *)&rxrpc_jumbo_max,
},
};
diff --git a/net/rxrpc/txbuf.c b/net/rxrpc/txbuf.c
index c3913d8a50d3..c550991d48fa 100644
--- a/net/rxrpc/txbuf.c
+++ b/net/rxrpc/txbuf.c
@@ -19,17 +19,19 @@ atomic_t rxrpc_nr_txbuf;
struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_size,
size_t data_align, gfp_t gfp)
{
- struct rxrpc_wire_header *whdr;
struct rxrpc_txbuf *txb;
- size_t total, hoff;
+ size_t total, doff, jsize = sizeof(struct rxrpc_jumbo_header);
void *buf;
- txb = kmalloc(sizeof(*txb), gfp);
+ txb = kzalloc(sizeof(*txb), gfp);
if (!txb)
return NULL;
- hoff = round_up(sizeof(*whdr), data_align) - sizeof(*whdr);
- total = hoff + sizeof(*whdr) + data_size;
+ /* We put a jumbo header in the buffer, but not a full wire header to
+ * avoid delayed-corruption problems with zerocopy.
+ */
+ doff = round_up(jsize, data_align);
+ total = doff + data_size;
data_align = umax(data_align, L1_CACHE_BYTES);
mutex_lock(&call->conn->tx_data_alloc_lock);
@@ -41,36 +43,15 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
return NULL;
}
- whdr = buf + hoff;
-
- INIT_LIST_HEAD(&txb->call_link);
- INIT_LIST_HEAD(&txb->tx_link);
refcount_set(&txb->ref, 1);
- txb->last_sent = KTIME_MIN;
txb->call_debug_id = call->debug_id;
txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
+ txb->alloc_size = data_size;
txb->space = data_size;
- txb->len = 0;
- txb->offset = sizeof(*whdr);
+ txb->offset = 0;
txb->flags = call->conn->out_clientflag;
- txb->ack_why = 0;
- txb->seq = call->tx_prepared + 1;
- txb->serial = 0;
- txb->cksum = 0;
- txb->nr_kvec = 1;
- txb->kvec[0].iov_base = whdr;
- txb->kvec[0].iov_len = sizeof(*whdr);
-
- whdr->epoch = htonl(call->conn->proto.epoch);
- whdr->cid = htonl(call->cid);
- whdr->callNumber = htonl(call->call_id);
- whdr->seq = htonl(txb->seq);
- whdr->type = RXRPC_PACKET_TYPE_DATA;
- whdr->flags = 0;
- whdr->userStatus = 0;
- whdr->securityIndex = call->security_ix;
- whdr->_rsvd = 0;
- whdr->serviceId = htons(call->dest_srx.srx_service);
+ txb->seq = call->send_top + 1;
+ txb->data = buf + doff;
trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1,
rxrpc_txbuf_alloc_data);
@@ -79,84 +60,6 @@ struct rxrpc_txbuf *rxrpc_alloc_data_txbuf(struct rxrpc_call *call, size_t data_
return txb;
}
-/*
- * Allocate and partially initialise an ACK packet.
- */
-struct rxrpc_txbuf *rxrpc_alloc_ack_txbuf(struct rxrpc_call *call, size_t sack_size)
-{
- struct rxrpc_wire_header *whdr;
- struct rxrpc_acktrailer *trailer;
- struct rxrpc_ackpacket *ack;
- struct rxrpc_txbuf *txb;
- gfp_t gfp = rcu_read_lock_held() ? GFP_ATOMIC | __GFP_NOWARN : GFP_NOFS;
- void *buf, *buf2 = NULL;
- u8 *filler;
-
- txb = kmalloc(sizeof(*txb), gfp);
- if (!txb)
- return NULL;
-
- buf = page_frag_alloc(&call->local->tx_alloc,
- sizeof(*whdr) + sizeof(*ack) + 1 + 3 + sizeof(*trailer), gfp);
- if (!buf) {
- kfree(txb);
- return NULL;
- }
-
- if (sack_size) {
- buf2 = page_frag_alloc(&call->local->tx_alloc, sack_size, gfp);
- if (!buf2) {
- page_frag_free(buf);
- kfree(txb);
- return NULL;
- }
- }
-
- whdr = buf;
- ack = buf + sizeof(*whdr);
- filler = buf + sizeof(*whdr) + sizeof(*ack) + 1;
- trailer = buf + sizeof(*whdr) + sizeof(*ack) + 1 + 3;
-
- INIT_LIST_HEAD(&txb->call_link);
- INIT_LIST_HEAD(&txb->tx_link);
- refcount_set(&txb->ref, 1);
- txb->call_debug_id = call->debug_id;
- txb->debug_id = atomic_inc_return(&rxrpc_txbuf_debug_ids);
- txb->space = 0;
- txb->len = sizeof(*whdr) + sizeof(*ack) + 3 + sizeof(*trailer);
- txb->offset = 0;
- txb->flags = call->conn->out_clientflag;
- txb->ack_rwind = 0;
- txb->seq = 0;
- txb->serial = 0;
- txb->cksum = 0;
- txb->nr_kvec = 3;
- txb->kvec[0].iov_base = whdr;
- txb->kvec[0].iov_len = sizeof(*whdr) + sizeof(*ack);
- txb->kvec[1].iov_base = buf2;
- txb->kvec[1].iov_len = sack_size;
- txb->kvec[2].iov_base = filler;
- txb->kvec[2].iov_len = 3 + sizeof(*trailer);
-
- whdr->epoch = htonl(call->conn->proto.epoch);
- whdr->cid = htonl(call->cid);
- whdr->callNumber = htonl(call->call_id);
- whdr->seq = 0;
- whdr->type = RXRPC_PACKET_TYPE_ACK;
- whdr->flags = 0;
- whdr->userStatus = 0;
- whdr->securityIndex = call->security_ix;
- whdr->_rsvd = 0;
- whdr->serviceId = htons(call->dest_srx.srx_service);
-
- get_page(virt_to_head_page(trailer));
-
- trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 1,
- rxrpc_txbuf_alloc_ack);
- atomic_inc(&rxrpc_nr_txbuf);
- return txb;
-}
-
void rxrpc_get_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
{
int r;
@@ -174,13 +77,10 @@ void rxrpc_see_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
static void rxrpc_free_txbuf(struct rxrpc_txbuf *txb)
{
- int i;
-
trace_rxrpc_txbuf(txb->debug_id, txb->call_debug_id, txb->seq, 0,
rxrpc_txbuf_free);
- for (i = 0; i < txb->nr_kvec; i++)
- if (txb->kvec[i].iov_base)
- page_frag_free(txb->kvec[i].iov_base);
+ if (txb->data)
+ page_frag_free(txb->data);
kfree(txb);
atomic_dec(&rxrpc_nr_txbuf);
}
@@ -202,37 +102,3 @@ void rxrpc_put_txbuf(struct rxrpc_txbuf *txb, enum rxrpc_txbuf_trace what)
rxrpc_free_txbuf(txb);
}
}
-
-/*
- * Shrink the transmit buffer.
- */
-void rxrpc_shrink_call_tx_buffer(struct rxrpc_call *call)
-{
- struct rxrpc_txbuf *txb;
- rxrpc_seq_t hard_ack = smp_load_acquire(&call->acks_hard_ack);
- bool wake = false;
-
- _enter("%x/%x/%x", call->tx_bottom, call->acks_hard_ack, call->tx_top);
-
- while ((txb = list_first_entry_or_null(&call->tx_buffer,
- struct rxrpc_txbuf, call_link))) {
- hard_ack = smp_load_acquire(&call->acks_hard_ack);
- if (before(hard_ack, txb->seq))
- break;
-
- if (txb->seq != call->tx_bottom + 1)
- rxrpc_see_txbuf(txb, rxrpc_txbuf_see_out_of_step);
- ASSERTCMP(txb->seq, ==, call->tx_bottom + 1);
- smp_store_release(&call->tx_bottom, call->tx_bottom + 1);
- list_del_rcu(&txb->call_link);
-
- trace_rxrpc_txqueue(call, rxrpc_txqueue_dequeue);
-
- rxrpc_put_txbuf(txb, rxrpc_txbuf_put_rotated);
- if (after(call->acks_hard_ack, call->tx_bottom + 128))
- wake = true;
- }
-
- if (wake)
- wake_up(&call->waitq);
-}
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index af7c99845948..e296714803dc 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -68,7 +68,7 @@ geneve_opt_policy[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_MAX + 1] = {
[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
[TCA_TUNNEL_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
- .len = 128 },
+ .len = 127 },
};
static const struct nla_policy
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 4f648af8cfaa..ecec0a1e1c1a 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -2057,6 +2057,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcmsg *tcm;
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
+ int ret = -EMSGSIZE;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
if (!nlh)
@@ -2101,11 +2102,45 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
return skb->len;
+cls_op_not_supp:
+ ret = -EOPNOTSUPP;
out_nlmsg_trim:
nla_put_failure:
-cls_op_not_supp:
nlmsg_trim(skb, b);
- return -1;
+ return ret;
+}
+
+static struct sk_buff *tfilter_notify_prep(struct net *net,
+ struct sk_buff *oskb,
+ struct nlmsghdr *n,
+ struct tcf_proto *tp,
+ struct tcf_block *block,
+ struct Qdisc *q, u32 parent,
+ void *fh, int event,
+ u32 portid, bool rtnl_held,
+ struct netlink_ext_ack *extack)
+{
+ unsigned int size = oskb ? max(NLMSG_GOODSIZE, oskb->len) : NLMSG_GOODSIZE;
+ struct sk_buff *skb;
+ int ret;
+
+retry:
+ skb = alloc_skb(size, GFP_KERNEL);
+ if (!skb)
+ return ERR_PTR(-ENOBUFS);
+
+ ret = tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
+ n->nlmsg_seq, n->nlmsg_flags, event, false,
+ rtnl_held, extack);
+ if (ret <= 0) {
+ kfree_skb(skb);
+ if (ret == -EMSGSIZE) {
+ size += NLMSG_GOODSIZE;
+ goto retry;
+ }
+ return ERR_PTR(-EINVAL);
+ }
+ return skb;
}
static int tfilter_notify(struct net *net, struct sk_buff *oskb,
@@ -2121,16 +2156,10 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
if (!unicast && !rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
return 0;
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, event,
- false, rtnl_held, extack) <= 0) {
- kfree_skb(skb);
- return -EINVAL;
- }
+ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh, event,
+ portid, rtnl_held, extack);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
if (unicast)
err = rtnl_unicast(skb, net, portid);
@@ -2153,16 +2182,11 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC))
return tp->ops->delete(tp, fh, last, rtnl_held, extack);
- skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!skb)
- return -ENOBUFS;
-
- if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
- n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
- false, rtnl_held, extack) <= 0) {
+ skb = tfilter_notify_prep(net, oskb, n, tp, block, q, parent, fh,
+ RTM_DELTFILTER, portid, rtnl_held, extack);
+ if (IS_ERR(skb)) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
- kfree_skb(skb);
- return -EINVAL;
+ return PTR_ERR(skb);
}
err = tp->ops->delete(tp, fh, last, rtnl_held, extack);
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 03505673d523..099ff6a3e1f5 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -766,7 +766,7 @@ geneve_opt_policy[TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX + 1] = {
[TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA] = { .type = NLA_BINARY,
- .len = 128 },
+ .len = 127 },
};
static const struct nla_policy
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index fac9c946a4c7..6c625dcd0651 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1560,7 +1560,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
}
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
- NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
return -EINVAL;
}
@@ -1674,7 +1674,7 @@ replay:
}
if (tca[TCA_KIND] &&
nla_strcmp(tca[TCA_KIND], q->ops->id)) {
- NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
return -EINVAL;
}
if (q->flags & TCQ_F_INGRESS) {
@@ -1750,7 +1750,7 @@ replay:
return -EEXIST;
}
if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
- NL_SET_ERR_MSG(extack, "Invalid qdisc name");
+ NL_SET_ERR_MSG(extack, "Invalid qdisc name: must match existing qdisc");
return -EINVAL;
}
err = qdisc_change(q, tca, extack);
@@ -2254,6 +2254,12 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
return -EOPNOTSUPP;
}
+ /* Prevent creation of traffic classes with classid TC_H_ROOT */
+ if (clid == TC_H_ROOT) {
+ NL_SET_ERR_MSG(extack, "Cannot create traffic class with classid TC_H_ROOT");
+ return -EINVAL;
+ }
+
new_cl = cl;
err = -EOPNOTSUPP;
if (cops->change)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 2c2e2a67f3b2..48dd8c88903f 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -484,13 +484,14 @@ static bool cobalt_queue_empty(struct cobalt_vars *vars,
/* Call this with a freshly dequeued packet for possible congestion marking.
* Returns true as an instruction to drop the packet, false for delivery.
*/
-static bool cobalt_should_drop(struct cobalt_vars *vars,
- struct cobalt_params *p,
- ktime_t now,
- struct sk_buff *skb,
- u32 bulk_flows)
-{
- bool next_due, over_target, drop = false;
+static enum skb_drop_reason cobalt_should_drop(struct cobalt_vars *vars,
+ struct cobalt_params *p,
+ ktime_t now,
+ struct sk_buff *skb,
+ u32 bulk_flows)
+{
+ enum skb_drop_reason reason = SKB_NOT_DROPPED_YET;
+ bool next_due, over_target;
ktime_t schedule;
u64 sojourn;
@@ -533,7 +534,8 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
if (next_due && vars->dropping) {
/* Use ECN mark if possible, otherwise drop */
- drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
+ if (!(vars->ecn_marked = INET_ECN_set_ce(skb)))
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
vars->count++;
if (!vars->count)
@@ -556,16 +558,17 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
}
/* Simple BLUE implementation. Lack of ECN is deliberate. */
- if (vars->p_drop)
- drop |= (get_random_u32() < vars->p_drop);
+ if (vars->p_drop && reason == SKB_NOT_DROPPED_YET &&
+ get_random_u32() < vars->p_drop)
+ reason = SKB_DROP_REASON_CAKE_FLOOD;
/* Overload the drop_next field as an activity timeout */
if (!vars->count)
vars->drop_next = ktime_add_ns(now, p->interval);
- else if (ktime_to_ns(schedule) > 0 && !drop)
+ else if (ktime_to_ns(schedule) > 0 && reason == SKB_NOT_DROPPED_YET)
vars->drop_next = now;
- return drop;
+ return reason;
}
static bool cake_update_flowkeys(struct flow_keys *keys,
@@ -1585,12 +1588,11 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
flow->dropped++;
b->tin_dropped++;
- sch->qstats.drops++;
if (q->rate_flags & CAKE_FLAG_INGRESS)
cake_advance_shaper(q, b, skb, now, true);
- __qdisc_drop(skb, to_free);
+ qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
sch->q.qlen--;
qdisc_tree_reduce_backlog(sch, 1, len);
@@ -1965,13 +1967,14 @@ static void cake_clear_tin(struct Qdisc *sch, u16 tin)
q->cur_tin = tin;
for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
while (!!(skb = cake_dequeue_one(sch)))
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_QUEUE_PURGE);
}
static struct sk_buff *cake_dequeue(struct Qdisc *sch)
{
struct cake_sched_data *q = qdisc_priv(sch);
struct cake_tin_data *b = &q->tins[q->cur_tin];
+ enum skb_drop_reason reason;
ktime_t now = ktime_get();
struct cake_flow *flow;
struct list_head *head;
@@ -2153,12 +2156,12 @@ retry:
goto begin;
}
+ reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
+ (b->bulk_flow_count *
+ !!(q->rate_flags &
+ CAKE_FLAG_INGRESS)));
/* Last packet in queue may be marked, shouldn't be dropped */
- if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
- (b->bulk_flow_count *
- !!(q->rate_flags &
- CAKE_FLAG_INGRESS))) ||
- !flow->head)
+ if (reason == SKB_NOT_DROPPED_YET || !flow->head)
break;
/* drop this packet, get another one */
@@ -2172,7 +2175,7 @@ retry:
b->tin_dropped++;
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_qstats_drop(sch);
- kfree_skb(skb);
+ kfree_skb_reason(skb, reason);
if (q->rate_flags & CAKE_FLAG_INGRESS)
goto retry;
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 3e8d4fe4d91e..12dd71139da3 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED);
qdisc_qstats_drop(sch);
}
@@ -65,10 +65,7 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
&q->stats, qdisc_pkt_len, codel_get_enqueue_time,
drop_func, dequeue_func);
- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
- * or HTB crashes. Defer it for next round.
- */
- if (q->stats.drop_count && sch->q.qlen) {
+ if (q->stats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
q->stats.drop_count = 0;
q->stats.drop_len = 0;
@@ -89,7 +86,8 @@ static int codel_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
q = qdisc_priv(sch);
q->drop_overlimit++;
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free,
+ SKB_DROP_REASON_QDISC_OVERLIMIT);
}
static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = {
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index a5e87f9ea986..2ca5332cfcc5 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -537,6 +537,8 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
}
+#define FQDR(reason) SKB_DROP_REASON_FQ_##reason
+
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
@@ -548,7 +550,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
q->stat_band_drops[band]++;
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free,
+ FQDR(BAND_LIMIT));
}
now = ktime_get_ns();
@@ -558,8 +561,9 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* Check if packet timestamp is too far in the future. */
if (fq_packet_beyond_horizon(skb, q, now)) {
if (q->horizon_drop) {
- q->stat_horizon_drops++;
- return qdisc_drop(skb, sch, to_free);
+ q->stat_horizon_drops++;
+ return qdisc_drop_reason(skb, sch, to_free,
+ FQDR(HORIZON_LIMIT));
}
q->stat_horizon_caps++;
skb->tstamp = now + q->horizon;
@@ -572,7 +576,8 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (f != &q->internal) {
if (unlikely(f->qlen >= q->flow_plimit)) {
q->stat_flows_plimit++;
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free,
+ FQDR(FLOW_LIMIT));
}
if (fq_flow_is_detached(f)) {
@@ -597,6 +602,7 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
return NET_XMIT_SUCCESS;
}
+#undef FQDR
static void fq_check_throttled(struct fq_sched_data *q, u64 now)
{
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 4f908c11ba95..6c9029f71e88 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -168,6 +168,7 @@ static unsigned int fq_codel_drop(struct Qdisc *sch, unsigned int max_packets,
skb = dequeue_head(flow);
len += qdisc_pkt_len(skb);
mem += get_codel_cb(skb)->mem_usage;
+ tcf_set_drop_reason(skb, SKB_DROP_REASON_QDISC_OVERLIMIT);
__qdisc_drop(skb, to_free);
} while (++i < max_packets && len < threshold);
@@ -274,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx)
{
struct Qdisc *sch = ctx;
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED);
qdisc_qstats_drop(sch);
}
@@ -314,10 +315,8 @@ begin:
}
qdisc_bstats_update(sch, skb);
flow->deficit -= qdisc_pkt_len(skb);
- /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
- * or HTB crashes. Defer it for next round.
- */
- if (q->cstats.drop_count && sch->q.qlen) {
+
+ if (q->cstats.drop_count) {
qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
q->cstats.drop_len);
q->cstats.drop_count = 0;
diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c
index c38f33ff80bd..93c36afbf576 100644
--- a/net/sched/sch_fq_pie.c
+++ b/net/sched/sch_fq_pie.c
@@ -130,6 +130,7 @@ static inline void flow_queue_add(struct fq_pie_flow *flow,
static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
struct fq_pie_sched_data *q = qdisc_priv(sch);
struct fq_pie_flow *sel_flow;
int ret;
@@ -161,6 +162,8 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
q->overmemory++;
}
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
+
if (!pie_drop_early(sch, &q->p_params, &sel_flow->vars,
sel_flow->backlog, skb->len)) {
enqueue = true;
@@ -198,8 +201,7 @@ static int fq_pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out:
q->stats.dropped++;
sel_flow->vars.accu_prob = 0;
- __qdisc_drop(skb, to_free);
- qdisc_qstats_drop(sch);
+ qdisc_drop_reason(skb, sch, to_free, reason);
return NET_XMIT_CN;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 8874ae668095..14ab2f4c190a 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -551,25 +551,20 @@ static void dev_watchdog(struct timer_list *t)
netdev_put(dev, &dev->watchdog_dev_tracker);
}
-void __netdev_watchdog_up(struct net_device *dev)
-{
- if (dev->netdev_ops->ndo_tx_timeout) {
- if (dev->watchdog_timeo <= 0)
- dev->watchdog_timeo = 5*HZ;
- if (!mod_timer(&dev->watchdog_timer,
- round_jiffies(jiffies + dev->watchdog_timeo)))
- netdev_hold(dev, &dev->watchdog_dev_tracker,
- GFP_ATOMIC);
- }
-}
-EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
-
-static void dev_watchdog_up(struct net_device *dev)
+void netdev_watchdog_up(struct net_device *dev)
{
- __netdev_watchdog_up(dev);
+ if (!dev->netdev_ops->ndo_tx_timeout)
+ return;
+ if (dev->watchdog_timeo <= 0)
+ dev->watchdog_timeo = 5*HZ;
+ if (!mod_timer(&dev->watchdog_timer,
+ round_jiffies(jiffies + dev->watchdog_timeo)))
+ netdev_hold(dev, &dev->watchdog_dev_tracker,
+ GFP_ATOMIC);
}
+EXPORT_SYMBOL_GPL(netdev_watchdog_up);
-static void dev_watchdog_down(struct net_device *dev)
+static void netdev_watchdog_down(struct net_device *dev)
{
netif_tx_lock_bh(dev);
if (del_timer(&dev->watchdog_timer))
@@ -591,7 +586,7 @@ void netif_carrier_on(struct net_device *dev)
atomic_inc(&dev->carrier_up_count);
linkwatch_fire_event(dev);
if (netif_running(dev))
- __netdev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(netif_carrier_on);
@@ -1267,7 +1262,7 @@ void dev_activate(struct net_device *dev)
if (need_watchdog) {
netif_trans_update(dev);
- dev_watchdog_up(dev);
+ netdev_watchdog_up(dev);
}
}
EXPORT_SYMBOL(dev_activate);
@@ -1282,15 +1277,17 @@ static void qdisc_deactivate(struct Qdisc *qdisc)
static void dev_deactivate_queue(struct net_device *dev,
struct netdev_queue *dev_queue,
- void *_qdisc_default)
+ void *_sync_needed)
{
- struct Qdisc *qdisc_default = _qdisc_default;
+ bool *sync_needed = _sync_needed;
struct Qdisc *qdisc;
qdisc = rtnl_dereference(dev_queue->qdisc);
if (qdisc) {
+ if (qdisc->enqueue)
+ *sync_needed = true;
qdisc_deactivate(qdisc);
- rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
+ rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);
}
}
@@ -1357,24 +1354,22 @@ static bool some_qdisc_is_busy(struct net_device *dev)
*/
void dev_deactivate_many(struct list_head *head)
{
+ bool sync_needed = false;
struct net_device *dev;
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_deactivate_queue,
- &noop_qdisc);
+ &sync_needed);
if (dev_ingress_queue(dev))
dev_deactivate_queue(dev, dev_ingress_queue(dev),
- &noop_qdisc);
+ &sync_needed);
- dev_watchdog_down(dev);
+ netdev_watchdog_down(dev);
}
- /* Wait for outstanding qdisc-less dev_queue_xmit calls or
- * outstanding qdisc enqueuing calls.
- * This is avoided if all devices are in dismantle phase :
- * Caller will call synchronize_net() for us
- */
- synchronize_net();
+ /* Wait for outstanding qdisc enqueuing calls. */
+ if (sync_needed)
+ synchronize_net();
list_for_each_entry(dev, head, close_list) {
netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);
diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c
index 7d2151c62c4a..532fde548b88 100644
--- a/net/sched/sch_gred.c
+++ b/net/sched/sch_gred.c
@@ -251,10 +251,10 @@ static int gred_enqueue(struct sk_buff *skb, struct Qdisc *sch,
q->stats.pdrop++;
drop:
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
congestion_drop:
- qdisc_drop(skb, sch, to_free);
+ qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_CONGESTED);
return NET_XMIT_CN;
}
@@ -913,7 +913,8 @@ static void gred_destroy(struct Qdisc *sch)
for (i = 0; i < table->DPs; i++)
gred_destroy_vq(table->tab[i]);
- gred_offload(sch, TC_GRED_DESTROY);
+ if (table->opt)
+ gred_offload(sch, TC_GRED_DESTROY);
kfree(table->opt);
}
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index b3dcb845b327..bb1fa9aa530b 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -85,6 +85,7 @@ EXPORT_SYMBOL_GPL(pie_drop_early);
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
struct pie_sched_data *q = qdisc_priv(sch);
bool enqueue = false;
@@ -93,6 +94,8 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
goto out;
}
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
+
if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
skb->len)) {
enqueue = true;
@@ -121,7 +124,7 @@ static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
out:
q->stats.dropped++;
q->vars.accu_prob = 0;
- return qdisc_drop(skb, sch, to_free);
+ return qdisc_drop_reason(skb, sch, to_free, reason);
}
static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 6029bc29b51e..ef8a2afed26b 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -70,6 +70,7 @@ static int red_use_nodrop(struct red_sched_data *q)
static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_CONGESTED;
struct red_sched_data *q = qdisc_priv(sch);
struct Qdisc *child = q->qdisc;
unsigned int len;
@@ -107,6 +108,7 @@ static int red_enqueue(struct sk_buff *skb, struct Qdisc *sch,
break;
case RED_HARD_MARK:
+ reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
qdisc_qstats_overlimit(sch);
if (red_use_harddrop(q) || !red_use_ecn(q)) {
q->stats.forced_drop++;
@@ -143,7 +145,7 @@ congestion_drop:
if (!skb)
return NET_XMIT_CN | ret;
- qdisc_drop(skb, sch, to_free);
+ qdisc_drop_reason(skb, sch, to_free, reason);
return NET_XMIT_CN;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index b717e15a3a17..d2835f1168e1 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -280,6 +280,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
struct sk_buff **to_free)
{
+ enum skb_drop_reason reason = SKB_DROP_REASON_QDISC_OVERLIMIT;
struct sfb_sched_data *q = qdisc_priv(sch);
unsigned int len = qdisc_pkt_len(skb);
struct Qdisc *child = q->qdisc;
@@ -380,6 +381,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch,
}
r = get_random_u16() & SFB_MAX_PROB;
+ reason = SKB_DROP_REASON_QDISC_CONGESTED;
if (unlikely(r < p_min)) {
if (unlikely(p_min > SFB_MAX_PROB / 2)) {
@@ -414,7 +416,7 @@ enqueue:
return ret;
drop:
- qdisc_drop(skb, sch, to_free);
+ qdisc_drop_reason(skb, sch, to_free, reason);
return NET_XMIT_CN;
other_drop:
if (ret & __NET_XMIT_BYPASS)
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 65d5b59da583..58b42dcf8f20 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -631,6 +631,15 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
struct red_parms *p = NULL;
struct sk_buff *to_free = NULL;
struct sk_buff *tail = NULL;
+ unsigned int maxflows;
+ unsigned int quantum;
+ unsigned int divisor;
+ int perturb_period;
+ u8 headdrop;
+ u8 maxdepth;
+ int limit;
+ u8 flags;
+
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
return -EINVAL;
@@ -652,39 +661,64 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt,
if (!p)
return -ENOMEM;
}
- if (ctl->limit == 1) {
- NL_SET_ERR_MSG_MOD(extack, "invalid limit");
- return -EINVAL;
- }
+
sch_tree_lock(sch);
+
+ limit = q->limit;
+ divisor = q->divisor;
+ headdrop = q->headdrop;
+ maxdepth = q->maxdepth;
+ maxflows = q->maxflows;
+ perturb_period = q->perturb_period;
+ quantum = q->quantum;
+ flags = q->flags;
+
+ /* update and validate configuration */
if (ctl->quantum)
- q->quantum = ctl->quantum;
- WRITE_ONCE(q->perturb_period, ctl->perturb_period * HZ);
+ quantum = ctl->quantum;
+ perturb_period = ctl->perturb_period * HZ;
if (ctl->flows)
- q->maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
+ maxflows = min_t(u32, ctl->flows, SFQ_MAX_FLOWS);
if (ctl->divisor) {
- q->divisor = ctl->divisor;
- q->maxflows = min_t(u32, q->maxflows, q->divisor);
+ divisor = ctl->divisor;
+ maxflows = min_t(u32, maxflows, divisor);
}
if (ctl_v1) {
if (ctl_v1->depth)
- q->maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
+ maxdepth = min_t(u32, ctl_v1->depth, SFQ_MAX_DEPTH);
if (p) {
- swap(q->red_parms, p);
- red_set_parms(q->red_parms,
+ red_set_parms(p,
ctl_v1->qth_min, ctl_v1->qth_max,
ctl_v1->Wlog,
ctl_v1->Plog, ctl_v1->Scell_log,
NULL,
ctl_v1->max_P);
}
- q->flags = ctl_v1->flags;
- q->headdrop = ctl_v1->headdrop;
+ flags = ctl_v1->flags;
+ headdrop = ctl_v1->headdrop;
}
if (ctl->limit) {
- q->limit = min_t(u32, ctl->limit, q->maxdepth * q->maxflows);
- q->maxflows = min_t(u32, q->maxflows, q->limit);
+ limit = min_t(u32, ctl->limit, maxdepth * maxflows);
+ maxflows = min_t(u32, maxflows, limit);
}
+ if (limit == 1) {
+ sch_tree_unlock(sch);
+ kfree(p);
+ NL_SET_ERR_MSG_MOD(extack, "invalid limit");
+ return -EINVAL;
+ }
+
+ /* commit configuration */
+ q->limit = limit;
+ q->divisor = divisor;
+ q->headdrop = headdrop;
+ q->maxdepth = maxdepth;
+ q->maxflows = maxflows;
+ WRITE_ONCE(q->perturb_period, perturb_period);
+ q->quantum = quantum;
+ q->flags = flags;
+ if (p)
+ swap(q->red_parms, p);
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit) {
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
index 20ff7386b74b..f485f62ab721 100644
--- a/net/sched/sch_skbprio.c
+++ b/net/sched/sch_skbprio.c
@@ -123,8 +123,6 @@ static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
/* Check to update highest and lowest priorities. */
if (skb_queue_empty(lp_qdisc)) {
if (q->lowest_prio == q->highest_prio) {
- /* The incoming packet is the only packet in queue. */
- BUG_ON(sch->q.qlen != 1);
q->lowest_prio = prio;
q->highest_prio = prio;
} else {
@@ -156,7 +154,6 @@ static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
/* Update highest priority field. */
if (skb_queue_empty(hpq)) {
if (q->lowest_prio == q->highest_prio) {
- BUG_ON(sch->q.qlen);
q->highest_prio = 0;
q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
} else {
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 8b9a1b96695e..29727ed1008e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -43,6 +43,7 @@
#include <net/addrconf.h>
#include <net/inet_common.h>
#include <net/inet_ecn.h>
+#include <net/inet_sock.h>
#include <net/udp_tunnel.h>
#include <net/inet_dscp.h>
@@ -427,16 +428,19 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
struct dst_entry *dst = NULL;
union sctp_addr *daddr = &t->ipaddr;
union sctp_addr dst_saddr;
- u8 tos = READ_ONCE(inet_sk(sk)->tos);
+ dscp_t dscp;
if (t->dscp & SCTP_DSCP_SET_MASK)
- tos = t->dscp & SCTP_DSCP_VAL_MASK;
+ dscp = inet_dsfield_to_dscp(t->dscp);
+ else
+ dscp = inet_sk_dscp(inet_sk(sk));
+
memset(&_fl, 0x0, sizeof(_fl));
fl4->daddr = daddr->v4.sin_addr.s_addr;
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl4->flowi4_tos = tos & INET_DSCP_MASK;
+ fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk);
fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
fl4->fl4_sport = htons(asoc->base.bind_addr.port);
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 36ee34f483d7..53725ee7ba06 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -72,8 +72,9 @@
/* Forward declarations for internal helper functions. */
static bool sctp_writeable(const struct sock *sk);
static void sctp_wfree(struct sk_buff *skb);
-static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len);
+static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
+ struct sctp_transport *transport,
+ long *timeo_p, size_t msg_len);
static int sctp_wait_for_packet(struct sock *sk, int *err, long *timeo_p);
static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p);
static int sctp_wait_for_accept(struct sock *sk, long timeo);
@@ -1828,7 +1829,7 @@ static int sctp_sendmsg_to_asoc(struct sctp_association *asoc,
if (sctp_wspace(asoc) <= 0 || !sk_wmem_schedule(sk, msg_len)) {
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
- err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len);
+ err = sctp_wait_for_sndbuf(asoc, transport, &timeo, msg_len);
if (err)
goto err;
if (unlikely(sinfo->sinfo_stream >= asoc->stream.outcnt)) {
@@ -9214,8 +9215,9 @@ void sctp_sock_rfree(struct sk_buff *skb)
/* Helper function to wait for space in the sndbuf. */
-static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
- size_t msg_len)
+static int sctp_wait_for_sndbuf(struct sctp_association *asoc,
+ struct sctp_transport *transport,
+ long *timeo_p, size_t msg_len)
{
struct sock *sk = asoc->base.sk;
long current_timeo = *timeo_p;
@@ -9225,7 +9227,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
pr_debug("%s: asoc:%p, timeo:%ld, msg_len:%zu\n", __func__, asoc,
*timeo_p, msg_len);
- /* Increment the association's refcnt. */
+ /* Increment the transport and association's refcnt. */
+ if (transport)
+ sctp_transport_hold(transport);
sctp_association_hold(asoc);
/* Wait on the association specific sndbuf space. */
@@ -9234,7 +9238,7 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
TASK_INTERRUPTIBLE);
if (asoc->base.dead)
goto do_dead;
- if (!*timeo_p)
+ if ((!*timeo_p) || (transport && transport->dead))
goto do_nonblock;
if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING)
goto do_error;
@@ -9259,7 +9263,9 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
out:
finish_wait(&asoc->wait, &wait);
- /* Release the association's refcnt. */
+ /* Release the transport and association's refcnt. */
+ if (transport)
+ sctp_transport_put(transport);
sctp_association_put(asoc);
return err;
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index c241cc552e8d..bfcff6d6a438 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -735,7 +735,7 @@ struct sctp_chunk *sctp_process_strreset_tsnreq(
* value SHOULD be the smallest TSN not acknowledged by the
* receiver of the request plus 2^31.
*/
- init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31);
+ init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1U << 31);
sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL,
init_tsn, GFP_ATOMIC);
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 8e1e97be4df7..ee3eac338a9d 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -525,6 +525,8 @@ static int proc_sctp_do_auth(const struct ctl_table *ctl, int write,
return ret;
}
+static DEFINE_MUTEX(sctp_sysctl_mutex);
+
static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@@ -549,6 +551,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write,
if (new_value > max || new_value < min)
return -EINVAL;
+ mutex_lock(&sctp_sysctl_mutex);
net->sctp.udp_port = new_value;
sctp_udp_sock_stop(net);
if (new_value) {
@@ -561,6 +564,7 @@ static int proc_sctp_do_udp_port(const struct ctl_table *ctl, int write,
lock_sock(sk);
sctp_sk(sk)->udp_port = htons(net->sctp.udp_port);
release_sock(sk);
+ mutex_unlock(&sctp_sysctl_mutex);
}
return ret;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 2abe45af98e7..31eca29b6cfb 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -117,6 +117,8 @@ fail:
*/
void sctp_transport_free(struct sctp_transport *transport)
{
+ transport->dead = 1;
+
/* Try to delete the heartbeat timer. */
if (del_timer(&transport->hb_timer))
sctp_transport_put(transport);
diff --git a/net/shaper/shaper.c b/net/shaper/shaper.c
index 15463062fe7b..7101a48bce54 100644
--- a/net/shaper/shaper.c
+++ b/net/shaper/shaper.c
@@ -40,7 +40,7 @@ static void net_shaper_lock(struct net_shaper_binding *binding)
{
switch (binding->type) {
case NET_SHAPER_BINDING_TYPE_NETDEV:
- mutex_lock(&binding->netdev->lock);
+ netdev_lock(binding->netdev);
break;
}
}
@@ -49,7 +49,7 @@ static void net_shaper_unlock(struct net_shaper_binding *binding)
{
switch (binding->type) {
case NET_SHAPER_BINDING_TYPE_NETDEV:
- mutex_unlock(&binding->netdev->lock);
+ netdev_unlock(binding->netdev);
break;
}
}
@@ -1398,7 +1398,7 @@ void net_shaper_set_real_num_tx_queues(struct net_device *dev,
/* Only drivers implementing shapers support ensure
* the lock is acquired in advance.
*/
- lockdep_assert_held(&dev->lock);
+ netdev_assert_locked(dev);
/* Take action only when decreasing the tx queue number. */
for (i = txq; i < dev->real_num_tx_queues; ++i) {
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index ba834cefb177..3760131f1484 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -362,6 +362,9 @@ static void smc_destruct(struct sock *sk)
return;
}
+static struct lock_class_key smc_key;
+static struct lock_class_key smc_slock_key;
+
void smc_sk_init(struct net *net, struct sock *sk, int protocol)
{
struct smc_sock *smc = smc_sk(sk);
@@ -375,6 +378,8 @@ void smc_sk_init(struct net *net, struct sock *sk, int protocol)
INIT_WORK(&smc->connect_work, smc_connect_work);
INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
INIT_LIST_HEAD(&smc->accept_q);
+ sock_lock_init_class_and_name(sk, "slock-AF_SMC", &smc_slock_key,
+ "sk_lock-AF_SMC", &smc_key);
spin_lock_init(&smc->accept_q_lock);
spin_lock_init(&smc->conn.send_lock);
sk->sk_prot->hash(sk);
@@ -1117,7 +1122,10 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
ini->check_smcrv2 = true;
ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
if (!(ini->smcr_version & SMC_V2) ||
- smc->clcsock->sk->sk_family != AF_INET ||
+#if IS_ENABLED(CONFIG_IPV6)
+ (smc->clcsock->sk->sk_family == AF_INET6 &&
+ !ipv6_addr_v4mapped(&smc->clcsock->sk->sk_v6_rcv_saddr)) ||
+#endif
!smc_clc_ueid_count() ||
smc_find_rdma_device(smc, ini))
ini->smcr_version &= ~SMC_V2;
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 3b125d348b4a..ac07b963aede 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -795,9 +795,14 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
if (lgr->smc_version == SMC_V2) {
lnk->smcibdev = ini->smcrv2.ib_dev_v2;
lnk->ibport = ini->smcrv2.ib_port_v2;
+ lnk->wr_rx_sge_cnt = lnk->smcibdev->ibdev->attrs.max_recv_sge < 2 ? 1 : 2;
+ lnk->wr_rx_buflen = smc_link_shared_v2_rxbuf(lnk) ?
+ SMC_WR_BUF_SIZE : SMC_WR_BUF_V2_SIZE;
} else {
lnk->smcibdev = ini->ib_dev;
lnk->ibport = ini->ib_port;
+ lnk->wr_rx_sge_cnt = 1;
+ lnk->wr_rx_buflen = SMC_WR_BUF_SIZE;
}
get_device(&lnk->smcibdev->ibdev->dev);
atomic_inc(&lnk->smcibdev->lnk_cnt);
@@ -2150,7 +2155,7 @@ static int smcr_buf_map_link(struct smc_buf_desc *buf_desc, bool is_rmb,
for_each_sg(buf_desc->sgt[lnk->link_idx].sgl, sg, nents, i) {
size = min_t(int, PAGE_SIZE - offset, buf_size);
sg_set_page(sg, vmalloc_to_page(buf), size, offset);
- buf += size / sizeof(*buf);
+ buf += size;
buf_size -= size;
offset = 0;
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 69b54ecd6503..48a1b1dcb576 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -122,10 +122,14 @@ struct smc_link {
} ____cacheline_aligned_in_smp;
struct completion tx_ref_comp;
- struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
+ u8 *wr_rx_bufs; /* WR recv payload buffers */
struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
/* above three vectors have wr_rx_cnt elements and use the same index */
+ int wr_rx_sge_cnt; /* rx sge, V1 is 1, V2 is either 2 or 1 */
+ int wr_rx_buflen; /* buffer len for the first sge, len for the
+ * second sge is lgr shared if rx sge is 2.
+ */
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
u64 wr_rx_id; /* seq # of last recv WR */
@@ -506,6 +510,11 @@ static inline bool smc_link_active(struct smc_link *lnk)
return lnk->state == SMC_LNK_ACTIVE;
}
+static inline bool smc_link_shared_v2_rxbuf(struct smc_link *lnk)
+{
+ return lnk->wr_rx_sge_cnt > 1;
+}
+
static inline void smc_gid_be16_convert(__u8 *buf, u8 *gid_raw)
{
sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 9c563cdbea90..53828833a3f7 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -662,7 +662,6 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
/* create a queue pair within the protection domain for a link */
int smc_ib_create_queue_pair(struct smc_link *lnk)
{
- int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
struct ib_qp_init_attr qp_attr = {
.event_handler = smc_ib_qp_event_handler,
.qp_context = lnk,
@@ -676,7 +675,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
.max_send_wr = SMC_WR_BUF_CNT * 3,
.max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE,
- .max_recv_sge = sges_per_buf,
+ .max_recv_sge = lnk->wr_rx_sge_cnt,
.max_inline_data = 0,
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 018ce8133b02..f865c58c3aa7 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -997,13 +997,14 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
}
static void smc_llc_save_add_link_rkeys(struct smc_link *link,
- struct smc_link *link_new)
+ struct smc_link *link_new,
+ u8 *llc_msg)
{
struct smc_llc_msg_add_link_v2_ext *ext;
struct smc_link_group *lgr = link->lgr;
int max, i;
- ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 +
+ ext = (struct smc_llc_msg_add_link_v2_ext *)(llc_msg +
SMC_WR_TX_SIZE);
max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
down_write(&lgr->rmbs_lock);
@@ -1098,7 +1099,9 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
if (rc)
goto out_clear_lnk;
if (lgr->smc_version == SMC_V2) {
- smc_llc_save_add_link_rkeys(link, lnk_new);
+ u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ?
+ (u8 *)lgr->wr_rx_buf_v2 : (u8 *)llc;
+ smc_llc_save_add_link_rkeys(link, lnk_new, llc_msg);
} else {
rc = smc_llc_cli_rkey_exchange(link, lnk_new);
if (rc) {
@@ -1498,7 +1501,9 @@ int smc_llc_srv_add_link(struct smc_link *link,
if (rc)
goto out_err;
if (lgr->smc_version == SMC_V2) {
- smc_llc_save_add_link_rkeys(link, link_new);
+ u8 *llc_msg = smc_link_shared_v2_rxbuf(link) ?
+ (u8 *)lgr->wr_rx_buf_v2 : (u8 *)add_llc;
+ smc_llc_save_add_link_rkeys(link, link_new, llc_msg);
} else {
rc = smc_llc_srv_rkey_exchange(link, link_new);
if (rc)
@@ -1807,8 +1812,12 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
if (lgr->smc_version == SMC_V2) {
struct smc_llc_msg_delete_rkey_v2 *llcv2;
- memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
- llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
+ if (smc_link_shared_v2_rxbuf(link)) {
+ memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
+ llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
+ } else {
+ llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)llc;
+ }
llcv2->num_inval_rkeys = 0;
max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index 79047721df51..e7f1134453ef 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -197,7 +197,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
partial[i].offset = offset;
partial[i].len = size;
partial[i].private = (unsigned long)priv[i];
- buf += size / sizeof(*buf);
+ buf += size;
left -= size;
offset = 0;
}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 994c0cd4fddb..b04a21b8c511 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -439,7 +439,7 @@ static inline void smc_wr_rx_demultiplex(struct ib_wc *wc)
return; /* short message */
temp_wr_id = wc->wr_id;
index = do_div(temp_wr_id, link->wr_rx_cnt);
- wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index];
+ wr_rx = (struct smc_wr_rx_hdr *)(link->wr_rx_bufs + index * link->wr_rx_buflen);
hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) {
if (handler->type == wr_rx->type)
handler->handler(wc, wr_rx);
@@ -555,7 +555,6 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
static void smc_wr_init_sge(struct smc_link *lnk)
{
- int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
u32 i;
@@ -608,13 +607,14 @@ static void smc_wr_init_sge(struct smc_link *lnk)
* the larger spillover buffer, allowing easy data mapping.
*/
for (i = 0; i < lnk->wr_rx_cnt; i++) {
- int x = i * sges_per_buf;
+ int x = i * lnk->wr_rx_sge_cnt;
lnk->wr_rx_sges[x].addr =
- lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
- lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
+ lnk->wr_rx_dma_addr + i * lnk->wr_rx_buflen;
+ lnk->wr_rx_sges[x].length = smc_link_shared_v2_rxbuf(lnk) ?
+ SMC_WR_TX_SIZE : lnk->wr_rx_buflen;
lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
- if (lnk->lgr->smc_version == SMC_V2) {
+ if (lnk->lgr->smc_version == SMC_V2 && smc_link_shared_v2_rxbuf(lnk)) {
lnk->wr_rx_sges[x + 1].addr =
lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
lnk->wr_rx_sges[x + 1].length =
@@ -624,7 +624,7 @@ static void smc_wr_init_sge(struct smc_link *lnk)
}
lnk->wr_rx_ibs[i].next = NULL;
lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
- lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
+ lnk->wr_rx_ibs[i].num_sge = lnk->wr_rx_sge_cnt;
}
lnk->wr_reg.wr.next = NULL;
lnk->wr_reg.wr.num_sge = 0;
@@ -655,7 +655,7 @@ void smc_wr_free_link(struct smc_link *lnk)
if (lnk->wr_rx_dma_addr) {
ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
- SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ lnk->wr_rx_buflen * lnk->wr_rx_cnt,
DMA_FROM_DEVICE);
lnk->wr_rx_dma_addr = 0;
}
@@ -740,13 +740,11 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
int smc_wr_alloc_link_mem(struct smc_link *link)
{
- int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
-
/* allocate link related memory */
link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
if (!link->wr_tx_bufs)
goto no_mem;
- link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE,
+ link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen,
GFP_KERNEL);
if (!link->wr_rx_bufs)
goto no_mem_wr_tx_bufs;
@@ -774,7 +772,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
if (!link->wr_tx_sges)
goto no_mem_wr_tx_rdma_sges;
link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
- sizeof(link->wr_rx_sges[0]) * sges_per_buf,
+ sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt,
GFP_KERNEL);
if (!link->wr_rx_sges)
goto no_mem_wr_tx_sges;
@@ -872,7 +870,7 @@ int smc_wr_create_link(struct smc_link *lnk)
smc_wr_tx_set_wr_id(&lnk->wr_tx_id, 0);
lnk->wr_rx_id = 0;
lnk->wr_rx_dma_addr = ib_dma_map_single(
- ibdev, lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ ibdev, lnk->wr_rx_bufs, lnk->wr_rx_buflen * lnk->wr_rx_cnt,
DMA_FROM_DEVICE);
if (ib_dma_mapping_error(ibdev, lnk->wr_rx_dma_addr)) {
lnk->wr_rx_dma_addr = 0;
@@ -880,13 +878,15 @@ int smc_wr_create_link(struct smc_link *lnk)
goto out;
}
if (lnk->lgr->smc_version == SMC_V2) {
- lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
- lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
- DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
- lnk->wr_rx_v2_dma_addr = 0;
- rc = -EIO;
- goto dma_unmap;
+ if (smc_link_shared_v2_rxbuf(lnk)) {
+ lnk->wr_rx_v2_dma_addr =
+ ib_dma_map_single(ibdev, lnk->lgr->wr_rx_buf_v2,
+ SMC_WR_BUF_V2_SIZE, DMA_FROM_DEVICE);
+ if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
+ lnk->wr_rx_v2_dma_addr = 0;
+ rc = -EIO;
+ goto dma_unmap;
+ }
}
lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
@@ -935,7 +935,7 @@ dma_unmap:
lnk->wr_tx_v2_dma_addr = 0;
}
ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
- SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
+ lnk->wr_rx_buflen * lnk->wr_rx_cnt,
DMA_FROM_DEVICE);
lnk->wr_rx_dma_addr = 0;
out:
diff --git a/net/socket.c b/net/socket.c
index 9a117248f18f..38227d00d198 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -110,6 +110,8 @@
#include <linux/ptp_clock_kernel.h>
#include <trace/events/sock.h>
+#include "core/dev.h"
+
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int sysctl_net_busy_read __read_mostly;
unsigned int sysctl_net_busy_poll __read_mostly;
@@ -477,6 +479,11 @@ struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
sock->file = file;
file->private_data = sock;
stream_open(SOCK_INODE(sock), file);
+ /*
+ * Disable permission and pre-content events, but enable legacy
+ * inotify events for legacy users.
+ */
+ file_set_fsnotify_mode(file, FMODE_NONOTIFY_PERM);
return file;
}
EXPORT_SYMBOL(sock_alloc_file);
@@ -774,34 +781,6 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
}
EXPORT_SYMBOL(kernel_sendmsg);
-/**
- * kernel_sendmsg_locked - send a message through @sock (kernel-space)
- * @sk: sock
- * @msg: message header
- * @vec: output s/g array
- * @num: output s/g array length
- * @size: total message data size
- *
- * Builds the message data with @vec and sends it through @sock.
- * Returns the number of bytes sent, or an error code.
- * Caller must hold @sk.
- */
-
-int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg,
- struct kvec *vec, size_t num, size_t size)
-{
- struct socket *sock = sk->sk_socket;
- const struct proto_ops *ops = READ_ONCE(sock->ops);
-
- if (!ops->sendmsg_locked)
- return sock_no_sendmsg_locked(sk, msg, size);
-
- iov_iter_kvec(&msg->msg_iter, ITER_SOURCE, vec, num, size);
-
- return ops->sendmsg_locked(sk, msg, msg_data_left(msg));
-}
-EXPORT_SYMBOL(kernel_sendmsg_locked);
-
static bool skb_is_err_queue(const struct sk_buff *skb)
{
/* pkt_type of skbs enqueued on the error queue are set to
@@ -1008,12 +987,23 @@ static void sock_recv_mark(struct msghdr *msg, struct sock *sk,
}
}
+static void sock_recv_priority(struct msghdr *msg, struct sock *sk,
+ struct sk_buff *skb)
+{
+ if (sock_flag(sk, SOCK_RCVPRIORITY) && skb) {
+ __u32 priority = skb->priority;
+
+ put_cmsg(msg, SOL_SOCKET, SO_PRIORITY, sizeof(__u32), &priority);
+ }
+}
+
void __sock_recv_cmsgs(struct msghdr *msg, struct sock *sk,
struct sk_buff *skb)
{
sock_recv_timestamp(msg, sk, skb);
sock_recv_drops(msg, sk, skb);
sock_recv_mark(msg, sk, skb);
+ sock_recv_priority(msg, sk, skb);
}
EXPORT_SYMBOL_GPL(__sock_recv_cmsgs);
@@ -1155,12 +1145,10 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
*/
static DEFINE_MUTEX(br_ioctl_mutex);
-static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br,
- unsigned int cmd, struct ifreq *ifr,
+static int (*br_ioctl_hook)(struct net *net, unsigned int cmd,
void __user *uarg);
-void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br,
- unsigned int cmd, struct ifreq *ifr,
+void brioctl_set(int (*hook)(struct net *net, unsigned int cmd,
void __user *uarg))
{
mutex_lock(&br_ioctl_mutex);
@@ -1169,8 +1157,7 @@ void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br,
}
EXPORT_SYMBOL(brioctl_set);
-int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
- struct ifreq *ifr, void __user *uarg)
+int br_ioctl_call(struct net *net, unsigned int cmd, void __user *uarg)
{
int err = -ENOPKG;
@@ -1179,7 +1166,7 @@ int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
mutex_lock(&br_ioctl_mutex);
if (br_ioctl_hook)
- err = br_ioctl_hook(net, br, cmd, ifr, uarg);
+ err = br_ioctl_hook(net, cmd, uarg);
mutex_unlock(&br_ioctl_mutex);
return err;
@@ -1279,7 +1266,9 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case SIOCSIFBR:
case SIOCBRADDBR:
case SIOCBRDELBR:
- err = br_ioctl_call(net, NULL, cmd, NULL, argp);
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
+ err = br_ioctl_call(net, cmd, argp);
break;
case SIOCGIFVLAN:
case SIOCSIFVLAN:
@@ -3439,6 +3428,8 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCGPGRP:
case SIOCBRADDBR:
case SIOCBRDELBR:
+ case SIOCBRADDIF:
+ case SIOCBRDELIF:
case SIOCGIFVLAN:
case SIOCSIFVLAN:
case SIOCGSKNS:
@@ -3478,8 +3469,6 @@ static int compat_sock_ioctl_trans(struct file *file, struct socket *sock,
case SIOCGIFPFLAGS:
case SIOCGIFTXQLEN:
case SIOCSIFTXQLEN:
- case SIOCBRADDIF:
- case SIOCBRDELIF:
case SIOCGIFNAME:
case SIOCSIFNAME:
case SIOCGMIIPHY:
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 8299ceb3e373..95696f42647e 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -347,7 +347,10 @@ static int strp_read_sock(struct strparser *strp)
struct socket *sock = strp->sk->sk_socket;
read_descriptor_t desc;
- if (unlikely(!sock || !sock->ops || !sock->ops->read_sock))
+ if (unlikely(!sock || !sock->ops))
+ return -EBUSY;
+
+ if (unlikely(!strp->cb.read_sock && !sock->ops->read_sock))
return -EBUSY;
desc.arg.data = strp;
@@ -355,7 +358,10 @@ static int strp_read_sock(struct strparser *strp)
desc.count = 1; /* give more than one skb per call */
/* sk should be locked here, so okay to do read_sock */
- sock->ops->read_sock(strp->sk, &desc, strp_recv);
+ if (strp->cb.read_sock)
+ strp->cb.read_sock(strp, &desc, strp_recv);
+ else
+ sock->ops->read_sock(strp->sk, &desc, strp_recv);
desc.error = strp->cb.read_sock_done(strp, desc.error);
@@ -468,6 +474,7 @@ int strp_init(struct strparser *strp, struct sock *sk,
strp->cb.unlock = cb->unlock ? : strp_sock_unlock;
strp->cb.rcv_msg = cb->rcv_msg;
strp->cb.parse_msg = cb->parse_msg;
+ strp->cb.read_sock = cb->read_sock;
strp->cb.read_sock_done = cb->read_sock_done ? : default_read_sock_done;
strp->cb.abort_parser = cb->abort_parser ? : strp_abort_strp;
diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile
index ad1736d93b76..452f67deebc6 100644
--- a/net/sunrpc/auth_gss/Makefile
+++ b/net/sunrpc/auth_gss/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o
-auth_rpcgss-y := auth_gss.o gss_generic_token.o \
+auth_rpcgss-y := auth_gss.o \
gss_mech_switch.o svcauth_gss.o \
gss_rpc_upcall.o gss_rpc_xdr.o trace.o
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
deleted file mode 100644
index 4a4082bb22ad..000000000000
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * linux/net/sunrpc/gss_generic_token.c
- *
- * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c
- *
- * Copyright (c) 2000 The Regents of the University of Michigan.
- * All rights reserved.
- *
- * Andy Adamson <andros@umich.edu>
- */
-
-/*
- * Copyright 1993 by OpenVision Technologies, Inc.
- *
- * Permission to use, copy, modify, distribute, and sell this software
- * and its documentation for any purpose is hereby granted without fee,
- * provided that the above copyright notice appears in all copies and
- * that both that copyright notice and this permission notice appear in
- * supporting documentation, and that the name of OpenVision not be used
- * in advertising or publicity pertaining to distribution of the software
- * without specific, written prior permission. OpenVision makes no
- * representations about the suitability of this software for any
- * purpose. It is provided "as is" without express or implied warranty.
- *
- * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
- * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
- * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
- * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
- * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
- * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
- * PERFORMANCE OF THIS SOFTWARE.
- */
-
-#include <linux/types.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/sunrpc/sched.h>
-#include <linux/sunrpc/gss_asn1.h>
-
-
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-# define RPCDBG_FACILITY RPCDBG_AUTH
-#endif
-
-
-/* TWRITE_STR from gssapiP_generic.h */
-#define TWRITE_STR(ptr, str, len) \
- memcpy((ptr), (char *) (str), (len)); \
- (ptr) += (len);
-
-/* XXXX this code currently makes the assumption that a mech oid will
- never be longer than 127 bytes. This assumption is not inherent in
- the interfaces, so the code can be fixed if the OSI namespace
- balloons unexpectedly. */
-
-/* Each token looks like this:
-
-0x60 tag for APPLICATION 0, SEQUENCE
- (constructed, definite-length)
- <length> possible multiple bytes, need to parse/generate
- 0x06 tag for OBJECT IDENTIFIER
- <moid_length> compile-time constant string (assume 1 byte)
- <moid_bytes> compile-time constant string
- <inner_bytes> the ANY containing the application token
- bytes 0,1 are the token type
- bytes 2,n are the token data
-
-For the purposes of this abstraction, the token "header" consists of
-the sequence tag and length octets, the mech OID DER encoding, and the
-first two inner bytes, which indicate the token type. The token
-"body" consists of everything else.
-
-*/
-
-static int
-der_length_size( int length)
-{
- if (length < (1<<7))
- return 1;
- else if (length < (1<<8))
- return 2;
-#if (SIZEOF_INT == 2)
- else
- return 3;
-#else
- else if (length < (1<<16))
- return 3;
- else if (length < (1<<24))
- return 4;
- else
- return 5;
-#endif
-}
-
-static void
-der_write_length(unsigned char **buf, int length)
-{
- if (length < (1<<7)) {
- *(*buf)++ = (unsigned char) length;
- } else {
- *(*buf)++ = (unsigned char) (der_length_size(length)+127);
-#if (SIZEOF_INT > 2)
- if (length >= (1<<24))
- *(*buf)++ = (unsigned char) (length>>24);
- if (length >= (1<<16))
- *(*buf)++ = (unsigned char) ((length>>16)&0xff);
-#endif
- if (length >= (1<<8))
- *(*buf)++ = (unsigned char) ((length>>8)&0xff);
- *(*buf)++ = (unsigned char) (length&0xff);
- }
-}
-
-/* returns decoded length, or < 0 on failure. Advances buf and
- decrements bufsize */
-
-static int
-der_read_length(unsigned char **buf, int *bufsize)
-{
- unsigned char sf;
- int ret;
-
- if (*bufsize < 1)
- return -1;
- sf = *(*buf)++;
- (*bufsize)--;
- if (sf & 0x80) {
- if ((sf &= 0x7f) > ((*bufsize)-1))
- return -1;
- if (sf > SIZEOF_INT)
- return -1;
- ret = 0;
- for (; sf; sf--) {
- ret = (ret<<8) + (*(*buf)++);
- (*bufsize)--;
- }
- } else {
- ret = sf;
- }
-
- return ret;
-}
-
-/* returns the length of a token, given the mech oid and the body size */
-
-int
-g_token_size(struct xdr_netobj *mech, unsigned int body_size)
-{
- /* set body_size to sequence contents size */
- body_size += 2 + (int) mech->len; /* NEED overflow check */
- return 1 + der_length_size(body_size) + body_size;
-}
-
-EXPORT_SYMBOL_GPL(g_token_size);
-
-/* fills in a buffer with the token header. The buffer is assumed to
- be the right size. buf is advanced past the token header */
-
-void
-g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf)
-{
- *(*buf)++ = 0x60;
- der_write_length(buf, 2 + mech->len + body_size);
- *(*buf)++ = 0x06;
- *(*buf)++ = (unsigned char) mech->len;
- TWRITE_STR(*buf, mech->data, ((int) mech->len));
-}
-
-EXPORT_SYMBOL_GPL(g_make_token_header);
-
-/*
- * Given a buffer containing a token, reads and verifies the token,
- * leaving buf advanced past the token header, and setting body_size
- * to the number of remaining bytes. Returns 0 on success,
- * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the
- * mechanism in the token does not match the mech argument. buf and
- * *body_size are left unmodified on error.
- */
-u32
-g_verify_token_header(struct xdr_netobj *mech, int *body_size,
- unsigned char **buf_in, int toksize)
-{
- unsigned char *buf = *buf_in;
- int seqsize;
- struct xdr_netobj toid;
- int ret = 0;
-
- if ((toksize-=1) < 0)
- return G_BAD_TOK_HEADER;
- if (*buf++ != 0x60)
- return G_BAD_TOK_HEADER;
-
- if ((seqsize = der_read_length(&buf, &toksize)) < 0)
- return G_BAD_TOK_HEADER;
-
- if (seqsize != toksize)
- return G_BAD_TOK_HEADER;
-
- if ((toksize-=1) < 0)
- return G_BAD_TOK_HEADER;
- if (*buf++ != 0x06)
- return G_BAD_TOK_HEADER;
-
- if ((toksize-=1) < 0)
- return G_BAD_TOK_HEADER;
- toid.len = *buf++;
-
- if ((toksize-=toid.len) < 0)
- return G_BAD_TOK_HEADER;
- toid.data = buf;
- buf+=toid.len;
-
- if (! g_OID_equal(&toid, mech))
- ret = G_WRONG_MECH;
-
- /* G_WRONG_MECH is not returned immediately because it's more important
- to return G_BAD_TOK_HEADER if the token header is in fact bad */
-
- if ((toksize-=2) < 0)
- return G_BAD_TOK_HEADER;
-
- if (ret)
- return ret;
-
- *buf_in = buf;
- *body_size = toksize;
-
- return ret;
-}
-
-EXPORT_SYMBOL_GPL(g_verify_token_header);
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index d2b02710ab07..9a27201638e2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -442,35 +442,6 @@ encryptor(struct scatterlist *sg, void *data)
return 0;
}
-int
-gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
- int offset, struct page **pages)
-{
- int ret;
- struct encryptor_desc desc;
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
- BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
-
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
-
- memset(desc.iv, 0, sizeof(desc.iv));
- desc.req = req;
- desc.pos = offset;
- desc.outbuf = buf;
- desc.pages = pages;
- desc.fragno = 0;
- desc.fraglen = 0;
-
- sg_init_table(desc.infrags, 4);
- sg_init_table(desc.outfrags, 4);
-
- ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc);
- skcipher_request_zero(req);
- return ret;
-}
-
struct decryptor_desc {
u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
struct skcipher_request *req;
@@ -525,32 +496,6 @@ decryptor(struct scatterlist *sg, void *data)
return 0;
}
-int
-gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm, struct xdr_buf *buf,
- int offset)
-{
- int ret;
- struct decryptor_desc desc;
- SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm);
-
- /* XXXJBF: */
- BUG_ON((buf->len - offset) % crypto_sync_skcipher_blocksize(tfm) != 0);
-
- skcipher_request_set_sync_tfm(req, tfm);
- skcipher_request_set_callback(req, 0, NULL, NULL);
-
- memset(desc.iv, 0, sizeof(desc.iv));
- desc.req = req;
- desc.fragno = 0;
- desc.fraglen = 0;
-
- sg_init_table(desc.frags, 4);
-
- ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
- skcipher_request_zero(req);
- return ret;
-}
-
/*
* This function makes the assumption that it was ultimately called
* from gss_wrap().
diff --git a/net/sunrpc/auth_gss/gss_krb5_internal.h b/net/sunrpc/auth_gss/gss_krb5_internal.h
index 3afd4065bf3d..a47e9ec228a5 100644
--- a/net/sunrpc/auth_gss/gss_krb5_internal.h
+++ b/net/sunrpc/auth_gss/gss_krb5_internal.h
@@ -172,13 +172,6 @@ u32 krb5_decrypt(struct crypto_sync_skcipher *key, void *iv, void *in,
int xdr_extend_head(struct xdr_buf *buf, unsigned int base,
unsigned int shiftlen);
-int gss_encrypt_xdr_buf(struct crypto_sync_skcipher *tfm,
- struct xdr_buf *outbuf, int offset,
- struct page **pages);
-
-int gss_decrypt_xdr_buf(struct crypto_sync_skcipher *tfm,
- struct xdr_buf *inbuf, int offset);
-
u32 gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
struct xdr_buf *buf, struct page **pages);
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index fae632da1058..c84d0cf61980 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -13,7 +13,6 @@
#include <linux/module.h>
#include <linux/oid_registry.h>
#include <linux/sunrpc/msg_prot.h>
-#include <linux/sunrpc/gss_asn1.h>
#include <linux/sunrpc/auth_gss.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/gss_err.h>
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 7fcb0574fc79..7ce5e28a6c03 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -281,21 +281,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
return rv;
}
-/*
- * This is the generic cache management routine for all
- * the authentication caches.
- * It checks the currency of a cache item and will (later)
- * initiate an upcall to fill it if needed.
- *
- *
- * Returns 0 if the cache_head can be used, or cache_puts it and returns
- * -EAGAIN if upcall is pending and request has been queued
- * -ETIMEDOUT if upcall failed or request could not be queue or
- * upcall completed but item is still invalid (implying that
- * the cache item has been replaced with a newer one).
- * -ENOENT if cache entry was negative
- */
-int cache_check(struct cache_detail *detail,
+int cache_check_rcu(struct cache_detail *detail,
struct cache_head *h, struct cache_req *rqstp)
{
int rv;
@@ -336,6 +322,31 @@ int cache_check(struct cache_detail *detail,
rv = -ETIMEDOUT;
}
}
+
+ return rv;
+}
+EXPORT_SYMBOL_GPL(cache_check_rcu);
+
+/*
+ * This is the generic cache management routine for all
+ * the authentication caches.
+ * It checks the currency of a cache item and will (later)
+ * initiate an upcall to fill it if needed.
+ *
+ *
+ * Returns 0 if the cache_head can be used, or cache_puts it and returns
+ * -EAGAIN if upcall is pending and request has been queued
+ * -ETIMEDOUT if upcall failed or request could not be queue or
+ * upcall completed but item is still invalid (implying that
+ * the cache item has been replaced with a newer one).
+ * -ENOENT if cache entry was negative
+ */
+int cache_check(struct cache_detail *detail,
+ struct cache_head *h, struct cache_req *rqstp)
+{
+ int rv;
+
+ rv = cache_check_rcu(detail, h, rqstp);
if (rv)
cache_put(h, detail);
return rv;
@@ -1427,17 +1438,11 @@ static int c_show(struct seq_file *m, void *p)
seq_printf(m, "# expiry=%lld refcnt=%d flags=%lx\n",
convert_to_wallclock(cp->expiry_time),
kref_read(&cp->ref), cp->flags);
- if (!cache_get_rcu(cp))
- return 0;
- if (cache_check(cd, cp, NULL))
- /* cache_check does a cache_put on failure */
+ if (cache_check_rcu(cd, cp, NULL))
+ seq_puts(m, "# ");
+ else if (cache_is_expired(cd, cp))
seq_puts(m, "# ");
- else {
- if (cache_is_expired(cd, cp))
- seq_puts(m, "# ");
- cache_put(cp, cd);
- }
return cd->cache_show(m, cd, cp);
}
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0090162ee8c3..2fe88ea79a70 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -958,12 +958,17 @@ void rpc_shutdown_client(struct rpc_clnt *clnt)
trace_rpc_clnt_shutdown(clnt);
+ clnt->cl_shutdown = 1;
while (!list_empty(&clnt->cl_tasks)) {
rpc_killall_tasks(clnt);
wait_event_timeout(destroy_wait,
list_empty(&clnt->cl_tasks), 1*HZ);
}
+ /* wait for tasks still in workqueue or waitqueue */
+ wait_event_timeout(destroy_wait,
+ atomic_read(&clnt->cl_task_count) == 0, 1 * HZ);
+
rpc_release_client(clnt);
}
EXPORT_SYMBOL_GPL(rpc_shutdown_client);
@@ -1139,6 +1144,7 @@ void rpc_task_release_client(struct rpc_task *task)
list_del(&task->tk_task);
spin_unlock(&clnt->cl_lock);
task->tk_client = NULL;
+ atomic_dec(&clnt->cl_task_count);
rpc_release_client(clnt);
}
@@ -1189,10 +1195,7 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
task->tk_flags |= RPC_TASK_TIMEOUT;
if (clnt->cl_noretranstimeo)
task->tk_flags |= RPC_TASK_NO_RETRANS_TIMEOUT;
- /* Add to the client's list of all tasks */
- spin_lock(&clnt->cl_lock);
- list_add_tail(&task->tk_task, &clnt->cl_tasks);
- spin_unlock(&clnt->cl_lock);
+ atomic_inc(&clnt->cl_task_count);
}
static void
@@ -1787,9 +1790,14 @@ call_reserveresult(struct rpc_task *task)
if (status >= 0) {
if (task->tk_rqstp) {
task->tk_action = call_refresh;
+
+ /* Add to the client's list of all tasks */
+ spin_lock(&task->tk_client->cl_lock);
+ if (list_empty(&task->tk_task))
+ list_add_tail(&task->tk_task, &task->tk_client->cl_tasks);
+ spin_unlock(&task->tk_client->cl_lock);
return;
}
-
rpc_call_rpcerror(task, -EIO);
return;
}
@@ -1854,13 +1862,13 @@ call_refreshresult(struct rpc_task *task)
fallthrough;
case -EAGAIN:
status = -EACCES;
- fallthrough;
- case -EKEYEXPIRED:
if (!task->tk_cred_retry)
break;
task->tk_cred_retry--;
trace_rpc_retry_refresh_status(task);
return;
+ case -EKEYEXPIRED:
+ break;
case -ENOMEM:
rpc_delay(task, HZ >> 4);
return;
@@ -3319,8 +3327,11 @@ bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
-static void rpc_show_header(void)
+static void rpc_show_header(struct rpc_clnt *clnt)
{
+ printk(KERN_INFO "clnt[%pISpc] RPC tasks[%d]\n",
+ (struct sockaddr *)&clnt->cl_xprt->addr,
+ atomic_read(&clnt->cl_task_count));
printk(KERN_INFO "-pid- flgs status -client- --rqstp- "
"-timeout ---ops--\n");
}
@@ -3352,7 +3363,7 @@ void rpc_show_tasks(struct net *net)
spin_lock(&clnt->cl_lock);
list_for_each_entry(task, &clnt->cl_tasks, tk_task) {
if (!header) {
- rpc_show_header();
+ rpc_show_header(clnt);
header++;
}
rpc_show_task(clnt, task);
diff --git a/net/sunrpc/debugfs.c b/net/sunrpc/debugfs.c
index a176d5a0b0ee..32417db340de 100644
--- a/net/sunrpc/debugfs.c
+++ b/net/sunrpc/debugfs.c
@@ -74,6 +74,9 @@ tasks_stop(struct seq_file *f, void *v)
{
struct rpc_clnt *clnt = f->private;
spin_unlock(&clnt->cl_lock);
+ seq_printf(f, "clnt[%pISpc] RPC tasks[%d]\n",
+ (struct sockaddr *)&clnt->cl_xprt->addr,
+ atomic_read(&clnt->cl_task_count));
}
static const struct seq_operations tasks_seq_operations = {
@@ -179,6 +182,18 @@ xprt_info_show(struct seq_file *f, void *v)
seq_printf(f, "addr: %s\n", xprt->address_strings[RPC_DISPLAY_ADDR]);
seq_printf(f, "port: %s\n", xprt->address_strings[RPC_DISPLAY_PORT]);
seq_printf(f, "state: 0x%lx\n", xprt->state);
+ seq_printf(f, "netns: %u\n", xprt->xprt_net->ns.inum);
+
+ if (xprt->ops->get_srcaddr) {
+ int ret, buflen;
+ char buf[INET6_ADDRSTRLEN];
+
+ buflen = ARRAY_SIZE(buf);
+ ret = xprt->ops->get_srcaddr(xprt, buf, buflen);
+ if (ret < 0)
+ ret = sprintf(buf, "<closed>");
+ seq_printf(f, "saddr: %.*s\n", ret, buf);
+ }
return 0;
}
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 7ce3721c06ca..eadc00410ebc 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -630,7 +630,7 @@ static int __rpc_rmpipe(struct inode *dir, struct dentry *dentry)
static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
const char *name)
{
- struct qstr q = QSTR_INIT(name, strlen(name));
+ struct qstr q = QSTR(name);
struct dentry *dentry = d_hash_and_lookup(parent, &q);
if (!dentry) {
dentry = d_alloc(parent, &q);
@@ -1190,8 +1190,7 @@ static const struct rpc_filelist files[] = {
struct dentry *rpc_d_lookup_sb(const struct super_block *sb,
const unsigned char *dir_name)
{
- struct qstr dir = QSTR_INIT(dir_name, strlen(dir_name));
- return d_hash_and_lookup(sb->s_root, &dir);
+ return d_hash_and_lookup(sb->s_root, &QSTR(dir_name));
}
EXPORT_SYMBOL_GPL(rpc_d_lookup_sb);
@@ -1300,11 +1299,9 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
struct dentry *gssd_dentry;
struct dentry *clnt_dentry = NULL;
struct dentry *pipe_dentry = NULL;
- struct qstr q = QSTR_INIT(files[RPCAUTH_gssd].name,
- strlen(files[RPCAUTH_gssd].name));
/* We should never get this far if "gssd" doesn't exist */
- gssd_dentry = d_hash_and_lookup(root, &q);
+ gssd_dentry = d_hash_and_lookup(root, &QSTR(files[RPCAUTH_gssd].name));
if (!gssd_dentry)
return ERR_PTR(-ENOENT);
@@ -1314,9 +1311,8 @@ rpc_gssd_dummy_populate(struct dentry *root, struct rpc_pipe *pipe_data)
goto out;
}
- q.name = gssd_dummy_clnt_dir[0].name;
- q.len = strlen(gssd_dummy_clnt_dir[0].name);
- clnt_dentry = d_hash_and_lookup(gssd_dentry, &q);
+ clnt_dentry = d_hash_and_lookup(gssd_dentry,
+ &QSTR(gssd_dummy_clnt_dir[0].name));
if (!clnt_dentry) {
__rpc_depopulate(gssd_dentry, gssd_dummy_clnt_dir, 0, 1);
pipe_dentry = ERR_PTR(-ENOENT);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 79879b7d39cb..e7f9c295d13c 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -651,8 +651,8 @@ svc_init_buffer(struct svc_rqst *rqstp, unsigned int size, int node)
if (pages > RPCSVC_MAXPAGES)
pages = RPCSVC_MAXPAGES;
- ret = alloc_pages_bulk_array_node(GFP_KERNEL, node, pages,
- rqstp->rq_pages);
+ ret = alloc_pages_bulk_node(GFP_KERNEL, node, pages,
+ rqstp->rq_pages);
return ret == pages;
}
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 43c57124de52..ae25405d8bd2 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -606,7 +606,8 @@ int svc_port_is_privileged(struct sockaddr *sin)
}
/*
- * Make sure that we don't have too many active connections. If we have,
+ * Make sure that we don't have too many connections that have not yet
+ * demonstrated that they have access to the NFS server. If we have,
* something must be dropped. It's not clear what will happen if we allow
* "too many" connections, but when dealing with network-facing software,
* we have to code defensively. Here we do that by imposing hard limits.
@@ -618,34 +619,26 @@ int svc_port_is_privileged(struct sockaddr *sin)
* The only somewhat efficient mechanism would be if drop old
* connections from the same IP first. But right now we don't even
* record the client IP in svc_sock.
- *
- * single-threaded services that expect a lot of clients will probably
- * need to set sv_maxconn to override the default value which is based
- * on the number of threads
*/
static void svc_check_conn_limits(struct svc_serv *serv)
{
- unsigned int limit = serv->sv_maxconn ? serv->sv_maxconn :
- (serv->sv_nrthreads+3) * 20;
-
- if (serv->sv_tmpcnt > limit) {
- struct svc_xprt *xprt = NULL;
+ if (serv->sv_tmpcnt > XPT_MAX_TMP_CONN) {
+ struct svc_xprt *xprt = NULL, *xprti;
spin_lock_bh(&serv->sv_lock);
if (!list_empty(&serv->sv_tempsocks)) {
- /* Try to help the admin */
- net_notice_ratelimited("%s: too many open connections, consider increasing the %s\n",
- serv->sv_name, serv->sv_maxconn ?
- "max number of connections" :
- "number of threads");
/*
* Always select the oldest connection. It's not fair,
- * but so is life
+ * but nor is life.
*/
- xprt = list_entry(serv->sv_tempsocks.prev,
- struct svc_xprt,
- xpt_list);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
- svc_xprt_get(xprt);
+ list_for_each_entry_reverse(xprti, &serv->sv_tempsocks,
+ xpt_list) {
+ if (!test_bit(XPT_PEER_VALID, &xprti->xpt_flags)) {
+ xprt = xprti;
+ set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ svc_xprt_get(xprt);
+ break;
+ }
+ }
}
spin_unlock_bh(&serv->sv_lock);
@@ -671,8 +664,7 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
}
for (filled = 0; filled < pages; filled = ret) {
- ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
- rqstp->rq_pages);
+ ret = alloc_pages_bulk(GFP_KERNEL, pages, rqstp->rq_pages);
if (ret > filled)
/* Made progress, don't sleep yet */
continue;
@@ -1039,7 +1031,8 @@ static void svc_delete_xprt(struct svc_xprt *xprt)
spin_lock_bh(&serv->sv_lock);
list_del_init(&xprt->xpt_list);
- if (test_bit(XPT_TEMP, &xprt->xpt_flags))
+ if (test_bit(XPT_TEMP, &xprt->xpt_flags) &&
+ !test_bit(XPT_PEER_VALID, &xprt->xpt_flags))
serv->sv_tmpcnt--;
spin_unlock_bh(&serv->sv_lock);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 62e07c330a66..4e003cb516fe 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1097,6 +1097,12 @@ out_overflow:
* Checks that we have enough buffer space to encode 'nbytes' more
* bytes of data. If so, update the total xdr_buf length, and
* adjust the length of the current kvec.
+ *
+ * The returned pointer is valid only until the next call to
+ * xdr_reserve_space() or xdr_commit_encode() on @xdr. The current
+ * implementation of this API guarantees that space reserved for a
+ * four-byte data item remains valid until @xdr is destroyed, but
+ * that might not always be true in the future.
*/
__be32 * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes)
{
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 720d3ba742ec..7e98d4dd9f10 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -603,23 +603,6 @@ struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi,
}
/**
- * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor
- * @xpi: pointer to rpc_xprt_iter
- *
- * Returns a reference to the struct rpc_xprt that is currently
- * pointed to by the cursor.
- */
-struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi)
-{
- struct rpc_xprt *xprt;
-
- rcu_read_lock();
- xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt);
- rcu_read_unlock();
- return xprt;
-}
-
-/**
* xprt_iter_get_next - Returns the next rpc_xprt following the cursor
* @xpi: pointer to rpc_xprt_iter
*
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index c3fbf0779d4a..aca8bdf65d72 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -621,7 +621,8 @@ static void __svc_rdma_free(struct work_struct *work)
/* Destroy the CM ID */
rdma_destroy_id(rdma->sc_cm_id);
- rpcrdma_rn_unregister(device, &rdma->sc_rn);
+ if (!test_bit(XPT_LISTENER, &rdma->sc_xprt.xpt_flags))
+ rpcrdma_rn_unregister(device, &rdma->sc_rn);
kfree(rdma);
}
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 6488ead9e464..4d5fbacef496 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -472,7 +472,7 @@ bool switchdev_port_obj_act_is_deferred(struct net_device *dev,
EXPORT_SYMBOL_GPL(switchdev_port_obj_act_is_deferred);
static ATOMIC_NOTIFIER_HEAD(switchdev_notif_chain);
-static BLOCKING_NOTIFIER_HEAD(switchdev_blocking_notif_chain);
+static RAW_NOTIFIER_HEAD(switchdev_blocking_notif_chain);
/**
* register_switchdev_notifier - Register notifier
@@ -518,17 +518,27 @@ EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
int register_switchdev_blocking_notifier(struct notifier_block *nb)
{
- struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain;
+ struct raw_notifier_head *chain = &switchdev_blocking_notif_chain;
+ int err;
+
+ rtnl_lock();
+ err = raw_notifier_chain_register(chain, nb);
+ rtnl_unlock();
- return blocking_notifier_chain_register(chain, nb);
+ return err;
}
EXPORT_SYMBOL_GPL(register_switchdev_blocking_notifier);
int unregister_switchdev_blocking_notifier(struct notifier_block *nb)
{
- struct blocking_notifier_head *chain = &switchdev_blocking_notif_chain;
+ struct raw_notifier_head *chain = &switchdev_blocking_notif_chain;
+ int err;
- return blocking_notifier_chain_unregister(chain, nb);
+ rtnl_lock();
+ err = raw_notifier_chain_unregister(chain, nb);
+ rtnl_unlock();
+
+ return err;
}
EXPORT_SYMBOL_GPL(unregister_switchdev_blocking_notifier);
@@ -536,10 +546,11 @@ int call_switchdev_blocking_notifiers(unsigned long val, struct net_device *dev,
struct switchdev_notifier_info *info,
struct netlink_ext_ack *extack)
{
+ ASSERT_RTNL();
info->dev = dev;
info->extack = extack;
- return blocking_notifier_call_chain(&switchdev_blocking_notif_chain,
- val, info);
+ return raw_notifier_call_chain(&switchdev_blocking_notif_chain,
+ val, info);
}
EXPORT_SYMBOL_GPL(call_switchdev_blocking_notifiers);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 5c2088a469ce..5689e1f48547 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1046,6 +1046,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
if (imp == TIPC_SYSTEM_IMPORTANCE) {
pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
+ __skb_queue_purge(list);
return -ENOBUFS;
}
rc = link_schedule_user(l, hdr);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index d1180370fdf4..e74940eab3a4 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -949,8 +949,8 @@ void tipc_nametbl_stop(struct net *net)
}
spin_unlock_bh(&tn->nametbl_lock);
- synchronize_net();
- kfree(nt);
+ /* TODO: clear tn->nametbl, implement proper RCU rules ? */
+ kfree_rcu(nt, rcu);
}
static int __tipc_nl_add_nametable_publ(struct tipc_nl_msg *msg,
diff --git a/net/tipc/name_table.h b/net/tipc/name_table.h
index 3bcd9ef8cee3..7ff6eeebaae6 100644
--- a/net/tipc/name_table.h
+++ b/net/tipc/name_table.h
@@ -90,6 +90,7 @@ struct publication {
/**
* struct name_table - table containing all existing port name publications
+ * @rcu: RCU callback head used for deferred freeing
* @services: name sequence hash lists
* @node_scope: all local publications with node scope
* - used by name_distr during re-init of name table
@@ -102,6 +103,7 @@ struct publication {
* @snd_nxt: next sequence number to be used
*/
struct name_table {
+ struct rcu_head rcu;
struct hlist_head services[TIPC_NAMETBL_SIZE];
struct list_head node_scope;
struct list_head cluster_scope;
diff --git a/net/tls/tls.h b/net/tls/tls.h
index e5e47452308a..774859b63f0d 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -145,7 +145,8 @@ void tls_err_abort(struct sock *sk, int err);
int init_prot_info(struct tls_prot_info *prot,
const struct tls_crypto_info *crypto_info,
const struct tls_cipher_desc *cipher_desc);
-int tls_set_sw_offload(struct sock *sk, int tx);
+int tls_set_sw_offload(struct sock *sk, int tx,
+ struct tls_crypto_info *new_crypto_info);
void tls_update_rx_zc_capable(struct tls_context *tls_ctx);
void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx);
void tls_sw_strparser_done(struct tls_context *tls_ctx);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index dc063c2c7950..e50b6e71df13 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -1227,7 +1227,7 @@ int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
context->resync_nh_reset = 1;
ctx->priv_ctx_rx = context;
- rc = tls_set_sw_offload(sk, 0);
+ rc = tls_set_sw_offload(sk, 0, NULL);
if (rc)
goto release_ctx;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 6b4b9f2749a6..4d7702ce17c0 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -423,9 +423,10 @@ static __poll_t tls_sk_poll(struct file *file, struct socket *sock,
ctx = tls_sw_ctx_rx(tls_ctx);
psock = sk_psock_get(sk);
- if (skb_queue_empty_lockless(&ctx->rx_list) &&
- !tls_strp_msg_ready(ctx) &&
- sk_psock_queue_empty(psock))
+ if ((skb_queue_empty_lockless(&ctx->rx_list) &&
+ !tls_strp_msg_ready(ctx) &&
+ sk_psock_queue_empty(psock)) ||
+ READ_ONCE(ctx->key_update_pending))
mask &= ~(EPOLLIN | EPOLLRDNORM);
if (psock)
@@ -612,11 +613,13 @@ static int validate_crypto_info(const struct tls_crypto_info *crypto_info,
static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
unsigned int optlen, int tx)
{
- struct tls_crypto_info *crypto_info;
- struct tls_crypto_info *alt_crypto_info;
+ struct tls_crypto_info *crypto_info, *alt_crypto_info;
+ struct tls_crypto_info *old_crypto_info = NULL;
struct tls_context *ctx = tls_get_ctx(sk);
const struct tls_cipher_desc *cipher_desc;
union tls_crypto_context *crypto_ctx;
+ union tls_crypto_context tmp = {};
+ bool update = false;
int rc = 0;
int conf;
@@ -633,9 +636,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
crypto_info = &crypto_ctx->info;
- /* Currently we don't support set crypto info more than one time */
- if (TLS_CRYPTO_INFO_READY(crypto_info))
- return -EBUSY;
+ if (TLS_CRYPTO_INFO_READY(crypto_info)) {
+ /* Currently we only support setting crypto info more
+ * than one time for TLS 1.3
+ */
+ if (crypto_info->version != TLS_1_3_VERSION) {
+ TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR
+ : LINUX_MIB_TLSRXREKEYERROR);
+ return -EBUSY;
+ }
+
+ update = true;
+ old_crypto_info = crypto_info;
+ crypto_info = &tmp.info;
+ crypto_ctx = &tmp;
+ }
rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info));
if (rc) {
@@ -643,7 +658,14 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
goto err_crypto_info;
}
- rc = validate_crypto_info(crypto_info, alt_crypto_info);
+ if (update) {
+ /* Ensure that TLS version and ciphers are not modified */
+ if (crypto_info->version != old_crypto_info->version ||
+ crypto_info->cipher_type != old_crypto_info->cipher_type)
+ rc = -EINVAL;
+ } else {
+ rc = validate_crypto_info(crypto_info, alt_crypto_info);
+ }
if (rc)
goto err_crypto_info;
@@ -673,11 +695,17 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE);
TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE);
} else {
- rc = tls_set_sw_offload(sk, 1);
+ rc = tls_set_sw_offload(sk, 1,
+ update ? crypto_info : NULL);
if (rc)
goto err_crypto_info;
- TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW);
- TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
+
+ if (update) {
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXREKEYOK);
+ } else {
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW);
+ }
conf = TLS_SW;
}
} else {
@@ -687,14 +715,21 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE);
TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE);
} else {
- rc = tls_set_sw_offload(sk, 0);
+ rc = tls_set_sw_offload(sk, 0,
+ update ? crypto_info : NULL);
if (rc)
goto err_crypto_info;
- TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW);
- TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW);
+
+ if (update) {
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYOK);
+ } else {
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW);
+ }
conf = TLS_SW;
}
- tls_sw_strparser_arm(sk, ctx);
+ if (!update)
+ tls_sw_strparser_arm(sk, ctx);
}
if (tx)
@@ -702,6 +737,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
else
ctx->rx_conf = conf;
update_sk_prot(sk, ctx);
+
+ if (update)
+ return 0;
+
if (tx) {
ctx->sk_write_space = sk->sk_write_space;
sk->sk_write_space = tls_write_space;
@@ -713,6 +752,10 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval,
return 0;
err_crypto_info:
+ if (update) {
+ TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR
+ : LINUX_MIB_TLSRXREKEYERROR);
+ }
memzero_explicit(crypto_ctx, sizeof(*crypto_ctx));
return rc;
}
@@ -809,6 +852,11 @@ static int tls_setsockopt(struct sock *sk, int level, int optname,
return do_tls_setsockopt(sk, optname, optval, optlen);
}
+static int tls_disconnect(struct sock *sk, int flags)
+{
+ return -EOPNOTSUPP;
+}
+
struct tls_context *tls_ctx_create(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -904,6 +952,7 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_BASE][TLS_BASE] = *base;
prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt;
prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt;
+ prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect;
prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close;
prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE];
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 68982728f620..367666aa07b8 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -22,6 +22,11 @@ static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC),
SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY),
SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL),
+ SNMP_MIB_ITEM("TlsRxRekeyOk", LINUX_MIB_TLSRXREKEYOK),
+ SNMP_MIB_ITEM("TlsRxRekeyError", LINUX_MIB_TLSRXREKEYERROR),
+ SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK),
+ SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR),
+ SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED),
SNMP_MIB_SENTINEL
};
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 7bcc9b4408a2..914d4e1516a3 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -1314,6 +1314,10 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock,
int ret = 0;
long timeo;
+ /* a rekey is pending, let userspace deal with it */
+ if (unlikely(ctx->key_update_pending))
+ return -EKEYEXPIRED;
+
timeo = sock_rcvtimeo(sk, nonblock);
while (!tls_strp_msg_ready(ctx)) {
@@ -1720,6 +1724,36 @@ tls_decrypt_device(struct sock *sk, struct msghdr *msg,
return 1;
}
+static int tls_check_pending_rekey(struct sock *sk, struct tls_context *ctx,
+ struct sk_buff *skb)
+{
+ const struct strp_msg *rxm = strp_msg(skb);
+ const struct tls_msg *tlm = tls_msg(skb);
+ char hs_type;
+ int err;
+
+ if (likely(tlm->control != TLS_RECORD_TYPE_HANDSHAKE))
+ return 0;
+
+ if (rxm->full_len < 1)
+ return 0;
+
+ err = skb_copy_bits(skb, rxm->offset, &hs_type, 1);
+ if (err < 0) {
+ DEBUG_NET_WARN_ON_ONCE(1);
+ return err;
+ }
+
+ if (hs_type == TLS_HANDSHAKE_KEYUPDATE) {
+ struct tls_sw_context_rx *rx_ctx = ctx->priv_ctx_rx;
+
+ WRITE_ONCE(rx_ctx->key_update_pending, true);
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYRECEIVED);
+ }
+
+ return 0;
+}
+
static int tls_rx_one_record(struct sock *sk, struct msghdr *msg,
struct tls_decrypt_arg *darg)
{
@@ -1739,7 +1773,7 @@ static int tls_rx_one_record(struct sock *sk, struct msghdr *msg,
rxm->full_len -= prot->overhead_size;
tls_advance_record_sn(sk, prot, &tls_ctx->rx);
- return 0;
+ return tls_check_pending_rekey(sk, tls_ctx, darg->skb);
}
int decrypt_skb(struct sock *sk, struct scatterlist *sgout)
@@ -2684,12 +2718,22 @@ int init_prot_info(struct tls_prot_info *prot,
return 0;
}
-int tls_set_sw_offload(struct sock *sk, int tx)
+static void tls_finish_key_update(struct sock *sk, struct tls_context *tls_ctx)
+{
+ struct tls_sw_context_rx *ctx = tls_ctx->priv_ctx_rx;
+
+ WRITE_ONCE(ctx->key_update_pending, false);
+ /* wake-up pre-existing poll() */
+ ctx->saved_data_ready(sk);
+}
+
+int tls_set_sw_offload(struct sock *sk, int tx,
+ struct tls_crypto_info *new_crypto_info)
{
+ struct tls_crypto_info *crypto_info, *src_crypto_info;
struct tls_sw_context_tx *sw_ctx_tx = NULL;
struct tls_sw_context_rx *sw_ctx_rx = NULL;
const struct tls_cipher_desc *cipher_desc;
- struct tls_crypto_info *crypto_info;
char *iv, *rec_seq, *key, *salt;
struct cipher_context *cctx;
struct tls_prot_info *prot;
@@ -2701,44 +2745,47 @@ int tls_set_sw_offload(struct sock *sk, int tx)
ctx = tls_get_ctx(sk);
prot = &ctx->prot_info;
- if (tx) {
- ctx->priv_ctx_tx = init_ctx_tx(ctx, sk);
- if (!ctx->priv_ctx_tx)
- return -ENOMEM;
+ /* new_crypto_info != NULL means rekey */
+ if (!new_crypto_info) {
+ if (tx) {
+ ctx->priv_ctx_tx = init_ctx_tx(ctx, sk);
+ if (!ctx->priv_ctx_tx)
+ return -ENOMEM;
+ } else {
+ ctx->priv_ctx_rx = init_ctx_rx(ctx);
+ if (!ctx->priv_ctx_rx)
+ return -ENOMEM;
+ }
+ }
+ if (tx) {
sw_ctx_tx = ctx->priv_ctx_tx;
crypto_info = &ctx->crypto_send.info;
cctx = &ctx->tx;
aead = &sw_ctx_tx->aead_send;
} else {
- ctx->priv_ctx_rx = init_ctx_rx(ctx);
- if (!ctx->priv_ctx_rx)
- return -ENOMEM;
-
sw_ctx_rx = ctx->priv_ctx_rx;
crypto_info = &ctx->crypto_recv.info;
cctx = &ctx->rx;
aead = &sw_ctx_rx->aead_recv;
}
- cipher_desc = get_cipher_desc(crypto_info->cipher_type);
+ src_crypto_info = new_crypto_info ?: crypto_info;
+
+ cipher_desc = get_cipher_desc(src_crypto_info->cipher_type);
if (!cipher_desc) {
rc = -EINVAL;
goto free_priv;
}
- rc = init_prot_info(prot, crypto_info, cipher_desc);
+ rc = init_prot_info(prot, src_crypto_info, cipher_desc);
if (rc)
goto free_priv;
- iv = crypto_info_iv(crypto_info, cipher_desc);
- key = crypto_info_key(crypto_info, cipher_desc);
- salt = crypto_info_salt(crypto_info, cipher_desc);
- rec_seq = crypto_info_rec_seq(crypto_info, cipher_desc);
-
- memcpy(cctx->iv, salt, cipher_desc->salt);
- memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv);
- memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq);
+ iv = crypto_info_iv(src_crypto_info, cipher_desc);
+ key = crypto_info_key(src_crypto_info, cipher_desc);
+ salt = crypto_info_salt(src_crypto_info, cipher_desc);
+ rec_seq = crypto_info_rec_seq(src_crypto_info, cipher_desc);
if (!*aead) {
*aead = crypto_alloc_aead(cipher_desc->cipher_name, 0, 0);
@@ -2751,20 +2798,30 @@ int tls_set_sw_offload(struct sock *sk, int tx)
ctx->push_pending_record = tls_sw_push_pending_record;
+ /* setkey is the last operation that could fail during a
+ * rekey. if it succeeds, we can start modifying the
+ * context.
+ */
rc = crypto_aead_setkey(*aead, key, cipher_desc->key);
- if (rc)
- goto free_aead;
+ if (rc) {
+ if (new_crypto_info)
+ goto out;
+ else
+ goto free_aead;
+ }
- rc = crypto_aead_setauthsize(*aead, prot->tag_size);
- if (rc)
- goto free_aead;
+ if (!new_crypto_info) {
+ rc = crypto_aead_setauthsize(*aead, prot->tag_size);
+ if (rc)
+ goto free_aead;
+ }
- if (sw_ctx_rx) {
+ if (!tx && !new_crypto_info) {
tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv);
tls_update_rx_zc_capable(ctx);
sw_ctx_rx->async_capable =
- crypto_info->version != TLS_1_3_VERSION &&
+ src_crypto_info->version != TLS_1_3_VERSION &&
!!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC);
rc = tls_strp_init(&sw_ctx_rx->strp, sk);
@@ -2772,18 +2829,33 @@ int tls_set_sw_offload(struct sock *sk, int tx)
goto free_aead;
}
+ memcpy(cctx->iv, salt, cipher_desc->salt);
+ memcpy(cctx->iv + cipher_desc->salt, iv, cipher_desc->iv);
+ memcpy(cctx->rec_seq, rec_seq, cipher_desc->rec_seq);
+
+ if (new_crypto_info) {
+ unsafe_memcpy(crypto_info, new_crypto_info,
+ cipher_desc->crypto_info,
+ /* size was checked in do_tls_setsockopt_conf */);
+ memzero_explicit(new_crypto_info, cipher_desc->crypto_info);
+ if (!tx)
+ tls_finish_key_update(sk, ctx);
+ }
+
goto out;
free_aead:
crypto_free_aead(*aead);
*aead = NULL;
free_priv:
- if (tx) {
- kfree(ctx->priv_ctx_tx);
- ctx->priv_ctx_tx = NULL;
- } else {
- kfree(ctx->priv_ctx_rx);
- ctx->priv_ctx_rx = NULL;
+ if (!new_crypto_info) {
+ if (tx) {
+ kfree(ctx->priv_ctx_tx);
+ ctx->priv_ctx_tx = NULL;
+ } else {
+ kfree(ctx->priv_ctx_rx);
+ ctx->priv_ctx_rx = NULL;
+ }
}
out:
return rc;
diff --git a/net/unix/Kconfig b/net/unix/Kconfig
index 8b5d04210d7c..6f1783c1659b 100644
--- a/net/unix/Kconfig
+++ b/net/unix/Kconfig
@@ -17,9 +17,11 @@ config UNIX
Say Y unless you know what you are doing.
config AF_UNIX_OOB
- bool
+ bool "UNIX: out-of-bound messages"
depends on UNIX
default y
+ help
+ Support for MSG_OOB in UNIX domain sockets. If unsure, say Y.
config UNIX_DIAG
tristate "UNIX: socket monitoring interface"
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 6b1762300443..f0e613d97664 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -286,14 +286,9 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
}
#endif /* CONFIG_SECURITY_NETWORK */
-static inline int unix_our_peer(struct sock *sk, struct sock *osk)
-{
- return unix_peer(osk) == sk;
-}
-
static inline int unix_may_send(struct sock *sk, struct sock *osk)
{
- return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
+ return !unix_peer(osk) || unix_peer(osk) == sk;
}
static inline int unix_recvq_full_lockless(const struct sock *sk)
@@ -627,7 +622,9 @@ static void unix_write_space(struct sock *sk)
static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
{
if (!skb_queue_empty(&sk->sk_receive_queue)) {
- skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge_reason(&sk->sk_receive_queue,
+ SKB_DROP_REASON_UNIX_DISCONNECT);
+
wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
/* If one link of bidirectional dgram pipe is disconnected,
@@ -645,7 +642,7 @@ static void unix_sock_destructor(struct sock *sk)
{
struct unix_sock *u = unix_sk(sk);
- skb_queue_purge(&sk->sk_receive_queue);
+ skb_queue_purge_reason(&sk->sk_receive_queue, SKB_DROP_REASON_SOCKET_CLOSE);
DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
@@ -720,8 +717,8 @@ static void unix_release_sock(struct sock *sk, int embrion)
if (state == TCP_LISTEN)
unix_release_sock(skb->sk, 1);
- /* passed fds are erased in the kfree_skb hook */
- kfree_skb(skb);
+ /* passed fds are erased in the kfree_skb hook */
+ kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
}
if (path.dentry)
@@ -1563,32 +1560,30 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
/* First of all allocate resources.
- If we will make it after state is locked,
- we will have to recheck all again in any case.
+ * If we will make it after state is locked,
+ * we will have to recheck all again in any case.
*/
/* create new sock for complete connection */
newsk = unix_create1(net, NULL, 0, sock->type);
if (IS_ERR(newsk)) {
err = PTR_ERR(newsk);
- newsk = NULL;
goto out;
}
- err = -ENOMEM;
-
/* Allocate skb for sending to listening sock */
skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
- if (skb == NULL)
- goto out;
+ if (!skb) {
+ err = -ENOMEM;
+ goto out_free_sk;
+ }
restart:
/* Find listening sock. */
other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
if (IS_ERR(other)) {
err = PTR_ERR(other);
- other = NULL;
- goto out;
+ goto out_free_skb;
}
unix_state_lock(other);
@@ -1600,23 +1595,25 @@ restart:
goto restart;
}
- err = -ECONNREFUSED;
- if (other->sk_state != TCP_LISTEN)
- goto out_unlock;
- if (other->sk_shutdown & RCV_SHUTDOWN)
+ if (other->sk_state != TCP_LISTEN ||
+ other->sk_shutdown & RCV_SHUTDOWN) {
+ err = -ECONNREFUSED;
goto out_unlock;
+ }
if (unix_recvq_full_lockless(other)) {
- err = -EAGAIN;
- if (!timeo)
+ if (!timeo) {
+ err = -EAGAIN;
goto out_unlock;
+ }
timeo = unix_wait_for_peer(other, timeo);
+ sock_put(other);
err = sock_intr_errno(timeo);
if (signal_pending(current))
- goto out;
- sock_put(other);
+ goto out_free_skb;
+
goto restart;
}
@@ -1701,15 +1698,13 @@ restart:
return 0;
out_unlock:
- if (other)
- unix_state_unlock(other);
-
+ unix_state_unlock(other);
+ sock_put(other);
+out_free_skb:
+ consume_skb(skb);
+out_free_sk:
+ unix_release_sock(newsk, 0);
out:
- kfree_skb(skb);
- if (newsk)
- unix_release_sock(newsk, 0);
- if (other)
- sock_put(other);
return err;
}
@@ -1964,7 +1959,6 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
- DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
struct sock *sk = sock->sk, *other = NULL;
struct unix_sock *u = unix_sk(sk);
struct scm_cookie scm;
@@ -1980,12 +1974,13 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
wait_for_unix_gc(scm.fp);
- err = -EOPNOTSUPP;
- if (msg->msg_flags&MSG_OOB)
+ if (msg->msg_flags & MSG_OOB) {
+ err = -EOPNOTSUPP;
goto out;
+ }
if (msg->msg_namelen) {
- err = unix_validate_addr(sunaddr, msg->msg_namelen);
+ err = unix_validate_addr(msg->msg_name, msg->msg_namelen);
if (err)
goto out;
@@ -1995,12 +1990,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
NULL);
if (err)
goto out;
- } else {
- sunaddr = NULL;
- err = -ENOTCONN;
- other = unix_peer_get(sk);
- if (!other)
- goto out;
}
if ((test_bit(SOCK_PASSCRED, &sock->flags) ||
@@ -2011,9 +2000,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
- err = -EMSGSIZE;
- if (len > READ_ONCE(sk->sk_sndbuf) - 32)
+ if (len > READ_ONCE(sk->sk_sndbuf) - 32) {
+ err = -EMSGSIZE;
goto out;
+ }
if (len > SKB_MAX_ALLOC) {
data_len = min_t(size_t,
@@ -2027,7 +2017,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
msg->msg_flags & MSG_DONTWAIT, &err,
PAGE_ALLOC_COSTLY_ORDER);
- if (skb == NULL)
+ if (!skb)
goto out;
err = unix_scm_to_skb(&scm, skb, true);
@@ -2043,17 +2033,18 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
-restart:
- if (!other) {
- err = -ECONNRESET;
- if (sunaddr == NULL)
- goto out_free;
-
- other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
- sk->sk_type);
+ if (msg->msg_namelen) {
+lookup:
+ other = unix_find_other(sock_net(sk), msg->msg_name,
+ msg->msg_namelen, sk->sk_type);
if (IS_ERR(other)) {
err = PTR_ERR(other);
- other = NULL;
+ goto out_free;
+ }
+ } else {
+ other = unix_peer_get(sk);
+ if (!other) {
+ err = -ENOTCONN;
goto out_free;
}
}
@@ -2061,36 +2052,37 @@ restart:
if (sk_filter(other, skb) < 0) {
/* Toss the packet but do not return any error to the sender */
err = len;
- goto out_free;
+ goto out_sock_put;
}
+restart:
sk_locked = 0;
unix_state_lock(other);
restart_locked:
- err = -EPERM;
- if (!unix_may_send(sk, other))
+
+ if (!unix_may_send(sk, other)) {
+ err = -EPERM;
goto out_unlock;
+ }
if (unlikely(sock_flag(other, SOCK_DEAD))) {
- /*
- * Check with 1003.1g - what should
- * datagram error
- */
- unix_state_unlock(other);
- sock_put(other);
+ /* Check with 1003.1g - what should datagram error */
- if (!sk_locked)
- unix_state_lock(sk);
+ unix_state_unlock(other);
- err = 0;
if (sk->sk_type == SOCK_SEQPACKET) {
/* We are here only when racing with unix_release_sock()
* is clearing @other. Never change state to TCP_CLOSE
* unlike SOCK_DGRAM wants.
*/
- unix_state_unlock(sk);
err = -EPIPE;
- } else if (unix_peer(sk) == other) {
+ goto out_sock_put;
+ }
+
+ if (!sk_locked)
+ unix_state_lock(sk);
+
+ if (unix_peer(sk) == other) {
unix_peer(sk) = NULL;
unix_dgram_peer_wake_disconnect_wakeup(sk, other);
@@ -2100,19 +2092,24 @@ restart_locked:
unix_dgram_disconnected(sk, other);
sock_put(other);
err = -ECONNREFUSED;
- } else {
- unix_state_unlock(sk);
+ goto out_sock_put;
}
- other = NULL;
- if (err)
- goto out_free;
- goto restart;
+ unix_state_unlock(sk);
+
+ if (!msg->msg_namelen) {
+ err = -ECONNRESET;
+ goto out_sock_put;
+ }
+
+ sock_put(other);
+ goto lookup;
}
- err = -EPIPE;
- if (other->sk_shutdown & RCV_SHUTDOWN)
+ if (other->sk_shutdown & RCV_SHUTDOWN) {
+ err = -EPIPE;
goto out_unlock;
+ }
if (sk->sk_type != SOCK_SEQPACKET) {
err = security_unix_may_send(sk->sk_socket, other->sk_socket);
@@ -2132,7 +2129,7 @@ restart_locked:
err = sock_intr_errno(timeo);
if (signal_pending(current))
- goto out_free;
+ goto out_sock_put;
goto restart;
}
@@ -2173,11 +2170,11 @@ out_unlock:
if (sk_locked)
unix_state_unlock(sk);
unix_state_unlock(other);
+out_sock_put:
+ sock_put(other);
out_free:
- kfree_skb(skb);
+ consume_skb(skb);
out:
- if (other)
- sock_put(other);
scm_destroy(&scm);
return err;
}
@@ -2193,7 +2190,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
{
struct unix_sock *ousk = unix_sk(other);
struct sk_buff *skb;
- int err = 0;
+ int err;
skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
@@ -2201,25 +2198,22 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
return err;
err = unix_scm_to_skb(scm, skb, !fds_sent);
- if (err < 0) {
- kfree_skb(skb);
- return err;
- }
+ if (err < 0)
+ goto out;
+
skb_put(skb, 1);
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
- if (err) {
- kfree_skb(skb);
- return err;
- }
+ if (err)
+ goto out;
unix_state_lock(other);
if (sock_flag(other, SOCK_DEAD) ||
(other->sk_shutdown & RCV_SHUTDOWN)) {
unix_state_unlock(other);
- kfree_skb(skb);
- return -EPIPE;
+ err = -EPIPE;
+ goto out;
}
maybe_add_creds(skb, sock, other);
@@ -2234,6 +2228,9 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other
unix_state_unlock(other);
other->sk_data_ready(other);
+ return 0;
+out:
+ consume_skb(skb);
return err;
}
#endif
@@ -2242,13 +2239,11 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
struct sock *sk = sock->sk;
+ struct sk_buff *skb = NULL;
struct sock *other = NULL;
- int err, size;
- struct sk_buff *skb;
- int sent = 0;
struct scm_cookie scm;
bool fds_sent = false;
- int data_len;
+ int err, sent = 0;
err = scm_send(sock, msg, &scm, false);
if (err < 0)
@@ -2256,8 +2251,8 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
wait_for_unix_gc(scm.fp);
- err = -EOPNOTSUPP;
if (msg->msg_flags & MSG_OOB) {
+ err = -EOPNOTSUPP;
#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
if (len)
len--;
@@ -2270,17 +2265,19 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
goto out_err;
} else {
- err = -ENOTCONN;
other = unix_peer(sk);
- if (!other)
+ if (!other) {
+ err = -ENOTCONN;
goto out_err;
+ }
}
if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
- goto pipe_err;
+ goto out_pipe;
while (sent < len) {
- size = len - sent;
+ int size = len - sent;
+ int data_len;
if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
skb = sock_alloc_send_pskb(sk, 0, 0,
@@ -2306,20 +2303,18 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
/* Only send the fds in the first buffer */
err = unix_scm_to_skb(&scm, skb, !fds_sent);
- if (err < 0) {
- kfree_skb(skb);
- goto out_err;
- }
+ if (err < 0)
+ goto out_free;
+
fds_sent = true;
if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
err = skb_splice_from_iter(skb, &msg->msg_iter, size,
sk->sk_allocation);
- if (err < 0) {
- kfree_skb(skb);
- goto out_err;
- }
+ if (err < 0)
+ goto out_free;
+
size = err;
refcount_add(size, &sk->sk_wmem_alloc);
} else {
@@ -2327,17 +2322,15 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
skb->data_len = data_len;
skb->len = size;
err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
- if (err) {
- kfree_skb(skb);
- goto out_err;
- }
+ if (err)
+ goto out_free;
}
unix_state_lock(other);
if (sock_flag(other, SOCK_DEAD) ||
(other->sk_shutdown & RCV_SHUTDOWN))
- goto pipe_err_free;
+ goto out_pipe_unlock;
maybe_add_creds(skb, sock, other);
scm_stat_add(other, skb);
@@ -2360,13 +2353,14 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
return sent;
-pipe_err_free:
+out_pipe_unlock:
unix_state_unlock(other);
- kfree_skb(skb);
-pipe_err:
- if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
+out_pipe:
+ if (!sent && !(msg->msg_flags & MSG_NOSIGNAL))
send_sig(SIGPIPE, current, 0);
err = -EPIPE;
+out_free:
+ consume_skb(skb);
out_err:
scm_destroy(&scm);
return sent ? : err;
@@ -2699,7 +2693,7 @@ unlock:
spin_unlock(&sk->sk_receive_queue.lock);
consume_skb(read_skb);
- kfree_skb(unread_skb);
+ kfree_skb_reason(unread_skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
return skb;
}
@@ -2728,7 +2722,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
if (sock_flag(sk, SOCK_DEAD)) {
unix_state_unlock(sk);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE);
return -ECONNRESET;
}
@@ -2742,7 +2736,7 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
unix_state_unlock(sk);
if (drop) {
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB);
return -EAGAIN;
}
}
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 0068e758be4d..9848b7b78701 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -573,7 +573,7 @@ static void __unix_gc(struct work_struct *work)
UNIXCB(skb).fp->dead = true;
}
- __skb_queue_purge(&hitlist);
+ __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE);
skip_gc:
WRITE_ONCE(gc_in_progress, false);
}
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7e3db87ae433..fc6afbc8d680 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1551,7 +1551,11 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr,
timeout = vsk->connect_timeout;
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- while (sk->sk_state != TCP_ESTABLISHED && sk->sk_err == 0) {
+ /* If the socket is already closing or it is in an error state, there
+ * is no point in waiting.
+ */
+ while (sk->sk_state != TCP_ESTABLISHED &&
+ sk->sk_state != TCP_CLOSING && sk->sk_err == 0) {
if (flags & O_NONBLOCK) {
/* If we're not going to block, we schedule a timeout
* function to generate a timeout on the connection
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 56c232cf5b0f..31342ab502b4 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -13,12 +13,12 @@
#include <linux/hyperv.h>
#include <net/sock.h>
#include <net/af_vsock.h>
-#include <asm/hyperv-tlfs.h>
+#include <hyperv/hvhdk.h>
/* Older (VMBUS version 'VERSION_WIN10' or before) Windows hosts have some
* stricter requirements on the hv_sock ring buffer size of six 4K pages.
- * hyperv-tlfs defines HV_HYP_PAGE_SIZE as 4K. Newer hosts don't have this
- * limitation; but, keep the defaults the same for compat.
+ * HV_HYP_PAGE_SIZE is defined as 4K. Newer hosts don't have this limitation;
+ * but, keep the defaults the same for compat.
*/
#define RINGBUFFER_HVS_RCV_SIZE (HV_HYP_PAGE_SIZE * 6)
#define RINGBUFFER_HVS_SND_SIZE (HV_HYP_PAGE_SIZE * 6)
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 40b6375a5de4..9f918b77b40e 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -55,6 +55,56 @@ void cfg80211_chandef_create(struct cfg80211_chan_def *chandef,
}
EXPORT_SYMBOL(cfg80211_chandef_create);
+static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
+{
+ return nl80211_chan_width_to_mhz(c->width);
+}
+
+static u32 cfg80211_get_start_freq(const struct cfg80211_chan_def *chandef,
+ u32 cf)
+{
+ u32 start_freq, center_freq, bandwidth;
+
+ center_freq = MHZ_TO_KHZ((cf == 1) ?
+ chandef->center_freq1 : chandef->center_freq2);
+ bandwidth = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef));
+
+ if (bandwidth <= MHZ_TO_KHZ(20))
+ start_freq = center_freq;
+ else
+ start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10);
+
+ return start_freq;
+}
+
+static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef,
+ u32 cf)
+{
+ u32 end_freq, center_freq, bandwidth;
+
+ center_freq = MHZ_TO_KHZ((cf == 1) ?
+ chandef->center_freq1 : chandef->center_freq2);
+ bandwidth = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef));
+
+ if (bandwidth <= MHZ_TO_KHZ(20))
+ end_freq = center_freq;
+ else
+ end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10);
+
+ return end_freq;
+}
+
+#define for_each_subchan(chandef, freq, cf) \
+ for (u32 punctured = chandef->punctured, \
+ cf = 1, freq = cfg80211_get_start_freq(chandef, cf); \
+ freq <= cfg80211_get_end_freq(chandef, cf); \
+ freq += MHZ_TO_KHZ(20), \
+ ((cf == 1 && chandef->center_freq2 != 0 && \
+ freq > cfg80211_get_end_freq(chandef, cf)) ? \
+ (cf++, freq = cfg80211_get_start_freq(chandef, cf), \
+ punctured = 0) : (punctured >>= 1))) \
+ if (!(punctured & 1))
+
struct cfg80211_per_bw_puncturing_values {
u8 len;
const u16 *valid_values;
@@ -258,11 +308,6 @@ int nl80211_chan_width_to_mhz(enum nl80211_chan_width chan_width)
}
EXPORT_SYMBOL(nl80211_chan_width_to_mhz);
-static int cfg80211_chandef_get_width(const struct cfg80211_chan_def *c)
-{
- return nl80211_chan_width_to_mhz(c->width);
-}
-
static bool cfg80211_valid_center_freq(u32 center,
enum nl80211_chan_width width)
{
@@ -582,29 +627,11 @@ cfg80211_chandef_compatible(const struct cfg80211_chan_def *c1,
}
EXPORT_SYMBOL(cfg80211_chandef_compatible);
-static void cfg80211_set_chans_dfs_state(struct wiphy *wiphy, u32 center_freq,
- u32 bandwidth,
- enum nl80211_dfs_state dfs_state)
-{
- struct ieee80211_channel *c;
- u32 freq;
-
- for (freq = center_freq - bandwidth/2 + 10;
- freq <= center_freq + bandwidth/2 - 10;
- freq += 20) {
- c = ieee80211_get_channel(wiphy, freq);
- if (!c || !(c->flags & IEEE80211_CHAN_RADAR))
- continue;
-
- c->dfs_state = dfs_state;
- c->dfs_state_entered = jiffies;
- }
-}
-
void cfg80211_set_dfs_state(struct wiphy *wiphy,
const struct cfg80211_chan_def *chandef,
enum nl80211_dfs_state dfs_state)
{
+ struct ieee80211_channel *c;
int width;
if (WARN_ON(!cfg80211_chandef_valid(chandef)))
@@ -614,41 +641,14 @@ void cfg80211_set_dfs_state(struct wiphy *wiphy,
if (width < 0)
return;
- cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq1,
- width, dfs_state);
-
- if (!chandef->center_freq2)
- return;
- cfg80211_set_chans_dfs_state(wiphy, chandef->center_freq2,
- width, dfs_state);
-}
-
-static u32 cfg80211_get_start_freq(u32 center_freq,
- u32 bandwidth)
-{
- u32 start_freq;
-
- bandwidth = MHZ_TO_KHZ(bandwidth);
- if (bandwidth <= MHZ_TO_KHZ(20))
- start_freq = center_freq;
- else
- start_freq = center_freq - bandwidth / 2 + MHZ_TO_KHZ(10);
-
- return start_freq;
-}
-
-static u32 cfg80211_get_end_freq(u32 center_freq,
- u32 bandwidth)
-{
- u32 end_freq;
-
- bandwidth = MHZ_TO_KHZ(bandwidth);
- if (bandwidth <= MHZ_TO_KHZ(20))
- end_freq = center_freq;
- else
- end_freq = center_freq + bandwidth / 2 - MHZ_TO_KHZ(10);
+ for_each_subchan(chandef, freq, cf) {
+ c = ieee80211_get_channel_khz(wiphy, freq);
+ if (!c || !(c->flags & IEEE80211_CHAN_RADAR))
+ continue;
- return end_freq;
+ c->dfs_state = dfs_state;
+ c->dfs_state_entered = jiffies;
+ }
}
static bool
@@ -725,17 +725,12 @@ static bool cfg80211_dfs_permissive_chan(struct wiphy *wiphy,
}
static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy,
- u32 center_freq,
- u32 bandwidth,
- enum nl80211_iftype iftype)
+ const struct cfg80211_chan_def *chandef,
+ enum nl80211_iftype iftype)
{
struct ieee80211_channel *c;
- u32 freq, start_freq, end_freq;
-
- start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
- end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
- for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+ for_each_subchan(chandef, freq, cf) {
c = ieee80211_get_channel_khz(wiphy, freq);
if (!c)
return -EINVAL;
@@ -768,25 +763,9 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
if (width < 0)
return -EINVAL;
- ret = cfg80211_get_chans_dfs_required(wiphy,
- ieee80211_chandef_to_khz(chandef),
- width, iftype);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- return BIT(chandef->width);
-
- if (!chandef->center_freq2)
- return 0;
-
- ret = cfg80211_get_chans_dfs_required(wiphy,
- MHZ_TO_KHZ(chandef->center_freq2),
- width, iftype);
- if (ret < 0)
- return ret;
- else if (ret > 0)
- return BIT(chandef->width);
+ ret = cfg80211_get_chans_dfs_required(wiphy, chandef, iftype);
+ return (ret > 0) ? BIT(chandef->width) : ret;
break;
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_OCB:
@@ -806,16 +785,18 @@ int cfg80211_chandef_dfs_required(struct wiphy *wiphy,
}
EXPORT_SYMBOL(cfg80211_chandef_dfs_required);
-static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy,
- u32 center_freq,
- u32 bandwidth)
+bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
{
struct ieee80211_channel *c;
- u32 freq, start_freq, end_freq;
- int count = 0;
+ int width, count = 0;
- start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
- end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return false;
+
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return false;
/*
* Check entire range of channels for the bandwidth.
@@ -823,61 +804,24 @@ static int cfg80211_get_chans_dfs_usable(struct wiphy *wiphy,
* DFS_AVAILABLE). Return number of usable channels
* (require CAC). Allow DFS and non-DFS channel mix.
*/
- for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+ for_each_subchan(chandef, freq, cf) {
c = ieee80211_get_channel_khz(wiphy, freq);
if (!c)
- return -EINVAL;
+ return false;
if (c->flags & IEEE80211_CHAN_DISABLED)
- return -EINVAL;
+ return false;
if (c->flags & IEEE80211_CHAN_RADAR) {
if (c->dfs_state == NL80211_DFS_UNAVAILABLE)
- return -EINVAL;
+ return false;
if (c->dfs_state == NL80211_DFS_USABLE)
count++;
}
}
- return count;
-}
-
-bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy,
- const struct cfg80211_chan_def *chandef)
-{
- int width;
- int r1, r2 = 0;
-
- if (WARN_ON(!cfg80211_chandef_valid(chandef)))
- return false;
-
- width = cfg80211_chandef_get_width(chandef);
- if (width < 0)
- return false;
-
- r1 = cfg80211_get_chans_dfs_usable(wiphy,
- MHZ_TO_KHZ(chandef->center_freq1),
- width);
-
- if (r1 < 0)
- return false;
-
- switch (chandef->width) {
- case NL80211_CHAN_WIDTH_80P80:
- WARN_ON(!chandef->center_freq2);
- r2 = cfg80211_get_chans_dfs_usable(wiphy,
- MHZ_TO_KHZ(chandef->center_freq2),
- width);
- if (r2 < 0)
- return false;
- break;
- default:
- WARN_ON(chandef->center_freq2);
- break;
- }
-
- return (r1 + r2 > 0);
+ return count > 0;
}
EXPORT_SYMBOL(cfg80211_chandef_dfs_usable);
@@ -1039,10 +983,10 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
if (!reg_dfs_domain_same(wiphy, &rdev->wiphy))
continue;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
found = cfg80211_is_wiphy_oper_chan(&rdev->wiphy, chan) ||
cfg80211_offchan_chain_is_active(rdev, chan);
- wiphy_unlock(&rdev->wiphy);
if (found)
return true;
@@ -1051,26 +995,29 @@ bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy,
return false;
}
-static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
- u32 center_freq,
- u32 bandwidth)
+static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
{
struct ieee80211_channel *c;
- u32 freq, start_freq, end_freq;
+ int width;
bool dfs_offload;
+ if (WARN_ON(!cfg80211_chandef_valid(chandef)))
+ return false;
+
+ width = cfg80211_chandef_get_width(chandef);
+ if (width < 0)
+ return false;
+
dfs_offload = wiphy_ext_feature_isset(wiphy,
NL80211_EXT_FEATURE_DFS_OFFLOAD);
- start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
- end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
-
/*
* Check entire range of channels for the bandwidth.
* If any channel in between is disabled or has not
* had gone through CAC return false
*/
- for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+ for_each_subchan(chandef, freq, cf) {
c = ieee80211_get_channel_khz(wiphy, freq);
if (!c)
return false;
@@ -1087,124 +1034,54 @@ static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy,
return true;
}
-static bool cfg80211_chandef_dfs_available(struct wiphy *wiphy,
- const struct cfg80211_chan_def *chandef)
+unsigned int
+cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
{
+ struct ieee80211_channel *c;
int width;
- int r;
+ unsigned int t1 = 0, t2 = 0;
if (WARN_ON(!cfg80211_chandef_valid(chandef)))
- return false;
+ return 0;
width = cfg80211_chandef_get_width(chandef);
if (width < 0)
- return false;
-
- r = cfg80211_get_chans_dfs_available(wiphy,
- MHZ_TO_KHZ(chandef->center_freq1),
- width);
-
- /* If any of channels unavailable for cf1 just return */
- if (!r)
- return r;
-
- switch (chandef->width) {
- case NL80211_CHAN_WIDTH_80P80:
- WARN_ON(!chandef->center_freq2);
- r = cfg80211_get_chans_dfs_available(wiphy,
- MHZ_TO_KHZ(chandef->center_freq2),
- width);
- break;
- default:
- WARN_ON(chandef->center_freq2);
- break;
- }
-
- return r;
-}
-
-static unsigned int cfg80211_get_chans_dfs_cac_time(struct wiphy *wiphy,
- u32 center_freq,
- u32 bandwidth)
-{
- struct ieee80211_channel *c;
- u32 start_freq, end_freq, freq;
- unsigned int dfs_cac_ms = 0;
-
- start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
- end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
+ return 0;
- for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
+ for_each_subchan(chandef, freq, cf) {
c = ieee80211_get_channel_khz(wiphy, freq);
- if (!c)
- return 0;
-
- if (c->flags & IEEE80211_CHAN_DISABLED)
- return 0;
+ if (!c || (c->flags & IEEE80211_CHAN_DISABLED)) {
+ if (cf == 1)
+ t1 = INT_MAX;
+ else
+ t2 = INT_MAX;
+ continue;
+ }
if (!(c->flags & IEEE80211_CHAN_RADAR))
continue;
- if (c->dfs_cac_ms > dfs_cac_ms)
- dfs_cac_ms = c->dfs_cac_ms;
- }
-
- return dfs_cac_ms;
-}
-
-unsigned int
-cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy,
- const struct cfg80211_chan_def *chandef)
-{
- int width;
- unsigned int t1 = 0, t2 = 0;
+ if (cf == 1 && c->dfs_cac_ms > t1)
+ t1 = c->dfs_cac_ms;
- if (WARN_ON(!cfg80211_chandef_valid(chandef)))
- return 0;
+ if (cf == 2 && c->dfs_cac_ms > t2)
+ t2 = c->dfs_cac_ms;
+ }
- width = cfg80211_chandef_get_width(chandef);
- if (width < 0)
+ if (t1 == INT_MAX && t2 == INT_MAX)
return 0;
- t1 = cfg80211_get_chans_dfs_cac_time(wiphy,
- MHZ_TO_KHZ(chandef->center_freq1),
- width);
+ if (t1 == INT_MAX)
+ return t2;
- if (!chandef->center_freq2)
+ if (t2 == INT_MAX)
return t1;
- t2 = cfg80211_get_chans_dfs_cac_time(wiphy,
- MHZ_TO_KHZ(chandef->center_freq2),
- width);
-
return max(t1, t2);
}
EXPORT_SYMBOL(cfg80211_chandef_dfs_cac_time);
-static bool cfg80211_secondary_chans_ok(struct wiphy *wiphy,
- u32 center_freq, u32 bandwidth,
- u32 prohibited_flags,
- u32 permitting_flags)
-{
- struct ieee80211_channel *c;
- u32 freq, start_freq, end_freq;
-
- start_freq = cfg80211_get_start_freq(center_freq, bandwidth);
- end_freq = cfg80211_get_end_freq(center_freq, bandwidth);
-
- for (freq = start_freq; freq <= end_freq; freq += MHZ_TO_KHZ(20)) {
- c = ieee80211_get_channel_khz(wiphy, freq);
- if (!c)
- return false;
- if (c->flags & permitting_flags)
- continue;
- if (c->flags & prohibited_flags)
- return false;
- }
-
- return true;
-}
-
/* check if the operating channels are valid and supported */
static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels,
enum ieee80211_edmg_bw_config edmg_bw_config,
@@ -1270,6 +1147,7 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy,
bool ext_nss_cap, support_80_80 = false, support_320 = false;
const struct ieee80211_sband_iftype_data *iftd;
struct ieee80211_supported_band *sband;
+ struct ieee80211_channel *c;
int i;
if (WARN_ON(!cfg80211_chandef_valid(chandef)))
@@ -1420,19 +1298,17 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy,
if (width < 20)
prohibited_flags |= IEEE80211_CHAN_NO_OFDM;
+ for_each_subchan(chandef, freq, cf) {
+ c = ieee80211_get_channel_khz(wiphy, freq);
+ if (!c)
+ return false;
+ if (c->flags & permitting_flags)
+ continue;
+ if (c->flags & prohibited_flags)
+ return false;
+ }
- if (!cfg80211_secondary_chans_ok(wiphy,
- ieee80211_chandef_to_khz(chandef),
- width, prohibited_flags,
- permitting_flags))
- return false;
-
- if (!chandef->center_freq2)
- return true;
- return cfg80211_secondary_chans_ok(wiphy,
- MHZ_TO_KHZ(chandef->center_freq2),
- width, prohibited_flags,
- permitting_flags);
+ return true;
}
bool cfg80211_chandef_usable(struct wiphy *wiphy,
diff --git a/net/wireless/core.c b/net/wireless/core.c
index afbdc549fb4a..ceb768925b85 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -143,10 +143,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev,
if (result)
return result;
- if (!IS_ERR_OR_NULL(rdev->wiphy.debugfsdir))
- debugfs_rename(rdev->wiphy.debugfsdir->d_parent,
- rdev->wiphy.debugfsdir,
- rdev->wiphy.debugfsdir->d_parent, newname);
+ debugfs_change_name(rdev->wiphy.debugfsdir, "%s", newname);
nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY);
@@ -191,7 +188,8 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
return err;
}
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (!wdev->netdev)
continue;
@@ -212,7 +210,6 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev,
continue;
nl80211_notify_iface(rdev, wdev, NL80211_CMD_NEW_INTERFACE);
}
- wiphy_unlock(&rdev->wiphy);
return 0;
}
@@ -221,9 +218,9 @@ static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data)
{
struct cfg80211_registered_device *rdev = data;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
rdev_rfkill_poll(rdev);
- wiphy_unlock(&rdev->wiphy);
}
void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
@@ -283,7 +280,7 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
/* otherwise, check iftype */
- wiphy_lock(wiphy);
+ guard(wiphy)(wiphy);
switch (wdev->iftype) {
case NL80211_IFTYPE_P2P_DEVICE:
@@ -295,8 +292,6 @@ void cfg80211_shutdown_all_interfaces(struct wiphy *wiphy)
default:
break;
}
-
- wiphy_unlock(wiphy);
}
}
EXPORT_SYMBOL_GPL(cfg80211_shutdown_all_interfaces);
@@ -331,9 +326,9 @@ static void cfg80211_event_work(struct work_struct *work)
rdev = container_of(work, struct cfg80211_registered_device,
event_work);
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
cfg80211_process_rdev_events(rdev);
- wiphy_unlock(&rdev->wiphy);
}
void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev)
@@ -347,10 +342,10 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev)
if (wdev->netdev)
dev_close(wdev->netdev);
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
cfg80211_leave(rdev, wdev);
cfg80211_remove_virtual_intf(rdev, wdev);
- wiphy_unlock(&rdev->wiphy);
}
}
}
@@ -423,9 +418,9 @@ static void cfg80211_wiphy_work(struct work_struct *work)
trace_wiphy_work_worker_start(&rdev->wiphy);
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
if (rdev->suspended)
- goto out;
+ return;
spin_lock_irq(&rdev->wiphy_work_lock);
wk = list_first_entry_or_null(&rdev->wiphy_work_list,
@@ -441,8 +436,6 @@ static void cfg80211_wiphy_work(struct work_struct *work)
} else {
spin_unlock_irq(&rdev->wiphy_work_lock);
}
-out:
- wiphy_unlock(&rdev->wiphy);
}
/* exported functions */
@@ -553,6 +546,9 @@ use_default_name:
INIT_WORK(&rdev->mgmt_registrations_update_wk,
cfg80211_mgmt_registrations_update_wk);
spin_lock_init(&rdev->mgmt_registrations_lock);
+ INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work);
+ INIT_LIST_HEAD(&rdev->wiphy_work_list);
+ spin_lock_init(&rdev->wiphy_work_lock);
#ifdef CONFIG_CFG80211_DEFAULT_PS
rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT;
@@ -570,9 +566,6 @@ use_default_name:
return NULL;
}
- INIT_WORK(&rdev->wiphy_work, cfg80211_wiphy_work);
- INIT_LIST_HEAD(&rdev->wiphy_work_list);
- spin_lock_init(&rdev->wiphy_work_lock);
INIT_WORK(&rdev->rfkill_block, cfg80211_rfkill_block_work);
INIT_WORK(&rdev->conn_work, cfg80211_conn_work);
INIT_WORK(&rdev->event_work, cfg80211_event_work);
@@ -1198,6 +1191,13 @@ void cfg80211_dev_free(struct cfg80211_registered_device *rdev)
{
struct cfg80211_internal_bss *scan, *tmp;
struct cfg80211_beacon_registration *reg, *treg;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rdev->wiphy_work_lock, flags);
+ WARN_ON(!list_empty(&rdev->wiphy_work_list));
+ spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags);
+ cancel_work_sync(&rdev->wiphy_work);
+
rfkill_destroy(rdev->wiphy.rfkill);
list_for_each_entry_safe(reg, treg, &rdev->beacon_registrations, list) {
list_del(&reg->list);
@@ -1526,9 +1526,9 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
break;
case NETDEV_REGISTER:
if (!wdev->registered) {
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
cfg80211_register_wdev(rdev, wdev);
- wiphy_unlock(&rdev->wiphy);
}
break;
case NETDEV_UNREGISTER:
@@ -1537,16 +1537,16 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
* so check wdev->registered.
*/
if (wdev->registered && !wdev->registering) {
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
_cfg80211_unregister_wdev(wdev, false);
- wiphy_unlock(&rdev->wiphy);
}
break;
case NETDEV_GOING_DOWN:
- wiphy_lock(&rdev->wiphy);
- cfg80211_leave(rdev, wdev);
- cfg80211_remove_links(wdev);
- wiphy_unlock(&rdev->wiphy);
+ scoped_guard(wiphy, &rdev->wiphy) {
+ cfg80211_leave(rdev, wdev);
+ cfg80211_remove_links(wdev);
+ }
/* since we just did cfg80211_leave() nothing to do there */
cancel_work_sync(&wdev->disconnect_wk);
cancel_work_sync(&wdev->pmsr_free_wk);
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 4c45f994a8c0..826299f3d781 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -567,6 +567,10 @@ int cfg80211_remove_virtual_intf(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask);
+int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_assoc_link *links,
+ u16 rem_links);
/**
* struct cfg80211_colocated_ap - colocated AP information
*
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index a5eb92d93074..e10f2b3b4b7f 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -352,6 +352,13 @@ cfg80211_mlme_check_mlo_compat(const struct ieee80211_multi_link_elem *mle_a,
return -EINVAL;
}
+ if (ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_a) !=
+ ieee80211_mle_get_ext_mld_capa_op((const u8 *)mle_b)) {
+ NL_SET_ERR_MSG(extack,
+ "extended link MLD capabilities/ops mismatch");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -627,10 +634,10 @@ void cfg80211_mgmt_registrations_update_wk(struct work_struct *wk)
rdev = container_of(wk, struct cfg80211_registered_device,
mgmt_registrations_update_wk);
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
cfg80211_mgmt_registrations_update(wdev);
- wiphy_unlock(&rdev->wiphy);
}
int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_portid,
@@ -1193,10 +1200,10 @@ cfg80211_background_cac_event(struct cfg80211_registered_device *rdev,
const struct cfg80211_chan_def *chandef,
enum nl80211_radar_event event)
{
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
__cfg80211_background_cac_event(rdev, rdev->background_radar_wdev,
chandef, event);
- wiphy_unlock(&rdev->wiphy);
}
void cfg80211_background_cac_done_wk(struct work_struct *work)
@@ -1287,3 +1294,80 @@ void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev)
&rdev->background_radar_chandef,
NL80211_RADAR_CAC_ABORTED);
}
+
+int cfg80211_assoc_ml_reconf(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_assoc_link *links,
+ u16 rem_links)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ int err;
+
+ lockdep_assert_wiphy(wdev->wiphy);
+
+ err = rdev_assoc_ml_reconf(rdev, dev, links, rem_links);
+ if (!err) {
+ int link_id;
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++) {
+ if (!links[link_id].bss)
+ continue;
+
+ cfg80211_ref_bss(&rdev->wiphy, links[link_id].bss);
+ cfg80211_hold_bss(bss_from_pub(links[link_id].bss));
+ }
+ }
+
+ return err;
+}
+
+void cfg80211_mlo_reconf_add_done(struct net_device *dev,
+ struct cfg80211_mlo_reconf_done_data *data)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ int link_id;
+
+ lockdep_assert_wiphy(wiphy);
+
+ trace_cfg80211_mlo_reconf_add_done(dev, data->added_links,
+ data->buf, data->len);
+
+ if (WARN_ON(!wdev->valid_links))
+ return;
+
+ if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT))
+ return;
+
+ /* validate that a BSS is given for each added link */
+ for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
+ struct cfg80211_bss *bss = data->links[link_id].bss;
+
+ if (!(data->added_links & BIT(link_id)))
+ continue;
+
+ if (WARN_ON(!bss))
+ return;
+ }
+
+ for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
+ struct cfg80211_bss *bss = data->links[link_id].bss;
+
+ if (!bss)
+ continue;
+
+ if (data->added_links & BIT(link_id)) {
+ wdev->links[link_id].client.current_bss =
+ bss_from_pub(bss);
+ } else {
+ cfg80211_unhold_bss(bss_from_pub(bss));
+ cfg80211_put_bss(wiphy, bss);
+ }
+ }
+
+ wdev->valid_links |= data->added_links;
+ nl80211_mlo_reconf_add_done(dev, data);
+}
+EXPORT_SYMBOL(cfg80211_mlo_reconf_add_done);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index f65e7441e644..b457fe78672b 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -294,6 +294,21 @@ static int validate_he_capa(const struct nlattr *attr,
return 0;
}
+static int validate_supported_selectors(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *supported_selectors = nla_data(attr);
+ u8 supported_selectors_len = nla_len(attr);
+
+ /* The top bit must not be set as it is not part of the selector */
+ for (int i = 0; i < supported_selectors_len; i++) {
+ if (supported_selectors[i] & 0x80)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* policy for the attributes */
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
@@ -830,6 +845,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_MLO_TTLM_ULINK] = NLA_POLICY_EXACT_LEN(sizeof(u16) * 8),
[NL80211_ATTR_ASSOC_SPP_AMSDU] = { .type = NLA_FLAG },
[NL80211_ATTR_VIF_RADIO_MASK] = { .type = NLA_U32 },
+ [NL80211_ATTR_SUPPORTED_SELECTORS] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_supported_selectors,
+ NL80211_MAX_SUPP_SELECTORS),
+ [NL80211_ATTR_MLO_RECONF_REM_LINKS] = { .type = NLA_U16 },
+ [NL80211_ATTR_EPCS] = { .type = NLA_FLAG },
};
/* policy for the key attributes */
@@ -3626,7 +3646,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
} else
wdev = netdev->ieee80211_ptr;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
/*
* end workaround code, by now the rdev is available
@@ -3639,32 +3659,24 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
rtnl_unlock();
if (result)
- goto out;
+ return result;
if (info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS]) {
struct ieee80211_txq_params txq_params;
struct nlattr *tb[NL80211_TXQ_ATTR_MAX + 1];
- if (!rdev->ops->set_txq_params) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->set_txq_params)
+ return -EOPNOTSUPP;
- if (!netdev) {
- result = -EINVAL;
- goto out;
- }
+ if (!netdev)
+ return -EINVAL;
if (netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP &&
- netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) {
- result = -EINVAL;
- goto out;
- }
+ netdev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
+ return -EINVAL;
- if (!netif_running(netdev)) {
- result = -ENETDOWN;
- goto out;
- }
+ if (!netif_running(netdev))
+ return -ENETDOWN;
nla_for_each_nested(nl_txq_params,
info->attrs[NL80211_ATTR_WIPHY_TXQ_PARAMS],
@@ -3675,10 +3687,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
txq_params_policy,
info->extack);
if (result)
- goto out;
+ return result;
+
result = parse_txq_params(tb, &txq_params);
if (result)
- goto out;
+ return result;
txq_params.link_id =
nl80211_link_id_or_invalid(info->attrs);
@@ -3694,7 +3707,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
result = rdev_set_txq_params(rdev, netdev,
&txq_params);
if (result)
- goto out;
+ return result;
}
}
@@ -3711,7 +3724,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
}
if (result)
- goto out;
+ return result;
}
if (info->attrs[NL80211_ATTR_WIPHY_TX_POWER_SETTING]) {
@@ -3722,19 +3735,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (!(rdev->wiphy.features & NL80211_FEATURE_VIF_TXPOWER))
txp_wdev = NULL;
- if (!rdev->ops->set_tx_power) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->set_tx_power)
+ return -EOPNOTSUPP;
idx = NL80211_ATTR_WIPHY_TX_POWER_SETTING;
type = nla_get_u32(info->attrs[idx]);
if (!info->attrs[NL80211_ATTR_WIPHY_TX_POWER_LEVEL] &&
- (type != NL80211_TX_POWER_AUTOMATIC)) {
- result = -EINVAL;
- goto out;
- }
+ (type != NL80211_TX_POWER_AUTOMATIC))
+ return -EINVAL;
if (type != NL80211_TX_POWER_AUTOMATIC) {
idx = NL80211_ATTR_WIPHY_TX_POWER_LEVEL;
@@ -3743,7 +3752,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
result = rdev_set_tx_power(rdev, txp_wdev, type, mbm);
if (result)
- goto out;
+ return result;
}
if (info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX] &&
@@ -3752,10 +3761,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if ((!rdev->wiphy.available_antennas_tx &&
!rdev->wiphy.available_antennas_rx) ||
- !rdev->ops->set_antenna) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ !rdev->ops->set_antenna)
+ return -EOPNOTSUPP;
tx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_TX]);
rx_ant = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_ANTENNA_RX]);
@@ -3763,17 +3770,15 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
/* reject antenna configurations which don't match the
* available antenna masks, except for the "all" mask */
if ((~tx_ant && (tx_ant & ~rdev->wiphy.available_antennas_tx)) ||
- (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx))) {
- result = -EINVAL;
- goto out;
- }
+ (~rx_ant && (rx_ant & ~rdev->wiphy.available_antennas_rx)))
+ return -EINVAL;
tx_ant = tx_ant & rdev->wiphy.available_antennas_tx;
rx_ant = rx_ant & rdev->wiphy.available_antennas_rx;
result = rdev_set_antenna(rdev, tx_ant, rx_ant);
if (result)
- goto out;
+ return result;
}
changed = 0;
@@ -3795,10 +3800,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]) {
frag_threshold = nla_get_u32(
info->attrs[NL80211_ATTR_WIPHY_FRAG_THRESHOLD]);
- if (frag_threshold < 256) {
- result = -EINVAL;
- goto out;
- }
+ if (frag_threshold < 256)
+ return -EINVAL;
if (frag_threshold != (u32) -1) {
/*
@@ -3819,10 +3822,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
}
if (info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]) {
- if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) {
- result = -EINVAL;
- goto out;
- }
+ if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK])
+ return -EINVAL;
coverage_class = nla_get_u8(
info->attrs[NL80211_ATTR_WIPHY_COVERAGE_CLASS]);
@@ -3830,20 +3831,17 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
}
if (info->attrs[NL80211_ATTR_WIPHY_DYN_ACK]) {
- if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION)) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ if (!(rdev->wiphy.features & NL80211_FEATURE_ACKTO_ESTIMATION))
+ return -EOPNOTSUPP;
changed |= WIPHY_PARAM_DYN_ACK;
}
if (info->attrs[NL80211_ATTR_TXQ_LIMIT]) {
if (!wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_TXQS)) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ NL80211_EXT_FEATURE_TXQS))
+ return -EOPNOTSUPP;
+
txq_limit = nla_get_u32(
info->attrs[NL80211_ATTR_TXQ_LIMIT]);
changed |= WIPHY_PARAM_TXQ_LIMIT;
@@ -3851,10 +3849,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]) {
if (!wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_TXQS)) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ NL80211_EXT_FEATURE_TXQS))
+ return -EOPNOTSUPP;
+
txq_memory_limit = nla_get_u32(
info->attrs[NL80211_ATTR_TXQ_MEMORY_LIMIT]);
changed |= WIPHY_PARAM_TXQ_MEMORY_LIMIT;
@@ -3862,10 +3859,9 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_TXQ_QUANTUM]) {
if (!wiphy_ext_feature_isset(&rdev->wiphy,
- NL80211_EXT_FEATURE_TXQS)) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ NL80211_EXT_FEATURE_TXQS))
+ return -EOPNOTSUPP;
+
txq_quantum = nla_get_u32(
info->attrs[NL80211_ATTR_TXQ_QUANTUM]);
changed |= WIPHY_PARAM_TXQ_QUANTUM;
@@ -3877,10 +3873,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
u8 old_coverage_class;
u32 old_txq_limit, old_txq_memory_limit, old_txq_quantum;
- if (!rdev->ops->set_wiphy_params) {
- result = -EOPNOTSUPP;
- goto out;
- }
+ if (!rdev->ops->set_wiphy_params)
+ return -EOPNOTSUPP;
old_retry_short = rdev->wiphy.retry_short;
old_retry_long = rdev->wiphy.retry_long;
@@ -3918,15 +3912,11 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info)
rdev->wiphy.txq_limit = old_txq_limit;
rdev->wiphy.txq_memory_limit = old_txq_memory_limit;
rdev->wiphy.txq_quantum = old_txq_quantum;
- goto out;
+ return result;
}
}
- result = 0;
-
-out:
- wiphy_unlock(&rdev->wiphy);
- return result;
+ return 0;
}
int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *chandef)
@@ -4010,10 +4000,10 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
goto nla_put_failure;
}
- if (rdev->ops->get_tx_power) {
+ if (rdev->ops->get_tx_power && !wdev->valid_links) {
int dbm, ret;
- ret = rdev_get_tx_power(rdev, wdev, &dbm);
+ ret = rdev_get_tx_power(rdev, wdev, 0, &dbm);
if (ret == 0 &&
nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
DBM_TO_MBM(dbm)))
@@ -4082,6 +4072,15 @@ static int nl80211_send_iface(struct sk_buff *msg, u32 portid, u32 seq, int flag
if (ret == 0 && nl80211_send_chandef(msg, &chandef))
goto nla_put_failure;
+ if (rdev->ops->get_tx_power) {
+ int dbm, ret;
+
+ ret = rdev_get_tx_power(rdev, wdev, link_id, &dbm);
+ if (ret == 0 &&
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_TX_POWER_LEVEL,
+ DBM_TO_MBM(dbm)))
+ goto nla_put_failure;
+ }
nla_nest_end(msg, link);
}
@@ -4144,22 +4143,22 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback *
if_idx = 0;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (if_idx < if_start) {
if_idx++;
continue;
}
+
if (nl80211_send_iface(skb, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
rdev, wdev,
- NL80211_CMD_NEW_INTERFACE) < 0) {
- wiphy_unlock(&rdev->wiphy);
+ NL80211_CMD_NEW_INTERFACE) < 0)
goto out;
- }
+
if_idx++;
}
- wiphy_unlock(&rdev->wiphy);
if_start = 0;
wp_idx++;
@@ -4522,16 +4521,13 @@ static int _nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
- int ret;
/* to avoid failing a new interface creation due to pending removal */
cfg80211_destroy_ifaces(rdev);
- wiphy_lock(&rdev->wiphy);
- ret = _nl80211_new_interface(skb, info);
- wiphy_unlock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
- return ret;
+ return _nl80211_new_interface(skb, info);
}
static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info)
@@ -10103,7 +10099,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
struct cfg80211_chan_def chandef;
enum nl80211_dfs_regions dfs_region;
unsigned int cac_time_ms;
- int err = -EINVAL;
+ int err;
flush_delayed_work(&rdev->dfs_update_channels_wk);
@@ -10118,35 +10114,29 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
return -EINVAL;
}
- wiphy_lock(wiphy);
+ guard(wiphy)(wiphy);
dfs_region = reg_get_dfs_region(wiphy);
if (dfs_region == NL80211_DFS_UNSET)
- goto unlock;
+ return -EINVAL;
err = nl80211_parse_chandef(rdev, info, &chandef);
if (err)
- goto unlock;
+ return err;
err = cfg80211_chandef_dfs_required(wiphy, &chandef, wdev->iftype);
if (err < 0)
- goto unlock;
+ return err;
- if (err == 0) {
- err = -EINVAL;
- goto unlock;
- }
+ if (err == 0)
+ return -EINVAL;
- if (!cfg80211_chandef_dfs_usable(wiphy, &chandef)) {
- err = -EINVAL;
- goto unlock;
- }
+ if (!cfg80211_chandef_dfs_usable(wiphy, &chandef))
+ return -EINVAL;
- if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND])) {
- err = cfg80211_start_background_radar_detection(rdev, wdev,
- &chandef);
- goto unlock;
- }
+ if (nla_get_flag(info->attrs[NL80211_ATTR_RADAR_BACKGROUND]))
+ return cfg80211_start_background_radar_detection(rdev, wdev,
+ &chandef);
if (cfg80211_beaconing_iface_active(wdev)) {
/* During MLO other link(s) can beacon, only the current link
@@ -10156,26 +10146,19 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
!wdev->links[link_id].ap.beacon_interval) {
/* nothing */
} else {
- err = -EBUSY;
- goto unlock;
+ return -EBUSY;
}
}
- if (wdev->links[link_id].cac_started) {
- err = -EBUSY;
- goto unlock;
- }
+ if (wdev->links[link_id].cac_started)
+ return -EBUSY;
/* CAC start is offloaded to HW and can't be started manually */
- if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD)) {
- err = -EOPNOTSUPP;
- goto unlock;
- }
+ if (wiphy_ext_feature_isset(wiphy, NL80211_EXT_FEATURE_DFS_OFFLOAD))
+ return -EOPNOTSUPP;
- if (!rdev->ops->start_radar_detection) {
- err = -EOPNOTSUPP;
- goto unlock;
- }
+ if (!rdev->ops->start_radar_detection)
+ return -EOPNOTSUPP;
cac_time_ms = cfg80211_chandef_dfs_cac_time(&rdev->wiphy, &chandef);
if (WARN_ON(!cac_time_ms))
@@ -10183,29 +10166,28 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms,
link_id);
- if (!err) {
- switch (wdev->iftype) {
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_P2P_GO:
- wdev->links[0].ap.chandef = chandef;
- break;
- case NL80211_IFTYPE_ADHOC:
- wdev->u.ibss.chandef = chandef;
- break;
- case NL80211_IFTYPE_MESH_POINT:
- wdev->u.mesh.chandef = chandef;
- break;
- default:
- break;
- }
- wdev->links[link_id].cac_started = true;
- wdev->links[link_id].cac_start_time = jiffies;
- wdev->links[link_id].cac_time_ms = cac_time_ms;
+ if (err)
+ return err;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ wdev->links[link_id].ap.chandef = chandef;
+ break;
+ case NL80211_IFTYPE_ADHOC:
+ wdev->u.ibss.chandef = chandef;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ wdev->u.mesh.chandef = chandef;
+ break;
+ default:
+ break;
}
-unlock:
- wiphy_unlock(wiphy);
+ wdev->links[link_id].cac_started = true;
+ wdev->links[link_id].cac_start_time = jiffies;
+ wdev->links[link_id].cac_time_ms = cac_time_ms;
- return err;
+ return 0;
}
static int nl80211_notify_radar_detection(struct sk_buff *skb,
@@ -10902,6 +10884,13 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
req.ie_len = nla_len(info->attrs[NL80211_ATTR_IE]);
}
+ if (info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]) {
+ req.supported_selectors =
+ nla_data(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]);
+ req.supported_selectors_len =
+ nla_len(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]);
+ }
+
auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]);
if (!nl80211_valid_auth_type(rdev, auth_type, NL80211_CMD_AUTHENTICATE))
return -EINVAL;
@@ -11132,12 +11121,84 @@ static struct cfg80211_bss *nl80211_assoc_bss(struct cfg80211_registered_device
return bss;
}
+static int nl80211_process_links(struct cfg80211_registered_device *rdev,
+ struct cfg80211_assoc_link *links,
+ int assoc_link_id,
+ const u8 *ssid, int ssid_len,
+ struct genl_info *info)
+{
+ unsigned int attrsize = NUM_NL80211_ATTR * sizeof(struct nlattr *);
+ struct nlattr **attrs __free(kfree) = kzalloc(attrsize, GFP_KERNEL);
+ struct nlattr *link;
+ unsigned int link_id;
+ int rem, err;
+
+ if (!attrs)
+ return -ENOMEM;
+
+ nla_for_each_nested(link, info->attrs[NL80211_ATTR_MLO_LINKS], rem) {
+ memset(attrs, 0, attrsize);
+
+ nla_parse_nested(attrs, NL80211_ATTR_MAX, link, NULL, NULL);
+
+ if (!attrs[NL80211_ATTR_MLO_LINK_ID]) {
+ NL_SET_BAD_ATTR(info->extack, link);
+ return -EINVAL;
+ }
+
+ link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]);
+ /* cannot use the same link ID again */
+ if (links[link_id].bss) {
+ NL_SET_BAD_ATTR(info->extack, link);
+ return -EINVAL;
+ }
+ links[link_id].bss =
+ nl80211_assoc_bss(rdev, ssid, ssid_len, attrs,
+ assoc_link_id, link_id);
+ if (IS_ERR(links[link_id].bss)) {
+ err = PTR_ERR(links[link_id].bss);
+ links[link_id].bss = NULL;
+ NL_SET_ERR_MSG_ATTR(info->extack, link,
+ "Error fetching BSS for link");
+ return err;
+ }
+
+ if (attrs[NL80211_ATTR_IE]) {
+ links[link_id].elems = nla_data(attrs[NL80211_ATTR_IE]);
+ links[link_id].elems_len =
+ nla_len(attrs[NL80211_ATTR_IE]);
+
+ if (cfg80211_find_elem(WLAN_EID_FRAGMENT,
+ links[link_id].elems,
+ links[link_id].elems_len)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[NL80211_ATTR_IE],
+ "cannot deal with fragmentation");
+ return -EINVAL;
+ }
+
+ if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
+ links[link_id].elems,
+ links[link_id].elems_len)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ attrs[NL80211_ATTR_IE],
+ "cannot deal with non-inheritance");
+ return -EINVAL;
+ }
+ }
+
+ links[link_id].disabled =
+ nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]);
+ }
+
+ return 0;
+}
+
static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
struct cfg80211_assoc_request req = {};
- struct nlattr **attrs = NULL;
const u8 *ap_addr, *ssid;
unsigned int link_id;
int err, ssid_len;
@@ -11184,6 +11245,13 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[NL80211_ATTR_PREV_BSSID])
req.prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]);
+ if (info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]) {
+ req.supported_selectors =
+ nla_data(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]);
+ req.supported_selectors_len =
+ nla_len(info->attrs[NL80211_ATTR_SUPPORTED_SELECTORS]);
+ }
+
if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_HT]))
req.flags |= ASSOC_REQ_DISABLE_HT;
@@ -11269,10 +11337,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
req.link_id = nl80211_link_id_or_invalid(info->attrs);
if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
- unsigned int attrsize = NUM_NL80211_ATTR * sizeof(*attrs);
- struct nlattr *link;
- int rem = 0;
-
if (req.link_id < 0)
return -EINVAL;
@@ -11287,72 +11351,10 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
req.ap_mld_addr = nla_data(info->attrs[NL80211_ATTR_MLD_ADDR]);
ap_addr = req.ap_mld_addr;
- attrs = kzalloc(attrsize, GFP_KERNEL);
- if (!attrs)
- return -ENOMEM;
-
- nla_for_each_nested(link,
- info->attrs[NL80211_ATTR_MLO_LINKS],
- rem) {
- memset(attrs, 0, attrsize);
-
- nla_parse_nested(attrs, NL80211_ATTR_MAX,
- link, NULL, NULL);
-
- if (!attrs[NL80211_ATTR_MLO_LINK_ID]) {
- err = -EINVAL;
- NL_SET_BAD_ATTR(info->extack, link);
- goto free;
- }
-
- link_id = nla_get_u8(attrs[NL80211_ATTR_MLO_LINK_ID]);
- /* cannot use the same link ID again */
- if (req.links[link_id].bss) {
- err = -EINVAL;
- NL_SET_BAD_ATTR(info->extack, link);
- goto free;
- }
- req.links[link_id].bss =
- nl80211_assoc_bss(rdev, ssid, ssid_len, attrs,
- req.link_id, link_id);
- if (IS_ERR(req.links[link_id].bss)) {
- err = PTR_ERR(req.links[link_id].bss);
- req.links[link_id].bss = NULL;
- NL_SET_ERR_MSG_ATTR(info->extack,
- link, "Error fetching BSS for link");
- goto free;
- }
-
- if (attrs[NL80211_ATTR_IE]) {
- req.links[link_id].elems =
- nla_data(attrs[NL80211_ATTR_IE]);
- req.links[link_id].elems_len =
- nla_len(attrs[NL80211_ATTR_IE]);
-
- if (cfg80211_find_elem(WLAN_EID_FRAGMENT,
- req.links[link_id].elems,
- req.links[link_id].elems_len)) {
- NL_SET_ERR_MSG_ATTR(info->extack,
- attrs[NL80211_ATTR_IE],
- "cannot deal with fragmentation");
- err = -EINVAL;
- goto free;
- }
-
- if (cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
- req.links[link_id].elems,
- req.links[link_id].elems_len)) {
- NL_SET_ERR_MSG_ATTR(info->extack,
- attrs[NL80211_ATTR_IE],
- "cannot deal with non-inheritance");
- err = -EINVAL;
- goto free;
- }
- }
-
- req.links[link_id].disabled =
- nla_get_flag(attrs[NL80211_ATTR_MLO_LINK_DISABLED]);
- }
+ err = nl80211_process_links(rdev, req.links, req.link_id,
+ ssid, ssid_len, info);
+ if (err)
+ goto free;
if (!req.links[req.link_id].bss) {
err = -EINVAL;
@@ -11372,9 +11374,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
err = -EINVAL;
goto free;
}
-
- kfree(attrs);
- attrs = NULL;
} else {
if (req.link_id >= 0)
return -EINVAL;
@@ -11434,7 +11433,6 @@ free:
for (link_id = 0; link_id < ARRAY_SIZE(req.links); link_id++)
cfg80211_put_bss(&rdev->wiphy, req.links[link_id].bss);
cfg80211_put_bss(&rdev->wiphy, req.bss);
- kfree(attrs);
return err;
}
@@ -16486,6 +16484,89 @@ nl80211_set_ttlm(struct sk_buff *skb, struct genl_info *info)
return rdev_set_ttlm(rdev, dev, &params);
}
+static int nl80211_assoc_ml_reconf(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct cfg80211_assoc_link links[IEEE80211_MLD_MAX_NUM_LINKS] = {};
+ unsigned int link_id;
+ u16 add_links, rem_links;
+ int err;
+
+ if (!wdev->valid_links)
+ return -EINVAL;
+
+ if (dev->ieee80211_ptr->conn_owner_nlportid &&
+ dev->ieee80211_ptr->conn_owner_nlportid != info->snd_portid)
+ return -EPERM;
+
+ if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION &&
+ dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ add_links = 0;
+ if (info->attrs[NL80211_ATTR_MLO_LINKS]) {
+ err = nl80211_process_links(rdev, links,
+ /* mark as MLO, but not assoc */
+ IEEE80211_MLD_MAX_NUM_LINKS,
+ NULL, 0, info);
+ if (err)
+ return err;
+
+ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS;
+ link_id++) {
+ if (!links[link_id].bss)
+ continue;
+ add_links |= BIT(link_id);
+ }
+ }
+
+ if (info->attrs[NL80211_ATTR_MLO_RECONF_REM_LINKS])
+ rem_links =
+ nla_get_u16(info->attrs[NL80211_ATTR_MLO_RECONF_REM_LINKS]);
+ else
+ rem_links = 0;
+
+ /* Validate that existing links are not added, removed links are valid
+ * and don't allow adding and removing the same links
+ */
+ if ((add_links & rem_links) || !(add_links | rem_links) ||
+ (wdev->valid_links & add_links) ||
+ ((wdev->valid_links & rem_links) != rem_links)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -EOPNOTSUPP;
+
+out:
+ for (link_id = 0; link_id < ARRAY_SIZE(links); link_id++)
+ cfg80211_put_bss(&rdev->wiphy, links[link_id].bss);
+
+ return err;
+}
+
+static int
+nl80211_epcs_cfg(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ bool val;
+
+ if (wdev->iftype != NL80211_IFTYPE_STATION &&
+ wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)
+ return -EOPNOTSUPP;
+
+ if (!wdev->connected)
+ return -ENOLINK;
+
+ val = nla_get_flag(info->attrs[NL80211_ATTR_EPCS]);
+
+ return rdev_set_epcs(rdev, dev, val);
+}
+
#define NL80211_FLAG_NEED_WIPHY 0x01
#define NL80211_FLAG_NEED_NETDEV 0x02
#define NL80211_FLAG_NEED_RTNL 0x04
@@ -17678,6 +17759,18 @@ static const struct genl_small_ops nl80211_small_ops[] = {
.flags = GENL_UNS_ADMIN_PERM,
.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
},
+ {
+ .cmd = NL80211_CMD_ASSOC_MLO_RECONF,
+ .doit = nl80211_assoc_ml_reconf,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+ },
+ {
+ .cmd = NL80211_CMD_EPCS_CFG,
+ .doit = nl80211_epcs_cfg,
+ .flags = GENL_UNS_ADMIN_PERM,
+ .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+ },
};
static struct genl_family nl80211_fam __ro_after_init = {
@@ -18453,10 +18546,9 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, peer_addr))
goto nla_put_failure;
- if ((td_bitmap_len > 0) && td_bitmap)
- if (nla_put(msg, NL80211_ATTR_TD_BITMAP,
- td_bitmap_len, td_bitmap))
- goto nla_put_failure;
+ if (td_bitmap_len > 0 && td_bitmap &&
+ nla_put(msg, NL80211_ATTR_TD_BITMAP, td_bitmap_len, td_bitmap))
+ goto nla_put_failure;
genlmsg_end(msg, hdr);
@@ -18574,6 +18666,23 @@ void cfg80211_links_removed(struct net_device *dev, u16 link_mask)
}
EXPORT_SYMBOL(cfg80211_links_removed);
+void nl80211_mlo_reconf_add_done(struct net_device *dev,
+ struct cfg80211_mlo_reconf_done_data *data)
+{
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct nl80211_mlme_event event = {
+ .cmd = NL80211_CMD_ASSOC_MLO_RECONF,
+ .buf = data->buf,
+ .buf_len = data->len,
+ .uapsd_queues = -1,
+ };
+
+ nl80211_send_mlme_event(rdev, dev, &event, GFP_KERNEL);
+}
+EXPORT_SYMBOL(nl80211_mlo_reconf_add_done);
+
void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev,
struct net_device *netdev, const u8 *bssid,
gfp_t gfp)
@@ -20406,6 +20515,39 @@ void cfg80211_schedule_channels_check(struct wireless_dev *wdev)
}
EXPORT_SYMBOL(cfg80211_schedule_channels_check);
+void cfg80211_epcs_changed(struct net_device *netdev, bool enabled)
+{
+ struct wireless_dev *wdev = netdev->ieee80211_ptr;
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_epcs_changed(wdev, enabled);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_EPCS_CFG);
+ if (!hdr) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ if (enabled && nla_put_flag(msg, NL80211_ATTR_EPCS))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
+ NL80211_MCGRP_MLME, GFP_KERNEL);
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_epcs_changed);
+
/* initialisation/exit functions */
int __init nl80211_init(void)
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index ffaab9a92e5b..5e25782af1e0 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -124,4 +124,7 @@ void cfg80211_free_coalesce(struct cfg80211_coalesce *coalesce);
/* peer measurement */
int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info);
+void nl80211_mlo_reconf_add_done(struct net_device *dev,
+ struct cfg80211_mlo_reconf_done_data *data);
+
#endif /* __NET_WIRELESS_NL80211_H */
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index 0396fa19bdf1..a117f5093ca2 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -630,9 +630,9 @@ void cfg80211_pmsr_free_wk(struct work_struct *work)
struct wireless_dev *wdev = container_of(work, struct wireless_dev,
pmsr_free_wk);
- wiphy_lock(wdev->wiphy);
+ guard(wiphy)(wdev->wiphy);
+
cfg80211_pmsr_process_abort(wdev);
- wiphy_unlock(wdev->wiphy);
}
void cfg80211_pmsr_wdev_down(struct wireless_dev *wdev)
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index adb6105bbb7d..759da1623342 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -600,11 +600,12 @@ static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev,
}
static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev, int *dbm)
+ struct wireless_dev *wdev, unsigned int link_id,
+ int *dbm)
{
int ret;
- trace_rdev_get_tx_power(&rdev->wiphy, wdev);
- ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm);
+ trace_rdev_get_tx_power(&rdev->wiphy, wdev, link_id);
+ ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, link_id, dbm);
trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm);
return ret;
}
@@ -1546,4 +1547,38 @@ rdev_get_radio_mask(struct cfg80211_registered_device *rdev,
return rdev->ops->get_radio_mask(wiphy, dev);
}
+
+static inline int
+rdev_assoc_ml_reconf(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_assoc_link *add_links,
+ u16 rem_links)
+{
+ struct wiphy *wiphy = &rdev->wiphy;
+ int ret = -EOPNOTSUPP;
+
+ trace_rdev_assoc_ml_reconf(wiphy, dev, add_links, rem_links);
+ if (rdev->ops->assoc_ml_reconf)
+ ret = rdev->ops->assoc_ml_reconf(wiphy, dev, add_links,
+ rem_links);
+ trace_rdev_return_int(wiphy, ret);
+
+ return ret;
+}
+
+static inline int
+rdev_set_epcs(struct cfg80211_registered_device *rdev,
+ struct net_device *dev, bool val)
+{
+ struct wiphy *wiphy = &rdev->wiphy;
+ int ret = -EOPNOTSUPP;
+
+ trace_rdev_set_epcs(wiphy, dev, val);
+ if (rdev->ops->set_epcs)
+ ret = rdev->ops->set_epcs(wiphy, dev, val);
+ trace_rdev_return_int(wiphy, ret);
+
+ return ret;
+}
+
#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index eec95cb0ab9e..212e9561aae7 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2466,11 +2466,11 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy)
struct wireless_dev *wdev;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
- wiphy_lock(wiphy);
+ guard(wiphy)(wiphy);
+
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
if (!reg_wdev_chan_valid(wiphy, wdev))
cfg80211_leave(rdev, wdev);
- wiphy_unlock(wiphy);
}
static void reg_check_chans_work(struct work_struct *work)
@@ -2650,13 +2650,11 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy,
return;
rtnl_lock();
- wiphy_lock(wiphy);
-
- tmp = get_wiphy_regdom(wiphy);
- rcu_assign_pointer(wiphy->regd, new_regd);
- rcu_free_regdom(tmp);
-
- wiphy_unlock(wiphy);
+ scoped_guard(wiphy, wiphy) {
+ tmp = get_wiphy_regdom(wiphy);
+ rcu_assign_pointer(wiphy->regd, new_regd);
+ rcu_free_regdom(tmp);
+ }
rtnl_unlock();
}
EXPORT_SYMBOL(wiphy_apply_custom_regulatory);
@@ -2826,9 +2824,9 @@ reg_process_hint_driver(struct wiphy *wiphy,
tmp = get_wiphy_regdom(wiphy);
ASSERT_RTNL();
- wiphy_lock(wiphy);
- rcu_assign_pointer(wiphy->regd, regd);
- wiphy_unlock(wiphy);
+ scoped_guard(wiphy, wiphy) {
+ rcu_assign_pointer(wiphy->regd, regd);
+ }
rcu_free_regdom(tmp);
}
@@ -3206,9 +3204,9 @@ static void reg_process_self_managed_hints(void)
ASSERT_RTNL();
for_each_rdev(rdev) {
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
reg_process_self_managed_hint(&rdev->wiphy);
- wiphy_unlock(&rdev->wiphy);
}
reg_check_channels();
@@ -3601,14 +3599,12 @@ static bool is_wiphy_all_set_reg_flag(enum ieee80211_regulatory_flags flag)
struct wireless_dev *wdev;
for_each_rdev(rdev) {
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
- if (!(wdev->wiphy->regulatory_flags & flag)) {
- wiphy_unlock(&rdev->wiphy);
+ if (!(wdev->wiphy->regulatory_flags & flag))
return false;
- }
}
- wiphy_unlock(&rdev->wiphy);
}
return true;
@@ -3884,19 +3880,18 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd,
if (!driver_request->intersect) {
ASSERT_RTNL();
- wiphy_lock(request_wiphy);
- if (request_wiphy->regd)
- tmp = get_wiphy_regdom(request_wiphy);
-
- regd = reg_copy_regd(rd);
- if (IS_ERR(regd)) {
- wiphy_unlock(request_wiphy);
- return PTR_ERR(regd);
+ scoped_guard(wiphy, request_wiphy) {
+ if (request_wiphy->regd)
+ tmp = get_wiphy_regdom(request_wiphy);
+
+ regd = reg_copy_regd(rd);
+ if (IS_ERR(regd))
+ return PTR_ERR(regd);
+
+ rcu_assign_pointer(request_wiphy->regd, regd);
+ rcu_free_regdom(tmp);
}
- rcu_assign_pointer(request_wiphy->regd, regd);
- rcu_free_regdom(tmp);
- wiphy_unlock(request_wiphy);
reset_regdomains(false, rd);
return 0;
}
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index abca3d7ff56c..cd2124329521 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -272,12 +272,19 @@ cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
{
const struct element *non_inherit_elem, *parent, *sub;
u8 *pos = new_ie;
- u8 id, ext_id;
+ const u8 *mbssid_index_ie;
+ u8 id, ext_id, bssid_index = 255;
unsigned int match_len;
non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE,
subie, subie_len);
+ mbssid_index_ie = cfg80211_find_ie(WLAN_EID_MULTI_BSSID_IDX, subie,
+ subie_len);
+ if (mbssid_index_ie && mbssid_index_ie[1] > 0 &&
+ mbssid_index_ie[2] > 0 && mbssid_index_ie[2] <= 46)
+ bssid_index = mbssid_index_ie[2];
+
/* We copy the elements one by one from the parent to the generated
* elements.
* If they are not inherited (included in subie or in the non
@@ -316,6 +323,24 @@ cfg80211_gen_new_ie(const u8 *ie, size_t ielen,
continue;
}
+ /* For ML probe response, match the MLE in the frame body with
+ * MLD id being 'bssid_index'
+ */
+ if (parent->id == WLAN_EID_EXTENSION && parent->datalen > 1 &&
+ parent->data[0] == WLAN_EID_EXT_EHT_MULTI_LINK &&
+ bssid_index == ieee80211_mle_get_mld_id(parent->data + 1)) {
+ if (!cfg80211_copy_elem_with_frags(parent,
+ ie, ielen,
+ &pos, new_ie,
+ new_ie_len))
+ return 0;
+
+ /* Continue here to prevent processing the MLE in
+ * sub-element, which AP MLD should not carry
+ */
+ continue;
+ }
+
/* Already copied if an earlier element had the same type */
if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie,
&ext_id, match_len, 0))
@@ -704,7 +729,7 @@ cfg80211_parse_colocated_ap_iter(void *_data, u8 type,
bss_params)))
return RNR_ITER_CONTINUE;
- entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, GFP_ATOMIC);
+ entry = kzalloc(sizeof(*entry), GFP_ATOMIC);
if (!entry)
return RNR_ITER_ERROR;
@@ -713,6 +738,17 @@ cfg80211_parse_colocated_ap_iter(void *_data, u8 type,
if (!cfg80211_parse_ap_info(entry, tbtt_info, tbtt_info_len,
data->ssid_elem, data->s_ssid_tmp)) {
+ struct cfg80211_colocated_ap *tmp;
+
+ /* Don't add duplicate BSSIDs on the same channel. */
+ list_for_each_entry(tmp, &data->ap_list, list) {
+ if (ether_addr_equal(tmp->bssid, entry->bssid) &&
+ tmp->center_freq == entry->center_freq) {
+ kfree(entry);
+ return RNR_ITER_CONTINUE;
+ }
+ }
+
data->n_coloc++;
list_add_tail(&entry->list, &data->ap_list);
} else {
@@ -1235,7 +1271,8 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work)
rdev = container_of(work, struct cfg80211_registered_device,
sched_scan_res_wk);
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) {
if (req->report_results) {
req->report_results = false;
@@ -1250,7 +1287,6 @@ void cfg80211_sched_scan_results_wk(struct work_struct *work)
NL80211_CMD_SCHED_SCAN_RESULTS);
}
}
- wiphy_unlock(&rdev->wiphy);
}
void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid)
@@ -1285,9 +1321,9 @@ EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked);
void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid)
{
- wiphy_lock(wiphy);
+ guard(wiphy)(wiphy);
+
cfg80211_sched_scan_stopped_locked(wiphy, reqid);
- wiphy_unlock(wiphy);
}
EXPORT_SYMBOL(cfg80211_sched_scan_stopped);
@@ -3562,10 +3598,8 @@ int cfg80211_wext_siwscan(struct net_device *dev,
/* translate "Scan for SSID" request */
if (wreq) {
if (wrqu->data.flags & IW_SCAN_THIS_ESSID) {
- if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) {
- err = -EINVAL;
- goto out;
- }
+ if (wreq->essid_len > IEEE80211_MAX_SSID_LEN)
+ return -EINVAL;
memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len);
creq->ssids[0].ssid_len = wreq->essid_len;
}
@@ -3581,20 +3615,20 @@ int cfg80211_wext_siwscan(struct net_device *dev,
eth_broadcast_addr(creq->bssid);
- wiphy_lock(&rdev->wiphy);
-
- rdev->scan_req = creq;
- err = rdev_scan(rdev, creq);
- if (err) {
- rdev->scan_req = NULL;
- /* creq will be freed below */
- } else {
- nl80211_send_scan_start(rdev, dev->ieee80211_ptr);
- /* creq now owned by driver */
- creq = NULL;
- dev_hold(dev);
+ scoped_guard(wiphy, &rdev->wiphy) {
+ rdev->scan_req = creq;
+ err = rdev_scan(rdev, creq);
+ if (err) {
+ rdev->scan_req = NULL;
+ /* creq will be freed below */
+ } else {
+ nl80211_send_scan_start(rdev, dev->ieee80211_ptr);
+ /* creq now owned by driver */
+ creq = NULL;
+ dev_hold(dev);
+ }
}
- wiphy_unlock(&rdev->wiphy);
+
out:
kfree(creq);
return err;
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 268171600087..cf998500a965 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -252,7 +252,7 @@ void cfg80211_conn_work(struct work_struct *work)
u8 bssid_buf[ETH_ALEN], *bssid = NULL;
enum nl80211_timeout_reason treason;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (!wdev->netdev)
@@ -280,8 +280,6 @@ void cfg80211_conn_work(struct work_struct *work)
__cfg80211_connect_result(wdev->netdev, &cr, false);
}
}
-
- wiphy_unlock(&rdev->wiphy);
}
static void cfg80211_step_auth_next(struct cfg80211_conn *conn,
@@ -693,13 +691,13 @@ static bool cfg80211_is_all_idle(void)
* as chan dfs state, etc.
*/
for_each_rdev(rdev) {
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) {
if (wdev->conn || wdev->connected ||
cfg80211_beaconing_iface_active(wdev))
is_all_idle = false;
}
- wiphy_unlock(&rdev->wiphy);
}
return is_all_idle;
@@ -1583,7 +1581,7 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
container_of(work, struct wireless_dev, disconnect_wk);
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- wiphy_lock(wdev->wiphy);
+ guard(wiphy)(wdev->wiphy);
if (wdev->conn_owner_nlportid) {
switch (wdev->iftype) {
@@ -1619,6 +1617,4 @@ void cfg80211_autodisconnect_wk(struct work_struct *work)
break;
}
}
-
- wiphy_unlock(wdev->wiphy);
}
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index d5c9bb614fa6..4f0abd5d49df 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -1690,9 +1690,28 @@ TRACE_EVENT(rdev_set_wiphy_params,
WIPHY_PR_ARG, __entry->changed)
);
-DEFINE_EVENT(wiphy_wdev_evt, rdev_get_tx_power,
- TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
- TP_ARGS(wiphy, wdev)
+DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ unsigned int link_id),
+ TP_ARGS(wiphy, wdev, link_id),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ WDEV_ENTRY
+ __field(unsigned int, link_id)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ WDEV_ASSIGN;
+ __entry->link_id = link_id;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
+);
+
+DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_tx_power,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ unsigned int link_id),
+ TP_ARGS(wiphy, wdev, link_id)
);
TRACE_EVENT(rdev_set_tx_power,
@@ -2192,25 +2211,6 @@ TRACE_EVENT(rdev_set_noack_map,
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", noack_map: %u",
WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->noack_map)
);
-
-DECLARE_EVENT_CLASS(wiphy_wdev_link_evt,
- TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
- unsigned int link_id),
- TP_ARGS(wiphy, wdev, link_id),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- WDEV_ENTRY
- __field(unsigned int, link_id)
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- WDEV_ASSIGN;
- __entry->link_id = link_id;
- ),
- TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %u",
- WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
-);
-
DEFINE_EVENT(wiphy_wdev_link_evt, rdev_get_channel,
TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
unsigned int link_id),
@@ -3049,6 +3049,24 @@ TRACE_EVENT(rdev_set_ttlm,
WIPHY_PR_ARG, NETDEV_PR_ARG)
);
+TRACE_EVENT(rdev_set_epcs,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ bool val),
+ TP_ARGS(wiphy, netdev, val),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(bool, val)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ __entry->val = val;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", config=%u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->val)
+);
+
/*************************************************************
* cfg80211 exported functions traces *
*************************************************************/
@@ -4104,6 +4122,66 @@ TRACE_EVENT(cfg80211_links_removed,
__entry->link_mask)
);
+TRACE_EVENT(cfg80211_mlo_reconf_add_done,
+ TP_PROTO(struct net_device *netdev, u16 link_mask,
+ const u8 *buf, size_t len),
+ TP_ARGS(netdev, link_mask, buf, len),
+ TP_STRUCT__entry(
+ NETDEV_ENTRY
+ __field(u16, link_mask)
+ __dynamic_array(u8, buf, len)
+ ),
+ TP_fast_assign(
+ NETDEV_ASSIGN;
+ __entry->link_mask = link_mask;
+ memcpy(__get_dynamic_array(buf), buf, len);
+ ),
+ TP_printk(NETDEV_PR_FMT ", link_mask:0x%x",
+ NETDEV_PR_ARG, __entry->link_mask)
+);
+
+TRACE_EVENT(rdev_assoc_ml_reconf,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_assoc_link *add_links,
+ u16 rem_links),
+ TP_ARGS(wiphy, netdev, add_links, rem_links),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __field(u16, add_links)
+ __field(u16, rem_links)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ u32 i;
+
+ __entry->add_links = 0;
+ __entry->rem_links = rem_links;
+ for (i = 0; add_links && i < IEEE80211_MLD_MAX_NUM_LINKS; i++)
+ if (add_links[i].bss)
+ __entry->add_links |= BIT(i);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", add_links=0x%x, rem_links=0x%x",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ __entry->add_links, __entry->rem_links)
+);
+
+TRACE_EVENT(cfg80211_epcs_changed,
+ TP_PROTO(struct wireless_dev *wdev, bool enabled),
+ TP_ARGS(wdev, enabled),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ __field(u32, enabled)
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ __entry->enabled = enabled;
+ ),
+ TP_printk(WDEV_PR_FMT ", enabled=%u",
+ WDEV_PR_ARG, __entry->enabled)
+);
+
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 65c8e47246b7..60157943d351 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -2572,7 +2572,6 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
{
struct cfg80211_registered_device *rdev;
struct wireless_dev *wdev;
- int ret;
wdev = dev->ieee80211_ptr;
if (!wdev)
@@ -2584,11 +2583,9 @@ int cfg80211_get_station(struct net_device *dev, const u8 *mac_addr,
memset(sinfo, 0, sizeof(*sinfo));
- wiphy_lock(&rdev->wiphy);
- ret = rdev_get_station(rdev, dev, mac_addr, sinfo);
- wiphy_unlock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
- return ret;
+ return rdev_get_station(rdev, dev, mac_addr, sinfo);
}
EXPORT_SYMBOL(cfg80211_get_station);
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index 90d5c0592667..a74b1afc594e 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -39,7 +39,6 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
struct cfg80211_registered_device *rdev;
struct vif_params vifparams;
enum nl80211_iftype type;
- int ret;
rdev = wiphy_to_rdev(wdev->wiphy);
@@ -62,11 +61,9 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
memset(&vifparams, 0, sizeof(vifparams));
- wiphy_lock(wdev->wiphy);
- ret = cfg80211_change_iface(rdev, dev, type, &vifparams);
- wiphy_unlock(wdev->wiphy);
+ guard(wiphy)(wdev->wiphy);
- return ret;
+ return cfg80211_change_iface(rdev, dev, type, &vifparams);
}
int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
@@ -258,23 +255,17 @@ int cfg80211_wext_siwrts(struct net_device *dev,
u32 orts = wdev->wiphy->rts_threshold;
int err;
- wiphy_lock(&rdev->wiphy);
- if (rts->disabled || !rts->fixed) {
+ guard(wiphy)(&rdev->wiphy);
+ if (rts->disabled || !rts->fixed)
wdev->wiphy->rts_threshold = (u32) -1;
- } else if (rts->value < 0) {
- err = -EINVAL;
- goto out;
- } else {
+ else if (rts->value < 0)
+ return -EINVAL;
+ else
wdev->wiphy->rts_threshold = rts->value;
- }
err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_RTS_THRESHOLD);
-
if (err)
wdev->wiphy->rts_threshold = orts;
-
-out:
- wiphy_unlock(&rdev->wiphy);
return err;
}
@@ -302,12 +293,12 @@ int cfg80211_wext_siwfrag(struct net_device *dev,
u32 ofrag = wdev->wiphy->frag_threshold;
int err;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
if (frag->disabled || !frag->fixed) {
wdev->wiphy->frag_threshold = (u32) -1;
} else if (frag->value < 256) {
- err = -EINVAL;
- goto out;
+ return -EINVAL;
} else {
/* Fragment length must be even, so strip LSB. */
wdev->wiphy->frag_threshold = frag->value & ~0x1;
@@ -316,9 +307,6 @@ int cfg80211_wext_siwfrag(struct net_device *dev,
err = rdev_set_wiphy_params(rdev, WIPHY_PARAM_FRAG_THRESHOLD);
if (err)
wdev->wiphy->frag_threshold = ofrag;
-out:
- wiphy_unlock(&rdev->wiphy);
-
return err;
}
@@ -352,7 +340,8 @@ static int cfg80211_wext_siwretry(struct net_device *dev,
(retry->flags & IW_RETRY_TYPE) != IW_RETRY_LIMIT)
return -EINVAL;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
if (retry->flags & IW_RETRY_LONG) {
wdev->wiphy->retry_long = retry->value;
changed |= WIPHY_PARAM_RETRY_LONG;
@@ -371,7 +360,6 @@ static int cfg80211_wext_siwretry(struct net_device *dev,
wdev->wiphy->retry_short = oshort;
wdev->wiphy->retry_long = olong;
}
- wiphy_unlock(&rdev->wiphy);
return err;
}
@@ -578,9 +566,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
struct iw_point *erq = &wrqu->encoding;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- int idx, err;
- bool remove = false;
struct key_params params;
+ bool remove = false;
+ int idx;
if (wdev->iftype != NL80211_IFTYPE_STATION &&
wdev->iftype != NL80211_IFTYPE_ADHOC)
@@ -592,11 +580,9 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
!rdev->ops->set_default_key)
return -EOPNOTSUPP;
- wiphy_lock(&rdev->wiphy);
- if (wdev->valid_links) {
- err = -EOPNOTSUPP;
- goto out;
- }
+ guard(wiphy)(&rdev->wiphy);
+ if (wdev->valid_links)
+ return -EOPNOTSUPP;
idx = erq->flags & IW_ENCODE_INDEX;
if (idx == 0) {
@@ -604,8 +590,7 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
if (idx < 0)
idx = 0;
} else if (idx < 1 || idx > 4) {
- err = -EINVAL;
- goto out;
+ return -EINVAL;
} else {
idx--;
}
@@ -614,7 +599,8 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
remove = true;
else if (erq->length == 0) {
/* No key data - just set the default TX key index */
- err = 0;
+ int err = 0;
+
if (wdev->connected ||
(wdev->iftype == NL80211_IFTYPE_ADHOC &&
wdev->u.ibss.current_bss))
@@ -622,28 +608,22 @@ static int cfg80211_wext_siwencode(struct net_device *dev,
true);
if (!err)
wdev->wext.default_key = idx;
- goto out;
+ return err;
}
memset(&params, 0, sizeof(params));
params.key = keybuf;
params.key_len = erq->length;
- if (erq->length == 5) {
+ if (erq->length == 5)
params.cipher = WLAN_CIPHER_SUITE_WEP40;
- } else if (erq->length == 13) {
+ else if (erq->length == 13)
params.cipher = WLAN_CIPHER_SUITE_WEP104;
- } else if (!remove) {
- err = -EINVAL;
- goto out;
- }
-
- err = cfg80211_set_encryption(rdev, dev, false, NULL, remove,
- wdev->wext.default_key == -1,
- idx, &params);
-out:
- wiphy_unlock(&rdev->wiphy);
+ else if (!remove)
+ return -EINVAL;
- return err;
+ return cfg80211_set_encryption(rdev, dev, false, NULL, remove,
+ wdev->wext.default_key == -1,
+ idx, &params);
}
static int cfg80211_wext_siwencodeext(struct net_device *dev,
@@ -659,7 +639,6 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev,
bool remove = false;
struct key_params params;
u32 cipher;
- int ret;
if (wdev->iftype != NL80211_IFTYPE_STATION &&
wdev->iftype != NL80211_IFTYPE_ADHOC)
@@ -734,16 +713,13 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev,
params.seq_len = 6;
}
- wiphy_lock(wdev->wiphy);
- ret = cfg80211_set_encryption(
- rdev, dev,
- !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
- addr, remove,
- ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
- idx, &params);
- wiphy_unlock(wdev->wiphy);
+ guard(wiphy)(wdev->wiphy);
- return ret;
+ return cfg80211_set_encryption(rdev, dev,
+ !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY),
+ addr, remove,
+ ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY,
+ idx, &params);
}
static int cfg80211_wext_giwencode(struct net_device *dev,
@@ -794,61 +770,41 @@ static int cfg80211_wext_siwfreq(struct net_device *dev,
struct cfg80211_chan_def chandef = {
.width = NL80211_CHAN_WIDTH_20_NOHT,
};
- int freq, ret;
+ int freq;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
switch (wdev->iftype) {
case NL80211_IFTYPE_STATION:
- ret = cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra);
- break;
+ return cfg80211_mgd_wext_siwfreq(dev, info, wextfreq, extra);
case NL80211_IFTYPE_ADHOC:
- ret = cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra);
- break;
+ return cfg80211_ibss_wext_siwfreq(dev, info, wextfreq, extra);
case NL80211_IFTYPE_MONITOR:
freq = cfg80211_wext_freq(wextfreq);
- if (freq < 0) {
- ret = freq;
- break;
- }
- if (freq == 0) {
- ret = -EINVAL;
- break;
- }
+ if (freq < 0)
+ return freq;
+ if (freq == 0)
+ return -EINVAL;
+
chandef.center_freq1 = freq;
chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
- if (!chandef.chan) {
- ret = -EINVAL;
- break;
- }
- ret = cfg80211_set_monitor_channel(rdev, dev, &chandef);
- break;
+ if (!chandef.chan)
+ return -EINVAL;
+ return cfg80211_set_monitor_channel(rdev, dev, &chandef);
case NL80211_IFTYPE_MESH_POINT:
freq = cfg80211_wext_freq(wextfreq);
- if (freq < 0) {
- ret = freq;
- break;
- }
- if (freq == 0) {
- ret = -EINVAL;
- break;
- }
+ if (freq < 0)
+ return freq;
+ if (freq == 0)
+ return -EINVAL;
chandef.center_freq1 = freq;
chandef.chan = ieee80211_get_channel(&rdev->wiphy, freq);
- if (!chandef.chan) {
- ret = -EINVAL;
- break;
- }
- ret = cfg80211_set_mesh_channel(rdev, wdev, &chandef);
- break;
+ if (!chandef.chan)
+ return -EINVAL;
+ return cfg80211_set_mesh_channel(rdev, wdev, &chandef);
default:
- ret = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
-
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static int cfg80211_wext_giwfreq(struct net_device *dev,
@@ -861,35 +817,26 @@ static int cfg80211_wext_giwfreq(struct net_device *dev,
struct cfg80211_chan_def chandef = {};
int ret;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_STATION:
- ret = cfg80211_mgd_wext_giwfreq(dev, info, freq, extra);
- break;
+ return cfg80211_mgd_wext_giwfreq(dev, info, freq, extra);
case NL80211_IFTYPE_ADHOC:
- ret = cfg80211_ibss_wext_giwfreq(dev, info, freq, extra);
- break;
+ return cfg80211_ibss_wext_giwfreq(dev, info, freq, extra);
case NL80211_IFTYPE_MONITOR:
- if (!rdev->ops->get_channel) {
- ret = -EINVAL;
- break;
- }
+ if (!rdev->ops->get_channel)
+ return -EINVAL;
ret = rdev_get_channel(rdev, wdev, 0, &chandef);
if (ret)
- break;
+ return ret;
freq->m = chandef.chan->center_freq;
freq->e = 6;
- ret = 0;
- break;
+ return ret;
default:
- ret = -EINVAL;
- break;
+ return -EINVAL;
}
-
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static int cfg80211_wext_siwtxpower(struct net_device *dev,
@@ -900,7 +847,6 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev,
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
enum nl80211_tx_power_setting type;
int dbm = 0;
- int ret;
if ((data->txpower.flags & IW_TXPOW_TYPE) != IW_TXPOW_DBM)
return -EINVAL;
@@ -942,11 +888,9 @@ static int cfg80211_wext_siwtxpower(struct net_device *dev,
return 0;
}
- wiphy_lock(&rdev->wiphy);
- ret = rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm));
- wiphy_unlock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
- return ret;
+ return rdev_set_tx_power(rdev, wdev, type, DBM_TO_MBM(dbm));
}
static int cfg80211_wext_giwtxpower(struct net_device *dev,
@@ -965,9 +909,9 @@ static int cfg80211_wext_giwtxpower(struct net_device *dev,
if (!rdev->ops->get_tx_power)
return -EOPNOTSUPP;
- wiphy_lock(&rdev->wiphy);
- err = rdev_get_tx_power(rdev, wdev, &val);
- wiphy_unlock(&rdev->wiphy);
+ scoped_guard(wiphy, &rdev->wiphy) {
+ err = rdev_get_tx_power(rdev, wdev, 0, &val);
+ }
if (err)
return err;
@@ -1209,9 +1153,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev,
timeout = wrq->value / 1000;
}
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
err = rdev_set_power_mgmt(rdev, dev, ps, timeout);
- wiphy_unlock(&rdev->wiphy);
if (err)
return err;
@@ -1244,8 +1188,8 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
struct cfg80211_bitrate_mask mask;
u32 fixed, maxrate;
struct ieee80211_supported_band *sband;
- int band, ridx, ret;
bool match = false;
+ int band, ridx;
if (!rdev->ops->set_bitrate_mask)
return -EOPNOTSUPP;
@@ -1283,14 +1227,12 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
if (!match)
return -EINVAL;
- wiphy_lock(&rdev->wiphy);
- if (dev->ieee80211_ptr->valid_links)
- ret = -EOPNOTSUPP;
- else
- ret = rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask);
- wiphy_unlock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
- return ret;
+ if (dev->ieee80211_ptr->valid_links)
+ return -EOPNOTSUPP;
+
+ return rdev_set_bitrate_mask(rdev, dev, 0, NULL, &mask);
}
static int cfg80211_wext_giwrate(struct net_device *dev,
@@ -1319,9 +1261,9 @@ static int cfg80211_wext_giwrate(struct net_device *dev,
if (err)
return err;
- wiphy_lock(&rdev->wiphy);
- err = rdev_get_station(rdev, dev, addr, &sinfo);
- wiphy_unlock(&rdev->wiphy);
+ scoped_guard(wiphy, &rdev->wiphy) {
+ err = rdev_get_station(rdev, dev, addr, &sinfo);
+ }
if (err)
return err;
@@ -1420,23 +1362,17 @@ static int cfg80211_wext_siwap(struct net_device *dev,
struct sockaddr *ap_addr = &wrqu->ap_addr;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- int ret;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
- ret = cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra);
- break;
+ return cfg80211_ibss_wext_siwap(dev, info, ap_addr, extra);
case NL80211_IFTYPE_STATION:
- ret = cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra);
- break;
+ return cfg80211_mgd_wext_siwap(dev, info, ap_addr, extra);
default:
- ret = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static int cfg80211_wext_giwap(struct net_device *dev,
@@ -1446,23 +1382,17 @@ static int cfg80211_wext_giwap(struct net_device *dev,
struct sockaddr *ap_addr = &wrqu->ap_addr;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- int ret;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
- ret = cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra);
- break;
+ return cfg80211_ibss_wext_giwap(dev, info, ap_addr, extra);
case NL80211_IFTYPE_STATION:
- ret = cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra);
- break;
+ return cfg80211_mgd_wext_giwap(dev, info, ap_addr, extra);
default:
- ret = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static int cfg80211_wext_siwessid(struct net_device *dev,
@@ -1472,23 +1402,17 @@ static int cfg80211_wext_siwessid(struct net_device *dev,
struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- int ret;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
- ret = cfg80211_ibss_wext_siwessid(dev, info, data, ssid);
- break;
+ return cfg80211_ibss_wext_siwessid(dev, info, data, ssid);
case NL80211_IFTYPE_STATION:
- ret = cfg80211_mgd_wext_siwessid(dev, info, data, ssid);
- break;
+ return cfg80211_mgd_wext_siwessid(dev, info, data, ssid);
default:
- ret = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static int cfg80211_wext_giwessid(struct net_device *dev,
@@ -1498,26 +1422,20 @@ static int cfg80211_wext_giwessid(struct net_device *dev,
struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
- int ret;
data->flags = 0;
data->length = 0;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (wdev->iftype) {
case NL80211_IFTYPE_ADHOC:
- ret = cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
- break;
+ return cfg80211_ibss_wext_giwessid(dev, info, data, ssid);
case NL80211_IFTYPE_STATION:
- ret = cfg80211_mgd_wext_giwessid(dev, info, data, ssid);
- break;
+ return cfg80211_mgd_wext_giwessid(dev, info, data, ssid);
default:
- ret = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static int cfg80211_wext_siwpmksa(struct net_device *dev,
@@ -1528,7 +1446,6 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev,
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
struct cfg80211_pmksa cfg_pmksa;
struct iw_pmksa *pmksa = (struct iw_pmksa *)extra;
- int ret;
memset(&cfg_pmksa, 0, sizeof(struct cfg80211_pmksa));
@@ -1538,39 +1455,27 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev,
cfg_pmksa.bssid = pmksa->bssid.sa_data;
cfg_pmksa.pmkid = pmksa->pmkid;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (pmksa->cmd) {
case IW_PMKSA_ADD:
- if (!rdev->ops->set_pmksa) {
- ret = -EOPNOTSUPP;
- break;
- }
+ if (!rdev->ops->set_pmksa)
+ return -EOPNOTSUPP;
- ret = rdev_set_pmksa(rdev, dev, &cfg_pmksa);
- break;
+ return rdev_set_pmksa(rdev, dev, &cfg_pmksa);
case IW_PMKSA_REMOVE:
- if (!rdev->ops->del_pmksa) {
- ret = -EOPNOTSUPP;
- break;
- }
+ if (!rdev->ops->del_pmksa)
+ return -EOPNOTSUPP;
- ret = rdev_del_pmksa(rdev, dev, &cfg_pmksa);
- break;
+ return rdev_del_pmksa(rdev, dev, &cfg_pmksa);
case IW_PMKSA_FLUSH:
- if (!rdev->ops->flush_pmksa) {
- ret = -EOPNOTSUPP;
- break;
- }
+ if (!rdev->ops->flush_pmksa)
+ return -EOPNOTSUPP;
- ret = rdev_flush_pmksa(rdev, dev);
- break;
+ return rdev_flush_pmksa(rdev, dev);
default:
- ret = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
- wiphy_unlock(&rdev->wiphy);
-
- return ret;
}
static const iw_handler cfg80211_handlers[] = {
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index 3bb04b05c5ce..bea70eb6f034 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -640,10 +640,8 @@ EXPORT_SYMBOL(wireless_send_event);
#ifdef CONFIG_CFG80211_WEXT
static void wireless_warn_cfg80211_wext(void)
{
- char name[sizeof(current->comm)];
-
pr_warn_once("warning: `%s' uses wireless extensions which will stop working for Wi-Fi 7 hardware; use nl80211\n",
- get_task_comm(name, current));
+ current->comm);
}
#endif
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 8edd9ada69d0..573b6b15a446 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -302,8 +302,8 @@ int cfg80211_wext_siwgenie(struct net_device *dev,
struct iw_point *data = &wrqu->data;
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ int ie_len = data->length;
u8 *ie = extra;
- int ie_len = data->length, err;
if (wdev->iftype != NL80211_IFTYPE_STATION)
return -EOPNOTSUPP;
@@ -311,39 +311,31 @@ int cfg80211_wext_siwgenie(struct net_device *dev,
if (!ie_len)
ie = NULL;
- wiphy_lock(wdev->wiphy);
+ guard(wiphy)(wdev->wiphy);
/* no change */
- err = 0;
if (wdev->wext.ie_len == ie_len &&
memcmp(wdev->wext.ie, ie, ie_len) == 0)
- goto out;
+ return 0;
if (ie_len) {
ie = kmemdup(extra, ie_len, GFP_KERNEL);
- if (!ie) {
- err = -ENOMEM;
- goto out;
- }
- } else
+ if (!ie)
+ return -ENOMEM;
+ } else {
ie = NULL;
+ }
kfree(wdev->wext.ie);
wdev->wext.ie = ie;
wdev->wext.ie_len = ie_len;
- if (wdev->conn) {
- err = cfg80211_disconnect(rdev, dev,
- WLAN_REASON_DEAUTH_LEAVING, false);
- if (err)
- goto out;
- }
+ if (wdev->conn)
+ return cfg80211_disconnect(rdev, dev,
+ WLAN_REASON_DEAUTH_LEAVING, false);
/* userspace better not think we'll reconnect */
- err = 0;
- out:
- wiphy_unlock(wdev->wiphy);
- return err;
+ return 0;
}
int cfg80211_wext_siwmlme(struct net_device *dev,
@@ -353,7 +345,6 @@ int cfg80211_wext_siwmlme(struct net_device *dev,
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct iw_mlme *mlme = (struct iw_mlme *)extra;
struct cfg80211_registered_device *rdev;
- int err;
if (!wdev)
return -EOPNOTSUPP;
@@ -366,17 +357,13 @@ int cfg80211_wext_siwmlme(struct net_device *dev,
if (mlme->addr.sa_family != ARPHRD_ETHER)
return -EINVAL;
- wiphy_lock(&rdev->wiphy);
+ guard(wiphy)(&rdev->wiphy);
+
switch (mlme->cmd) {
case IW_MLME_DEAUTH:
case IW_MLME_DISASSOC:
- err = cfg80211_disconnect(rdev, dev, mlme->reason_code, true);
- break;
+ return cfg80211_disconnect(rdev, dev, mlme->reason_code, true);
default:
- err = -EOPNOTSUPP;
- break;
+ return -EOPNOTSUPP;
}
- wiphy_unlock(&rdev->wiphy);
-
- return err;
}
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 89d2bef96469..a373a7130d75 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -742,6 +742,9 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
goto free_err;
}
}
+
+ if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
+ skb->skb_mstamp_ns = meta->request.launch_time;
}
}
@@ -802,8 +805,11 @@ static int __xsk_generic_xmit(struct sock *sk)
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (xsk_cq_reserve_addr_locked(xs->pool, desc.addr))
+ err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr);
+ if (err) {
+ err = -EAGAIN;
goto out;
+ }
skb = xsk_build_skb(xs, &desc);
if (IS_ERR(skb)) {
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 1f7975b49657..d158cb6dd391 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -105,7 +105,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
if (pool->unaligned)
pool->free_heads[i] = xskb;
else
- xp_init_xskb_addr(xskb, pool, i * pool->chunk_size);
+ xp_init_xskb_addr(xskb, pool, (u64)i * pool->chunk_size);
}
return pool;
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index d7b16f2c23e9..f0157702718f 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -135,6 +135,22 @@ config NET_KEY_MIGRATE
If unsure, say N.
+config XFRM_IPTFS
+ tristate "IPsec IP-TFS/AGGFRAG (RFC 9347) encapsulation support"
+ depends on XFRM
+ help
+ Information on the IP-TFS/AGGFRAG encapsulation can be found
+ in RFC 9347. This feature supports demand driven (i.e.,
+ non-constant send rate) IP-TFS to take advantage of the
+ AGGFRAG ESP payload encapsulation. This payload type
+ supports aggregation and fragmentation of the inner IP
+ packet stream which in turn yields higher small-packet
+ bandwidth as well as reducing MTU/PMTU issues. Congestion
+ control is unimplementated as the send rate is demand driven
+ rather than constant.
+
+ If unsure, say N.
+
config XFRM_ESPINTCP
bool
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 512e0b2f8514..5a1787587cb3 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -21,5 +21,6 @@ obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_USER_COMPAT) += xfrm_compat.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
+obj-$(CONFIG_XFRM_IPTFS) += xfrm_iptfs.o
obj-$(CONFIG_XFRM_ESPINTCP) += espintcp.o
obj-$(CONFIG_DEBUG_INFO_BTF) += xfrm_state_bpf.o
diff --git a/net/xfrm/trace_iptfs.h b/net/xfrm/trace_iptfs.h
new file mode 100644
index 000000000000..74391ba24445
--- /dev/null
+++ b/net/xfrm/trace_iptfs.h
@@ -0,0 +1,218 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* xfrm_trace_iptfs.h
+ *
+ * August 12 2023, Christian Hopps <chopps@labn.net>
+ *
+ * Copyright (c) 2023, LabN Consulting, L.L.C.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM iptfs
+
+#if !defined(_TRACE_IPTFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IPTFS_H
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+#include <net/ip.h>
+
+struct xfrm_iptfs_data;
+
+TRACE_EVENT(iptfs_egress_recv,
+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u16 blkoff),
+ TP_ARGS(skb, xtfs, blkoff),
+ TP_STRUCT__entry(__field(struct sk_buff *, skb)
+ __field(void *, head)
+ __field(void *, head_pg_addr)
+ __field(void *, pg0addr)
+ __field(u32, skb_len)
+ __field(u32, data_len)
+ __field(u32, headroom)
+ __field(u32, tailroom)
+ __field(u32, tail)
+ __field(u32, end)
+ __field(u32, pg0off)
+ __field(u8, head_frag)
+ __field(u8, frag_list)
+ __field(u8, nr_frags)
+ __field(u16, blkoff)),
+ TP_fast_assign(__entry->skb = skb;
+ __entry->head = skb->head;
+ __entry->skb_len = skb->len;
+ __entry->data_len = skb->data_len;
+ __entry->headroom = skb_headroom(skb);
+ __entry->tailroom = skb_tailroom(skb);
+ __entry->tail = (u32)skb->tail;
+ __entry->end = (u32)skb->end;
+ __entry->head_frag = skb->head_frag;
+ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list;
+ __entry->nr_frags = skb_shinfo(skb)->nr_frags;
+ __entry->blkoff = blkoff;
+ __entry->head_pg_addr = page_address(virt_to_head_page(skb->head));
+ __entry->pg0addr = (__entry->nr_frags
+ ? page_address(netmem_to_page(skb_shinfo(skb)->frags[0].netmem))
+ : NULL);
+ __entry->pg0off = (__entry->nr_frags
+ ? skb_shinfo(skb)->frags[0].offset
+ : 0);
+ ),
+ TP_printk("EGRESS: skb=%p len=%u data_len=%u headroom=%u head_frag=%u frag_list=%u nr_frags=%u blkoff=%u\n\t\ttailroom=%u tail=%u end=%u head=%p hdpgaddr=%p pg0->addr=%p pg0->data=%p pg0->off=%u",
+ __entry->skb, __entry->skb_len, __entry->data_len, __entry->headroom,
+ __entry->head_frag, __entry->frag_list, __entry->nr_frags, __entry->blkoff,
+ __entry->tailroom, __entry->tail, __entry->end, __entry->head,
+ __entry->head_pg_addr, __entry->pg0addr, __entry->pg0addr + __entry->pg0off,
+ __entry->pg0off)
+ )
+
+DECLARE_EVENT_CLASS(iptfs_ingress_preq_event,
+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs,
+ u32 pmtu, u8 was_gso),
+ TP_ARGS(skb, xtfs, pmtu, was_gso),
+ TP_STRUCT__entry(__field(struct sk_buff *, skb)
+ __field(u32, skb_len)
+ __field(u32, data_len)
+ __field(u32, pmtu)
+ __field(u32, queue_size)
+ __field(u32, proto_seq)
+ __field(u8, proto)
+ __field(u8, was_gso)
+ ),
+ TP_fast_assign(__entry->skb = skb;
+ __entry->skb_len = skb->len;
+ __entry->data_len = skb->data_len;
+ __entry->queue_size =
+ xtfs->cfg.max_queue_size - xtfs->queue_size;
+ __entry->proto = __trace_ip_proto(ip_hdr(skb));
+ __entry->proto_seq = __trace_ip_proto_seq(ip_hdr(skb));
+ __entry->pmtu = pmtu;
+ __entry->was_gso = was_gso;
+ ),
+ TP_printk("INGRPREQ: skb=%p len=%u data_len=%u qsize=%u proto=%u proto_seq=%u pmtu=%u was_gso=%u",
+ __entry->skb, __entry->skb_len, __entry->data_len,
+ __entry->queue_size, __entry->proto, __entry->proto_seq,
+ __entry->pmtu, __entry->was_gso));
+
+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_enqueue,
+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
+ TP_ARGS(skb, xtfs, pmtu, was_gso));
+
+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_no_queue_space,
+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
+ TP_ARGS(skb, xtfs, pmtu, was_gso));
+
+DEFINE_EVENT(iptfs_ingress_preq_event, iptfs_too_big,
+ TP_PROTO(struct sk_buff *skb, struct xfrm_iptfs_data *xtfs, u32 pmtu, u8 was_gso),
+ TP_ARGS(skb, xtfs, pmtu, was_gso));
+
+DECLARE_EVENT_CLASS(iptfs_ingress_postq_event,
+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff, struct iphdr *iph),
+ TP_ARGS(skb, mtu, blkoff, iph),
+ TP_STRUCT__entry(__field(struct sk_buff *, skb)
+ __field(u32, skb_len)
+ __field(u32, data_len)
+ __field(u32, mtu)
+ __field(u32, proto_seq)
+ __field(u16, blkoff)
+ __field(u8, proto)),
+ TP_fast_assign(__entry->skb = skb;
+ __entry->skb_len = skb->len;
+ __entry->data_len = skb->data_len;
+ __entry->mtu = mtu;
+ __entry->blkoff = blkoff;
+ __entry->proto = iph ? __trace_ip_proto(iph) : 0;
+ __entry->proto_seq = iph ? __trace_ip_proto_seq(iph) : 0;
+ ),
+ TP_printk("INGRPSTQ: skb=%p len=%u data_len=%u mtu=%u blkoff=%u proto=%u proto_seq=%u",
+ __entry->skb, __entry->skb_len, __entry->data_len, __entry->mtu,
+ __entry->blkoff, __entry->proto, __entry->proto_seq));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_dequeue,
+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+ struct iphdr *iph),
+ TP_ARGS(skb, mtu, blkoff, iph));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_fragmenting,
+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+ struct iphdr *iph),
+ TP_ARGS(skb, mtu, blkoff, iph));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_final_fragment,
+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+ struct iphdr *iph),
+ TP_ARGS(skb, mtu, blkoff, iph));
+
+DEFINE_EVENT(iptfs_ingress_postq_event, iptfs_first_toobig,
+ TP_PROTO(struct sk_buff *skb, u32 mtu, u16 blkoff,
+ struct iphdr *iph),
+ TP_ARGS(skb, mtu, blkoff, iph));
+
+TRACE_EVENT(iptfs_ingress_nth_peek,
+ TP_PROTO(struct sk_buff *skb, u32 remaining),
+ TP_ARGS(skb, remaining),
+ TP_STRUCT__entry(__field(struct sk_buff *, skb)
+ __field(u32, skb_len)
+ __field(u32, remaining)),
+ TP_fast_assign(__entry->skb = skb;
+ __entry->skb_len = skb->len;
+ __entry->remaining = remaining;
+ ),
+ TP_printk("INGRPSTQ: NTHPEEK: skb=%p len=%u remaining=%u",
+ __entry->skb, __entry->skb_len, __entry->remaining));
+
+TRACE_EVENT(iptfs_ingress_nth_add, TP_PROTO(struct sk_buff *skb, u8 share_ok),
+ TP_ARGS(skb, share_ok),
+ TP_STRUCT__entry(__field(struct sk_buff *, skb)
+ __field(u32, skb_len)
+ __field(u32, data_len)
+ __field(u8, share_ok)
+ __field(u8, head_frag)
+ __field(u8, pp_recycle)
+ __field(u8, cloned)
+ __field(u8, shared)
+ __field(u8, nr_frags)
+ __field(u8, frag_list)
+ ),
+ TP_fast_assign(__entry->skb = skb;
+ __entry->skb_len = skb->len;
+ __entry->data_len = skb->data_len;
+ __entry->share_ok = share_ok;
+ __entry->head_frag = skb->head_frag;
+ __entry->pp_recycle = skb->pp_recycle;
+ __entry->cloned = skb_cloned(skb);
+ __entry->shared = skb_shared(skb);
+ __entry->nr_frags = skb_shinfo(skb)->nr_frags;
+ __entry->frag_list = (bool)skb_shinfo(skb)->frag_list;
+ ),
+ TP_printk("INGRPSTQ: NTHADD: skb=%p len=%u data_len=%u share_ok=%u head_frag=%u pp_recycle=%u cloned=%u shared=%u nr_frags=%u frag_list=%u",
+ __entry->skb, __entry->skb_len, __entry->data_len, __entry->share_ok,
+ __entry->head_frag, __entry->pp_recycle, __entry->cloned, __entry->shared,
+ __entry->nr_frags, __entry->frag_list));
+
+DECLARE_EVENT_CLASS(iptfs_timer_event,
+ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
+ TP_ARGS(xtfs, time_val),
+ TP_STRUCT__entry(__field(u64, time_val)
+ __field(u64, set_time)),
+ TP_fast_assign(__entry->time_val = time_val;
+ __entry->set_time = xtfs->iptfs_settime;
+ ),
+ TP_printk("TIMER: set_time=%llu time_val=%llu",
+ __entry->set_time, __entry->time_val));
+
+DEFINE_EVENT(iptfs_timer_event, iptfs_timer_start,
+ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
+ TP_ARGS(xtfs, time_val));
+
+DEFINE_EVENT(iptfs_timer_event, iptfs_timer_expire,
+ TP_PROTO(struct xfrm_iptfs_data *xtfs, u64 time_val),
+ TP_ARGS(xtfs, time_val));
+
+#endif /* _TRACE_IPTFS_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../net/xfrm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace_iptfs
+#include <trace/define_trace.h>
diff --git a/net/xfrm/xfrm_compat.c b/net/xfrm/xfrm_compat.c
index 5b9ee63e30b6..b8d2e6930041 100644
--- a/net/xfrm/xfrm_compat.c
+++ b/net/xfrm/xfrm_compat.c
@@ -284,9 +284,15 @@ static int xfrm_xlate64_attr(struct sk_buff *dst, const struct nlattr *src)
case XFRMA_SA_DIR:
case XFRMA_NAT_KEEPALIVE_INTERVAL:
case XFRMA_SA_PCPU:
+ case XFRMA_IPTFS_DROP_TIME:
+ case XFRMA_IPTFS_REORDER_WINDOW:
+ case XFRMA_IPTFS_DONT_FRAG:
+ case XFRMA_IPTFS_INIT_DELAY:
+ case XFRMA_IPTFS_MAX_QSIZE:
+ case XFRMA_IPTFS_PKT_SIZE:
return xfrm_nla_cpy(dst, src, nla_len(src));
default:
- BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
pr_warn_once("unsupported nla_type %d\n", src->nla_type);
return -EOPNOTSUPP;
}
@@ -441,7 +447,7 @@ static int xfrm_xlate32_attr(void *dst, const struct nlattr *nla,
int err;
if (type > XFRMA_MAX) {
- BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU);
+ BUILD_BUG_ON(XFRMA_MAX != XFRMA_IPTFS_PKT_SIZE);
NL_SET_ERR_MSG(extack, "Bad attribute");
return -EOPNOTSUPP;
}
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index b33c4591e09a..97c8030cc417 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -42,7 +42,8 @@ static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb,
skb->transport_header = skb->network_header + hsize;
skb_reset_mac_len(skb);
- pskb_pull(skb, skb->mac_len + x->props.header_len);
+ pskb_pull(skb,
+ skb->mac_len + x->props.header_len - x->props.enc_hdr_len);
}
static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
@@ -68,6 +69,7 @@ static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb,
static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb)
{
switch (x->outer_mode.encap) {
+ case XFRM_MODE_IPTFS:
case XFRM_MODE_TUNNEL:
if (x->outer_mode.family == AF_INET)
return __xfrm_mode_tunnel_prep(x, skb,
@@ -242,11 +244,6 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
xfrm_address_t *daddr;
bool is_packet_offload;
- if (!x->type_offload) {
- NL_SET_ERR_MSG(extack, "Type doesn't support offload");
- return -EINVAL;
- }
-
if (xuo->flags &
~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND | XFRM_OFFLOAD_PACKET)) {
NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
@@ -308,6 +305,13 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
return -EINVAL;
}
+ xfrm_set_type_offload(x);
+ if (!x->type_offload) {
+ NL_SET_ERR_MSG(extack, "Type doesn't support offload");
+ dev_put(dev);
+ return -EINVAL;
+ }
+
xso->dev = dev;
netdev_tracker_alloc(dev, &xso->dev_tracker, GFP_ATOMIC);
xso->real_dev = dev;
@@ -330,6 +334,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
netdev_put(dev, &xso->dev_tracker);
xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+ xfrm_unset_type_offload(x);
/* User explicitly requested packet offload mode and configured
* policy in addition to the XFRM state. So be civil to users,
* and return an error instead of taking fallback path.
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 841a60a6fbfe..7e6a71b9d6a3 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -48,7 +48,7 @@ static DEFINE_SPINLOCK(xfrm_input_afinfo_lock);
static struct xfrm_input_afinfo const __rcu *xfrm_input_afinfo[2][AF_INET6 + 1];
static struct gro_cells gro_cells;
-static struct net_device xfrm_napi_dev;
+static struct net_device *xfrm_napi_dev;
static DEFINE_PER_CPU(struct xfrm_trans_tasklet, xfrm_trans_tasklet);
@@ -446,6 +446,9 @@ static int xfrm_inner_mode_input(struct xfrm_state *x,
WARN_ON_ONCE(1);
break;
default:
+ if (x->mode_cbs && x->mode_cbs->input)
+ return x->mode_cbs->input(x, skb);
+
WARN_ON_ONCE(1);
break;
}
@@ -453,6 +456,10 @@ static int xfrm_inner_mode_input(struct xfrm_state *x,
return -EOPNOTSUPP;
}
+/* NOTE: encap_type - In addition to the normal (non-negative) values for
+ * encap_type, a negative value of -1 or -2 can be used to resume/restart this
+ * function after a previous invocation early terminated for async operation.
+ */
int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
{
const struct xfrm_state_afinfo *afinfo;
@@ -489,6 +496,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
family = x->props.family;
+ /* An encap_type of -2 indicates reconstructed inner packet */
+ if (encap_type == -2)
+ goto resume_decapped;
+
/* An encap_type of -1 indicates async resumption. */
if (encap_type == -1) {
async = 1;
@@ -679,11 +690,14 @@ resume:
XFRM_MODE_SKB_CB(skb)->protocol = nexthdr;
- if (xfrm_inner_mode_input(x, skb)) {
+ err = xfrm_inner_mode_input(x, skb);
+ if (err == -EINPROGRESS)
+ return 0;
+ else if (err) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR);
goto drop;
}
-
+resume_decapped:
if (x->outer_mode.flags & XFRM_MODE_FLAG_TUNNEL) {
decaps = 1;
break;
@@ -811,8 +825,11 @@ void __init xfrm_input_init(void)
int err;
int i;
- init_dummy_netdev(&xfrm_napi_dev);
- err = gro_cells_init(&gro_cells, &xfrm_napi_dev);
+ xfrm_napi_dev = alloc_netdev_dummy(0);
+ if (!xfrm_napi_dev)
+ panic("Failed to allocate XFRM dummy netdev\n");
+
+ err = gro_cells_init(&gro_cells, xfrm_napi_dev);
if (err)
gro_cells.cells = NULL;
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 98f1e2b67c76..c397eb99d867 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -506,7 +506,7 @@ xmit:
skb_dst_set(skb, dst);
skb->dev = tdev;
- err = dst_output(xi->net, skb->sk, skb);
+ err = dst_output(xi->net, skb_to_full_sk(skb), skb);
if (net_xmit_eval(err) == 0) {
dev_sw_netstats_tx_add(dev, 1, length);
} else {
diff --git a/net/xfrm/xfrm_iptfs.c b/net/xfrm/xfrm_iptfs.c
new file mode 100644
index 000000000000..755f1eea8bfa
--- /dev/null
+++ b/net/xfrm/xfrm_iptfs.c
@@ -0,0 +1,2764 @@
+// SPDX-License-Identifier: GPL-2.0
+/* xfrm_iptfs: IPTFS encapsulation support
+ *
+ * April 21 2022, Christian Hopps <chopps@labn.net>
+ *
+ * Copyright (c) 2022, LabN Consulting, L.L.C.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/icmpv6.h>
+#include <linux/skbuff_ref.h>
+#include <net/gro.h>
+#include <net/icmp.h>
+#include <net/ip6_route.h>
+#include <net/inet_ecn.h>
+#include <net/xfrm.h>
+
+#include <crypto/aead.h>
+
+#include "xfrm_inout.h"
+#include "trace_iptfs.h"
+
+/* IPTFS encap (header) values. */
+#define IPTFS_SUBTYPE_BASIC 0
+#define IPTFS_SUBTYPE_CC 1
+
+/* ----------------------------------------------- */
+/* IP-TFS default SA values (tunnel egress/dir-in) */
+/* ----------------------------------------------- */
+
+/**
+ * define IPTFS_DEFAULT_DROP_TIME_USECS - default drop time
+ *
+ * The default IPTFS drop time in microseconds. The drop time is the amount of
+ * time before a missing out-of-order IPTFS tunnel packet is considered lost.
+ * See also the reorder window.
+ *
+ * Default 1s.
+ */
+#define IPTFS_DEFAULT_DROP_TIME_USECS 1000000
+
+/**
+ * define IPTFS_DEFAULT_REORDER_WINDOW - default reorder window size
+ *
+ * The default IPTFS reorder window size. The reorder window size dictates the
+ * maximum number of IPTFS tunnel packets in a sequence that may arrive out of
+ * order.
+ *
+ * Default 3. (tcp folks suggested)
+ */
+#define IPTFS_DEFAULT_REORDER_WINDOW 3
+
+/* ------------------------------------------------ */
+/* IPTFS default SA values (tunnel ingress/dir-out) */
+/* ------------------------------------------------ */
+
+/**
+ * define IPTFS_DEFAULT_INIT_DELAY_USECS - default initial output delay
+ *
+ * The initial output delay is the amount of time prior to servicing the output
+ * queue after queueing the first packet on said queue. This applies anytime the
+ * output queue was previously empty.
+ *
+ * Default 0.
+ */
+#define IPTFS_DEFAULT_INIT_DELAY_USECS 0
+
+/**
+ * define IPTFS_DEFAULT_MAX_QUEUE_SIZE - default max output queue size.
+ *
+ * The default IPTFS max output queue size in octets. The output queue is where
+ * received packets destined for output over an IPTFS tunnel are stored prior to
+ * being output in aggregated/fragmented form over the IPTFS tunnel.
+ *
+ * Default 1M.
+ */
+#define IPTFS_DEFAULT_MAX_QUEUE_SIZE (1024 * 10240)
+
+/* Assumed: skb->head is cache aligned.
+ *
+ * L2 Header resv: Arrange for cacheline to start at skb->data - 16 to keep the
+ * to-be-pushed L2 header in the same cacheline as resulting `skb->data` (i.e.,
+ * the L3 header). If cacheline size is > 64 then skb->data + pushed L2 will all
+ * be in a single cacheline if we simply reserve 64 bytes.
+ *
+ * L3 Header resv: For L3+L2 headers (i.e., skb->data points at the IPTFS payload)
+ * we want `skb->data` to be cacheline aligned and all pushed L2L3 headers will
+ * be in their own cacheline[s]. 128 works for cachelins up to 128 bytes, for
+ * any larger cacheline sizes the pushed headers will simply share the cacheline
+ * with the start of the IPTFS payload (skb->data).
+ */
+#define XFRM_IPTFS_MIN_L3HEADROOM 128
+#define XFRM_IPTFS_MIN_L2HEADROOM (L1_CACHE_BYTES > 64 ? 64 : 64 + 16)
+
+/* Min to try to share outer iptfs skb data vs copying into new skb */
+#define IPTFS_PKT_SHARE_MIN 129
+
+#define NSECS_IN_USEC 1000
+
+#define IPTFS_HRTIMER_MODE HRTIMER_MODE_REL_SOFT
+
+/**
+ * struct xfrm_iptfs_config - configuration for the IPTFS tunnel.
+ * @pkt_size: size of the outer IP packet. 0 to use interface and MTU discovery,
+ * otherwise the user specified value.
+ * @max_queue_size: The maximum number of octets allowed to be queued to be sent
+ * over the IPTFS SA. The queue size is measured as the size of all the
+ * packets enqueued.
+ * @reorder_win_size: the number slots in the reorder window, thus the number of
+ * packets that may arrive out of order.
+ * @dont_frag: true to inhibit fragmenting across IPTFS outer packets.
+ */
+struct xfrm_iptfs_config {
+ u32 pkt_size; /* outer_packet_size or 0 */
+ u32 max_queue_size; /* octets */
+ u16 reorder_win_size;
+ u8 dont_frag : 1;
+};
+
+struct skb_wseq {
+ struct sk_buff *skb;
+ u64 drop_time;
+};
+
+/**
+ * struct xfrm_iptfs_data - mode specific xfrm state.
+ * @cfg: IPTFS tunnel config.
+ * @x: owning SA (xfrm_state).
+ * @queue: queued user packets to send.
+ * @queue_size: number of octets on queue (sum of packet sizes).
+ * @ecn_queue_size: octets above with ECN mark.
+ * @init_delay_ns: nanoseconds to wait to send initial IPTFS packet.
+ * @iptfs_timer: output timer.
+ * @iptfs_settime: time the output timer was set.
+ * @payload_mtu: max payload size.
+ * @w_seq_set: true after first seq received.
+ * @w_wantseq: waiting for this seq number as next to process (in order).
+ * @w_saved: the saved buf array (reorder window).
+ * @w_savedlen: the saved len (not size).
+ * @drop_lock: lock to protect reorder queue.
+ * @drop_timer: timer for considering next packet lost.
+ * @drop_time_ns: timer intervan in nanoseconds.
+ * @ra_newskb: new pkt being reassembled.
+ * @ra_wantseq: expected next sequence for reassembly.
+ * @ra_runt: last pkt bytes from very end of last skb.
+ * @ra_runtlen: size of ra_runt.
+ */
+struct xfrm_iptfs_data {
+ struct xfrm_iptfs_config cfg;
+
+ /* Ingress User Input */
+ struct xfrm_state *x; /* owning state */
+ struct sk_buff_head queue; /* output queue */
+
+ u32 queue_size; /* octets */
+ u32 ecn_queue_size; /* octets above which ECN mark */
+ u64 init_delay_ns; /* nanoseconds */
+ struct hrtimer iptfs_timer; /* output timer */
+ time64_t iptfs_settime; /* time timer was set */
+ u32 payload_mtu; /* max payload size */
+
+ /* Tunnel input reordering */
+ bool w_seq_set; /* true after first seq received */
+ u64 w_wantseq; /* expected next sequence */
+ struct skb_wseq *w_saved; /* the saved buf array */
+ u32 w_savedlen; /* the saved len (not size) */
+ spinlock_t drop_lock;
+ struct hrtimer drop_timer;
+ u64 drop_time_ns;
+
+ /* Tunnel input reassembly */
+ struct sk_buff *ra_newskb; /* new pkt being reassembled */
+ u64 ra_wantseq; /* expected next sequence */
+ u8 ra_runt[6]; /* last pkt bytes from last skb */
+ u8 ra_runtlen; /* count of ra_runt */
+};
+
+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu);
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me);
+static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me);
+
+/* ================= */
+/* Utility Functions */
+/* ================= */
+
+#ifdef TRACEPOINTS_ENABLED
+static u32 __trace_ip_proto(struct iphdr *iph)
+{
+ if (iph->version == 4)
+ return iph->protocol;
+ return ((struct ipv6hdr *)iph)->nexthdr;
+}
+
+static u32 __trace_ip_proto_seq(struct iphdr *iph)
+{
+ void *nexthdr;
+ u32 protocol = 0;
+
+ if (iph->version == 4) {
+ nexthdr = (void *)(iph + 1);
+ protocol = iph->protocol;
+ } else if (iph->version == 6) {
+ nexthdr = (void *)(((struct ipv6hdr *)(iph)) + 1);
+ protocol = ((struct ipv6hdr *)(iph))->nexthdr;
+ }
+ switch (protocol) {
+ case IPPROTO_ICMP:
+ return ntohs(((struct icmphdr *)nexthdr)->un.echo.sequence);
+ case IPPROTO_ICMPV6:
+ return ntohs(((struct icmp6hdr *)nexthdr)->icmp6_sequence);
+ case IPPROTO_TCP:
+ return ntohl(((struct tcphdr *)nexthdr)->seq);
+ case IPPROTO_UDP:
+ return ntohs(((struct udphdr *)nexthdr)->source);
+ default:
+ return 0;
+ }
+}
+#endif /*TRACEPOINTS_ENABLED*/
+
+static u64 __esp_seq(struct sk_buff *skb)
+{
+ u64 seq = ntohl(XFRM_SKB_CB(skb)->seq.input.low);
+
+ return seq | (u64)ntohl(XFRM_SKB_CB(skb)->seq.input.hi) << 32;
+}
+
+/* ======================= */
+/* IPTFS SK_BUFF Functions */
+/* ======================= */
+
+/**
+ * iptfs_alloc_skb() - Allocate a new `skb`.
+ * @tpl: the skb to copy required meta-data from.
+ * @len: the linear length of the head data, zero is fine.
+ * @l3resv: true if skb reserve needs to support pushing L3 headers
+ *
+ * A new `skb` is allocated and required meta-data is copied from `tpl`, the
+ * head data is sized to `len` + reserved space set according to the @l3resv
+ * boolean.
+ *
+ * When @l3resv is false, resv is XFRM_IPTFS_MIN_L2HEADROOM which arranges for
+ * `skb->data - 16` which is a good guess for good cache alignment (placing the
+ * to be pushed L2 header at the start of a cacheline.
+ *
+ * Otherwise, @l3resv is true and resv is set to the correct reserved space for
+ * dst->dev plus the calculated L3 overhead for the xfrm dst or
+ * XFRM_IPTFS_MIN_L3HEADROOM whichever is larger. This is then cache aligned so
+ * that all the headers will commonly fall in a cacheline when possible.
+ *
+ * l3resv=true is used on tunnel ingress (tx), because we need to reserve for
+ * the new IPTFS packet (i.e., L2+L3 headers). On tunnel egress (rx) the data
+ * being copied into the skb includes the user L3 headers already so we only
+ * need to reserve for L2.
+ *
+ * Return: the new skb or NULL.
+ */
+static struct sk_buff *iptfs_alloc_skb(struct sk_buff *tpl, u32 len, bool l3resv)
+{
+ struct sk_buff *skb;
+ u32 resv;
+
+ if (!l3resv) {
+ resv = XFRM_IPTFS_MIN_L2HEADROOM;
+ } else {
+ struct dst_entry *dst = skb_dst(tpl);
+
+ resv = LL_RESERVED_SPACE(dst->dev) + dst->header_len;
+ resv = max(resv, XFRM_IPTFS_MIN_L3HEADROOM);
+ resv = L1_CACHE_ALIGN(resv);
+ }
+
+ skb = alloc_skb(len + resv, GFP_ATOMIC | __GFP_NOWARN);
+ if (!skb)
+ return NULL;
+
+ skb_reserve(skb, resv);
+
+ if (!l3resv) {
+ /* xfrm_input resume needs dev and xfrm ext from tunnel pkt */
+ skb->dev = tpl->dev;
+ __skb_ext_copy(skb, tpl);
+ }
+
+ /* dropped by xfrm_input, used by xfrm_output */
+ skb_dst_copy(skb, tpl);
+
+ return skb;
+}
+
+/**
+ * iptfs_skb_head_to_frag() - initialize a skb_frag_t based on skb head data
+ * @skb: skb with the head data
+ * @frag: frag to initialize
+ */
+static void iptfs_skb_head_to_frag(const struct sk_buff *skb, skb_frag_t *frag)
+{
+ struct page *page = virt_to_head_page(skb->data);
+ unsigned char *addr = (unsigned char *)page_address(page);
+
+ skb_frag_fill_page_desc(frag, page, skb->data - addr, skb_headlen(skb));
+}
+
+/**
+ * struct iptfs_skb_frag_walk - use to track a walk through fragments
+ * @fragi: current fragment index
+ * @past: length of data in fragments before @fragi
+ * @total: length of data in all fragments
+ * @nr_frags: number of fragments present in array
+ * @initial_offset: the value passed in to skb_prepare_frag_walk()
+ * @frags: the page fragments inc. room for head page
+ * @pp_recycle: copy of skb->pp_recycle
+ */
+struct iptfs_skb_frag_walk {
+ u32 fragi;
+ u32 past;
+ u32 total;
+ u32 nr_frags;
+ u32 initial_offset;
+ skb_frag_t frags[MAX_SKB_FRAGS + 1];
+ bool pp_recycle;
+};
+
+/**
+ * iptfs_skb_prepare_frag_walk() - initialize a frag walk over an skb.
+ * @skb: the skb to walk.
+ * @initial_offset: start the walk @initial_offset into the skb.
+ * @walk: the walk to initialize
+ *
+ * Future calls to skb_add_frags() will expect the @offset value to be at
+ * least @initial_offset large.
+ */
+static void iptfs_skb_prepare_frag_walk(struct sk_buff *skb, u32 initial_offset,
+ struct iptfs_skb_frag_walk *walk)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ skb_frag_t *frag, *from;
+ u32 i;
+
+ walk->initial_offset = initial_offset;
+ walk->fragi = 0;
+ walk->past = 0;
+ walk->total = 0;
+ walk->nr_frags = 0;
+ walk->pp_recycle = skb->pp_recycle;
+
+ if (skb->head_frag) {
+ if (initial_offset >= skb_headlen(skb)) {
+ initial_offset -= skb_headlen(skb);
+ } else {
+ frag = &walk->frags[walk->nr_frags++];
+ iptfs_skb_head_to_frag(skb, frag);
+ frag->offset += initial_offset;
+ frag->len -= initial_offset;
+ walk->total += frag->len;
+ initial_offset = 0;
+ }
+ } else {
+ initial_offset -= skb_headlen(skb);
+ }
+
+ for (i = 0; i < shinfo->nr_frags; i++) {
+ from = &shinfo->frags[i];
+ if (initial_offset >= from->len) {
+ initial_offset -= from->len;
+ continue;
+ }
+ frag = &walk->frags[walk->nr_frags++];
+ *frag = *from;
+ if (initial_offset) {
+ frag->offset += initial_offset;
+ frag->len -= initial_offset;
+ initial_offset = 0;
+ }
+ walk->total += frag->len;
+ }
+}
+
+static u32 iptfs_skb_reset_frag_walk(struct iptfs_skb_frag_walk *walk,
+ u32 offset)
+{
+ /* Adjust offset to refer to internal walk values */
+ offset -= walk->initial_offset;
+
+ /* Get to the correct fragment for offset */
+ while (offset < walk->past) {
+ walk->past -= walk->frags[--walk->fragi].len;
+ if (offset >= walk->past)
+ break;
+ }
+ while (offset >= walk->past + walk->frags[walk->fragi].len)
+ walk->past += walk->frags[walk->fragi++].len;
+
+ /* offset now relative to this current frag */
+ offset -= walk->past;
+ return offset;
+}
+
+/**
+ * iptfs_skb_can_add_frags() - check if ok to add frags from walk to skb
+ * @skb: skb to check for adding frags to
+ * @walk: the walk that will be used as source for frags.
+ * @offset: offset from beginning of original skb to start from.
+ * @len: amount of data to add frag references to in @skb.
+ *
+ * Return: true if ok to add frags.
+ */
+static bool iptfs_skb_can_add_frags(const struct sk_buff *skb,
+ struct iptfs_skb_frag_walk *walk,
+ u32 offset, u32 len)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u32 fragi, nr_frags, fraglen;
+
+ if (skb_has_frag_list(skb) || skb->pp_recycle != walk->pp_recycle)
+ return false;
+
+ /* Make offset relative to current frag after setting that */
+ offset = iptfs_skb_reset_frag_walk(walk, offset);
+
+ /* Verify we have array space for the fragments we need to add */
+ fragi = walk->fragi;
+ nr_frags = shinfo->nr_frags;
+ while (len && fragi < walk->nr_frags) {
+ skb_frag_t *frag = &walk->frags[fragi];
+
+ fraglen = frag->len;
+ if (offset) {
+ fraglen -= offset;
+ offset = 0;
+ }
+ if (++nr_frags > MAX_SKB_FRAGS)
+ return false;
+ if (len <= fraglen)
+ return true;
+ len -= fraglen;
+ fragi++;
+ }
+ /* We may not copy all @len but what we have will fit. */
+ return true;
+}
+
+/**
+ * iptfs_skb_add_frags() - add a range of fragment references into an skb
+ * @skb: skb to add references into
+ * @walk: the walk to add referenced fragments from.
+ * @offset: offset from beginning of original skb to start from.
+ * @len: amount of data to add frag references to in @skb.
+ *
+ * iptfs_skb_can_add_frags() should be called before this function to verify
+ * that the destination @skb is compatible with the walk and has space in the
+ * array for the to be added frag references.
+ *
+ * Return: The number of bytes not added to @skb b/c we reached the end of the
+ * walk before adding all of @len.
+ */
+static int iptfs_skb_add_frags(struct sk_buff *skb,
+ struct iptfs_skb_frag_walk *walk, u32 offset,
+ u32 len)
+{
+ struct skb_shared_info *shinfo = skb_shinfo(skb);
+ u32 fraglen;
+
+ if (!walk->nr_frags || offset >= walk->total + walk->initial_offset)
+ return len;
+
+ /* make offset relative to current frag after setting that */
+ offset = iptfs_skb_reset_frag_walk(walk, offset);
+
+ while (len && walk->fragi < walk->nr_frags) {
+ skb_frag_t *frag = &walk->frags[walk->fragi];
+ skb_frag_t *tofrag = &shinfo->frags[shinfo->nr_frags];
+
+ *tofrag = *frag;
+ if (offset) {
+ tofrag->offset += offset;
+ tofrag->len -= offset;
+ offset = 0;
+ }
+ __skb_frag_ref(tofrag);
+ shinfo->nr_frags++;
+
+ /* see if we are done */
+ fraglen = tofrag->len;
+ if (len < fraglen) {
+ tofrag->len = len;
+ skb->len += len;
+ skb->data_len += len;
+ return 0;
+ }
+ /* advance to next source fragment */
+ len -= fraglen; /* careful, use dst bv_len */
+ skb->len += fraglen; /* careful, " " " */
+ skb->data_len += fraglen; /* careful, " " " */
+ walk->past += frag->len; /* careful, use src bv_len */
+ walk->fragi++;
+ }
+ return len;
+}
+
+/* ================================== */
+/* IPTFS Trace Event Definitions */
+/* ================================== */
+
+#define CREATE_TRACE_POINTS
+#include "trace_iptfs.h"
+
+/* ================================== */
+/* IPTFS Receiving (egress) Functions */
+/* ================================== */
+
+/**
+ * iptfs_pskb_add_frags() - Create and add frags into a new sk_buff.
+ * @tpl: template to create new skb from.
+ * @walk: The source for fragments to add.
+ * @off: The offset into @walk to add frags from, also used with @st and
+ * @copy_len.
+ * @len: The length of data to add covering frags from @walk into @skb.
+ * This must be <= @skblen.
+ * @st: The sequence state to copy from into the new head skb.
+ * @copy_len: Copy @copy_len bytes from @st at offset @off into the new skb
+ * linear space.
+ *
+ * Create a new sk_buff `skb` using the template @tpl. Copy @copy_len bytes from
+ * @st into the new skb linear space, and then add shared fragments from the
+ * frag walk for the remaining @len of data (i.e., @len - @copy_len bytes).
+ *
+ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs.
+ */
+static struct sk_buff *
+iptfs_pskb_add_frags(struct sk_buff *tpl, struct iptfs_skb_frag_walk *walk,
+ u32 off, u32 len, struct skb_seq_state *st, u32 copy_len)
+{
+ struct sk_buff *skb;
+
+ skb = iptfs_alloc_skb(tpl, copy_len, false);
+ if (!skb)
+ return NULL;
+
+ /* this should not normally be happening */
+ if (!iptfs_skb_can_add_frags(skb, walk, off + copy_len,
+ len - copy_len)) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ if (copy_len &&
+ skb_copy_seq_read(st, off, skb_put(skb, copy_len), copy_len)) {
+ XFRM_INC_STATS(dev_net(st->root_skb->dev),
+ LINUX_MIB_XFRMINERROR);
+ kfree_skb(skb);
+ return NULL;
+ }
+
+ iptfs_skb_add_frags(skb, walk, off + copy_len, len - copy_len);
+ return skb;
+}
+
+/**
+ * iptfs_pskb_extract_seq() - Create and load data into a new sk_buff.
+ * @skblen: the total data size for `skb`.
+ * @st: The source for the rest of the data to copy into `skb`.
+ * @off: The offset into @st to copy data from.
+ * @len: The length of data to copy from @st into `skb`. This must be <=
+ * @skblen.
+ *
+ * Create a new sk_buff `skb` with @skblen of packet data space. If non-zero,
+ * copy @rlen bytes of @runt into `skb`. Then using seq functions copy @len
+ * bytes from @st into `skb` starting from @off.
+ *
+ * It is an error for @len to be greater than the amount of data left in @st.
+ *
+ * Return: The newly allocated sk_buff `skb` or NULL if an error occurs.
+ */
+static struct sk_buff *
+iptfs_pskb_extract_seq(u32 skblen, struct skb_seq_state *st, u32 off, int len)
+{
+ struct sk_buff *skb = iptfs_alloc_skb(st->root_skb, skblen, false);
+
+ if (!skb)
+ return NULL;
+ if (skb_copy_seq_read(st, off, skb_put(skb, len), len)) {
+ XFRM_INC_STATS(dev_net(st->root_skb->dev), LINUX_MIB_XFRMINERROR);
+ kfree_skb(skb);
+ return NULL;
+ }
+ return skb;
+}
+
+/**
+ * iptfs_input_save_runt() - save data in xtfs runt space.
+ * @xtfs: xtfs state
+ * @seq: the current sequence
+ * @buf: packet data
+ * @len: length of packet data
+ *
+ * Save the small (`len`) start of a fragmented packet in `buf` in the xtfs data
+ * runt space.
+ */
+static void iptfs_input_save_runt(struct xfrm_iptfs_data *xtfs, u64 seq,
+ u8 *buf, int len)
+{
+ memcpy(xtfs->ra_runt, buf, len);
+
+ xtfs->ra_runtlen = len;
+ xtfs->ra_wantseq = seq + 1;
+}
+
+/**
+ * __iptfs_iphlen() - return the v4/v6 header length using packet data.
+ * @data: pointer at octet with version nibble
+ *
+ * The version data has been checked to be valid (i.e., either 4 or 6).
+ *
+ * Return: the IP header size based on the IP version.
+ */
+static u32 __iptfs_iphlen(u8 *data)
+{
+ struct iphdr *iph = (struct iphdr *)data;
+
+ if (iph->version == 0x4)
+ return sizeof(*iph);
+ return sizeof(struct ipv6hdr);
+}
+
+/**
+ * __iptfs_iplen() - return the v4/v6 length using packet data.
+ * @data: pointer to ip (v4/v6) packet header
+ *
+ * Grab the IPv4 or IPv6 length value in the start of the inner packet header
+ * pointed to by `data`. Assumes data len is enough for the length field only.
+ *
+ * The version data has been checked to be valid (i.e., either 4 or 6).
+ *
+ * Return: the length value.
+ */
+static u32 __iptfs_iplen(u8 *data)
+{
+ struct iphdr *iph = (struct iphdr *)data;
+
+ if (iph->version == 0x4)
+ return ntohs(iph->tot_len);
+ return ntohs(((struct ipv6hdr *)iph)->payload_len) +
+ sizeof(struct ipv6hdr);
+}
+
+/**
+ * iptfs_complete_inner_skb() - finish preparing the inner packet for gro recv.
+ * @x: xfrm state
+ * @skb: the inner packet
+ *
+ * Finish the standard xfrm processing on the inner packet prior to sending back
+ * through gro_cells_receive. We do this separately b/c we are building a list
+ * of packets in the hopes that one day a list will be taken by
+ * xfrm_input.
+ */
+static void iptfs_complete_inner_skb(struct xfrm_state *x, struct sk_buff *skb)
+{
+ skb_reset_network_header(skb);
+
+ /* The packet is going back through gro_cells_receive no need to
+ * set this.
+ */
+ skb_reset_transport_header(skb);
+
+ /* Packet already has checksum value set. */
+ skb->ip_summed = CHECKSUM_NONE;
+
+ /* Our skb will contain the header data copied when this outer packet
+ * which contained the start of this inner packet. This is true
+ * when we allocate a new skb as well as when we reuse the existing skb.
+ */
+ if (ip_hdr(skb)->version == 0x4) {
+ struct iphdr *iph = ip_hdr(skb);
+
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv4_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph);
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+ IP_ECN_set_ce(iph);
+
+ skb->protocol = htons(ETH_P_IP);
+ } else {
+ struct ipv6hdr *iph = ipv6_hdr(skb);
+
+ if (x->props.flags & XFRM_STATE_DECAP_DSCP)
+ ipv6_copy_dscp(XFRM_MODE_SKB_CB(skb)->tos, iph);
+ if (!(x->props.flags & XFRM_STATE_NOECN))
+ if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
+ IP6_ECN_set_ce(skb, iph);
+
+ skb->protocol = htons(ETH_P_IPV6);
+ }
+}
+
+static void __iptfs_reassem_done(struct xfrm_iptfs_data *xtfs, bool free)
+{
+ assert_spin_locked(&xtfs->drop_lock);
+
+ /* We don't care if it works locking takes care of things */
+ hrtimer_try_to_cancel(&xtfs->drop_timer);
+ if (free)
+ kfree_skb(xtfs->ra_newskb);
+ xtfs->ra_newskb = NULL;
+}
+
+/**
+ * iptfs_reassem_abort() - In-progress packet is aborted free the state.
+ * @xtfs: xtfs state
+ */
+static void iptfs_reassem_abort(struct xfrm_iptfs_data *xtfs)
+{
+ __iptfs_reassem_done(xtfs, true);
+}
+
+/**
+ * iptfs_reassem_done() - In-progress packet is complete, clear the state.
+ * @xtfs: xtfs state
+ */
+static void iptfs_reassem_done(struct xfrm_iptfs_data *xtfs)
+{
+ __iptfs_reassem_done(xtfs, false);
+}
+
+/**
+ * iptfs_reassem_cont() - Continue the reassembly of an inner packets.
+ * @xtfs: xtfs state
+ * @seq: sequence of current packet
+ * @st: seq read stat for current packet
+ * @skb: current packet
+ * @data: offset into sequential packet data
+ * @blkoff: packet blkoff value
+ * @list: list of skbs to enqueue completed packet on
+ *
+ * Process an IPTFS payload that has a non-zero `blkoff` or when we are
+ * expecting the continuation b/c we have a runt or in-progress packet.
+ *
+ * Return: the new data offset to continue processing from.
+ */
+static u32 iptfs_reassem_cont(struct xfrm_iptfs_data *xtfs, u64 seq,
+ struct skb_seq_state *st, struct sk_buff *skb,
+ u32 data, u32 blkoff, struct list_head *list)
+{
+ struct iptfs_skb_frag_walk _fragwalk;
+ struct iptfs_skb_frag_walk *fragwalk = NULL;
+ struct sk_buff *newskb = xtfs->ra_newskb;
+ u32 remaining = skb->len - data;
+ u32 runtlen = xtfs->ra_runtlen;
+ u32 copylen, fraglen, ipremain, iphlen, iphremain, rrem;
+
+ /* Handle packet fragment we aren't expecting */
+ if (!runtlen && !xtfs->ra_newskb)
+ return data + min(blkoff, remaining);
+
+ /* Important to remember that input to this function is an ordered
+ * packet stream (unless the user disabled the reorder window). Thus if
+ * we are waiting for, and expecting the next packet so we can continue
+ * assembly, a newer sequence number indicates older ones are not coming
+ * (or if they do should be ignored). Technically we can receive older
+ * ones when the reorder window is disabled; however, the user should
+ * have disabled fragmentation in this case, and regardless we don't
+ * deal with it.
+ *
+ * blkoff could be zero if the stream is messed up (or it's an all pad
+ * insertion) be careful to handle that case in each of the below
+ */
+
+ /* Too old case: This can happen when the reorder window is disabled so
+ * ordering isn't actually guaranteed.
+ */
+ if (seq < xtfs->ra_wantseq)
+ return data + remaining;
+
+ /* Too new case: We missed what we wanted cleanup. */
+ if (seq > xtfs->ra_wantseq) {
+ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR);
+ goto abandon;
+ }
+
+ if (blkoff == 0) {
+ if ((*skb->data & 0xF0) != 0) {
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINIPTFSERROR);
+ goto abandon;
+ }
+ /* Handle all pad case, advance expected sequence number.
+ * (RFC 9347 S2.2.3)
+ */
+ xtfs->ra_wantseq++;
+ /* will end parsing */
+ return data + remaining;
+ }
+
+ if (runtlen) {
+ /* Regardless of what happens we're done with the runt */
+ xtfs->ra_runtlen = 0;
+
+ /* The start of this inner packet was at the very end of the last
+ * iptfs payload which didn't include enough for the ip header
+ * length field. We must have *at least* that now.
+ */
+ rrem = sizeof(xtfs->ra_runt) - runtlen;
+ if (remaining < rrem || blkoff < rrem) {
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINIPTFSERROR);
+ goto abandon;
+ }
+
+ /* fill in the runt data */
+ if (skb_copy_seq_read(st, data, &xtfs->ra_runt[runtlen],
+ rrem)) {
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINBUFFERERROR);
+ goto abandon;
+ }
+
+ /* We have enough data to get the ip length value now,
+ * allocate an in progress skb
+ */
+ ipremain = __iptfs_iplen(xtfs->ra_runt);
+ if (ipremain < sizeof(xtfs->ra_runt)) {
+ /* length has to be at least runtsize large */
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINIPTFSERROR);
+ goto abandon;
+ }
+
+ /* For the runt case we don't attempt sharing currently. NOTE:
+ * Currently, this IPTFS implementation will not create runts.
+ */
+
+ newskb = iptfs_alloc_skb(skb, ipremain, false);
+ if (!newskb) {
+ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINERROR);
+ goto abandon;
+ }
+ xtfs->ra_newskb = newskb;
+
+ /* Copy the runt data into the buffer, but leave data
+ * pointers the same as normal non-runt case. The extra `rrem`
+ * recopied bytes are basically cacheline free. Allows using
+ * same logic below to complete.
+ */
+ memcpy(skb_put(newskb, runtlen), xtfs->ra_runt,
+ sizeof(xtfs->ra_runt));
+ }
+
+ /* Continue reassembling the packet */
+ ipremain = __iptfs_iplen(newskb->data);
+ iphlen = __iptfs_iphlen(newskb->data);
+
+ ipremain -= newskb->len;
+ if (blkoff < ipremain) {
+ /* Corrupt data, we don't have enough to complete the packet */
+ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMINIPTFSERROR);
+ goto abandon;
+ }
+
+ /* We want the IP header in linear space */
+ if (newskb->len < iphlen) {
+ iphremain = iphlen - newskb->len;
+ if (blkoff < iphremain) {
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINIPTFSERROR);
+ goto abandon;
+ }
+ fraglen = min(blkoff, remaining);
+ copylen = min(fraglen, iphremain);
+ if (skb_copy_seq_read(st, data, skb_put(newskb, copylen),
+ copylen)) {
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINBUFFERERROR);
+ goto abandon;
+ }
+ /* this is a silly condition that might occur anyway */
+ if (copylen < iphremain) {
+ xtfs->ra_wantseq++;
+ return data + fraglen;
+ }
+ /* update data and things derived from it */
+ data += copylen;
+ blkoff -= copylen;
+ remaining -= copylen;
+ ipremain -= copylen;
+ }
+
+ fraglen = min(blkoff, remaining);
+ copylen = min(fraglen, ipremain);
+
+ /* If we may have the opportunity to share prepare a fragwalk. */
+ if (!skb_has_frag_list(skb) && !skb_has_frag_list(newskb) &&
+ (skb->head_frag || skb->len == skb->data_len) &&
+ skb->pp_recycle == newskb->pp_recycle) {
+ fragwalk = &_fragwalk;
+ iptfs_skb_prepare_frag_walk(skb, data, fragwalk);
+ }
+
+ /* Try share then copy. */
+ if (fragwalk &&
+ iptfs_skb_can_add_frags(newskb, fragwalk, data, copylen)) {
+ iptfs_skb_add_frags(newskb, fragwalk, data, copylen);
+ } else {
+ /* copy fragment data into newskb */
+ if (skb_copy_seq_read(st, data, skb_put(newskb, copylen),
+ copylen)) {
+ XFRM_INC_STATS(xs_net(xtfs->x),
+ LINUX_MIB_XFRMINBUFFERERROR);
+ goto abandon;
+ }
+ }
+
+ if (copylen < ipremain) {
+ xtfs->ra_wantseq++;
+ } else {
+ /* We are done with packet reassembly! */
+ iptfs_reassem_done(xtfs);
+ iptfs_complete_inner_skb(xtfs->x, newskb);
+ list_add_tail(&newskb->list, list);
+ }
+
+ /* will continue on to new data block or end */
+ return data + fraglen;
+
+abandon:
+ if (xtfs->ra_newskb) {
+ iptfs_reassem_abort(xtfs);
+ } else {
+ xtfs->ra_runtlen = 0;
+ xtfs->ra_wantseq = 0;
+ }
+ /* skip past fragment, maybe to end */
+ return data + min(blkoff, remaining);
+}
+
+static bool __input_process_payload(struct xfrm_state *x, u32 data,
+ struct skb_seq_state *skbseq,
+ struct list_head *sublist)
+{
+ u8 hbytes[sizeof(struct ipv6hdr)];
+ struct iptfs_skb_frag_walk _fragwalk;
+ struct iptfs_skb_frag_walk *fragwalk = NULL;
+ struct sk_buff *defer, *first_skb, *next, *skb;
+ const unsigned char *old_mac;
+ struct xfrm_iptfs_data *xtfs;
+ struct iphdr *iph;
+ struct net *net;
+ u32 first_iplen, iphlen, iplen, remaining, tail;
+ u32 capturelen;
+ u64 seq;
+
+ xtfs = x->mode_data;
+ net = xs_net(x);
+ skb = skbseq->root_skb;
+ first_skb = NULL;
+ defer = NULL;
+
+ seq = __esp_seq(skb);
+
+ /* Save the old mac header if set */
+ old_mac = skb_mac_header_was_set(skb) ? skb_mac_header(skb) : NULL;
+
+ /* New packets */
+
+ tail = skb->len;
+ while (data < tail) {
+ __be16 protocol = 0;
+
+ /* Gather information on the next data block.
+ * `data` points to the start of the data block.
+ */
+ remaining = tail - data;
+
+ /* try and copy enough bytes to read length from ipv4/ipv6 */
+ iphlen = min_t(u32, remaining, 6);
+ if (skb_copy_seq_read(skbseq, data, hbytes, iphlen)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto done;
+ }
+
+ iph = (struct iphdr *)hbytes;
+ if (iph->version == 0x4) {
+ /* must have at least tot_len field present */
+ if (remaining < 4) {
+ /* save the bytes we have, advance data and exit */
+ iptfs_input_save_runt(xtfs, seq, hbytes,
+ remaining);
+ data += remaining;
+ break;
+ }
+
+ iplen = be16_to_cpu(iph->tot_len);
+ iphlen = iph->ihl << 2;
+ protocol = cpu_to_be16(ETH_P_IP);
+ XFRM_MODE_SKB_CB(skbseq->root_skb)->tos = iph->tos;
+ } else if (iph->version == 0x6) {
+ /* must have at least payload_len field present */
+ if (remaining < 6) {
+ /* save the bytes we have, advance data and exit */
+ iptfs_input_save_runt(xtfs, seq, hbytes,
+ remaining);
+ data += remaining;
+ break;
+ }
+
+ iplen = be16_to_cpu(((struct ipv6hdr *)hbytes)->payload_len);
+ iplen += sizeof(struct ipv6hdr);
+ iphlen = sizeof(struct ipv6hdr);
+ protocol = cpu_to_be16(ETH_P_IPV6);
+ XFRM_MODE_SKB_CB(skbseq->root_skb)->tos =
+ ipv6_get_dsfield((struct ipv6hdr *)iph);
+ } else if (iph->version == 0x0) {
+ /* pad */
+ data = tail;
+ break;
+ } else {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto done;
+ }
+
+ if (unlikely(skbseq->stepped_offset)) {
+ /* We need to reset our seq read, it can't backup at
+ * this point.
+ */
+ struct sk_buff *save = skbseq->root_skb;
+
+ skb_abort_seq_read(skbseq);
+ skb_prepare_seq_read(save, data, tail, skbseq);
+ }
+
+ if (first_skb) {
+ skb = NULL;
+ } else {
+ first_skb = skb;
+ first_iplen = iplen;
+ fragwalk = NULL;
+
+ /* We are going to skip over `data` bytes to reach the
+ * start of the IP header of `iphlen` len for `iplen`
+ * inner packet.
+ */
+
+ if (skb_has_frag_list(skb)) {
+ defer = skb;
+ skb = NULL;
+ } else if (data + iphlen <= skb_headlen(skb) &&
+ /* make sure our header is 32-bit aligned? */
+ /* ((uintptr_t)(skb->data + data) & 0x3) == 0 && */
+ skb_tailroom(skb) + tail - data >= iplen) {
+ /* Reuse the received skb.
+ *
+ * We have enough headlen to pull past any
+ * initial fragment data, leaving at least the
+ * IP header in the linear buffer space.
+ *
+ * For linear buffer space we only require that
+ * linear buffer space is large enough to
+ * eventually hold the entire reassembled
+ * packet (by including tailroom in the check).
+ *
+ * For non-linear tailroom is 0 and so we only
+ * re-use if the entire packet is present
+ * already.
+ *
+ * NOTE: there are many more options for
+ * sharing, KISS for now. Also, this can produce
+ * skb's with the IP header unaligned to 32
+ * bits. If that ends up being a problem then a
+ * check should be added to the conditional
+ * above that the header lies on a 32-bit
+ * boundary as well.
+ */
+ skb_pull(skb, data);
+
+ /* our range just changed */
+ data = 0;
+ tail = skb->len;
+ remaining = skb->len;
+
+ skb->protocol = protocol;
+ skb_mac_header_rebuild(skb);
+ if (skb->mac_len)
+ eth_hdr(skb)->h_proto = skb->protocol;
+
+ /* all pointers could be changed now reset walk */
+ skb_abort_seq_read(skbseq);
+ skb_prepare_seq_read(skb, data, tail, skbseq);
+ } else if (skb->head_frag &&
+ /* We have the IP header right now */
+ remaining >= iphlen) {
+ fragwalk = &_fragwalk;
+ iptfs_skb_prepare_frag_walk(skb, data, fragwalk);
+ defer = skb;
+ skb = NULL;
+ } else {
+ /* We couldn't reuse the input skb so allocate a
+ * new one.
+ */
+ defer = skb;
+ skb = NULL;
+ }
+
+ /* Don't trim `first_skb` until the end as we are
+ * walking that data now.
+ */
+ }
+
+ capturelen = min(iplen, remaining);
+ if (!skb) {
+ if (!fragwalk ||
+ /* Large enough to be worth sharing */
+ iplen < IPTFS_PKT_SHARE_MIN ||
+ /* Have IP header + some data to share. */
+ capturelen <= iphlen ||
+ /* Try creating skb and adding frags */
+ !(skb = iptfs_pskb_add_frags(first_skb, fragwalk,
+ data, capturelen,
+ skbseq, iphlen))) {
+ skb = iptfs_pskb_extract_seq(iplen, skbseq, data, capturelen);
+ }
+ if (!skb) {
+ /* skip to next packet or done */
+ data += capturelen;
+ continue;
+ }
+
+ skb->protocol = protocol;
+ if (old_mac) {
+ /* rebuild the mac header */
+ skb_set_mac_header(skb, -first_skb->mac_len);
+ memcpy(skb_mac_header(skb), old_mac, first_skb->mac_len);
+ eth_hdr(skb)->h_proto = skb->protocol;
+ }
+ }
+
+ data += capturelen;
+
+ if (skb->len < iplen) {
+ /* Start reassembly */
+ spin_lock(&xtfs->drop_lock);
+
+ xtfs->ra_newskb = skb;
+ xtfs->ra_wantseq = seq + 1;
+ if (!hrtimer_is_queued(&xtfs->drop_timer)) {
+ /* softirq blocked lest the timer fire and interrupt us */
+ hrtimer_start(&xtfs->drop_timer,
+ xtfs->drop_time_ns,
+ IPTFS_HRTIMER_MODE);
+ }
+
+ spin_unlock(&xtfs->drop_lock);
+
+ break;
+ }
+
+ iptfs_complete_inner_skb(x, skb);
+ list_add_tail(&skb->list, sublist);
+ }
+
+ if (data != tail)
+ /* this should not happen from the above code */
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINIPTFSERROR);
+
+ if (first_skb && first_iplen && !defer && first_skb != xtfs->ra_newskb) {
+ /* first_skb is queued b/c !defer and not partial */
+ if (pskb_trim(first_skb, first_iplen)) {
+ /* error trimming */
+ list_del(&first_skb->list);
+ defer = first_skb;
+ }
+ first_skb->ip_summed = CHECKSUM_NONE;
+ }
+
+ /* Send the packets! */
+ list_for_each_entry_safe(skb, next, sublist, list) {
+ skb_list_del_init(skb);
+ if (xfrm_input(skb, 0, 0, -2))
+ kfree_skb(skb);
+ }
+done:
+ skb = skbseq->root_skb;
+ skb_abort_seq_read(skbseq);
+
+ if (defer) {
+ consume_skb(defer);
+ } else if (!first_skb) {
+ /* skb is the original passed in skb, but we didn't get far
+ * enough to process it as the first_skb, if we had it would
+ * either be save in ra_newskb, trimmed and sent on as an skb or
+ * placed in defer to be freed.
+ */
+ kfree_skb(skb);
+ }
+ return true;
+}
+
+/**
+ * iptfs_input_ordered() - handle next in order IPTFS payload.
+ * @x: xfrm state
+ * @skb: current packet
+ *
+ * Process the IPTFS payload in `skb` and consume it afterwards.
+ */
+static void iptfs_input_ordered(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct ip_iptfs_cc_hdr iptcch;
+ struct skb_seq_state skbseq;
+ struct list_head sublist; /* rename this it's just a list */
+ struct xfrm_iptfs_data *xtfs;
+ struct ip_iptfs_hdr *ipth;
+ struct net *net;
+ u32 blkoff, data, remaining;
+ bool consumed = false;
+ u64 seq;
+
+ xtfs = x->mode_data;
+ net = xs_net(x);
+
+ seq = __esp_seq(skb);
+
+ /* Large enough to hold both types of header */
+ ipth = (struct ip_iptfs_hdr *)&iptcch;
+
+ skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
+
+ /* Get the IPTFS header and validate it */
+
+ if (skb_copy_seq_read(&skbseq, 0, ipth, sizeof(*ipth))) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto done;
+ }
+ data = sizeof(*ipth);
+
+ trace_iptfs_egress_recv(skb, xtfs, be16_to_cpu(ipth->block_offset));
+
+ /* Set data past the basic header */
+ if (ipth->subtype == IPTFS_SUBTYPE_CC) {
+ /* Copy the rest of the CC header */
+ remaining = sizeof(iptcch) - sizeof(*ipth);
+ if (skb_copy_seq_read(&skbseq, data, ipth + 1, remaining)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+ goto done;
+ }
+ data += remaining;
+ } else if (ipth->subtype != IPTFS_SUBTYPE_BASIC) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+ goto done;
+ }
+
+ if (ipth->flags != 0) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+ goto done;
+ }
+
+ INIT_LIST_HEAD(&sublist);
+
+ /* Handle fragment at start of payload, and/or waiting reassembly. */
+
+ blkoff = ntohs(ipth->block_offset);
+ /* check before locking i.e., maybe */
+ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) {
+ spin_lock(&xtfs->drop_lock);
+
+ /* check again after lock */
+ if (blkoff || xtfs->ra_runtlen || xtfs->ra_newskb) {
+ data = iptfs_reassem_cont(xtfs, seq, &skbseq, skb, data,
+ blkoff, &sublist);
+ }
+
+ spin_unlock(&xtfs->drop_lock);
+ }
+
+ /* New packets */
+ consumed = __input_process_payload(x, data, &skbseq, &sublist);
+done:
+ if (!consumed) {
+ skb = skbseq.root_skb;
+ skb_abort_seq_read(&skbseq);
+ kfree_skb(skb);
+ }
+}
+
+/* ------------------------------- */
+/* Input (Egress) Re-ordering Code */
+/* ------------------------------- */
+
+static void __vec_shift(struct xfrm_iptfs_data *xtfs, u32 shift)
+{
+ u32 savedlen = xtfs->w_savedlen;
+
+ if (shift > savedlen)
+ shift = savedlen;
+ if (shift != savedlen)
+ memcpy(xtfs->w_saved, xtfs->w_saved + shift,
+ (savedlen - shift) * sizeof(*xtfs->w_saved));
+ memset(xtfs->w_saved + savedlen - shift, 0,
+ shift * sizeof(*xtfs->w_saved));
+ xtfs->w_savedlen -= shift;
+}
+
+static void __reorder_past(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
+ struct list_head *freelist)
+{
+ list_add_tail(&inskb->list, freelist);
+}
+
+static u32 __reorder_drop(struct xfrm_iptfs_data *xtfs, struct list_head *list)
+
+{
+ struct skb_wseq *s, *se;
+ const u32 savedlen = xtfs->w_savedlen;
+ time64_t now = ktime_get_raw_fast_ns();
+ u32 count = 0;
+ u32 scount = 0;
+
+ if (xtfs->w_saved[0].drop_time > now)
+ goto set_timer;
+
+ ++xtfs->w_wantseq;
+
+ /* Keep flushing packets until we reach a drop time greater than now. */
+ s = xtfs->w_saved;
+ se = s + savedlen;
+ do {
+ /* Walking past empty slots until we reach a packet */
+ for (; s < se && !s->skb; s++) {
+ if (s->drop_time > now)
+ goto outerdone;
+ }
+ /* Sending packets until we hit another empty slot. */
+ for (; s < se && s->skb; scount++, s++)
+ list_add_tail(&s->skb->list, list);
+ } while (s < se);
+outerdone:
+
+ count = s - xtfs->w_saved;
+ if (count) {
+ xtfs->w_wantseq += count;
+
+ /* Shift handled slots plus final empty slot into slot 0. */
+ __vec_shift(xtfs, count);
+ }
+
+ if (xtfs->w_savedlen) {
+set_timer:
+ /* Drifting is OK */
+ hrtimer_start(&xtfs->drop_timer,
+ xtfs->w_saved[0].drop_time - now,
+ IPTFS_HRTIMER_MODE);
+ }
+ return scount;
+}
+
+static void __reorder_this(struct xfrm_iptfs_data *xtfs, struct sk_buff *inskb,
+ struct list_head *list)
+{
+ struct skb_wseq *s, *se;
+ const u32 savedlen = xtfs->w_savedlen;
+ u32 count = 0;
+
+ /* Got what we wanted. */
+ list_add_tail(&inskb->list, list);
+ ++xtfs->w_wantseq;
+ if (!savedlen)
+ return;
+
+ /* Flush remaining consecutive packets. */
+
+ /* Keep sending until we hit another missed pkt. */
+ for (s = xtfs->w_saved, se = s + savedlen; s < se && s->skb; s++)
+ list_add_tail(&s->skb->list, list);
+ count = s - xtfs->w_saved;
+ if (count)
+ xtfs->w_wantseq += count;
+
+ /* Shift handled slots plus final empty slot into slot 0. */
+ __vec_shift(xtfs, count + 1);
+}
+
+/* Set the slot's drop time and all the empty slots below it until reaching a
+ * filled slot which will already be set.
+ */
+static void iptfs_set_window_drop_times(struct xfrm_iptfs_data *xtfs, int index)
+{
+ const u32 savedlen = xtfs->w_savedlen;
+ struct skb_wseq *s = xtfs->w_saved;
+ time64_t drop_time;
+
+ assert_spin_locked(&xtfs->drop_lock);
+
+ if (savedlen > index + 1) {
+ /* we are below another, our drop time and the timer are already set */
+ return;
+ }
+ /* we are the most future so get a new drop time. */
+ drop_time = ktime_get_raw_fast_ns();
+ drop_time += xtfs->drop_time_ns;
+
+ /* Walk back through the array setting drop times as we go */
+ s[index].drop_time = drop_time;
+ while (index-- > 0 && !s[index].skb)
+ s[index].drop_time = drop_time;
+
+ /* If we walked all the way back, schedule the drop timer if needed */
+ if (index == -1 && !hrtimer_is_queued(&xtfs->drop_timer))
+ hrtimer_start(&xtfs->drop_timer, xtfs->drop_time_ns,
+ IPTFS_HRTIMER_MODE);
+}
+
+static void __reorder_future_fits(struct xfrm_iptfs_data *xtfs,
+ struct sk_buff *inskb,
+ struct list_head *freelist)
+{
+ const u64 inseq = __esp_seq(inskb);
+ const u64 wantseq = xtfs->w_wantseq;
+ const u64 distance = inseq - wantseq;
+ const u32 savedlen = xtfs->w_savedlen;
+ const u32 index = distance - 1;
+
+ /* Handle future sequence number received which fits in the window.
+ *
+ * We know we don't have the seq we want so we won't be able to flush
+ * anything.
+ */
+
+ /* slot count is 4, saved size is 3 savedlen is 2
+ *
+ * "window boundary" is based on the fixed window size
+ * distance is also slot number
+ * index is an array index (i.e., - 1 of slot)
+ * : : - implicit NULL after array len
+ *
+ * +--------- used length (savedlen == 2)
+ * | +----- array size (nslots - 1 == 3)
+ * | | + window boundary (nslots == 4)
+ * V V | V
+ * |
+ * 0 1 2 3 | slot number
+ * --- 0 1 2 | array index
+ * [-] [b] : :| array
+ *
+ * "2" "3" "4" *5*| seq numbers
+ *
+ * We receive seq number 5
+ * distance == 3 [inseq(5) - w_wantseq(2)]
+ * index == 2 [distance(6) - 1]
+ */
+
+ if (xtfs->w_saved[index].skb) {
+ /* a dup of a future */
+ list_add_tail(&inskb->list, freelist);
+ return;
+ }
+
+ xtfs->w_saved[index].skb = inskb;
+ xtfs->w_savedlen = max(savedlen, index + 1);
+ iptfs_set_window_drop_times(xtfs, index);
+}
+
+static void __reorder_future_shifts(struct xfrm_iptfs_data *xtfs,
+ struct sk_buff *inskb,
+ struct list_head *list)
+{
+ const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+ const u64 inseq = __esp_seq(inskb);
+ u32 savedlen = xtfs->w_savedlen;
+ u64 wantseq = xtfs->w_wantseq;
+ struct skb_wseq *wnext;
+ struct sk_buff *slot0;
+ u32 beyond, shifting, slot;
+ u64 distance;
+
+ /* Handle future sequence number received.
+ *
+ * IMPORTANT: we are at least advancing w_wantseq (i.e., wantseq) by 1
+ * b/c we are beyond the window boundary.
+ *
+ * We know we don't have the wantseq so that counts as a drop.
+ */
+
+ /* example: slot count is 4, array size is 3 savedlen is 2, slot 0 is
+ * the missing sequence number.
+ *
+ * the final slot at savedlen (index savedlen - 1) is always occupied.
+ *
+ * beyond is "beyond array size" not savedlen.
+ *
+ * +--------- array length (savedlen == 2)
+ * | +----- array size (nslots - 1 == 3)
+ * | | +- window boundary (nslots == 4)
+ * V V |
+ * |
+ * 0 1 2 3 | slot number
+ * --- 0 1 2 | array index
+ * [b] [c] : :| array
+ * |
+ * "2" "3" "4" "5"|*6* seq numbers
+ *
+ * We receive seq number 6
+ * distance == 4 [inseq(6) - w_wantseq(2)]
+ * newslot == distance
+ * index == 3 [distance(4) - 1]
+ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
+ * shifting == 1 [min(savedlen(2), beyond(1)]
+ * slot0_skb == [b], and should match w_wantseq
+ *
+ * +--- window boundary (nslots == 4)
+ * 0 1 2 3 | 4 slot number
+ * --- 0 1 2 | 3 array index
+ * [b] : : : :| array
+ * "2" "3" "4" "5" *6* seq numbers
+ *
+ * We receive seq number 6
+ * distance == 4 [inseq(6) - w_wantseq(2)]
+ * newslot == distance
+ * index == 3 [distance(4) - 1]
+ * beyond == 1 [newslot(4) - lastslot((nslots(4) - 1))]
+ * shifting == 1 [min(savedlen(1), beyond(1)]
+ * slot0_skb == [b] and should match w_wantseq
+ *
+ * +-- window boundary (nslots == 4)
+ * 0 1 2 3 | 4 5 6 slot number
+ * --- 0 1 2 | 3 4 5 array index
+ * [-] [c] : :| array
+ * "2" "3" "4" "5" "6" "7" *8* seq numbers
+ *
+ * savedlen = 2, beyond = 3
+ * iter 1: slot0 == NULL, missed++, lastdrop = 2 (2+1-1), slot0 = [-]
+ * iter 2: slot0 == NULL, missed++, lastdrop = 3 (2+2-1), slot0 = [c]
+ * 2 < 3, extra = 1 (3-2), missed += extra, lastdrop = 4 (2+2+1-1)
+ *
+ * We receive seq number 8
+ * distance == 6 [inseq(8) - w_wantseq(2)]
+ * newslot == distance
+ * index == 5 [distance(6) - 1]
+ * beyond == 3 [newslot(6) - lastslot((nslots(4) - 1))]
+ * shifting == 2 [min(savedlen(2), beyond(3)]
+ *
+ * slot0_skb == NULL changed from [b] when "savedlen < beyond" is true.
+ */
+
+ /* Now send any packets that are being shifted out of saved, and account
+ * for missing packets that are exiting the window as we shift it.
+ */
+
+ distance = inseq - wantseq;
+ beyond = distance - (nslots - 1);
+
+ /* If savedlen > beyond we are shifting some, else all. */
+ shifting = min(savedlen, beyond);
+
+ /* slot0 is the buf that just shifted out and into slot0 */
+ slot0 = NULL;
+ wnext = xtfs->w_saved;
+ for (slot = 1; slot <= shifting; slot++, wnext++) {
+ /* handle what was in slot0 before we occupy it */
+ if (slot0)
+ list_add_tail(&slot0->list, list);
+ slot0 = wnext->skb;
+ wnext->skb = NULL;
+ }
+
+ /* slot0 is now either NULL (in which case it's what we now are waiting
+ * for, or a buf in which case we need to handle it like we received it;
+ * however, we may be advancing past that buffer as well..
+ */
+
+ /* Handle case where we need to shift more than we had saved, slot0 will
+ * be NULL iff savedlen is 0, otherwise slot0 will always be
+ * non-NULL b/c we shifted the final element, which is always set if
+ * there is any saved, into slot0.
+ */
+ if (savedlen < beyond) {
+ if (savedlen != 0)
+ list_add_tail(&slot0->list, list);
+ slot0 = NULL;
+ /* slot0 has had an empty slot pushed into it */
+ }
+
+ /* Remove the entries */
+ __vec_shift(xtfs, beyond);
+
+ /* Advance want seq */
+ xtfs->w_wantseq += beyond;
+
+ /* Process drops here when implementing congestion control */
+
+ /* We've shifted. plug the packet in at the end. */
+ xtfs->w_savedlen = nslots - 1;
+ xtfs->w_saved[xtfs->w_savedlen - 1].skb = inskb;
+ iptfs_set_window_drop_times(xtfs, xtfs->w_savedlen - 1);
+
+ /* if we don't have a slot0 then we must wait for it */
+ if (!slot0)
+ return;
+
+ /* If slot0, seq must match new want seq */
+
+ /* slot0 is valid, treat like we received expected. */
+ __reorder_this(xtfs, slot0, list);
+}
+
+/* Receive a new packet into the reorder window. Return a list of ordered
+ * packets from the window.
+ */
+static void iptfs_input_reorder(struct xfrm_iptfs_data *xtfs,
+ struct sk_buff *inskb, struct list_head *list,
+ struct list_head *freelist)
+{
+ const u32 nslots = xtfs->cfg.reorder_win_size + 1;
+ u64 inseq = __esp_seq(inskb);
+ u64 wantseq;
+
+ assert_spin_locked(&xtfs->drop_lock);
+
+ if (unlikely(!xtfs->w_seq_set)) {
+ xtfs->w_seq_set = true;
+ xtfs->w_wantseq = inseq;
+ }
+ wantseq = xtfs->w_wantseq;
+
+ if (likely(inseq == wantseq))
+ __reorder_this(xtfs, inskb, list);
+ else if (inseq < wantseq)
+ __reorder_past(xtfs, inskb, freelist);
+ else if ((inseq - wantseq) < nslots)
+ __reorder_future_fits(xtfs, inskb, freelist);
+ else
+ __reorder_future_shifts(xtfs, inskb, list);
+}
+
+/**
+ * iptfs_drop_timer() - Handle drop timer expiry.
+ * @me: the timer
+ *
+ * This is similar to our input function.
+ *
+ * The drop timer is set when we start an in progress reassembly, and also when
+ * we save a future packet in the window saved array.
+ *
+ * NOTE packets in the save window are always newer WRT drop times as
+ * they get further in the future. i.e. for:
+ *
+ * if slots (S0, S1, ... Sn) and `Dn` is the drop time for slot `Sn`,
+ * then D(n-1) <= D(n).
+ *
+ * So, regardless of why the timer is firing we can always discard any inprogress
+ * fragment; either it's the reassembly timer, or slot 0 is going to be
+ * dropped as S0 must have the most recent drop time, and slot 0 holds the
+ * continuation fragment of the in progress packet.
+ *
+ * Returns HRTIMER_NORESTART.
+ */
+static enum hrtimer_restart iptfs_drop_timer(struct hrtimer *me)
+{
+ struct sk_buff *skb, *next;
+ struct list_head list;
+ struct xfrm_iptfs_data *xtfs;
+ struct xfrm_state *x;
+ u32 count;
+
+ xtfs = container_of(me, typeof(*xtfs), drop_timer);
+ x = xtfs->x;
+
+ INIT_LIST_HEAD(&list);
+
+ spin_lock(&xtfs->drop_lock);
+
+ /* Drop any in progress packet */
+ skb = xtfs->ra_newskb;
+ xtfs->ra_newskb = NULL;
+
+ /* Now drop as many packets as we should from the reordering window
+ * saved array
+ */
+ count = xtfs->w_savedlen ? __reorder_drop(xtfs, &list) : 0;
+
+ spin_unlock(&xtfs->drop_lock);
+
+ if (skb)
+ kfree_skb_reason(skb, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
+
+ if (count) {
+ list_for_each_entry_safe(skb, next, &list, list) {
+ skb_list_del_init(skb);
+ iptfs_input_ordered(x, skb);
+ }
+ }
+
+ return HRTIMER_NORESTART;
+}
+
+/**
+ * iptfs_input() - handle receipt of iptfs payload
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * We have an IPTFS payload order it if needed, then process newly in order
+ * packets.
+ *
+ * Return: -EINPROGRESS to inform xfrm_input to stop processing the skb.
+ */
+static int iptfs_input(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct list_head freelist, list;
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct sk_buff *next;
+
+ /* Fast path for no reorder window. */
+ if (xtfs->cfg.reorder_win_size == 0) {
+ iptfs_input_ordered(x, skb);
+ goto done;
+ }
+
+ /* Fetch list of in-order packets from the reordering window as well as
+ * a list of buffers we need to now free.
+ */
+ INIT_LIST_HEAD(&list);
+ INIT_LIST_HEAD(&freelist);
+
+ spin_lock(&xtfs->drop_lock);
+ iptfs_input_reorder(xtfs, skb, &list, &freelist);
+ spin_unlock(&xtfs->drop_lock);
+
+ list_for_each_entry_safe(skb, next, &list, list) {
+ skb_list_del_init(skb);
+ iptfs_input_ordered(x, skb);
+ }
+
+ list_for_each_entry_safe(skb, next, &freelist, list) {
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ }
+done:
+ /* We always have dealt with the input SKB, either we are re-using it,
+ * or we have freed it. Return EINPROGRESS so that xfrm_input stops
+ * processing it.
+ */
+ return -EINPROGRESS;
+}
+
+/* ================================= */
+/* IPTFS Sending (ingress) Functions */
+/* ================================= */
+
+/* ------------------------- */
+/* Enqueue to send functions */
+/* ------------------------- */
+
+/**
+ * iptfs_enqueue() - enqueue packet if ok to send.
+ * @xtfs: xtfs state
+ * @skb: the packet
+ *
+ * Return: true if packet enqueued.
+ */
+static bool iptfs_enqueue(struct xfrm_iptfs_data *xtfs, struct sk_buff *skb)
+{
+ u64 newsz = xtfs->queue_size + skb->len;
+ struct iphdr *iph;
+
+ assert_spin_locked(&xtfs->x->lock);
+
+ if (newsz > xtfs->cfg.max_queue_size)
+ return false;
+
+ /* Set ECN CE if we are above our ECN queue threshold */
+ if (newsz > xtfs->ecn_queue_size) {
+ iph = ip_hdr(skb);
+ if (iph->version == 4)
+ IP_ECN_set_ce(iph);
+ else if (iph->version == 6)
+ IP6_ECN_set_ce(skb, ipv6_hdr(skb));
+ }
+
+ __skb_queue_tail(&xtfs->queue, skb);
+ xtfs->queue_size += skb->len;
+ return true;
+}
+
+static int iptfs_get_cur_pmtu(struct xfrm_state *x, struct xfrm_iptfs_data *xtfs,
+ struct sk_buff *skb)
+{
+ struct xfrm_dst *xdst = (struct xfrm_dst *)skb_dst(skb);
+ u32 payload_mtu = xtfs->payload_mtu;
+ u32 pmtu = __iptfs_get_inner_mtu(x, xdst->child_mtu_cached);
+
+ if (payload_mtu && payload_mtu < pmtu)
+ pmtu = payload_mtu;
+
+ return pmtu;
+}
+
+static int iptfs_is_too_big(struct sock *sk, struct sk_buff *skb, u32 pmtu)
+{
+ if (skb->len <= pmtu)
+ return 0;
+
+ /* We only send ICMP too big if the user has configured us as
+ * dont-fragment.
+ */
+ if (skb->dev)
+ XFRM_INC_STATS(dev_net(skb->dev), LINUX_MIB_XFRMOUTERROR);
+
+ if (sk)
+ xfrm_local_error(skb, pmtu);
+ else if (ip_hdr(skb)->version == 4)
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(pmtu));
+ else
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, pmtu);
+
+ return 1;
+}
+
+/* IPv4/IPv6 packet ingress to IPTFS tunnel, arrange to send in IPTFS payload
+ * (i.e., aggregating or fragmenting as appropriate).
+ * This is set in dst->output for an SA.
+ */
+static int iptfs_output_collect(struct net *net, struct sock *sk, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct xfrm_state *x = dst->xfrm;
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct sk_buff *segs, *nskb;
+ u32 pmtu = 0;
+ bool ok = true;
+ bool was_gso;
+
+ /* We have hooked into dst_entry->output which means we have skipped the
+ * protocol specific netfilter (see xfrm4_output, xfrm6_output).
+ * when our timer runs we will end up calling xfrm_output directly on
+ * the encapsulated traffic.
+ *
+ * For both cases this is the NF_INET_POST_ROUTING hook which allows
+ * changing the skb->dst entry which then may not be xfrm based anymore
+ * in which case a REROUTED flag is set. and dst_output is called.
+ *
+ * For IPv6 we are also skipping fragmentation handling for local
+ * sockets, which may or may not be good depending on our tunnel DF
+ * setting. Normally with fragmentation supported we want to skip this
+ * fragmentation.
+ */
+
+ if (xtfs->cfg.dont_frag)
+ pmtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+
+ /* Break apart GSO skbs. If the queue is nearing full then we want the
+ * accounting and queuing to be based on the individual packets not on the
+ * aggregate GSO buffer.
+ */
+ was_gso = skb_is_gso(skb);
+ if (!was_gso) {
+ segs = skb;
+ } else {
+ segs = skb_gso_segment(skb, 0);
+ if (IS_ERR_OR_NULL(segs)) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ kfree_skb(skb);
+ if (IS_ERR(segs))
+ return PTR_ERR(segs);
+ return -EINVAL;
+ }
+ consume_skb(skb);
+ skb = NULL;
+ }
+
+ /* We can be running on multiple cores and from the network softirq or
+ * from user context depending on where the packet is coming from.
+ */
+ spin_lock_bh(&x->lock);
+
+ skb_list_walk_safe(segs, skb, nskb) {
+ skb_mark_not_on_list(skb);
+
+ /* Once we drop due to no queue space we continue to drop the
+ * rest of the packets from that GRO.
+ */
+ if (!ok) {
+nospace:
+ trace_iptfs_no_queue_space(skb, xtfs, pmtu, was_gso);
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOQSPACE);
+ kfree_skb_reason(skb, SKB_DROP_REASON_FULL_RING);
+ continue;
+ }
+
+ /* If the user indicated no iptfs fragmenting check before
+ * enqueue.
+ */
+ if (xtfs->cfg.dont_frag && iptfs_is_too_big(sk, skb, pmtu)) {
+ trace_iptfs_too_big(skb, xtfs, pmtu, was_gso);
+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+ continue;
+ }
+
+ /* Enqueue to send in tunnel */
+ ok = iptfs_enqueue(xtfs, skb);
+ if (!ok)
+ goto nospace;
+
+ trace_iptfs_enqueue(skb, xtfs, pmtu, was_gso);
+ }
+
+ /* Start a delay timer if we don't have one yet */
+ if (!hrtimer_is_queued(&xtfs->iptfs_timer)) {
+ hrtimer_start(&xtfs->iptfs_timer, xtfs->init_delay_ns, IPTFS_HRTIMER_MODE);
+ xtfs->iptfs_settime = ktime_get_raw_fast_ns();
+ trace_iptfs_timer_start(xtfs, xtfs->init_delay_ns);
+ }
+
+ spin_unlock_bh(&x->lock);
+ return 0;
+}
+
+/* -------------------------- */
+/* Dequeue and send functions */
+/* -------------------------- */
+
+static void iptfs_output_prepare_skb(struct sk_buff *skb, u32 blkoff)
+{
+ struct ip_iptfs_hdr *h;
+ size_t hsz = sizeof(*h);
+
+ /* now reset values to be pointing at the rest of the packets */
+ h = skb_push(skb, hsz);
+ memset(h, 0, hsz);
+ if (blkoff)
+ h->block_offset = htons(blkoff);
+
+ /* network_header current points at the inner IP packet
+ * move it to the iptfs header
+ */
+ skb->transport_header = skb->network_header;
+ skb->network_header -= hsz;
+
+ IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE;
+}
+
+/**
+ * iptfs_copy_create_frag() - create an inner fragment skb.
+ * @st: The source packet data.
+ * @offset: offset in @st of the new fragment data.
+ * @copy_len: the amount of data to copy from @st.
+ *
+ * Create a new skb holding a single IPTFS inner packet fragment. @copy_len must
+ * not be greater than the max fragment size.
+ *
+ * Return: the new fragment skb or an ERR_PTR().
+ */
+static struct sk_buff *iptfs_copy_create_frag(struct skb_seq_state *st, u32 offset, u32 copy_len)
+{
+ struct sk_buff *src = st->root_skb;
+ struct sk_buff *skb;
+ int err;
+
+ skb = iptfs_alloc_skb(src, copy_len, true);
+ if (!skb)
+ return ERR_PTR(-ENOMEM);
+
+ /* Now copy `copy_len` data from src */
+ err = skb_copy_seq_read(st, offset, skb_put(skb, copy_len), copy_len);
+ if (err) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ return skb;
+}
+
+/**
+ * iptfs_copy_create_frags() - create and send N-1 fragments of a larger skb.
+ * @skbp: the source packet skb (IN), skb holding the last fragment in
+ * the fragment stream (OUT).
+ * @xtfs: IPTFS SA state.
+ * @mtu: the max IPTFS fragment size.
+ *
+ * This function is responsible for fragmenting a larger inner packet into a
+ * sequence of IPTFS payload packets. The last fragment is returned rather than
+ * being sent so that the caller can append more inner packets (aggregation) if
+ * there is room.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_copy_create_frags(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu)
+{
+ struct skb_seq_state skbseq;
+ struct list_head sublist;
+ struct sk_buff *skb = *skbp;
+ struct sk_buff *nskb = *skbp;
+ u32 copy_len, offset;
+ u32 to_copy = skb->len - mtu;
+ u32 blkoff = 0;
+ int err = 0;
+
+ INIT_LIST_HEAD(&sublist);
+
+ skb_prepare_seq_read(skb, 0, skb->len, &skbseq);
+
+ /* A trimmed `skb` will be sent as the first fragment, later. */
+ offset = mtu;
+ to_copy = skb->len - offset;
+ while (to_copy) {
+ /* Send all but last fragment to allow agg. append */
+ trace_iptfs_first_fragmenting(nskb, mtu, to_copy, NULL);
+ list_add_tail(&nskb->list, &sublist);
+
+ /* FUTURE: if the packet has an odd/non-aligning length we could
+ * send less data in the penultimate fragment so that the last
+ * fragment then ends on an aligned boundary.
+ */
+ copy_len = min(to_copy, mtu);
+ nskb = iptfs_copy_create_frag(&skbseq, offset, copy_len);
+ if (IS_ERR(nskb)) {
+ XFRM_INC_STATS(xs_net(xtfs->x), LINUX_MIB_XFRMOUTERROR);
+ skb_abort_seq_read(&skbseq);
+ err = PTR_ERR(nskb);
+ nskb = NULL;
+ break;
+ }
+ iptfs_output_prepare_skb(nskb, to_copy);
+ offset += copy_len;
+ to_copy -= copy_len;
+ blkoff = to_copy;
+ }
+ skb_abort_seq_read(&skbseq);
+
+ /* return last fragment that will be unsent (or NULL) */
+ *skbp = nskb;
+ if (nskb)
+ trace_iptfs_first_final_fragment(nskb, mtu, blkoff, NULL);
+
+ /* trim the original skb to MTU */
+ if (!err)
+ err = pskb_trim(skb, mtu);
+
+ if (err) {
+ /* Free all frags. Don't bother sending a partial packet we will
+ * never complete.
+ */
+ kfree_skb(nskb);
+ list_for_each_entry_safe(skb, nskb, &sublist, list) {
+ skb_list_del_init(skb);
+ kfree_skb(skb);
+ }
+ return err;
+ }
+
+ /* prepare the initial fragment with an iptfs header */
+ iptfs_output_prepare_skb(skb, 0);
+
+ /* Send all but last fragment, if we fail to send a fragment then free
+ * the rest -- no point in sending a packet that can't be reassembled.
+ */
+ list_for_each_entry_safe(skb, nskb, &sublist, list) {
+ skb_list_del_init(skb);
+ if (!err)
+ err = xfrm_output(NULL, skb);
+ else
+ kfree_skb(skb);
+ }
+ if (err)
+ kfree_skb(*skbp);
+ return err;
+}
+
+/**
+ * iptfs_first_skb() - handle the first dequeued inner packet for output
+ * @skbp: the source packet skb (IN), skb holding the last fragment in
+ * the fragment stream (OUT).
+ * @xtfs: IPTFS SA state.
+ * @mtu: the max IPTFS fragment size.
+ *
+ * This function is responsible for fragmenting a larger inner packet into a
+ * sequence of IPTFS payload packets.
+ *
+ * The last fragment is returned rather than being sent so that the caller can
+ * append more inner packets (aggregation) if there is room.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_first_skb(struct sk_buff **skbp, struct xfrm_iptfs_data *xtfs, u32 mtu)
+{
+ struct sk_buff *skb = *skbp;
+ int err;
+
+ /* Classic ESP skips the don't fragment ICMP error if DF is clear on
+ * the inner packet or ignore_df is set. Otherwise it will send an ICMP
+ * or local error if the inner packet won't fit it's MTU.
+ *
+ * With IPTFS we do not care about the inner packet DF bit. If the
+ * tunnel is configured to "don't fragment" we error back if things
+ * don't fit in our max packet size. Otherwise we iptfs-fragment as
+ * normal.
+ */
+
+ /* The opportunity for HW offload has ended */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ err = skb_checksum_help(skb);
+ if (err)
+ return err;
+ }
+
+ /* We've split gso up before queuing */
+
+ trace_iptfs_first_dequeue(skb, mtu, 0, ip_hdr(skb));
+
+ /* Consider the buffer Tx'd and no longer owned */
+ skb_orphan(skb);
+
+ /* Simple case -- it fits. `mtu` accounted for all the overhead
+ * including the basic IPTFS header.
+ */
+ if (skb->len <= mtu) {
+ iptfs_output_prepare_skb(skb, 0);
+ return 0;
+ }
+
+ return iptfs_copy_create_frags(skbp, xtfs, mtu);
+}
+
+static struct sk_buff **iptfs_rehome_fraglist(struct sk_buff **nextp, struct sk_buff *child)
+{
+ u32 fllen = 0;
+
+ /* It might be possible to account for a frag list in addition to page
+ * fragment if it's a valid state to be in. The page fragments size
+ * should be kept as data_len so only the frag_list size is removed,
+ * this must be done above as well.
+ */
+ *nextp = skb_shinfo(child)->frag_list;
+ while (*nextp) {
+ fllen += (*nextp)->len;
+ nextp = &(*nextp)->next;
+ }
+ skb_frag_list_init(child);
+ child->len -= fllen;
+ child->data_len -= fllen;
+
+ return nextp;
+}
+
+static void iptfs_consume_frags(struct sk_buff *to, struct sk_buff *from)
+{
+ struct skb_shared_info *fromi = skb_shinfo(from);
+ struct skb_shared_info *toi = skb_shinfo(to);
+ unsigned int new_truesize;
+
+ /* If we have data in a head page, grab it */
+ if (!skb_headlen(from)) {
+ new_truesize = SKB_TRUESIZE(skb_end_offset(from));
+ } else {
+ iptfs_skb_head_to_frag(from, &toi->frags[toi->nr_frags]);
+ skb_frag_ref(to, toi->nr_frags++);
+ new_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+ }
+
+ /* Move any other page fragments rather than copy */
+ memcpy(&toi->frags[toi->nr_frags], fromi->frags,
+ sizeof(fromi->frags[0]) * fromi->nr_frags);
+ toi->nr_frags += fromi->nr_frags;
+ fromi->nr_frags = 0;
+ from->data_len = 0;
+ from->len = 0;
+ to->truesize += from->truesize - new_truesize;
+ from->truesize = new_truesize;
+
+ /* We are done with this SKB */
+ consume_skb(from);
+}
+
+static void iptfs_output_queued(struct xfrm_state *x, struct sk_buff_head *list)
+{
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct sk_buff *skb, *skb2, **nextp;
+ struct skb_shared_info *shi, *shi2;
+
+ /* If we are fragmenting due to a large inner packet we will output all
+ * the outer IPTFS packets required to contain the fragments of the
+ * single large inner packet. These outer packets need to be sent
+ * consecutively (ESP seq-wise). Since this output function is always
+ * running from a timer we do not need a lock to provide this guarantee.
+ * We will output our packets consecutively before the timer is allowed
+ * to run again on some other CPU.
+ */
+
+ while ((skb = __skb_dequeue(list))) {
+ u32 mtu = iptfs_get_cur_pmtu(x, xtfs, skb);
+ bool share_ok = true;
+ int remaining;
+
+ /* protocol comes to us cleared sometimes */
+ skb->protocol = x->outer_mode.family == AF_INET ? htons(ETH_P_IP) :
+ htons(ETH_P_IPV6);
+
+ if (skb->len > mtu && xtfs->cfg.dont_frag) {
+ /* We handle this case before enqueueing so we are only
+ * here b/c MTU changed after we enqueued before we
+ * dequeued, just drop these.
+ */
+ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR);
+
+ trace_iptfs_first_toobig(skb, mtu, 0, ip_hdr(skb));
+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
+ continue;
+ }
+
+ /* Convert first inner packet into an outer IPTFS packet,
+ * dealing with any fragmentation into multiple outer packets
+ * if necessary.
+ */
+ if (iptfs_first_skb(&skb, xtfs, mtu))
+ continue;
+
+ /* If fragmentation was required the returned skb is the last
+ * IPTFS fragment in the chain, and it's IPTFS header blkoff has
+ * been set just past the end of the fragment data.
+ *
+ * In either case the space remaining to send more inner packet
+ * data is `mtu` - (skb->len - sizeof iptfs header). This is b/c
+ * the `mtu` value has the basic IPTFS header len accounted for,
+ * and we added that header to the skb so it is a part of
+ * skb->len, thus we subtract it from the skb length.
+ */
+ remaining = mtu - (skb->len - sizeof(struct ip_iptfs_hdr));
+
+ /* Re-home (un-nest) nested fragment lists. We need to do this
+ * b/c we will simply be appending any following aggregated
+ * inner packets using the frag list.
+ */
+ shi = skb_shinfo(skb);
+ nextp = &shi->frag_list;
+ while (*nextp) {
+ if (skb_has_frag_list(*nextp))
+ nextp = iptfs_rehome_fraglist(&(*nextp)->next, *nextp);
+ else
+ nextp = &(*nextp)->next;
+ }
+
+ if (shi->frag_list || skb_cloned(skb) || skb_shared(skb))
+ share_ok = false;
+
+ /* See if we have enough space to simply append.
+ *
+ * NOTE: Maybe do not append if we will be mis-aligned,
+ * SW-based endpoints will probably have to copy in this
+ * case.
+ */
+ while ((skb2 = skb_peek(list))) {
+ trace_iptfs_ingress_nth_peek(skb2, remaining);
+ if (skb2->len > remaining)
+ break;
+
+ __skb_unlink(skb2, list);
+
+ /* Consider the buffer Tx'd and no longer owned */
+ skb_orphan(skb);
+
+ /* If we don't have a cksum in the packet we need to add
+ * one before encapsulation.
+ */
+ if (skb2->ip_summed == CHECKSUM_PARTIAL) {
+ if (skb_checksum_help(skb2)) {
+ XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTERROR);
+ kfree_skb(skb2);
+ continue;
+ }
+ }
+
+ /* skb->pp_recycle is passed to __skb_flag_unref for all
+ * frag pages so we can only share pages with skb's who
+ * match ourselves.
+ */
+ shi2 = skb_shinfo(skb2);
+ if (share_ok &&
+ (shi2->frag_list ||
+ (!skb2->head_frag && skb_headlen(skb)) ||
+ skb->pp_recycle != skb2->pp_recycle ||
+ skb_zcopy(skb2) ||
+ (shi->nr_frags + shi2->nr_frags + 1 > MAX_SKB_FRAGS)))
+ share_ok = false;
+
+ /* Do accounting */
+ skb->data_len += skb2->len;
+ skb->len += skb2->len;
+ remaining -= skb2->len;
+
+ trace_iptfs_ingress_nth_add(skb2, share_ok);
+
+ if (share_ok) {
+ iptfs_consume_frags(skb, skb2);
+ } else {
+ /* Append to the frag_list */
+ *nextp = skb2;
+ nextp = &skb2->next;
+ if (skb_has_frag_list(skb2))
+ nextp = iptfs_rehome_fraglist(nextp,
+ skb2);
+ skb->truesize += skb2->truesize;
+ }
+ }
+
+ xfrm_output(NULL, skb);
+ }
+}
+
+static enum hrtimer_restart iptfs_delay_timer(struct hrtimer *me)
+{
+ struct sk_buff_head list;
+ struct xfrm_iptfs_data *xtfs;
+ struct xfrm_state *x;
+ time64_t settime;
+
+ xtfs = container_of(me, typeof(*xtfs), iptfs_timer);
+ x = xtfs->x;
+
+ /* Process all the queued packets
+ *
+ * softirq execution order: timer > tasklet > hrtimer
+ *
+ * Network rx will have run before us giving one last chance to queue
+ * ingress packets for us to process and transmit.
+ */
+
+ spin_lock(&x->lock);
+ __skb_queue_head_init(&list);
+ skb_queue_splice_init(&xtfs->queue, &list);
+ xtfs->queue_size = 0;
+ settime = xtfs->iptfs_settime;
+ spin_unlock(&x->lock);
+
+ /* After the above unlock, packets can begin queuing again, and the
+ * timer can be set again, from another CPU either in softirq or user
+ * context (not from this one since we are running at softirq level
+ * already).
+ */
+
+ trace_iptfs_timer_expire(xtfs, (unsigned long long)(ktime_get_raw_fast_ns() - settime));
+
+ iptfs_output_queued(x, &list);
+
+ return HRTIMER_NORESTART;
+}
+
+/**
+ * iptfs_encap_add_ipv4() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm4_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the TOS/DSCP bits. Sets the protocol to a different value and doesn't do
+ * anything with inner headers as they aren't pointing into a normal IP
+ * singleton inner packet.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_encap_add_ipv4(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct iphdr *top_iph;
+
+ skb_reset_inner_network_header(skb);
+ skb_reset_inner_transport_header(skb);
+
+ skb_set_network_header(skb, -(x->props.header_len - x->props.enc_hdr_len));
+ skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+ top_iph = ip_hdr(skb);
+ top_iph->ihl = 5;
+ top_iph->version = 4;
+ top_iph->protocol = IPPROTO_AGGFRAG;
+
+ /* As we have 0, fractional, 1 or N inner packets there's no obviously
+ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+ * 3.1.
+ */
+ top_iph->tos = 0;
+
+ top_iph->frag_off = htons(IP_DF);
+ top_iph->ttl = ip4_dst_hoplimit(xfrm_dst_child(dst));
+ top_iph->saddr = x->props.saddr.a4;
+ top_iph->daddr = x->id.daddr.a4;
+ ip_select_ident(dev_net(dst->dev), skb, NULL);
+
+ return 0;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/**
+ * iptfs_encap_add_ipv6() - add outer encaps
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * This was originally taken from xfrm6_tunnel_encap_add. The reason for the
+ * copy is that IP-TFS/AGGFRAG can have different functionality for how to set
+ * the flow label and TOS/DSCP bits. It also sets the protocol to a different
+ * value and doesn't do anything with inner headers as they aren't pointing into
+ * a normal IP singleton inner packet.
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_encap_add_ipv6(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct ipv6hdr *top_iph;
+ int dsfield;
+
+ skb_reset_inner_network_header(skb);
+ skb_reset_inner_transport_header(skb);
+
+ skb_set_network_header(skb, -x->props.header_len + x->props.enc_hdr_len);
+ skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr);
+ skb->transport_header = skb->network_header + sizeof(*top_iph);
+
+ top_iph = ipv6_hdr(skb);
+ top_iph->version = 6;
+ top_iph->priority = 0;
+ memset(top_iph->flow_lbl, 0, sizeof(top_iph->flow_lbl));
+ top_iph->nexthdr = IPPROTO_AGGFRAG;
+
+ /* As we have 0, fractional, 1 or N inner packets there's no obviously
+ * correct DSCP mapping to inherit. ECN should be cleared per RFC9347
+ * 3.1.
+ */
+ dsfield = 0;
+ ipv6_change_dsfield(top_iph, 0, dsfield);
+
+ top_iph->hop_limit = ip6_dst_hoplimit(xfrm_dst_child(dst));
+ top_iph->saddr = *(struct in6_addr *)&x->props.saddr;
+ top_iph->daddr = *(struct in6_addr *)&x->id.daddr;
+
+ return 0;
+}
+#endif
+
+/**
+ * iptfs_prepare_output() - prepare the skb for output
+ * @x: xfrm state
+ * @skb: the packet
+ *
+ * Return: Error value, if 0 then skb values should be as follows:
+ * - transport_header should point at ESP header
+ * - network_header should point at Outer IP header
+ * - mac_header should point at protocol/nexthdr of the outer IP
+ */
+static int iptfs_prepare_output(struct xfrm_state *x, struct sk_buff *skb)
+{
+ if (x->outer_mode.family == AF_INET)
+ return iptfs_encap_add_ipv4(x, skb);
+ if (x->outer_mode.family == AF_INET6) {
+#if IS_ENABLED(CONFIG_IPV6)
+ return iptfs_encap_add_ipv6(x, skb);
+#else
+ return -EAFNOSUPPORT;
+#endif
+ }
+ return -EOPNOTSUPP;
+}
+
+/* ========================== */
+/* State Management Functions */
+/* ========================== */
+
+/**
+ * __iptfs_get_inner_mtu() - return inner MTU with no fragmentation.
+ * @x: xfrm state.
+ * @outer_mtu: the outer mtu
+ *
+ * Return: Correct MTU taking in to account the encap overhead.
+ */
+static u32 __iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
+{
+ struct crypto_aead *aead;
+ u32 blksize;
+
+ aead = x->data;
+ blksize = ALIGN(crypto_aead_blocksize(aead), 4);
+ return ((outer_mtu - x->props.header_len - crypto_aead_authsize(aead)) &
+ ~(blksize - 1)) - 2;
+}
+
+/**
+ * iptfs_get_inner_mtu() - return the inner MTU for an IPTFS xfrm.
+ * @x: xfrm state.
+ * @outer_mtu: Outer MTU for the encapsulated packet.
+ *
+ * Return: Correct MTU taking in to account the encap overhead.
+ */
+static u32 iptfs_get_inner_mtu(struct xfrm_state *x, int outer_mtu)
+{
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+
+ /* If not dont-frag we have no MTU */
+ if (!xtfs->cfg.dont_frag)
+ return x->outer_mode.family == AF_INET ? IP_MAX_MTU : IP6_MAX_MTU;
+ return __iptfs_get_inner_mtu(x, outer_mtu);
+}
+
+/**
+ * iptfs_user_init() - initialize the SA with IPTFS options from netlink.
+ * @net: the net data
+ * @x: xfrm state
+ * @attrs: netlink attributes
+ * @extack: extack return data
+ *
+ * Return: 0 on success or a negative error code on failure
+ */
+static int iptfs_user_init(struct net *net, struct xfrm_state *x,
+ struct nlattr **attrs,
+ struct netlink_ext_ack *extack)
+{
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct xfrm_iptfs_config *xc;
+ u64 q;
+
+ xc = &xtfs->cfg;
+ xc->max_queue_size = IPTFS_DEFAULT_MAX_QUEUE_SIZE;
+ xc->reorder_win_size = IPTFS_DEFAULT_REORDER_WINDOW;
+ xtfs->drop_time_ns = IPTFS_DEFAULT_DROP_TIME_USECS * NSECS_IN_USEC;
+ xtfs->init_delay_ns = IPTFS_DEFAULT_INIT_DELAY_USECS * NSECS_IN_USEC;
+
+ if (attrs[XFRMA_IPTFS_DONT_FRAG])
+ xc->dont_frag = true;
+ if (attrs[XFRMA_IPTFS_REORDER_WINDOW])
+ xc->reorder_win_size =
+ nla_get_u16(attrs[XFRMA_IPTFS_REORDER_WINDOW]);
+ /* saved array is for saving 1..N seq nums from wantseq */
+ if (xc->reorder_win_size) {
+ xtfs->w_saved = kcalloc(xc->reorder_win_size,
+ sizeof(*xtfs->w_saved), GFP_KERNEL);
+ if (!xtfs->w_saved) {
+ NL_SET_ERR_MSG(extack, "Cannot alloc reorder window");
+ return -ENOMEM;
+ }
+ }
+ if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
+ xc->pkt_size = nla_get_u32(attrs[XFRMA_IPTFS_PKT_SIZE]);
+ if (!xc->pkt_size) {
+ xtfs->payload_mtu = 0;
+ } else if (xc->pkt_size > x->props.header_len) {
+ xtfs->payload_mtu = xc->pkt_size - x->props.header_len;
+ } else {
+ NL_SET_ERR_MSG(extack,
+ "Packet size must be 0 or greater than IPTFS/ESP header length");
+ return -EINVAL;
+ }
+ }
+ if (attrs[XFRMA_IPTFS_MAX_QSIZE])
+ xc->max_queue_size = nla_get_u32(attrs[XFRMA_IPTFS_MAX_QSIZE]);
+ if (attrs[XFRMA_IPTFS_DROP_TIME])
+ xtfs->drop_time_ns =
+ (u64)nla_get_u32(attrs[XFRMA_IPTFS_DROP_TIME]) *
+ NSECS_IN_USEC;
+ if (attrs[XFRMA_IPTFS_INIT_DELAY])
+ xtfs->init_delay_ns =
+ (u64)nla_get_u32(attrs[XFRMA_IPTFS_INIT_DELAY]) * NSECS_IN_USEC;
+
+ q = (u64)xc->max_queue_size * 95;
+ do_div(q, 100);
+ xtfs->ecn_queue_size = (u32)q;
+
+ return 0;
+}
+
+static unsigned int iptfs_sa_len(const struct xfrm_state *x)
+{
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct xfrm_iptfs_config *xc = &xtfs->cfg;
+ unsigned int l = 0;
+
+ if (x->dir == XFRM_SA_DIR_IN) {
+ l += nla_total_size(sizeof(u32)); /* drop time usec */
+ l += nla_total_size(sizeof(xc->reorder_win_size));
+ } else {
+ if (xc->dont_frag)
+ l += nla_total_size(0); /* dont-frag flag */
+ l += nla_total_size(sizeof(u32)); /* init delay usec */
+ l += nla_total_size(sizeof(xc->max_queue_size));
+ l += nla_total_size(sizeof(xc->pkt_size));
+ }
+
+ return l;
+}
+
+static int iptfs_copy_to_user(struct xfrm_state *x, struct sk_buff *skb)
+{
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct xfrm_iptfs_config *xc = &xtfs->cfg;
+ int ret = 0;
+ u64 q;
+
+ if (x->dir == XFRM_SA_DIR_IN) {
+ q = xtfs->drop_time_ns;
+ do_div(q, NSECS_IN_USEC);
+ ret = nla_put_u32(skb, XFRMA_IPTFS_DROP_TIME, q);
+ if (ret)
+ return ret;
+
+ ret = nla_put_u16(skb, XFRMA_IPTFS_REORDER_WINDOW,
+ xc->reorder_win_size);
+ } else {
+ if (xc->dont_frag) {
+ ret = nla_put_flag(skb, XFRMA_IPTFS_DONT_FRAG);
+ if (ret)
+ return ret;
+ }
+
+ q = xtfs->init_delay_ns;
+ do_div(q, NSECS_IN_USEC);
+ ret = nla_put_u32(skb, XFRMA_IPTFS_INIT_DELAY, q);
+ if (ret)
+ return ret;
+
+ ret = nla_put_u32(skb, XFRMA_IPTFS_MAX_QSIZE, xc->max_queue_size);
+ if (ret)
+ return ret;
+
+ ret = nla_put_u32(skb, XFRMA_IPTFS_PKT_SIZE, xc->pkt_size);
+ }
+
+ return ret;
+}
+
+static void __iptfs_init_state(struct xfrm_state *x,
+ struct xfrm_iptfs_data *xtfs)
+{
+ __skb_queue_head_init(&xtfs->queue);
+ hrtimer_init(&xtfs->iptfs_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE);
+ xtfs->iptfs_timer.function = iptfs_delay_timer;
+
+ spin_lock_init(&xtfs->drop_lock);
+ hrtimer_init(&xtfs->drop_timer, CLOCK_MONOTONIC, IPTFS_HRTIMER_MODE);
+ xtfs->drop_timer.function = iptfs_drop_timer;
+
+ /* Modify type (esp) adjustment values */
+
+ if (x->props.family == AF_INET)
+ x->props.header_len += sizeof(struct iphdr) + sizeof(struct ip_iptfs_hdr);
+ else if (x->props.family == AF_INET6)
+ x->props.header_len += sizeof(struct ipv6hdr) + sizeof(struct ip_iptfs_hdr);
+ x->props.enc_hdr_len = sizeof(struct ip_iptfs_hdr);
+
+ /* Always keep a module reference when x->mode_data is set */
+ __module_get(x->mode_cbs->owner);
+
+ x->mode_data = xtfs;
+ xtfs->x = x;
+}
+
+static int iptfs_clone_state(struct xfrm_state *x, struct xfrm_state *orig)
+{
+ struct xfrm_iptfs_data *xtfs;
+
+ xtfs = kmemdup(orig->mode_data, sizeof(*xtfs), GFP_KERNEL);
+ if (!xtfs)
+ return -ENOMEM;
+
+ x->mode_data = xtfs;
+ xtfs->x = x;
+
+ xtfs->ra_newskb = NULL;
+ if (xtfs->cfg.reorder_win_size) {
+ xtfs->w_saved = kcalloc(xtfs->cfg.reorder_win_size,
+ sizeof(*xtfs->w_saved), GFP_KERNEL);
+ if (!xtfs->w_saved) {
+ kfree_sensitive(xtfs);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+static int iptfs_init_state(struct xfrm_state *x)
+{
+ struct xfrm_iptfs_data *xtfs;
+
+ if (x->mode_data) {
+ /* We have arrived here from xfrm_state_clone() */
+ xtfs = x->mode_data;
+ } else {
+ xtfs = kzalloc(sizeof(*xtfs), GFP_KERNEL);
+ if (!xtfs)
+ return -ENOMEM;
+ }
+
+ __iptfs_init_state(x, xtfs);
+
+ return 0;
+}
+
+static void iptfs_destroy_state(struct xfrm_state *x)
+{
+ struct xfrm_iptfs_data *xtfs = x->mode_data;
+ struct sk_buff_head list;
+ struct skb_wseq *s, *se;
+ struct sk_buff *skb;
+
+ if (!xtfs)
+ return;
+
+ spin_lock_bh(&xtfs->x->lock);
+ hrtimer_cancel(&xtfs->iptfs_timer);
+ __skb_queue_head_init(&list);
+ skb_queue_splice_init(&xtfs->queue, &list);
+ spin_unlock_bh(&xtfs->x->lock);
+
+ while ((skb = __skb_dequeue(&list)))
+ kfree_skb(skb);
+
+ spin_lock_bh(&xtfs->drop_lock);
+ hrtimer_cancel(&xtfs->drop_timer);
+ spin_unlock_bh(&xtfs->drop_lock);
+
+ if (xtfs->ra_newskb)
+ kfree_skb(xtfs->ra_newskb);
+
+ for (s = xtfs->w_saved, se = s + xtfs->w_savedlen; s < se; s++) {
+ if (s->skb)
+ kfree_skb(s->skb);
+ }
+
+ kfree_sensitive(xtfs->w_saved);
+ kfree_sensitive(xtfs);
+
+ module_put(x->mode_cbs->owner);
+}
+
+static const struct xfrm_mode_cbs iptfs_mode_cbs = {
+ .owner = THIS_MODULE,
+ .init_state = iptfs_init_state,
+ .clone_state = iptfs_clone_state,
+ .destroy_state = iptfs_destroy_state,
+ .user_init = iptfs_user_init,
+ .copy_to_user = iptfs_copy_to_user,
+ .sa_len = iptfs_sa_len,
+ .get_inner_mtu = iptfs_get_inner_mtu,
+ .input = iptfs_input,
+ .output = iptfs_output_collect,
+ .prepare_output = iptfs_prepare_output,
+};
+
+static int __init xfrm_iptfs_init(void)
+{
+ int err;
+
+ pr_info("xfrm_iptfs: IPsec IP-TFS tunnel mode module\n");
+
+ err = xfrm_register_mode_cbs(XFRM_MODE_IPTFS, &iptfs_mode_cbs);
+ if (err < 0)
+ pr_info("%s: can't register IP-TFS\n", __func__);
+
+ return err;
+}
+
+static void __exit xfrm_iptfs_fini(void)
+{
+ xfrm_unregister_mode_cbs(XFRM_MODE_IPTFS);
+}
+
+module_init(xfrm_iptfs_init);
+module_exit(xfrm_iptfs_fini);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("IP-TFS support for xfrm ipsec tunnels");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index e5722c95b8bb..3cabc87978dd 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -472,6 +472,8 @@ static int xfrm_outer_mode_output(struct xfrm_state *x, struct sk_buff *skb)
WARN_ON_ONCE(1);
break;
default:
+ if (x->mode_cbs && x->mode_cbs->prepare_output)
+ return x->mode_cbs->prepare_output(x, skb);
WARN_ON_ONCE(1);
break;
}
@@ -610,6 +612,40 @@ out:
}
EXPORT_SYMBOL_GPL(xfrm_output_resume);
+static int xfrm_dev_direct_output(struct sock *sk, struct xfrm_state *x,
+ struct sk_buff *skb)
+{
+ struct dst_entry *dst = skb_dst(skb);
+ struct net *net = xs_net(x);
+ int err;
+
+ dst = skb_dst_pop(skb);
+ if (!dst) {
+ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+ kfree_skb(skb);
+ return -EHOSTUNREACH;
+ }
+ skb_dst_set(skb, dst);
+ nf_reset_ct(skb);
+
+ err = skb_dst(skb)->ops->local_out(net, sk, skb);
+ if (unlikely(err != 1)) {
+ kfree_skb(skb);
+ return err;
+ }
+
+ /* In transport mode, network destination is
+ * directly reachable, while in tunnel mode,
+ * inner packet network may not be. In packet
+ * offload type, HW is responsible for hard
+ * header packet mangling so directly xmit skb
+ * to netdevice.
+ */
+ skb->dev = x->xso.dev;
+ __skb_push(skb, skb->dev->hard_header_len);
+ return dev_queue_xmit(skb);
+}
+
static int xfrm_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
return xfrm_output_resume(sk, skb, 1);
@@ -675,6 +711,10 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x)
return;
}
+ if (x->outer_mode.encap == XFRM_MODE_IPTFS) {
+ xo->inner_ipproto = IPPROTO_AGGFRAG;
+ return;
+ }
/* non-Tunnel Mode */
if (!skb->encapsulation)
@@ -729,6 +769,13 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
return -EHOSTUNREACH;
}
+ /* Exclusive direct xmit for tunnel mode, as
+ * some filtering or matching rules may apply
+ * in transport mode.
+ */
+ if (x->props.mode == XFRM_MODE_TUNNEL)
+ return xfrm_dev_direct_output(sk, x, skb);
+
return xfrm_output_resume(sk, skb, 0);
}
@@ -752,7 +799,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
skb->encapsulation = 1;
if (skb_is_gso(skb)) {
- if (skb->inner_protocol)
+ if (skb->inner_protocol && x->props.mode == XFRM_MODE_TUNNEL)
return xfrm_output_gso(net, sk, skb);
skb_shinfo(skb)->gso_type |= SKB_GSO_ESP;
@@ -796,7 +843,7 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
!skb_gso_validate_network_len(skb, ip_skb_dst_mtu(skb->sk, skb)))) {
skb->protocol = htons(ETH_P_IP);
- if (skb->sk)
+ if (skb->sk && sk_fullsock(skb->sk))
xfrm_local_error(skb, mtu);
else
icmp_send(skb, ICMP_DEST_UNREACH,
@@ -832,6 +879,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
{
int mtu, ret = 0;
struct dst_entry *dst = skb_dst(skb);
+ struct sock *sk = skb_to_full_sk(skb);
if (skb->ignore_df)
goto out;
@@ -846,9 +894,9 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb)
skb->dev = dst->dev;
skb->protocol = htons(ETH_P_IPV6);
- if (xfrm6_local_dontfrag(skb->sk))
+ if (xfrm6_local_dontfrag(sk))
ipv6_stub->xfrm6_local_rxpmtu(skb, mtu);
- else if (skb->sk)
+ else if (sk)
xfrm_local_error(skb, mtu);
else
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 4408c11c0835..6551e588fe52 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2497,6 +2497,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i];
if (tmpl->mode == XFRM_MODE_TUNNEL ||
+ tmpl->mode == XFRM_MODE_IPTFS ||
tmpl->mode == XFRM_MODE_BEET) {
remote = &tmpl->id.daddr;
local = &tmpl->saddr;
@@ -2748,13 +2749,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
dst1->input = dst_discard;
- rcu_read_lock();
- afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
- if (likely(afinfo))
- dst1->output = afinfo->output;
- else
- dst1->output = dst_discard_out;
- rcu_read_unlock();
+ if (xfrm[i]->mode_cbs && xfrm[i]->mode_cbs->output) {
+ dst1->output = xfrm[i]->mode_cbs->output;
+ } else {
+ rcu_read_lock();
+ afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family);
+ if (likely(afinfo))
+ dst1->output = afinfo->output;
+ else
+ dst1->output = dst_discard_out;
+ rcu_read_unlock();
+ }
xdst_prev = xdst;
@@ -2959,7 +2964,7 @@ static void xfrm_policy_queue_process(struct timer_list *t)
skb_dst_drop(skb);
skb_dst_set(skb, dst);
- dst_output(net, skb->sk, skb);
+ dst_output(net, skb_to_full_sk(skb), skb);
}
out:
@@ -3290,7 +3295,8 @@ no_transform:
ok:
xfrm_pols_put(pols, drop_pols);
if (dst && dst->xfrm &&
- dst->xfrm->props.mode == XFRM_MODE_TUNNEL)
+ (dst->xfrm->props.mode == XFRM_MODE_TUNNEL ||
+ dst->xfrm->props.mode == XFRM_MODE_IPTFS))
dst->flags |= DST_XFRM_TUNNEL;
return dst;
@@ -4519,6 +4525,7 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm
switch (t->mode) {
case XFRM_MODE_TUNNEL:
case XFRM_MODE_BEET:
+ case XFRM_MODE_IPTFS:
if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr,
m->old_family) &&
xfrm_addr_equal(&t->saddr, &m->old_saddr,
@@ -4561,7 +4568,8 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
continue;
n++;
if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL &&
- pol->xfrm_vec[i].mode != XFRM_MODE_BEET)
+ pol->xfrm_vec[i].mode != XFRM_MODE_BEET &&
+ pol->xfrm_vec[i].mode != XFRM_MODE_IPTFS)
continue;
/* update endpoints */
memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr,
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index eeb984be03a7..8e07dd614b0b 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -43,6 +43,8 @@ static const struct snmp_mib xfrm_mib_list[] = {
SNMP_MIB_ITEM("XfrmAcquireError", LINUX_MIB_XFRMACQUIREERROR),
SNMP_MIB_ITEM("XfrmOutStateDirError", LINUX_MIB_XFRMOUTSTATEDIRERROR),
SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR),
+ SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR),
+ SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE),
SNMP_MIB_SENTINEL
};
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 235bbefc2aba..dbdf8a39dffe 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -731,6 +731,7 @@ static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff
}
replay_esn->oseq = oseq;
+ xfrm_dev_state_advance_esn(x);
if (xfrm_aevent_is_on(net))
xfrm_replay_notify(x, XFRM_REPLAY_UPDATE);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 711e816fc404..69af5964c886 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -424,18 +424,18 @@ void xfrm_unregister_type_offload(const struct xfrm_type_offload *type,
}
EXPORT_SYMBOL(xfrm_unregister_type_offload);
-static const struct xfrm_type_offload *
-xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load)
+void xfrm_set_type_offload(struct xfrm_state *x)
{
const struct xfrm_type_offload *type = NULL;
struct xfrm_state_afinfo *afinfo;
+ bool try_load = true;
retry:
- afinfo = xfrm_state_get_afinfo(family);
+ afinfo = xfrm_state_get_afinfo(x->props.family);
if (unlikely(afinfo == NULL))
- return NULL;
+ goto out;
- switch (proto) {
+ switch (x->id.proto) {
case IPPROTO_ESP:
type = afinfo->type_offload_esp;
break;
@@ -449,18 +449,16 @@ retry:
rcu_read_unlock();
if (!type && try_load) {
- request_module("xfrm-offload-%d-%d", family, proto);
+ request_module("xfrm-offload-%d-%d", x->props.family,
+ x->id.proto);
try_load = false;
goto retry;
}
- return type;
-}
-
-static void xfrm_put_type_offload(const struct xfrm_type_offload *type)
-{
- module_put(type->owner);
+out:
+ x->type_offload = type;
}
+EXPORT_SYMBOL(xfrm_set_type_offload);
static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
[XFRM_MODE_BEET] = {
@@ -477,6 +475,11 @@ static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = {
.flags = XFRM_MODE_FLAG_TUNNEL,
.family = AF_INET,
},
+ [XFRM_MODE_IPTFS] = {
+ .encap = XFRM_MODE_IPTFS,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+ .family = AF_INET,
+ },
};
static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
@@ -498,6 +501,11 @@ static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = {
.flags = XFRM_MODE_FLAG_TUNNEL,
.family = AF_INET6,
},
+ [XFRM_MODE_IPTFS] = {
+ .encap = XFRM_MODE_IPTFS,
+ .flags = XFRM_MODE_FLAG_TUNNEL,
+ .family = AF_INET6,
+ },
};
static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
@@ -525,6 +533,60 @@ static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family)
return NULL;
}
+static const struct xfrm_mode_cbs __rcu *xfrm_mode_cbs_map[XFRM_MODE_MAX];
+static DEFINE_SPINLOCK(xfrm_mode_cbs_map_lock);
+
+int xfrm_register_mode_cbs(u8 mode, const struct xfrm_mode_cbs *mode_cbs)
+{
+ if (mode >= XFRM_MODE_MAX)
+ return -EINVAL;
+
+ spin_lock_bh(&xfrm_mode_cbs_map_lock);
+ rcu_assign_pointer(xfrm_mode_cbs_map[mode], mode_cbs);
+ spin_unlock_bh(&xfrm_mode_cbs_map_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(xfrm_register_mode_cbs);
+
+void xfrm_unregister_mode_cbs(u8 mode)
+{
+ if (mode >= XFRM_MODE_MAX)
+ return;
+
+ spin_lock_bh(&xfrm_mode_cbs_map_lock);
+ RCU_INIT_POINTER(xfrm_mode_cbs_map[mode], NULL);
+ spin_unlock_bh(&xfrm_mode_cbs_map_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(xfrm_unregister_mode_cbs);
+
+static const struct xfrm_mode_cbs *xfrm_get_mode_cbs(u8 mode)
+{
+ const struct xfrm_mode_cbs *cbs;
+ bool try_load = true;
+
+ if (mode >= XFRM_MODE_MAX)
+ return NULL;
+
+retry:
+ rcu_read_lock();
+
+ cbs = rcu_dereference(xfrm_mode_cbs_map[mode]);
+ if (cbs && !try_module_get(cbs->owner))
+ cbs = NULL;
+
+ rcu_read_unlock();
+
+ if (mode == XFRM_MODE_IPTFS && !cbs && try_load) {
+ request_module("xfrm-iptfs");
+ try_load = false;
+ goto retry;
+ }
+
+ return cbs;
+}
+
void xfrm_state_free(struct xfrm_state *x)
{
kmem_cache_free(xfrm_state_cache, x);
@@ -533,6 +595,8 @@ EXPORT_SYMBOL(xfrm_state_free);
static void ___xfrm_state_destroy(struct xfrm_state *x)
{
+ if (x->mode_cbs && x->mode_cbs->destroy_state)
+ x->mode_cbs->destroy_state(x);
hrtimer_cancel(&x->mtimer);
del_timer_sync(&x->rtimer);
kfree(x->aead);
@@ -543,8 +607,6 @@ static void ___xfrm_state_destroy(struct xfrm_state *x)
kfree(x->coaddr);
kfree(x->replay_esn);
kfree(x->preplay_esn);
- if (x->type_offload)
- xfrm_put_type_offload(x->type_offload);
if (x->type) {
x->type->destructor(x);
xfrm_put_type(x->type);
@@ -692,6 +754,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
x->replay_maxdiff = 0;
x->pcpu_num = UINT_MAX;
spin_lock_init(&x->lock);
+ x->mode_data = NULL;
}
return x;
}
@@ -717,6 +780,8 @@ void xfrm_dev_state_free(struct xfrm_state *x)
struct xfrm_dev_offload *xso = &x->xso;
struct net_device *dev = READ_ONCE(xso->dev);
+ xfrm_unset_type_offload(x);
+
if (dev && dev->xfrmdev_ops) {
spin_lock_bh(&xfrm_state_dev_gc_lock);
if (!hlist_unhashed(&x->dev_gclist))
@@ -1987,6 +2052,12 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
x->new_mapping_sport = 0;
x->dir = orig->dir;
+ x->mode_cbs = orig->mode_cbs;
+ if (x->mode_cbs && x->mode_cbs->clone_state) {
+ if (x->mode_cbs->clone_state(x, orig))
+ goto error;
+ }
+
return x;
error:
@@ -2320,6 +2391,7 @@ static int __xfrm6_state_sort_cmp(const void *p)
#endif
case XFRM_MODE_TUNNEL:
case XFRM_MODE_BEET:
+ case XFRM_MODE_IPTFS:
return 4;
}
return 5;
@@ -2346,6 +2418,7 @@ static int __xfrm6_tmpl_sort_cmp(const void *p)
#endif
case XFRM_MODE_TUNNEL:
case XFRM_MODE_BEET:
+ case XFRM_MODE_IPTFS:
return 3;
}
return 4;
@@ -3035,6 +3108,9 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
case XFRM_MODE_TUNNEL:
break;
default:
+ if (x->mode_cbs && x->mode_cbs->get_inner_mtu)
+ return x->mode_cbs->get_inner_mtu(x, mtu);
+
WARN_ON_ONCE(1);
break;
}
@@ -3044,7 +3120,7 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu)
}
EXPORT_SYMBOL_GPL(xfrm_state_mtu);
-int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
+int __xfrm_init_state(struct xfrm_state *x, bool init_replay,
struct netlink_ext_ack *extack)
{
const struct xfrm_mode *inner_mode;
@@ -3100,8 +3176,6 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
goto error;
}
- x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload);
-
err = x->type->init_state(x, extack);
if (err)
goto error;
@@ -3135,6 +3209,12 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload,
}
}
+ x->mode_cbs = xfrm_get_mode_cbs(x->props.mode);
+ if (x->mode_cbs) {
+ if (x->mode_cbs->init_state)
+ err = x->mode_cbs->init_state(x);
+ module_put(x->mode_cbs->owner);
+ }
error:
return err;
}
@@ -3145,7 +3225,7 @@ int xfrm_init_state(struct xfrm_state *x)
{
int err;
- err = __xfrm_init_state(x, true, false, NULL);
+ err = __xfrm_init_state(x, true, NULL);
if (!err)
x->km.state = XFRM_STATE_VALID;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index b2876e09328b..82a768500999 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -301,6 +301,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode");
goto out;
}
+ if ((attrs[XFRMA_IPTFS_DROP_TIME] ||
+ attrs[XFRMA_IPTFS_REORDER_WINDOW] ||
+ attrs[XFRMA_IPTFS_DONT_FRAG] ||
+ attrs[XFRMA_IPTFS_INIT_DELAY] ||
+ attrs[XFRMA_IPTFS_MAX_QSIZE] ||
+ attrs[XFRMA_IPTFS_PKT_SIZE]) &&
+ p->mode != XFRM_MODE_IPTFS) {
+ NL_SET_ERR_MSG(extack, "IP-TFS options can only be used in IP-TFS mode");
+ goto out;
+ }
break;
case IPPROTO_COMP:
@@ -373,6 +383,16 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
case XFRM_MODE_ROUTEOPTIMIZATION:
case XFRM_MODE_BEET:
break;
+ case XFRM_MODE_IPTFS:
+ if (p->id.proto != IPPROTO_ESP) {
+ NL_SET_ERR_MSG(extack, "IP-TFS mode only supported with ESP");
+ goto out;
+ }
+ if (sa_dir == 0) {
+ NL_SET_ERR_MSG(extack, "IP-TFS mode requires in or out direction attribute");
+ goto out;
+ }
+ break;
default:
NL_SET_ERR_MSG(extack, "Unsupported mode");
@@ -421,6 +441,18 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
goto out;
}
+ if (attrs[XFRMA_IPTFS_DROP_TIME]) {
+ NL_SET_ERR_MSG(extack, "IP-TFS drop time should not be set for output SA");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (attrs[XFRMA_IPTFS_REORDER_WINDOW]) {
+ NL_SET_ERR_MSG(extack, "IP-TFS reorder window should not be set for output SA");
+ err = -EINVAL;
+ goto out;
+ }
+
if (attrs[XFRMA_REPLAY_VAL]) {
struct xfrm_replay_state *replay;
@@ -458,6 +490,30 @@ static int verify_newsa_info(struct xfrm_usersa_info *p,
}
}
+
+ if (attrs[XFRMA_IPTFS_DONT_FRAG]) {
+ NL_SET_ERR_MSG(extack, "IP-TFS don't fragment should not be set for input SA");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (attrs[XFRMA_IPTFS_INIT_DELAY]) {
+ NL_SET_ERR_MSG(extack, "IP-TFS initial delay should not be set for input SA");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (attrs[XFRMA_IPTFS_MAX_QSIZE]) {
+ NL_SET_ERR_MSG(extack, "IP-TFS max queue size should not be set for input SA");
+ err = -EINVAL;
+ goto out;
+ }
+
+ if (attrs[XFRMA_IPTFS_PKT_SIZE]) {
+ NL_SET_ERR_MSG(extack, "IP-TFS packet size should not be set for input SA");
+ err = -EINVAL;
+ goto out;
+ }
}
if (!sa_dir && attrs[XFRMA_SA_PCPU]) {
@@ -851,7 +907,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
goto error;
}
- err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack);
+ err = __xfrm_init_state(x, false, extack);
if (err)
goto error;
@@ -886,6 +942,12 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
goto error;
}
+ if (x->mode_cbs && x->mode_cbs->user_init) {
+ err = x->mode_cbs->user_init(net, x, attrs, extack);
+ if (err)
+ goto error;
+ }
+
return x;
error:
@@ -1301,6 +1363,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
if (ret)
goto out;
}
+ if (x->mode_cbs && x->mode_cbs->copy_to_user)
+ ret = x->mode_cbs->copy_to_user(x, skb);
+ if (ret)
+ goto out;
if (x->mapping_maxage) {
ret = nla_put_u32(skb, XFRMA_MTIMER_THRESH, x->mapping_maxage);
if (ret)
@@ -1958,6 +2024,8 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family,
return -EINVAL;
}
break;
+ case XFRM_MODE_IPTFS:
+ break;
default:
if (ut[i].family != prev_family) {
NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change");
@@ -3220,6 +3288,12 @@ const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT),
[XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 },
[XFRMA_SA_PCPU] = { .type = NLA_U32 },
+ [XFRMA_IPTFS_DROP_TIME] = { .type = NLA_U32 },
+ [XFRMA_IPTFS_REORDER_WINDOW] = { .type = NLA_U16 },
+ [XFRMA_IPTFS_DONT_FRAG] = { .type = NLA_FLAG },
+ [XFRMA_IPTFS_INIT_DELAY] = { .type = NLA_U32 },
+ [XFRMA_IPTFS_MAX_QSIZE] = { .type = NLA_U32 },
+ [XFRMA_IPTFS_PKT_SIZE] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(xfrma_policy);
@@ -3554,6 +3628,9 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
if (x->nat_keepalive_interval)
l += nla_total_size(sizeof(x->nat_keepalive_interval));
+ if (x->mode_cbs && x->mode_cbs->sa_len)
+ l += x->mode_cbs->sa_len(x);
+
return l;
}