summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/iphc.c1
-rw-r--r--net/8021q/Makefile1
-rw-r--r--net/9p/client.c3
-rw-r--r--net/Kconfig2
-rw-r--r--net/atm/common.c2
-rw-r--r--net/atm/mpoa_proc.c6
-rw-r--r--net/ax25/ax25_addr.c1
-rw-r--r--net/ax25/ax25_ds_in.c1
-rw-r--r--net/ax25/ax25_ds_subr.c1
-rw-r--r--net/ax25/ax25_ip.c1
-rw-r--r--net/ax25/ax25_out.c1
-rw-r--r--net/batman-adv/Kconfig8
-rw-r--r--net/batman-adv/bat_iv_ogm.c4
-rw-r--r--net/batman-adv/bat_iv_ogm.h6
-rw-r--r--net/batman-adv/bat_v.c4
-rw-r--r--net/batman-adv/bat_v_ogm.h6
-rw-r--r--net/batman-adv/debugfs.c42
-rw-r--r--net/batman-adv/debugfs.h11
-rw-r--r--net/batman-adv/hard-interface.c37
-rw-r--r--net/batman-adv/originator.c17
-rw-r--r--net/batman-adv/translation-table.c7
-rw-r--r--net/batman-adv/types.h7
-rw-r--r--net/bluetooth/af_bluetooth.c2
-rw-r--r--net/bluetooth/hci_conn.c189
-rw-r--r--net/bluetooth/hci_core.c105
-rw-r--r--net/bluetooth/hci_debugfs.c19
-rw-r--r--net/bluetooth/hci_event.c554
-rw-r--r--net/bluetooth/hci_request.c616
-rw-r--r--net/bluetooth/hci_request.h8
-rw-r--r--net/bluetooth/hidp/core.c6
-rw-r--r--net/bluetooth/mgmt.c405
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/bpf/test_run.c17
-rw-r--r--net/bpfilter/Kconfig1
-rw-r--r--net/bridge/br_forward.c16
-rw-r--r--net/bridge/br_if.c62
-rw-r--r--net/bridge/br_netfilter_hooks.c1
-rw-r--r--net/bridge/br_netlink.c30
-rw-r--r--net/bridge/br_private.h5
-rw-r--r--net/bridge/br_sysfs_if.c94
-rw-r--r--net/bridge/netfilter/ebtable_filter.c1
-rw-r--r--net/bridge/netfilter/ebtable_nat.c1
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c3
-rw-r--r--net/caif/caif_dev.c4
-rw-r--r--net/caif/caif_socket.c2
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c147
-rw-r--r--net/core/dev_ioctl.c7
-rw-r--r--net/core/dst.c1
-rw-r--r--net/core/ethtool.c1
-rw-r--r--net/core/fib_rules.c3
-rw-r--r--net/core/filter.c161
-rw-r--r--net/core/flow_dissector.c14
-rw-r--r--net/core/gen_stats.c16
-rw-r--r--net/core/lwt_bpf.c2
-rw-r--r--net/core/neighbour.c1
-rw-r--r--net/core/net-sysfs.c32
-rw-r--r--net/core/net_namespace.c28
-rw-r--r--net/core/page_pool.c2
-rw-r--r--net/core/pktgen.c12
-rw-r--r--net/core/rtnetlink.c82
-rw-r--r--net/core/skbuff.c19
-rw-r--r--net/core/sock.c57
-rw-r--r--net/core/utils.c2
-rw-r--r--net/core/xdp.c37
-rw-r--r--net/dcb/dcbnl.c97
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/decnet/Kconfig1
-rw-r--r--net/decnet/Makefile1
-rw-r--r--net/decnet/TODO5
-rw-r--r--net/decnet/dn_fib.c2
-rw-r--r--net/decnet/dn_nsp_in.c1
-rw-r--r--net/decnet/dn_nsp_out.c1
-rw-r--r--net/decnet/dn_route.c1
-rw-r--r--net/decnet/dn_rules.c2
-rw-r--r--net/decnet/netfilter/Makefile1
-rw-r--r--net/decnet/netfilter/dn_rtmsg.c1
-rw-r--r--net/dns_resolver/dns_key.c29
-rw-r--r--net/dsa/dsa2.c14
-rw-r--r--net/dsa/slave.c6
-rw-r--r--net/dsa/switch.c22
-rw-r--r--net/ieee802154/6lowpan/core.c6
-rw-r--r--net/ieee802154/6lowpan/reassembly.c2
-rw-r--r--net/ieee802154/core.c1
-rw-r--r--net/ieee802154/nl_policy.c1
-rw-r--r--net/ipv4/Kconfig4
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c5
-rw-r--r--net/ipv4/bpfilter/Makefile1
-rw-r--r--net/ipv4/devinet.c11
-rw-r--r--net/ipv4/esp4_offload.c6
-rw-r--r--net/ipv4/fib_frontend.c5
-rw-r--r--net/ipv4/igmp.c54
-rw-r--r--net/ipv4/inet_fragment.c24
-rw-r--r--net/ipv4/ip_forward.c3
-rw-r--r--net/ipv4/ip_fragment.c234
-rw-r--r--net/ipv4/ip_gre.c5
-rw-r--r--net/ipv4/ip_output.c2
-rw-r--r--net/ipv4/ip_sockglue.c11
-rw-r--r--net/ipv4/ipmr.c21
-rw-r--r--net/ipv4/netfilter.c53
-rw-r--r--net/ipv4/netfilter/Kconfig22
-rw-r--r--net/ipv4/netfilter/Makefile6
-rw-r--r--net/ipv4/netfilter/ip_tables.c1
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c472
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c18
-rw-r--r--net/ipv4/ping.c6
-rw-r--r--net/ipv4/proc.c1
-rw-r--r--net/ipv4/route.c6
-rw-r--r--net/ipv4/sysctl_net_ipv4.c31
-rw-r--r--net/ipv4/tcp.c71
-rw-r--r--net/ipv4/tcp_bbr.c4
-rw-r--r--net/ipv4/tcp_dctcp.c75
-rw-r--r--net/ipv4/tcp_input.c91
-rw-r--r--net/ipv4/tcp_ipv4.c23
-rw-r--r--net/ipv4/tcp_offload.c3
-rw-r--r--net/ipv4/tcp_output.c41
-rw-r--r--net/ipv4/tcp_recovery.c2
-rw-r--r--net/ipv4/tcp_timer.c51
-rw-r--r--net/ipv6/Kconfig3
-rw-r--r--net/ipv6/addrconf.c3
-rw-r--r--net/ipv6/af_inet6.c6
-rw-r--r--net/ipv6/calipso.c9
-rw-r--r--net/ipv6/datagram.c9
-rw-r--r--net/ipv6/esp6.c4
-rw-r--r--net/ipv6/esp6_offload.c6
-rw-r--r--net/ipv6/exthdrs.c111
-rw-r--r--net/ipv6/icmp.c7
-rw-r--r--net/ipv6/ila/ila_common.c1
-rw-r--r--net/ipv6/ila/ila_xlat.c1
-rw-r--r--net/ipv6/ip6_fib.c156
-rw-r--r--net/ipv6/ip6_gre.c3
-rw-r--r--net/ipv6/ip6_output.c2
-rw-r--r--net/ipv6/ip6_vti.c11
-rw-r--r--net/ipv6/ipv6_sockglue.c32
-rw-r--r--net/ipv6/mcast.c61
-rw-r--r--net/ipv6/ndisc.c2
-rw-r--r--net/ipv6/netfilter.c62
-rw-r--r--net/ipv6/netfilter/Kconfig27
-rw-r--r--net/ipv6/netfilter/Makefile6
-rw-r--r--net/ipv6/netfilter/ip6_tables.c1
-rw-r--r--net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c460
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c24
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c4
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c18
-rw-r--r--net/ipv6/reassembly.c97
-rw-r--r--net/ipv6/route.c51
-rw-r--r--net/ipv6/seg6_iptunnel.c2
-rw-r--r--net/ipv6/seg6_local.c4
-rw-r--r--net/ipv6/tcp_ipv6.c6
-rw-r--r--net/ipv6/xfrm6_mode_ro.c2
-rw-r--r--net/iucv/af_iucv.c3
-rw-r--r--net/kcm/Kconfig1
-rw-r--r--net/kcm/kcmsock.c1
-rw-r--r--net/key/af_key.c6
-rw-r--r--net/l2tp/l2tp_core.c19
-rw-r--r--net/l2tp/l2tp_core.h34
-rw-r--r--net/l2tp/l2tp_debugfs.c5
-rw-r--r--net/l2tp/l2tp_eth.c25
-rw-r--r--net/l2tp/l2tp_ip.c2
-rw-r--r--net/l2tp/l2tp_ip6.c3
-rw-r--r--net/l2tp/l2tp_netlink.c26
-rw-r--r--net/l2tp/l2tp_ppp.c166
-rw-r--r--net/llc/Kconfig2
-rw-r--r--net/llc/Makefile2
-rw-r--r--net/llc/llc_if.c1
-rw-r--r--net/mac80211/cfg.c2
-rw-r--r--net/mac80211/key.c24
-rw-r--r--net/mac80211/rc80211_minstrel.c1
-rw-r--r--net/mac80211/rx.c5
-rw-r--r--net/mac80211/tx.c2
-rw-r--r--net/mac80211/util.c3
-rw-r--r--net/mpls/mpls_iptunnel.c2
-rw-r--r--net/netfilter/Kconfig82
-rw-r--r--net/netfilter/Makefile19
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c67
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto.c19
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c18
-rw-r--r--net/netfilter/nf_conncount.c386
-rw-r--r--net/netfilter/nf_conntrack_core.c283
-rw-r--r--net/netfilter/nf_conntrack_expect.c3
-rw-r--r--net/netfilter/nf_conntrack_helper.c10
-rw-r--r--net/netfilter/nf_conntrack_l3proto_generic.c66
-rw-r--r--net/netfilter/nf_conntrack_netlink.c98
-rw-r--r--net/netfilter/nf_conntrack_proto.c844
-rw-r--r--net/netfilter/nf_conntrack_proto_dccp.c52
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c32
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c24
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c (renamed from net/ipv4/netfilter/nf_conntrack_proto_icmp.c)19
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c (renamed from net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c)17
-rw-r--r--net/netfilter/nf_conntrack_proto_sctp.c46
-rw-r--r--net/netfilter/nf_conntrack_proto_tcp.c52
-rw-r--r--net/netfilter/nf_conntrack_proto_udp.c55
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_flow_table_core.c13
-rw-r--r--net/netfilter/nf_nat_core.c12
-rw-r--r--net/netfilter/nf_osf.c218
-rw-r--r--net/netfilter/nf_tables_api.c529
-rw-r--r--net/netfilter/nf_tables_core.c16
-rw-r--r--net/netfilter/nf_tables_set_core.c28
-rw-r--r--net/netfilter/nfnetlink.c23
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c19
-rw-r--r--net/netfilter/nfnetlink_osf.c436
-rw-r--r--net/netfilter/nft_chain_filter.c4
-rw-r--r--net/netfilter/nft_compat.c13
-rw-r--r--net/netfilter/nft_connlimit.c36
-rw-r--r--net/netfilter/nft_ct.c2
-rw-r--r--net/netfilter/nft_dynset.c2
-rw-r--r--net/netfilter/nft_immediate.c3
-rw-r--r--net/netfilter/nft_lookup.c19
-rw-r--r--net/netfilter/nft_meta.c6
-rw-r--r--net/netfilter/nft_numgen.c4
-rw-r--r--net/netfilter/nft_osf.c106
-rw-r--r--net/netfilter/nft_set_bitmap.c19
-rw-r--r--net/netfilter/nft_set_hash.c30
-rw-r--r--net/netfilter/nft_set_rbtree.c26
-rw-r--r--net/netfilter/nft_socket.c17
-rw-r--r--net/netfilter/nft_tproxy.c316
-rw-r--r--net/netfilter/nft_tunnel.c566
-rw-r--r--net/netfilter/utils.c131
-rw-r--r--net/netfilter/xt_CT.c2
-rw-r--r--net/netfilter/xt_TEE.c4
-rw-r--r--net/netfilter/xt_TPROXY.c17
-rw-r--r--net/netfilter/xt_connlimit.c4
-rw-r--r--net/netfilter/xt_osf.c149
-rw-r--r--net/netlink/af_netlink.c12
-rw-r--r--net/nfc/llcp_commands.c9
-rw-r--r--net/nfc/llcp_sock.c2
-rw-r--r--net/nsh/nsh.c2
-rw-r--r--net/openvswitch/conntrack.c20
-rw-r--r--net/openvswitch/meter.c10
-rw-r--r--net/packet/af_packet.c6
-rw-r--r--net/qrtr/qrtr.c13
-rw-r--r--net/rds/Kconfig1
-rw-r--r--net/rds/Makefile1
-rw-r--r--net/rds/af_rds.c205
-rw-r--r--net/rds/bind.c138
-rw-r--r--net/rds/cong.c23
-rw-r--r--net/rds/connection.c283
-rw-r--r--net/rds/ib.c136
-rw-r--r--net/rds/ib.h53
-rw-r--r--net/rds/ib_cm.c320
-rw-r--r--net/rds/ib_frmr.c5
-rw-r--r--net/rds/ib_mr.h5
-rw-r--r--net/rds/ib_rdma.c47
-rw-r--r--net/rds/ib_recv.c33
-rw-r--r--net/rds/ib_send.c13
-rw-r--r--net/rds/loop.c7
-rw-r--r--net/rds/message.c1
-rw-r--r--net/rds/rdma.c19
-rw-r--r--net/rds/rdma_transport.c95
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h93
-rw-r--r--net/rds/recv.c78
-rw-r--r--net/rds/send.c128
-rw-r--r--net/rds/tcp.c154
-rw-r--r--net/rds/tcp.h2
-rw-r--r--net/rds/tcp_connect.c68
-rw-r--r--net/rds/tcp_listen.c87
-rw-r--r--net/rds/tcp_recv.c9
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c69
-rw-r--r--net/rds/transport.c16
-rw-r--r--net/rxrpc/af_rxrpc.c2
-rw-r--r--net/rxrpc/ar-internal.h4
-rw-r--r--net/rxrpc/call_accept.c4
-rw-r--r--net/rxrpc/call_event.c2
-rw-r--r--net/rxrpc/conn_client.c3
-rw-r--r--net/rxrpc/conn_event.c17
-rw-r--r--net/rxrpc/input.c15
-rw-r--r--net/rxrpc/local_event.c5
-rw-r--r--net/rxrpc/output.c32
-rw-r--r--net/rxrpc/proc.c22
-rw-r--r--net/rxrpc/recvmsg.c56
-rw-r--r--net/rxrpc/rxkad.c32
-rw-r--r--net/sched/Kconfig17
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c18
-rw-r--r--net/sched/act_bpf.c4
-rw-r--r--net/sched/act_connmark.c1
-rw-r--r--net/sched/act_csum.c18
-rw-r--r--net/sched/act_ife.c5
-rw-r--r--net/sched/act_mirred.c57
-rw-r--r--net/sched/act_pedit.c3
-rw-r--r--net/sched/act_sample.c4
-rw-r--r--net/sched/act_skbedit.c10
-rw-r--r--net/sched/act_skbmod.c21
-rw-r--r--net/sched/act_tunnel_key.c32
-rw-r--r--net/sched/act_vlan.c19
-rw-r--r--net/sched/cls_api.c614
-rw-r--r--net/sched/cls_basic.c1
-rw-r--r--net/sched/cls_bpf.c4
-rw-r--r--net/sched/cls_flower.c317
-rw-r--r--net/sched/sch_cake.c15
-rw-r--r--net/sched/sch_cbs.c134
-rw-r--r--net/sched/sch_fq_codel.c25
-rw-r--r--net/sched/sch_skbprio.c320
-rw-r--r--net/sctp/Kconfig4
-rw-r--r--net/sctp/sm_sideeffect.c1
-rw-r--r--net/sctp/transport.c2
-rw-r--r--net/smc/af_smc.c134
-rw-r--r--net/smc/smc.h2
-rw-r--r--net/smc/smc_cdc.c38
-rw-r--r--net/smc/smc_cdc.h43
-rw-r--r--net/smc/smc_clc.c19
-rw-r--r--net/smc/smc_clc.h20
-rw-r--r--net/smc/smc_close.c2
-rw-r--r--net/smc/smc_core.c84
-rw-r--r--net/smc/smc_core.h17
-rw-r--r--net/smc/smc_diag.c15
-rw-r--r--net/smc/smc_ib.c44
-rw-r--r--net/smc/smc_ib.h3
-rw-r--r--net/smc/smc_llc.c80
-rw-r--r--net/smc/smc_llc.h7
-rw-r--r--net/smc/smc_pnet.c30
-rw-r--r--net/smc/smc_pnet.h3
-rw-r--r--net/smc/smc_rx.c19
-rw-r--r--net/smc/smc_tx.c46
-rw-r--r--net/smc/smc_tx.h4
-rw-r--r--net/smc/smc_wr.c32
-rw-r--r--net/socket.c30
-rw-r--r--net/strparser/strparser.c4
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c2
-rw-r--r--net/tipc/bcast.c2
-rw-r--r--net/tipc/bearer.c1
-rw-r--r--net/tipc/discover.c18
-rw-r--r--net/tipc/group.c9
-rw-r--r--net/tipc/link.c16
-rw-r--r--net/tipc/monitor.c3
-rw-r--r--net/tipc/name_table.c2
-rw-r--r--net/tipc/net.c17
-rw-r--r--net/tipc/node.c11
-rw-r--r--net/tipc/socket.c7
-rw-r--r--net/tls/tls_device.c304
-rw-r--r--net/tls/tls_device_fallback.c9
-rw-r--r--net/tls/tls_main.c32
-rw-r--r--net/tls/tls_sw.c144
-rw-r--r--net/unix/af_unix.c11
-rw-r--r--net/wimax/Makefile2
-rw-r--r--net/wimax/debugfs.c2
-rw-r--r--net/wimax/op-msg.c1
-rw-r--r--net/wimax/stack.c4
-rw-r--r--net/wireless/lib80211_crypt_tkip.c55
-rw-r--r--net/wireless/nl80211.c25
-rw-r--r--net/wireless/reg.c28
-rw-r--r--net/wireless/trace.h18
-rw-r--r--net/x25/Kconfig2
-rw-r--r--net/x25/x25_subr.c1
-rw-r--r--net/xdp/xdp_umem.c70
-rw-r--r--net/xdp/xsk.c34
-rw-r--r--net/xdp/xsk_queue.h11
-rw-r--r--net/xfrm/Kconfig9
-rw-r--r--net/xfrm/Makefile1
-rw-r--r--net/xfrm/xfrm_device.c19
-rw-r--r--net/xfrm/xfrm_input.c5
-rw-r--r--net/xfrm/xfrm_interface.c975
-rw-r--r--net/xfrm/xfrm_output.c3
-rw-r--r--net/xfrm/xfrm_policy.c317
-rw-r--r--net/xfrm/xfrm_state.c48
-rw-r--r--net/xfrm/xfrm_user.c113
364 files changed, 12291 insertions, 5755 deletions
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 6b1042e21656..52fad5dad9f7 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -770,6 +770,7 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
hdr.hop_limit, &hdr.daddr);
skb_push(skb, sizeof(hdr));
+ skb_reset_mac_header(skb);
skb_reset_network_header(skb);
skb_copy_to_linear_data(skb, &hdr, sizeof(hdr));
diff --git a/net/8021q/Makefile b/net/8021q/Makefile
index 9b703454b93e..e05d4d7aab35 100644
--- a/net/8021q/Makefile
+++ b/net/8021q/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_VLAN_8021Q) += 8021q.o
8021q-$(CONFIG_VLAN_8021Q_GVRP) += vlan_gvrp.o
8021q-$(CONFIG_VLAN_8021Q_MVRP) += vlan_mvrp.o
8021q-$(CONFIG_PROC_FS) += vlanproc.o
-
diff --git a/net/9p/client.c b/net/9p/client.c
index 18c5271910dc..5c1343195292 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -225,7 +225,8 @@ static int parse_opts(char *opts, struct p9_client *clnt)
}
free_and_return:
- v9fs_put_trans(clnt->trans_mod);
+ if (ret)
+ v9fs_put_trans(clnt->trans_mod);
kfree(tmp_options);
return ret;
}
diff --git a/net/Kconfig b/net/Kconfig
index f738a6f27665..228dfa382eec 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -12,7 +12,7 @@ menuconfig NET
The reason is that some programs need kernel networking support even
when running on a stand-alone machine that isn't connected to any
other computer.
-
+
If you are upgrading from an older kernel, you
should consider updating your networking tools too because changes
in the kernel and the tools often go hand in hand. The tools are
diff --git a/net/atm/common.c b/net/atm/common.c
index a7a68e509628..9f8cb0d2e71e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -653,7 +653,7 @@ __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait)
struct atm_vcc *vcc;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
mask = 0;
vcc = ATM_SD(sock);
diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c
index b93cc0f18292..46d6cd9a36ae 100644
--- a/net/atm/mpoa_proc.c
+++ b/net/atm/mpoa_proc.c
@@ -307,9 +307,3 @@ void mpc_proc_clean(void)
}
#endif /* CONFIG_PROC_FS */
-
-
-
-
-
-
diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c
index ac2542b7be88..a14cfa736b63 100644
--- a/net/ax25/ax25_addr.c
+++ b/net/ax25/ax25_addr.c
@@ -304,4 +304,3 @@ void ax25_digi_invert(const ax25_digi *in, ax25_digi *out)
}
}
}
-
diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c
index 891596e74278..488fc2d7085a 100644
--- a/net/ax25/ax25_ds_in.c
+++ b/net/ax25/ax25_ds_in.c
@@ -299,4 +299,3 @@ int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type)
return queued;
}
-
diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c
index 28827e81ba2b..bc0329f43013 100644
--- a/net/ax25/ax25_ds_subr.c
+++ b/net/ax25/ax25_ds_subr.c
@@ -205,4 +205,3 @@ void ax25_dama_off(ax25_cb *ax25)
ax25->condition &= ~AX25_COND_DAMA_MODE;
ax25_dev_dama_off(ax25->ax25_dev);
}
-
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index 183b1c583d56..70417e9b932d 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -249,4 +249,3 @@ const struct header_ops ax25_header_ops = {
EXPORT_SYMBOL(ax25_header_ops);
EXPORT_SYMBOL(ax25_ip_xmit);
-
diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c
index b11a5f466fcc..3e5afc8dc93e 100644
--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -394,4 +394,3 @@ int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr)
}
return 0;
}
-
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index de8034d80623..361116f77cb9 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -24,7 +24,6 @@ config BATMAN_ADV
depends on NET
select CRC16
select LIBCRC32C
- default n
help
B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
a routing protocol for multi-hop ad-hoc mesh networks. The
@@ -33,7 +32,7 @@ config BATMAN_ADV
tools.
config BATMAN_ADV_BATMAN_V
- bool "B.A.T.M.A.N. V protocol (experimental)"
+ bool "B.A.T.M.A.N. V protocol"
depends on BATMAN_ADV && !(CFG80211=m && BATMAN_ADV=y)
default y
help
@@ -60,7 +59,7 @@ config BATMAN_ADV_BLA
config BATMAN_ADV_DAT
bool "Distributed ARP Table"
depends on BATMAN_ADV && INET
- default n
+ default y
help
This option enables DAT (Distributed ARP Table), a DHT based
mechanism that increases ARP reliability on sparse wireless
@@ -70,7 +69,6 @@ config BATMAN_ADV_DAT
config BATMAN_ADV_NC
bool "Network Coding"
depends on BATMAN_ADV
- default n
help
This option enables network coding, a mechanism that aims to
increase the overall network throughput by fusing multiple
@@ -84,7 +82,6 @@ config BATMAN_ADV_NC
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
- default n
help
This option enables the multicast optimisation which aims to
reduce the air overhead while improving the reliability of
@@ -94,7 +91,6 @@ config BATMAN_ADV_DEBUGFS
bool "batman-adv debugfs entries"
depends on BATMAN_ADV
depends on DEBUG_FS
- default n
help
Enable this to export routing related debug tables via debugfs.
The information for each soft-interface and used hard-interface can be
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index be09a9883825..73bf6a93a3cf 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -2732,7 +2732,7 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
{
struct batadv_neigh_ifinfo *router_ifinfo = NULL;
struct batadv_neigh_node *router;
- struct batadv_gw_node *curr_gw;
+ struct batadv_gw_node *curr_gw = NULL;
int ret = 0;
void *hdr;
@@ -2780,6 +2780,8 @@ static int batadv_iv_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
ret = 0;
out:
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
if (router_ifinfo)
batadv_neigh_ifinfo_put(router_ifinfo);
if (router)
diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h
index 317cafd302cf..3dc6a7a43eb7 100644
--- a/net/batman-adv/bat_iv_ogm.h
+++ b/net/batman-adv/bat_iv_ogm.h
@@ -16,11 +16,11 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _BATMAN_ADV_BATADV_IV_OGM_H_
-#define _BATMAN_ADV_BATADV_IV_OGM_H_
+#ifndef _NET_BATMAN_ADV_BAT_IV_OGM_H_
+#define _NET_BATMAN_ADV_BAT_IV_OGM_H_
#include "main.h"
int batadv_iv_init(void);
-#endif /* _BATMAN_ADV_BATADV_IV_OGM_H_ */
+#endif /* _NET_BATMAN_ADV_BAT_IV_OGM_H_ */
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
index ec93337ee259..6baec4e68898 100644
--- a/net/batman-adv/bat_v.c
+++ b/net/batman-adv/bat_v.c
@@ -927,7 +927,7 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
{
struct batadv_neigh_ifinfo *router_ifinfo = NULL;
struct batadv_neigh_node *router;
- struct batadv_gw_node *curr_gw;
+ struct batadv_gw_node *curr_gw = NULL;
int ret = 0;
void *hdr;
@@ -995,6 +995,8 @@ static int batadv_v_gw_dump_entry(struct sk_buff *msg, u32 portid, u32 seq,
ret = 0;
out:
+ if (curr_gw)
+ batadv_gw_node_put(curr_gw);
if (router_ifinfo)
batadv_neigh_ifinfo_put(router_ifinfo);
if (router)
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
index ed36c5e79fde..e5be14c908c6 100644
--- a/net/batman-adv/bat_v_ogm.h
+++ b/net/batman-adv/bat_v_ogm.h
@@ -16,8 +16,8 @@
* along with this program; if not, see <http://www.gnu.org/licenses/>.
*/
-#ifndef _BATMAN_ADV_BATADV_V_OGM_H_
-#define _BATMAN_ADV_BATADV_V_OGM_H_
+#ifndef _NET_BATMAN_ADV_BAT_V_OGM_H_
+#define _NET_BATMAN_ADV_BAT_V_OGM_H_
#include "main.h"
@@ -34,4 +34,4 @@ void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
int batadv_v_ogm_packet_recv(struct sk_buff *skb,
struct batadv_hard_iface *if_incoming);
-#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */
+#endif /* _NET_BATMAN_ADV_BAT_V_OGM_H_ */
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index 4229b01ac7b5..3cb82378300b 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -19,6 +19,7 @@
#include "debugfs.h"
#include "main.h"
+#include <linux/dcache.h>
#include <linux/debugfs.h>
#include <linux/err.h>
#include <linux/errno.h>
@@ -117,7 +118,7 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * batadv_dat_cache_open() - Prepare file handler for reads from dat_chache
+ * batadv_dat_cache_open() - Prepare file handler for reads from dat_cache
* @inode: inode which was opened
* @file: file handle to be initialized
*
@@ -344,6 +345,25 @@ out:
}
/**
+ * batadv_debugfs_rename_hardif() - Fix debugfs path for renamed hardif
+ * @hard_iface: hard interface which was renamed
+ */
+void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
+{
+ const char *name = hard_iface->net_dev->name;
+ struct dentry *dir;
+ struct dentry *d;
+
+ dir = hard_iface->debug_dir;
+ if (!dir)
+ return;
+
+ d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
+ if (!d)
+ pr_err("Can't rename debugfs dir to %s\n", name);
+}
+
+/**
* batadv_debugfs_del_hardif() - delete the base directory for a hard interface
* in debugfs.
* @hard_iface: hard interface which is deleted.
@@ -414,6 +434,26 @@ out:
}
/**
+ * batadv_debugfs_rename_meshif() - Fix debugfs path for renamed softif
+ * @dev: net_device which was renamed
+ */
+void batadv_debugfs_rename_meshif(struct net_device *dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(dev);
+ const char *name = dev->name;
+ struct dentry *dir;
+ struct dentry *d;
+
+ dir = bat_priv->debug_dir;
+ if (!dir)
+ return;
+
+ d = debugfs_rename(dir->d_parent, dir, dir->d_parent, name);
+ if (!d)
+ pr_err("Can't rename debugfs dir to %s\n", name);
+}
+
+/**
* batadv_debugfs_del_meshif() - Remove interface dependent debugfs entries
* @dev: netdev struct of the soft interface
*/
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 37b069698b04..08a592ffbee5 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -30,8 +30,10 @@ struct net_device;
void batadv_debugfs_init(void);
void batadv_debugfs_destroy(void);
int batadv_debugfs_add_meshif(struct net_device *dev);
+void batadv_debugfs_rename_meshif(struct net_device *dev);
void batadv_debugfs_del_meshif(struct net_device *dev);
int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface);
+void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface);
void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface);
#else
@@ -49,6 +51,10 @@ static inline int batadv_debugfs_add_meshif(struct net_device *dev)
return 0;
}
+static inline void batadv_debugfs_rename_meshif(struct net_device *dev)
+{
+}
+
static inline void batadv_debugfs_del_meshif(struct net_device *dev)
{
}
@@ -60,6 +66,11 @@ int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
}
static inline
+void batadv_debugfs_rename_hardif(struct batadv_hard_iface *hard_iface)
+{
+}
+
+static inline
void batadv_debugfs_del_hardif(struct batadv_hard_iface *hard_iface)
{
}
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index c405d15befd6..2f0d42f2f913 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -989,6 +989,32 @@ void batadv_hardif_remove_interfaces(void)
rtnl_unlock();
}
+/**
+ * batadv_hard_if_event_softif() - Handle events for soft interfaces
+ * @event: NETDEV_* event to handle
+ * @net_dev: net_device which generated an event
+ *
+ * Return: NOTIFY_* result
+ */
+static int batadv_hard_if_event_softif(unsigned long event,
+ struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv;
+
+ switch (event) {
+ case NETDEV_REGISTER:
+ batadv_sysfs_add_meshif(net_dev);
+ bat_priv = netdev_priv(net_dev);
+ batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
+ break;
+ case NETDEV_CHANGENAME:
+ batadv_debugfs_rename_meshif(net_dev);
+ break;
+ }
+
+ return NOTIFY_DONE;
+}
+
static int batadv_hard_if_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
@@ -997,12 +1023,8 @@ static int batadv_hard_if_event(struct notifier_block *this,
struct batadv_hard_iface *primary_if = NULL;
struct batadv_priv *bat_priv;
- if (batadv_softif_is_valid(net_dev) && event == NETDEV_REGISTER) {
- batadv_sysfs_add_meshif(net_dev);
- bat_priv = netdev_priv(net_dev);
- batadv_softif_create_vlan(bat_priv, BATADV_NO_FLAGS);
- return NOTIFY_DONE;
- }
+ if (batadv_softif_is_valid(net_dev))
+ return batadv_hard_if_event_softif(event, net_dev);
hard_iface = batadv_hardif_get_by_netdev(net_dev);
if (!hard_iface && (event == NETDEV_REGISTER ||
@@ -1051,6 +1073,9 @@ static int batadv_hard_if_event(struct notifier_block *this,
if (batadv_is_wifi_hardif(hard_iface))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
break;
+ case NETDEV_CHANGENAME:
+ batadv_debugfs_rename_hardif(hard_iface);
+ break;
default:
break;
}
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 716e5b43acfa..1d295da3e342 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1339,7 +1339,11 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
return false;
}
-static void _batadv_purge_orig(struct batadv_priv *bat_priv)
+/**
+ * batadv_purge_orig_ref() - Purge all outdated originators
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
{
struct batadv_hashtable *hash = bat_priv->orig_hash;
struct hlist_node *node_tmp;
@@ -1385,21 +1389,12 @@ static void batadv_purge_orig(struct work_struct *work)
delayed_work = to_delayed_work(work);
bat_priv = container_of(delayed_work, struct batadv_priv, orig_work);
- _batadv_purge_orig(bat_priv);
+ batadv_purge_orig_ref(bat_priv);
queue_delayed_work(batadv_event_workqueue,
&bat_priv->orig_work,
msecs_to_jiffies(BATADV_ORIG_WORK_PERIOD));
}
-/**
- * batadv_purge_orig_ref() - Purge all outdated originators
- * @bat_priv: the bat priv with all the soft interface information
- */
-void batadv_purge_orig_ref(struct batadv_priv *bat_priv)
-{
- _batadv_purge_orig(bat_priv);
-}
-
#ifdef CONFIG_BATMAN_ADV_DEBUGFS
/**
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 3986551397ca..12a2b7d21376 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1705,7 +1705,9 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
ether_addr_copy(common->addr, tt_addr);
common->vid = vid;
- common->flags = flags;
+ if (!is_multicast_ether_addr(common->addr))
+ common->flags = flags & (~BATADV_TT_SYNC_MASK);
+
tt_global_entry->roam_at = 0;
/* node must store current time in case of roaming. This is
* needed to purge this entry out on timeout (if nobody claims
@@ -1768,7 +1770,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
* TT_CLIENT_TEMP, therefore they have to be copied in the
* client entry
*/
- common->flags |= flags & (~BATADV_TT_SYNC_MASK);
+ if (!is_multicast_ether_addr(common->addr))
+ common->flags |= flags & (~BATADV_TT_SYNC_MASK);
/* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only
* one originator left in the list and we previously received a
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 360357f83f20..343d304851a5 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -43,12 +43,13 @@ struct seq_file;
#ifdef CONFIG_BATMAN_ADV_DAT
/**
- * batadv_dat_addr_t - it is the type used for all DHT addresses. If it is
- * changed, BATADV_DAT_ADDR_MAX is changed as well.
+ * typedef batadv_dat_addr_t - type used for all DHT addresses
+ *
+ * If it is changed, BATADV_DAT_ADDR_MAX is changed as well.
*
* *Please be careful: batadv_dat_addr_t must be UNSIGNED*
*/
-#define batadv_dat_addr_t u16
+typedef u16 batadv_dat_addr_t;
#endif /* CONFIG_BATMAN_ADV_DAT */
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 3264e1873219..deacc52d7ff1 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -159,7 +159,7 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk)
BT_DBG("parent %p, sk %p", parent, sk);
sock_hold(sk);
- lock_sock(sk);
+ lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q);
bt_sk(sk)->parent = parent;
release_sock(sk);
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 45ff5dc124cc..bd4978ce8c45 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -748,11 +748,30 @@ static bool conn_use_rpa(struct hci_conn *conn)
return hci_dev_test_flag(hdev, HCI_PRIVACY);
}
+static void set_ext_conn_params(struct hci_conn *conn,
+ struct hci_cp_le_ext_conn_param *p)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ memset(p, 0, sizeof(*p));
+
+ /* Set window to be the same value as the interval to
+ * enable continuous scanning.
+ */
+ p->scan_interval = cpu_to_le16(hdev->le_scan_interval);
+ p->scan_window = p->scan_interval;
+ p->conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ p->conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ p->conn_latency = cpu_to_le16(conn->le_conn_latency);
+ p->supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ p->min_ce_len = cpu_to_le16(0x0000);
+ p->max_ce_len = cpu_to_le16(0x0000);
+}
+
static void hci_req_add_le_create_conn(struct hci_request *req,
struct hci_conn *conn,
bdaddr_t *direct_rpa)
{
- struct hci_cp_le_create_conn cp;
struct hci_dev *hdev = conn->hdev;
u8 own_addr_type;
@@ -775,25 +794,71 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
return;
}
- memset(&cp, 0, sizeof(cp));
+ if (use_ext_conn(hdev)) {
+ struct hci_cp_le_ext_create_conn *cp;
+ struct hci_cp_le_ext_conn_param *p;
+ u8 data[sizeof(*cp) + sizeof(*p) * 3];
+ u32 plen;
- /* Set window to be the same value as the interval to enable
- * continuous scanning.
- */
- cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
- cp.scan_window = cp.scan_interval;
+ cp = (void *) data;
+ p = (void *) cp->data;
+
+ memset(cp, 0, sizeof(*cp));
- bacpy(&cp.peer_addr, &conn->dst);
- cp.peer_addr_type = conn->dst_type;
- cp.own_address_type = own_addr_type;
- cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
- cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
- cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
- cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
- cp.min_ce_len = cpu_to_le16(0x0000);
- cp.max_ce_len = cpu_to_le16(0x0000);
+ bacpy(&cp->peer_addr, &conn->dst);
+ cp->peer_addr_type = conn->dst_type;
+ cp->own_addr_type = own_addr_type;
- hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+ plen = sizeof(*cp);
+
+ if (scan_1m(hdev)) {
+ cp->phys |= LE_SCAN_PHY_1M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_2m(hdev)) {
+ cp->phys |= LE_SCAN_PHY_2M;
+ set_ext_conn_params(conn, p);
+
+ p++;
+ plen += sizeof(*p);
+ }
+
+ if (scan_coded(hdev)) {
+ cp->phys |= LE_SCAN_PHY_CODED;
+ set_ext_conn_params(conn, p);
+
+ plen += sizeof(*p);
+ }
+
+ hci_req_add(req, HCI_OP_LE_EXT_CREATE_CONN, plen, data);
+
+ } else {
+ struct hci_cp_le_create_conn cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ /* Set window to be the same value as the interval to enable
+ * continuous scanning.
+ */
+ cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
+ cp.scan_window = cp.scan_interval;
+
+ bacpy(&cp.peer_addr, &conn->dst);
+ cp.peer_addr_type = conn->dst_type;
+ cp.own_address_type = own_addr_type;
+ cp.conn_interval_min = cpu_to_le16(conn->le_conn_min_interval);
+ cp.conn_interval_max = cpu_to_le16(conn->le_conn_max_interval);
+ cp.conn_latency = cpu_to_le16(conn->le_conn_latency);
+ cp.supervision_timeout = cpu_to_le16(conn->le_supv_timeout);
+ cp.min_ce_len = cpu_to_le16(0x0000);
+ cp.max_ce_len = cpu_to_le16(0x0000);
+
+ hci_req_add(req, HCI_OP_LE_CREATE_CONN, sizeof(cp), &cp);
+ }
conn->state = BT_CONNECT;
clear_bit(HCI_CONN_SCANNING, &conn->flags);
@@ -803,35 +868,81 @@ static void hci_req_directed_advertising(struct hci_request *req,
struct hci_conn *conn)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_param cp;
u8 own_addr_type;
u8 enable;
- /* Clear the HCI_LE_ADV bit temporarily so that the
- * hci_update_random_address knows that it's safe to go ahead
- * and write a new random address. The flag will be set back on
- * as soon as the SET_ADV_ENABLE HCI command completes.
- */
- hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_params cp;
+ bdaddr_t random_addr;
- /* Set require_privacy to false so that the remote device has a
- * chance of identifying us.
- */
- if (hci_update_random_address(req, false, conn_use_rpa(conn),
- &own_addr_type) < 0)
- return;
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ if (hci_get_random_address(hdev, false, conn_use_rpa(conn), NULL,
+ &own_addr_type, &random_addr) < 0)
+ return;
- memset(&cp, 0, sizeof(cp));
- cp.type = LE_ADV_DIRECT_IND;
- cp.own_address_type = own_addr_type;
- cp.direct_addr_type = conn->dst_type;
- bacpy(&cp.direct_addr, &conn->dst);
- cp.channel_map = hdev->le_adv_channel_map;
+ memset(&cp, 0, sizeof(cp));
- hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_DIRECT_IND);
+ cp.own_addr_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.tx_power = HCI_TX_POWER_INVALID;
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ cp.handle = 0; /* Use instance 0 for directed adv */
+ cp.own_addr_type = own_addr_type;
+ cp.peer_addr_type = conn->dst_type;
+ bacpy(&cp.peer_addr, &conn->dst);
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
+
+ if (own_addr_type == ADDR_LE_DEV_RANDOM &&
+ bacmp(&random_addr, BDADDR_ANY) &&
+ bacmp(&random_addr, &hdev->random_addr)) {
+ struct hci_cp_le_set_adv_set_rand_addr cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = 0;
+ bacpy(&cp.bdaddr, &random_addr);
+
+ hci_req_add(req,
+ HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ sizeof(cp), &cp);
+ }
- enable = 0x01;
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+ __hci_req_enable_ext_advertising(req);
+ } else {
+ struct hci_cp_le_set_adv_param cp;
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ /* Set require_privacy to false so that the remote device has a
+ * chance of identifying us.
+ */
+ if (hci_update_random_address(req, false, conn_use_rpa(conn),
+ &own_addr_type) < 0)
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.type = LE_ADV_DIRECT_IND;
+ cp.own_address_type = own_addr_type;
+ cp.direct_addr_type = conn->dst_type;
+ bacpy(&cp.direct_addr, &conn->dst);
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+
+ enable = 0x01;
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable),
+ &enable);
+ }
conn->state = BT_CONNECT;
}
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index ee8ef1228263..74b29c7d841c 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -695,11 +695,42 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
if (hdev->commands[35] & (0x20 | 0x40))
events[1] |= 0x08; /* LE PHY Update Complete */
+ /* If the controller supports LE Set Extended Scan Parameters
+ * and LE Set Extended Scan Enable commands, enable the
+ * corresponding event.
+ */
+ if (use_ext_scan(hdev))
+ events[1] |= 0x10; /* LE Extended Advertising
+ * Report
+ */
+
+ /* If the controller supports the LE Extended Create Connection
+ * command, enable the corresponding event.
+ */
+ if (use_ext_conn(hdev))
+ events[1] |= 0x02; /* LE Enhanced Connection
+ * Complete
+ */
+
+ /* If the controller supports the LE Extended Advertising
+ * command, enable the corresponding event.
+ */
+ if (ext_adv_capable(hdev))
+ events[2] |= 0x02; /* LE Advertising Set
+ * Terminated
+ */
+
hci_req_add(req, HCI_OP_LE_SET_EVENT_MASK, sizeof(events),
events);
- if (hdev->commands[25] & 0x40) {
- /* Read LE Advertising Channel TX Power */
+ /* Read LE Advertising Channel TX Power */
+ if ((hdev->commands[25] & 0x40) && !ext_adv_capable(hdev)) {
+ /* HCI TS spec forbids mixing of legacy and extended
+ * advertising commands wherein READ_ADV_TX_POWER is
+ * also included. So do not call it if extended adv
+ * is supported otherwise controller will return
+ * COMMAND_DISALLOWED for extended commands.
+ */
hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
}
@@ -714,6 +745,17 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
}
+ if (hdev->commands[34] & 0x40) {
+ /* Read LE Resolving List Size */
+ hci_req_add(req, HCI_OP_LE_READ_RESOLV_LIST_SIZE,
+ 0, NULL);
+ }
+
+ if (hdev->commands[34] & 0x20) {
+ /* Clear LE Resolving List */
+ hci_req_add(req, HCI_OP_LE_CLEAR_RESOLV_LIST, 0, NULL);
+ }
+
if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
/* Read LE Maximum Data Length */
hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
@@ -722,6 +764,12 @@ static int hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_READ_DEF_DATA_LEN, 0, NULL);
}
+ if (ext_adv_capable(hdev)) {
+ /* Read LE Number of Supported Advertising Sets */
+ hci_req_add(req, HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS,
+ 0, NULL);
+ }
+
hci_set_le_support(req);
}
@@ -802,10 +850,9 @@ static int hci_init4_req(struct hci_request *req, unsigned long opt)
if (hdev->commands[35] & 0x20) {
struct hci_cp_le_set_default_phy cp;
- /* No transmitter PHY or receiver PHY preferences */
- cp.all_phys = 0x03;
- cp.tx_phys = 0;
- cp.rx_phys = 0;
+ cp.all_phys = 0x00;
+ cp.tx_phys = hdev->le_tx_def_phys;
+ cp.rx_phys = hdev->le_rx_def_phys;
hci_req_add(req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp), &cp);
}
@@ -1368,7 +1415,8 @@ static int hci_dev_do_open(struct hci_dev *hdev)
atomic_set(&hdev->cmd_cnt, 1);
set_bit(HCI_INIT, &hdev->flags);
- if (hci_dev_test_flag(hdev, HCI_SETUP)) {
+ if (hci_dev_test_flag(hdev, HCI_SETUP) ||
+ test_bit(HCI_QUIRK_NON_PERSISTENT_SETUP, &hdev->quirks)) {
hci_sock_dev_event(hdev, HCI_DEV_SETUP);
if (hdev->setup)
@@ -1432,6 +1480,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
if (!ret) {
hci_dev_hold(hdev);
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
set_bit(HCI_UP, &hdev->flags);
hci_sock_dev_event(hdev, HCI_DEV_UP);
hci_leds_update_powered(hdev, true);
@@ -1587,9 +1636,15 @@ int hci_dev_do_close(struct hci_dev *hdev)
if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
cancel_delayed_work(&hdev->service_cache);
- if (hci_dev_test_flag(hdev, HCI_MGMT))
+ if (hci_dev_test_flag(hdev, HCI_MGMT)) {
+ struct adv_info *adv_instance;
+
cancel_delayed_work_sync(&hdev->rpa_expired);
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list)
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+ }
+
/* Avoid potential lockdep warnings from the *_flush() calls by
* ensuring the workqueue is empty up front.
*/
@@ -1897,7 +1952,11 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
break;
case HCISETPTYPE:
+ if (hdev->pkt_type == (__u16) dr.dev_opt)
+ break;
+
hdev->pkt_type = (__u16) dr.dev_opt;
+ mgmt_phy_configuration_changed(hdev, NULL);
break;
case HCISETACLMTU:
@@ -2661,6 +2720,8 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
hdev->cur_adv_instance = 0x00;
}
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
+
list_del(&adv_instance->list);
kfree(adv_instance);
@@ -2669,6 +2730,14 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
return 0;
}
+void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired)
+{
+ struct adv_info *adv_instance, *n;
+
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list)
+ adv_instance->rpa_expired = rpa_expired;
+}
+
/* This function requires the caller holds hdev->lock */
void hci_adv_instances_clear(struct hci_dev *hdev)
{
@@ -2680,6 +2749,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
}
list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
+ cancel_delayed_work_sync(&adv_instance->rpa_expired_cb);
list_del(&adv_instance->list);
kfree(adv_instance);
}
@@ -2688,6 +2758,16 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
hdev->cur_adv_instance = 0x00;
}
+static void adv_instance_rpa_expired(struct work_struct *work)
+{
+ struct adv_info *adv_instance = container_of(work, struct adv_info,
+ rpa_expired_cb.work);
+
+ BT_DBG("");
+
+ adv_instance->rpa_expired = true;
+}
+
/* This function requires the caller holds hdev->lock */
int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
u16 adv_data_len, u8 *adv_data,
@@ -2736,6 +2816,11 @@ int hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags,
else
adv_instance->duration = duration;
+ adv_instance->tx_power = HCI_TX_POWER_INVALID;
+
+ INIT_DELAYED_WORK(&adv_instance->rpa_expired_cb,
+ adv_instance_rpa_expired);
+
BT_DBG("%s for %dMR", hdev->name, instance);
return 0;
@@ -2999,6 +3084,8 @@ struct hci_dev *hci_alloc_dev(void)
hdev->le_max_tx_time = 0x0148;
hdev->le_max_rx_len = 0x001b;
hdev->le_max_rx_time = 0x0148;
+ hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M;
+ hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M;
hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT;
hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT;
@@ -3017,6 +3104,7 @@ struct hci_dev *hci_alloc_dev(void)
INIT_LIST_HEAD(&hdev->identity_resolving_keys);
INIT_LIST_HEAD(&hdev->remote_oob_data);
INIT_LIST_HEAD(&hdev->le_white_list);
+ INIT_LIST_HEAD(&hdev->le_resolv_list);
INIT_LIST_HEAD(&hdev->le_conn_params);
INIT_LIST_HEAD(&hdev->pend_le_conns);
INIT_LIST_HEAD(&hdev->pend_le_reports);
@@ -3218,6 +3306,7 @@ void hci_unregister_dev(struct hci_dev *hdev)
hci_remote_oob_data_clear(hdev);
hci_adv_instances_clear(hdev);
hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
hci_conn_params_clear_all(hdev);
hci_discovery_filter_clear(hdev);
hci_dev_unlock(hdev);
diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c
index 0d8ab5b3c177..51f5b1efc3a5 100644
--- a/net/bluetooth/hci_debugfs.c
+++ b/net/bluetooth/hci_debugfs.c
@@ -694,6 +694,21 @@ static int white_list_show(struct seq_file *f, void *ptr)
DEFINE_SHOW_ATTRIBUTE(white_list);
+static int resolv_list_show(struct seq_file *f, void *ptr)
+{
+ struct hci_dev *hdev = f->private;
+ struct bdaddr_list *b;
+
+ hci_dev_lock(hdev);
+ list_for_each_entry(b, &hdev->le_resolv_list, list)
+ seq_printf(f, "%pMR (type %u)\n", &b->bdaddr, b->bdaddr_type);
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(resolv_list);
+
static int identity_resolving_keys_show(struct seq_file *f, void *ptr)
{
struct hci_dev *hdev = f->private;
@@ -955,6 +970,10 @@ void hci_debugfs_create_le(struct hci_dev *hdev)
&hdev->le_white_list_size);
debugfs_create_file("white_list", 0444, hdev->debugfs, hdev,
&white_list_fops);
+ debugfs_create_u8("resolv_list_size", 0444, hdev->debugfs,
+ &hdev->le_resolv_list_size);
+ debugfs_create_file("resolv_list", 0444, hdev->debugfs, hdev,
+ &resolv_list_fops);
debugfs_create_file("identity_resolving_keys", 0400, hdev->debugfs,
hdev, &identity_resolving_keys_fops);
debugfs_create_file("long_term_keys", 0400, hdev->debugfs, hdev,
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index 235b5aaab23d..754714c8d752 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -221,6 +221,7 @@ static void hci_cc_reset(struct hci_dev *hdev, struct sk_buff *skb)
hdev->ssp_debug_mode = 0;
hci_bdaddr_list_clear(&hdev->le_white_list);
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
}
static void hci_cc_read_stored_link_key(struct hci_dev *hdev,
@@ -1041,6 +1042,57 @@ static void hci_cc_le_set_random_addr(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_default_phy(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_set_default_phy *cp;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_DEFAULT_PHY);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_tx_def_phys = cp->tx_phys;
+ hdev->le_rx_def_phys = cp->rx_phys;
+
+ hci_dev_unlock(hdev);
+}
+
+static void hci_cc_le_set_adv_set_random_addr(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_set_adv_set_rand_addr *cp;
+ struct adv_info *adv_instance;
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_ADV_SET_RAND_ADDR);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ if (!hdev->cur_adv_instance) {
+ /* Store in hdev for instance 0 (Set adv and Directed advs) */
+ bacpy(&hdev->random_addr, &cp->bdaddr);
+ } else {
+ adv_instance = hci_find_adv_instance(hdev,
+ hdev->cur_adv_instance);
+ if (adv_instance)
+ bacpy(&adv_instance->random_addr, &cp->bdaddr);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
{
__u8 *sent, status = *((__u8 *) skb->data);
@@ -1076,6 +1128,43 @@ static void hci_cc_le_set_adv_enable(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *adv_set;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_ENABLE);
+ if (!cp)
+ return;
+
+ adv_set = (void *) cp->data;
+
+ hci_dev_lock(hdev);
+
+ if (cp->enable) {
+ struct hci_conn *conn;
+
+ hci_dev_set_flag(hdev, HCI_LE_ADV);
+
+ conn = hci_lookup_le_connect(hdev);
+ if (conn)
+ queue_delayed_work(hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+ } else {
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_cp_le_set_scan_param *cp;
@@ -1097,6 +1186,31 @@ static void hci_cc_le_set_scan_param(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_ext_scan_param(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_scan_params *cp;
+ __u8 status = *((__u8 *) skb->data);
+ struct hci_cp_le_scan_phy_params *phy_param;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_PARAMS);
+ if (!cp)
+ return;
+
+ phy_param = (void *)cp->data;
+
+ hci_dev_lock(hdev);
+
+ hdev->le_scan_type = phy_param->type;
+
+ hci_dev_unlock(hdev);
+}
+
static bool has_pending_adv_report(struct hci_dev *hdev)
{
struct discovery_state *d = &hdev->discovery;
@@ -1126,24 +1240,11 @@ static void store_pending_adv_report(struct hci_dev *hdev, bdaddr_t *bdaddr,
d->last_adv_data_len = len;
}
-static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
- struct sk_buff *skb)
+static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable)
{
- struct hci_cp_le_set_scan_enable *cp;
- __u8 status = *((__u8 *) skb->data);
-
- BT_DBG("%s status 0x%2.2x", hdev->name, status);
-
- if (status)
- return;
-
- cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
- if (!cp)
- return;
-
hci_dev_lock(hdev);
- switch (cp->enable) {
+ switch (enable) {
case LE_SCAN_ENABLE:
hci_dev_set_flag(hdev, HCI_LE_SCAN);
if (hdev->le_scan_type == LE_SCAN_ACTIVE)
@@ -1189,13 +1290,63 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
default:
bt_dev_err(hdev, "use of reserved LE_Scan_Enable param %d",
- cp->enable);
+ enable);
break;
}
hci_dev_unlock(hdev);
}
+static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_scan_enable *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_SCAN_ENABLE);
+ if (!cp)
+ return;
+
+ le_set_scan_enable_complete(hdev, cp->enable);
+}
+
+static void hci_cc_le_set_ext_scan_enable(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_cp_le_set_ext_scan_enable *cp;
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE);
+ if (!cp)
+ return;
+
+ le_set_scan_enable_complete(hdev, cp->enable);
+}
+
+static void hci_cc_le_read_num_adv_sets(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_num_supported_adv_sets *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x No of Adv sets %u", hdev->name, rp->status,
+ rp->num_of_sets);
+
+ if (rp->status)
+ return;
+
+ hdev->le_num_of_adv_sets = rp->num_of_sets;
+}
+
static void hci_cc_le_read_white_list_size(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -1306,6 +1457,32 @@ static void hci_cc_le_write_def_data_len(struct hci_dev *hdev,
hdev->le_def_tx_time = le16_to_cpu(sent->tx_time);
}
+static void hci_cc_le_clear_resolv_list(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ __u8 status = *((__u8 *) skb->data);
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ if (status)
+ return;
+
+ hci_bdaddr_list_clear(&hdev->le_resolv_list);
+}
+
+static void hci_cc_le_read_resolv_list_size(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_rp_le_read_resolv_list_size *rp = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x size %u", hdev->name, rp->status, rp->size);
+
+ if (rp->status)
+ return;
+
+ hdev->le_resolv_list_size = rp->size;
+}
+
static void hci_cc_le_read_max_data_len(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -1375,6 +1552,37 @@ static void hci_cc_set_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static void hci_cc_set_ext_adv_param(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_rp_le_set_ext_adv_params *rp = (void *) skb->data;
+ struct hci_cp_le_set_ext_adv_params *cp;
+ struct adv_info *adv_instance;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, rp->status);
+
+ if (rp->status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_EXT_ADV_PARAMS);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+ hdev->adv_addr_type = cp->own_addr_type;
+ if (!hdev->cur_adv_instance) {
+ /* Store in hdev for instance 0 */
+ hdev->adv_tx_power = rp->tx_power;
+ } else {
+ adv_instance = hci_find_adv_instance(hdev,
+ hdev->cur_adv_instance);
+ if (adv_instance)
+ adv_instance->tx_power = rp->tx_power;
+ }
+ /* Update adv data as tx power is known now */
+ hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
+ hci_dev_unlock(hdev);
+}
+
static void hci_cc_read_rssi(struct hci_dev *hdev, struct sk_buff *skb)
{
struct hci_rp_read_rssi *rp = (void *) skb->data;
@@ -1896,10 +2104,44 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status)
hci_dev_unlock(hdev);
}
+static void cs_le_create_conn(struct hci_dev *hdev, bdaddr_t *peer_addr,
+ u8 peer_addr_type, u8 own_address_type,
+ u8 filter_policy)
+{
+ struct hci_conn *conn;
+
+ conn = hci_conn_hash_lookup_le(hdev, peer_addr,
+ peer_addr_type);
+ if (!conn)
+ return;
+
+ /* Store the initiator and responder address information which
+ * is needed for SMP. These values will not change during the
+ * lifetime of the connection.
+ */
+ conn->init_addr_type = own_address_type;
+ if (own_address_type == ADDR_LE_DEV_RANDOM)
+ bacpy(&conn->init_addr, &hdev->random_addr);
+ else
+ bacpy(&conn->init_addr, &hdev->bdaddr);
+
+ conn->resp_addr_type = peer_addr_type;
+ bacpy(&conn->resp_addr, peer_addr);
+
+ /* We don't want the connection attempt to stick around
+ * indefinitely since LE doesn't have a page timeout concept
+ * like BR/EDR. Set a timer for any connection that doesn't use
+ * the white list for connecting.
+ */
+ if (filter_policy == HCI_LE_USE_PEER_ADDR)
+ queue_delayed_work(conn->hdev->workqueue,
+ &conn->le_conn_timeout,
+ conn->conn_timeout);
+}
+
static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_create_conn *cp;
- struct hci_conn *conn;
BT_DBG("%s status 0x%2.2x", hdev->name, status);
@@ -1916,35 +2158,34 @@ static void hci_cs_le_create_conn(struct hci_dev *hdev, u8 status)
hci_dev_lock(hdev);
- conn = hci_conn_hash_lookup_le(hdev, &cp->peer_addr,
- cp->peer_addr_type);
- if (!conn)
- goto unlock;
+ cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
+ cp->own_address_type, cp->filter_policy);
- /* Store the initiator and responder address information which
- * is needed for SMP. These values will not change during the
- * lifetime of the connection.
- */
- conn->init_addr_type = cp->own_address_type;
- if (cp->own_address_type == ADDR_LE_DEV_RANDOM)
- bacpy(&conn->init_addr, &hdev->random_addr);
- else
- bacpy(&conn->init_addr, &hdev->bdaddr);
+ hci_dev_unlock(hdev);
+}
- conn->resp_addr_type = cp->peer_addr_type;
- bacpy(&conn->resp_addr, &cp->peer_addr);
+static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_ext_create_conn *cp;
- /* We don't want the connection attempt to stick around
- * indefinitely since LE doesn't have a page timeout concept
- * like BR/EDR. Set a timer for any connection that doesn't use
- * the white list for connecting.
+ BT_DBG("%s status 0x%2.2x", hdev->name, status);
+
+ /* All connection failure handling is taken care of by the
+ * hci_le_conn_failed function which is triggered by the HCI
+ * request completion callbacks used for connecting.
*/
- if (cp->filter_policy == HCI_LE_USE_PEER_ADDR)
- queue_delayed_work(conn->hdev->workqueue,
- &conn->le_conn_timeout,
- conn->conn_timeout);
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_EXT_CREATE_CONN);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ cs_le_create_conn(hdev, &cp->peer_addr, cp->peer_addr_type,
+ cp->own_addr_type, cp->filter_policy);
-unlock:
hci_dev_unlock(hdev);
}
@@ -2618,8 +2859,10 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb)
/* We should disregard the current RPA and generate a new one
* whenever the encryption procedure fails.
*/
- if (ev->status && conn->type == LE_LINK)
+ if (ev->status && conn->type == LE_LINK) {
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
+ }
clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->flags);
@@ -3015,6 +3258,14 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_le_write_def_data_len(hdev, skb);
break;
+ case HCI_OP_LE_CLEAR_RESOLV_LIST:
+ hci_cc_le_clear_resolv_list(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_RESOLV_LIST_SIZE:
+ hci_cc_le_read_resolv_list_size(hdev, skb);
+ break;
+
case HCI_OP_LE_READ_MAX_DATA_LEN:
hci_cc_le_read_max_data_len(hdev, skb);
break;
@@ -3039,6 +3290,34 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cc_write_ssp_debug_mode(hdev, skb);
break;
+ case HCI_OP_LE_SET_EXT_SCAN_PARAMS:
+ hci_cc_le_set_ext_scan_param(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_EXT_SCAN_ENABLE:
+ hci_cc_le_set_ext_scan_enable(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_DEFAULT_PHY:
+ hci_cc_le_set_default_phy(hdev, skb);
+ break;
+
+ case HCI_OP_LE_READ_NUM_SUPPORTED_ADV_SETS:
+ hci_cc_le_read_num_adv_sets(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_EXT_ADV_PARAMS:
+ hci_cc_set_ext_adv_param(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_EXT_ADV_ENABLE:
+ hci_cc_le_set_ext_adv_enable(hdev, skb);
+ break;
+
+ case HCI_OP_LE_SET_ADV_SET_RAND_ADDR:
+ hci_cc_le_set_adv_set_random_addr(hdev, skb);
+ break;
+
default:
BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
break;
@@ -3134,6 +3413,10 @@ static void hci_cmd_status_evt(struct hci_dev *hdev, struct sk_buff *skb,
hci_cs_le_start_enc(hdev, ev->status);
break;
+ case HCI_OP_LE_EXT_CREATE_CONN:
+ hci_cs_le_ext_create_conn(hdev, ev->status);
+ break;
+
default:
BT_DBG("%s opcode 0x%4.4x", hdev->name, *opcode);
break;
@@ -4460,16 +4743,15 @@ static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev,
}
#endif
-static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+static void le_conn_complete_evt(struct hci_dev *hdev, u8 status,
+ bdaddr_t *bdaddr, u8 bdaddr_type, u8 role, u16 handle,
+ u16 interval, u16 latency, u16 supervision_timeout)
{
- struct hci_ev_le_conn_complete *ev = (void *) skb->data;
struct hci_conn_params *params;
struct hci_conn *conn;
struct smp_irk *irk;
u8 addr_type;
- BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
-
hci_dev_lock(hdev);
/* All controllers implicitly stop advertising in the event of a
@@ -4479,13 +4761,13 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn = hci_lookup_le_connect(hdev);
if (!conn) {
- conn = hci_conn_add(hdev, LE_LINK, &ev->bdaddr, ev->role);
+ conn = hci_conn_add(hdev, LE_LINK, bdaddr, role);
if (!conn) {
bt_dev_err(hdev, "no memory for new connection");
goto unlock;
}
- conn->dst_type = ev->bdaddr_type;
+ conn->dst_type = bdaddr_type;
/* If we didn't have a hci_conn object previously
* but we're in master role this must be something
@@ -4496,8 +4778,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* initiator address based on the HCI_PRIVACY flag.
*/
if (conn->out) {
- conn->resp_addr_type = ev->bdaddr_type;
- bacpy(&conn->resp_addr, &ev->bdaddr);
+ conn->resp_addr_type = bdaddr_type;
+ bacpy(&conn->resp_addr, bdaddr);
if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
conn->init_addr_type = ADDR_LE_DEV_RANDOM;
bacpy(&conn->init_addr, &hdev->rpa);
@@ -4516,13 +4798,18 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* the advertising address type.
*/
conn->resp_addr_type = hdev->adv_addr_type;
- if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM)
- bacpy(&conn->resp_addr, &hdev->random_addr);
- else
+ if (hdev->adv_addr_type == ADDR_LE_DEV_RANDOM) {
+ /* In case of ext adv, resp_addr will be updated in
+ * Adv Terminated event.
+ */
+ if (!ext_adv_capable(hdev))
+ bacpy(&conn->resp_addr, &hdev->random_addr);
+ } else {
bacpy(&conn->resp_addr, &hdev->bdaddr);
+ }
- conn->init_addr_type = ev->bdaddr_type;
- bacpy(&conn->init_addr, &ev->bdaddr);
+ conn->init_addr_type = bdaddr_type;
+ bacpy(&conn->init_addr, bdaddr);
/* For incoming connections, set the default minimum
* and maximum connection interval. They will be used
@@ -4548,8 +4835,8 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
conn->dst_type = irk->addr_type;
}
- if (ev->status) {
- hci_le_conn_failed(conn, ev->status);
+ if (status) {
+ hci_le_conn_failed(conn, status);
goto unlock;
}
@@ -4568,17 +4855,17 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
mgmt_device_connected(hdev, conn, 0, NULL, 0);
conn->sec_level = BT_SECURITY_LOW;
- conn->handle = __le16_to_cpu(ev->handle);
+ conn->handle = handle;
conn->state = BT_CONFIG;
- conn->le_conn_interval = le16_to_cpu(ev->interval);
- conn->le_conn_latency = le16_to_cpu(ev->latency);
- conn->le_supv_timeout = le16_to_cpu(ev->supervision_timeout);
+ conn->le_conn_interval = interval;
+ conn->le_conn_latency = latency;
+ conn->le_supv_timeout = supervision_timeout;
hci_debugfs_create_conn(conn);
hci_conn_add_sysfs(conn);
- if (!ev->status) {
+ if (!status) {
/* The remote features procedure is defined for master
* role only. So only in case of an initiated connection
* request the remote features.
@@ -4600,10 +4887,10 @@ static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_conn_hold(conn);
} else {
conn->state = BT_CONNECTED;
- hci_connect_cfm(conn, ev->status);
+ hci_connect_cfm(conn, status);
}
} else {
- hci_connect_cfm(conn, ev->status);
+ hci_connect_cfm(conn, status);
}
params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst,
@@ -4622,6 +4909,61 @@ unlock:
hci_dev_unlock(hdev);
}
+static void hci_le_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_ev_le_conn_complete *ev = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
+ ev->role, le16_to_cpu(ev->handle),
+ le16_to_cpu(ev->interval),
+ le16_to_cpu(ev->latency),
+ le16_to_cpu(ev->supervision_timeout));
+}
+
+static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev,
+ struct sk_buff *skb)
+{
+ struct hci_ev_le_enh_conn_complete *ev = (void *) skb->data;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ le_conn_complete_evt(hdev, ev->status, &ev->bdaddr, ev->bdaddr_type,
+ ev->role, le16_to_cpu(ev->handle),
+ le16_to_cpu(ev->interval),
+ le16_to_cpu(ev->latency),
+ le16_to_cpu(ev->supervision_timeout));
+}
+
+static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ struct hci_evt_le_ext_adv_set_term *ev = (void *) skb->data;
+ struct hci_conn *conn;
+
+ BT_DBG("%s status 0x%2.2x", hdev->name, ev->status);
+
+ if (ev->status)
+ return;
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->conn_handle));
+ if (conn) {
+ struct adv_info *adv_instance;
+
+ if (hdev->adv_addr_type != ADDR_LE_DEV_RANDOM)
+ return;
+
+ if (!hdev->cur_adv_instance) {
+ bacpy(&conn->resp_addr, &hdev->random_addr);
+ return;
+ }
+
+ adv_instance = hci_find_adv_instance(hdev, hdev->cur_adv_instance);
+ if (adv_instance)
+ bacpy(&conn->resp_addr, &adv_instance->random_addr);
+ }
+}
+
static void hci_le_conn_update_complete_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -4957,6 +5299,78 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_dev_unlock(hdev);
}
+static u8 ext_evt_type_to_legacy(u16 evt_type)
+{
+ if (evt_type & LE_EXT_ADV_LEGACY_PDU) {
+ switch (evt_type) {
+ case LE_LEGACY_ADV_IND:
+ return LE_ADV_IND;
+ case LE_LEGACY_ADV_DIRECT_IND:
+ return LE_ADV_DIRECT_IND;
+ case LE_LEGACY_ADV_SCAN_IND:
+ return LE_ADV_SCAN_IND;
+ case LE_LEGACY_NONCONN_IND:
+ return LE_ADV_NONCONN_IND;
+ case LE_LEGACY_SCAN_RSP_ADV:
+ case LE_LEGACY_SCAN_RSP_ADV_SCAN:
+ return LE_ADV_SCAN_RSP;
+ }
+
+ BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
+ evt_type);
+
+ return LE_ADV_INVALID;
+ }
+
+ if (evt_type & LE_EXT_ADV_CONN_IND) {
+ if (evt_type & LE_EXT_ADV_DIRECT_IND)
+ return LE_ADV_DIRECT_IND;
+
+ return LE_ADV_IND;
+ }
+
+ if (evt_type & LE_EXT_ADV_SCAN_RSP)
+ return LE_ADV_SCAN_RSP;
+
+ if (evt_type & LE_EXT_ADV_SCAN_IND)
+ return LE_ADV_SCAN_IND;
+
+ if (evt_type == LE_EXT_ADV_NON_CONN_IND ||
+ evt_type & LE_EXT_ADV_DIRECT_IND)
+ return LE_ADV_NONCONN_IND;
+
+ BT_ERR_RATELIMITED("Unknown advertising packet type: 0x%02x",
+ evt_type);
+
+ return LE_ADV_INVALID;
+}
+
+static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, struct sk_buff *skb)
+{
+ u8 num_reports = skb->data[0];
+ void *ptr = &skb->data[1];
+
+ hci_dev_lock(hdev);
+
+ while (num_reports--) {
+ struct hci_ev_le_ext_adv_report *ev = ptr;
+ u8 legacy_evt_type;
+ u16 evt_type;
+
+ evt_type = __le16_to_cpu(ev->evt_type);
+ legacy_evt_type = ext_evt_type_to_legacy(evt_type);
+ if (legacy_evt_type != LE_ADV_INVALID) {
+ process_adv_report(hdev, legacy_evt_type, &ev->bdaddr,
+ ev->bdaddr_type, NULL, 0, ev->rssi,
+ ev->data, ev->length);
+ }
+
+ ptr += sizeof(*ev) + ev->length + 1;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev,
struct sk_buff *skb)
{
@@ -5189,6 +5603,18 @@ static void hci_le_meta_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_le_direct_adv_report_evt(hdev, skb);
break;
+ case HCI_EV_LE_EXT_ADV_REPORT:
+ hci_le_ext_adv_report_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_ENHANCED_CONN_COMPLETE:
+ hci_le_enh_conn_complete_evt(hdev, skb);
+ break;
+
+ case HCI_EV_LE_EXT_ADV_SET_TERM:
+ hci_le_ext_adv_term_evt(hdev, skb);
+ break;
+
default:
break;
}
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index e44d34734834..e8c9ef1e1922 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -647,11 +647,22 @@ void __hci_req_update_eir(struct hci_request *req)
void hci_req_add_le_scan_disable(struct hci_request *req)
{
- struct hci_cp_le_set_scan_enable cp;
+ struct hci_dev *hdev = req->hdev;
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_DISABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ if (use_ext_scan(hdev)) {
+ struct hci_cp_le_set_ext_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_DISABLE;
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp),
+ &cp);
+ } else {
+ struct hci_cp_le_set_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_DISABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ }
}
static void add_to_white_list(struct hci_request *req,
@@ -767,10 +778,86 @@ static bool scan_use_rpa(struct hci_dev *hdev)
return hci_dev_test_flag(hdev, HCI_PRIVACY);
}
+static void hci_req_start_scan(struct hci_request *req, u8 type, u16 interval,
+ u16 window, u8 own_addr_type, u8 filter_policy)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ /* Use ext scanning if set ext scan param and ext scan enable is
+ * supported
+ */
+ if (use_ext_scan(hdev)) {
+ struct hci_cp_le_set_ext_scan_params *ext_param_cp;
+ struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
+ struct hci_cp_le_scan_phy_params *phy_params;
+ u8 data[sizeof(*ext_param_cp) + sizeof(*phy_params) * 2];
+ u32 plen;
+
+ ext_param_cp = (void *)data;
+ phy_params = (void *)ext_param_cp->data;
+
+ memset(ext_param_cp, 0, sizeof(*ext_param_cp));
+ ext_param_cp->own_addr_type = own_addr_type;
+ ext_param_cp->filter_policy = filter_policy;
+
+ plen = sizeof(*ext_param_cp);
+
+ if (scan_1m(hdev) || scan_2m(hdev)) {
+ ext_param_cp->scanning_phys |= LE_SCAN_PHY_1M;
+
+ memset(phy_params, 0, sizeof(*phy_params));
+ phy_params->type = type;
+ phy_params->interval = cpu_to_le16(interval);
+ phy_params->window = cpu_to_le16(window);
+
+ plen += sizeof(*phy_params);
+ phy_params++;
+ }
+
+ if (scan_coded(hdev)) {
+ ext_param_cp->scanning_phys |= LE_SCAN_PHY_CODED;
+
+ memset(phy_params, 0, sizeof(*phy_params));
+ phy_params->type = type;
+ phy_params->interval = cpu_to_le16(interval);
+ phy_params->window = cpu_to_le16(window);
+
+ plen += sizeof(*phy_params);
+ phy_params++;
+ }
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_PARAMS,
+ plen, ext_param_cp);
+
+ memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
+ ext_enable_cp.enable = LE_SCAN_ENABLE;
+ ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ sizeof(ext_enable_cp), &ext_enable_cp);
+ } else {
+ struct hci_cp_le_set_scan_param param_cp;
+ struct hci_cp_le_set_scan_enable enable_cp;
+
+ memset(&param_cp, 0, sizeof(param_cp));
+ param_cp.type = type;
+ param_cp.interval = cpu_to_le16(interval);
+ param_cp.window = cpu_to_le16(window);
+ param_cp.own_address_type = own_addr_type;
+ param_cp.filter_policy = filter_policy;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
+ &param_cp);
+
+ memset(&enable_cp, 0, sizeof(enable_cp));
+ enable_cp.enable = LE_SCAN_ENABLE;
+ enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
+ &enable_cp);
+ }
+}
+
void hci_req_add_le_passive_scan(struct hci_request *req)
{
- struct hci_cp_le_set_scan_param param_cp;
- struct hci_cp_le_set_scan_enable enable_cp;
struct hci_dev *hdev = req->hdev;
u8 own_addr_type;
u8 filter_policy;
@@ -804,20 +891,26 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
(hdev->le_features[0] & HCI_LE_EXT_SCAN_POLICY))
filter_policy |= 0x02;
- memset(&param_cp, 0, sizeof(param_cp));
- param_cp.type = LE_SCAN_PASSIVE;
- param_cp.interval = cpu_to_le16(hdev->le_scan_interval);
- param_cp.window = cpu_to_le16(hdev->le_scan_window);
- param_cp.own_address_type = own_addr_type;
- param_cp.filter_policy = filter_policy;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
- &param_cp);
+ hci_req_start_scan(req, LE_SCAN_PASSIVE, hdev->le_scan_interval,
+ hdev->le_scan_window, own_addr_type, filter_policy);
+}
+
+static u8 get_adv_instance_scan_rsp_len(struct hci_dev *hdev, u8 instance)
+{
+ struct adv_info *adv_instance;
+
+ /* Ignore instance 0 */
+ if (instance == 0x00)
+ return 0;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
- memset(&enable_cp, 0, sizeof(enable_cp));
- enable_cp.enable = LE_SCAN_ENABLE;
- enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
- &enable_cp);
+ /* TODO: Take into account the "appearance" and "local-name" flags here.
+ * These are currently being ignored as they are not supported.
+ */
+ return adv_instance->scan_rsp_len;
}
static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
@@ -841,9 +934,19 @@ static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
void __hci_req_disable_advertising(struct hci_request *req)
{
- u8 enable = 0x00;
+ if (ext_adv_capable(req->hdev)) {
+ struct hci_cp_le_set_ext_adv_enable cp;
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+ cp.enable = 0x00;
+ /* Disable all sets since we only support one set at the moment */
+ cp.num_of_sets = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, sizeof(cp), &cp);
+ } else {
+ u8 enable = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+ }
}
static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
@@ -1081,29 +1184,58 @@ static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_rsp_data cp;
u8 len;
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
return;
- memset(&cp, 0, sizeof(cp));
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_scan_rsp_data cp;
- if (instance)
- len = create_instance_scan_rsp_data(hdev, instance, cp.data);
- else
- len = create_default_scan_rsp_data(hdev, cp.data);
+ memset(&cp, 0, sizeof(cp));
- if (hdev->scan_rsp_data_len == len &&
- !memcmp(cp.data, hdev->scan_rsp_data, len))
- return;
+ if (instance)
+ len = create_instance_scan_rsp_data(hdev, instance,
+ cp.data);
+ else
+ len = create_default_scan_rsp_data(hdev, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return;
- memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
- hdev->scan_rsp_data_len = len;
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
- cp.length = len;
+ cp.handle = 0;
+ cp.length = len;
+ cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, sizeof(cp),
+ &cp);
+ } else {
+ struct hci_cp_le_set_scan_rsp_data cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (instance)
+ len = create_instance_scan_rsp_data(hdev, instance,
+ cp.data);
+ else
+ len = create_default_scan_rsp_data(hdev, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
+
+ cp.length = len;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
+ }
}
static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
@@ -1160,15 +1292,27 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
ptr += adv_instance->adv_data_len;
}
- /* Provide Tx Power only if we can provide a valid value for it */
- if (hdev->adv_tx_power != HCI_TX_POWER_INVALID &&
- (instance_flags & MGMT_ADV_FLAG_TX_POWER)) {
- ptr[0] = 0x02;
- ptr[1] = EIR_TX_POWER;
- ptr[2] = (u8)hdev->adv_tx_power;
+ if (instance_flags & MGMT_ADV_FLAG_TX_POWER) {
+ s8 adv_tx_power;
- ad_len += 3;
- ptr += 3;
+ if (ext_adv_capable(hdev)) {
+ if (adv_instance)
+ adv_tx_power = adv_instance->tx_power;
+ else
+ adv_tx_power = hdev->adv_tx_power;
+ } else {
+ adv_tx_power = hdev->adv_tx_power;
+ }
+
+ /* Provide Tx Power only if we can provide a valid value for it */
+ if (adv_tx_power != HCI_TX_POWER_INVALID) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)adv_tx_power;
+
+ ad_len += 3;
+ ptr += 3;
+ }
}
return ad_len;
@@ -1177,27 +1321,51 @@ static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_data cp;
u8 len;
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
return;
- memset(&cp, 0, sizeof(cp));
+ if (ext_adv_capable(hdev)) {
+ struct hci_cp_le_set_ext_adv_data cp;
- len = create_instance_adv_data(hdev, instance, cp.data);
+ memset(&cp, 0, sizeof(cp));
- /* There's nothing to do if the data hasn't changed */
- if (hdev->adv_data_len == len &&
- memcmp(cp.data, hdev->adv_data, len) == 0)
- return;
+ len = create_instance_adv_data(hdev, instance, cp.data);
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return;
+
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
+
+ cp.length = len;
+ cp.handle = 0;
+ cp.operation = LE_SET_ADV_DATA_OP_COMPLETE;
+ cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, sizeof(cp), &cp);
+ } else {
+ struct hci_cp_le_set_adv_data cp;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = create_instance_adv_data(hdev, instance, cp.data);
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return;
- memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
- hdev->adv_data_len = len;
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
- cp.length = len;
+ cp.length = len;
- hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
+ hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
+ }
}
int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
@@ -1229,9 +1397,13 @@ void hci_req_reenable_advertising(struct hci_dev *hdev)
__hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance,
true);
} else {
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
- __hci_req_enable_advertising(&req);
+ if (ext_adv_capable(hdev)) {
+ __hci_req_start_ext_adv(&req, 0x00);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ __hci_req_enable_advertising(&req);
+ }
}
hci_req_run(&req, adv_enable_complete);
@@ -1268,6 +1440,245 @@ unlock:
hci_dev_unlock(hdev);
}
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+ bool use_rpa, struct adv_info *adv_instance,
+ u8 *own_addr_type, bdaddr_t *rand_addr)
+{
+ int err;
+
+ bacpy(rand_addr, BDADDR_ANY);
+
+ /* If privacy is enabled use a resolvable private address. If
+ * current RPA has expired then generate a new one.
+ */
+ if (use_rpa) {
+ int to;
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+
+ if (adv_instance) {
+ if (!adv_instance->rpa_expired &&
+ !bacmp(&adv_instance->random_addr, &hdev->rpa))
+ return 0;
+
+ adv_instance->rpa_expired = false;
+ } else {
+ if (!hci_dev_test_and_clear_flag(hdev, HCI_RPA_EXPIRED) &&
+ !bacmp(&hdev->random_addr, &hdev->rpa))
+ return 0;
+ }
+
+ err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa);
+ if (err < 0) {
+ BT_ERR("%s failed to generate new RPA", hdev->name);
+ return err;
+ }
+
+ bacpy(rand_addr, &hdev->rpa);
+
+ to = msecs_to_jiffies(hdev->rpa_timeout * 1000);
+ if (adv_instance)
+ queue_delayed_work(hdev->workqueue,
+ &adv_instance->rpa_expired_cb, to);
+ else
+ queue_delayed_work(hdev->workqueue,
+ &hdev->rpa_expired, to);
+
+ return 0;
+ }
+
+ /* In case of required privacy without resolvable private address,
+ * use an non-resolvable private address. This is useful for
+ * non-connectable advertising.
+ */
+ if (require_privacy) {
+ bdaddr_t nrpa;
+
+ while (true) {
+ /* The non-resolvable private address is generated
+ * from random six bytes with the two most significant
+ * bits cleared.
+ */
+ get_random_bytes(&nrpa, 6);
+ nrpa.b[5] &= 0x3f;
+
+ /* The non-resolvable private address shall not be
+ * equal to the public address.
+ */
+ if (bacmp(&hdev->bdaddr, &nrpa))
+ break;
+ }
+
+ *own_addr_type = ADDR_LE_DEV_RANDOM;
+ bacpy(rand_addr, &nrpa);
+
+ return 0;
+ }
+
+ /* No privacy so use a public address. */
+ *own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ return 0;
+}
+
+void __hci_req_clear_ext_adv_sets(struct hci_request *req)
+{
+ hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL);
+}
+
+int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance)
+{
+ struct hci_cp_le_set_ext_adv_params cp;
+ struct hci_dev *hdev = req->hdev;
+ bool connectable;
+ u32 flags;
+ bdaddr_t random_addr;
+ u8 own_addr_type;
+ int err;
+ struct adv_info *adv_instance;
+ bool secondary_adv;
+ /* In ext adv set param interval is 3 octets */
+ const u8 adv_interval[3] = { 0x00, 0x08, 0x00 };
+
+ if (instance > 0) {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -EINVAL;
+ } else {
+ adv_instance = NULL;
+ }
+
+ flags = get_adv_instance_flags(hdev, instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ if (!is_advertising_allowed(hdev, connectable))
+ return -EPERM;
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used. In that case it is fine to use a
+ * non-resolvable private address.
+ */
+ err = hci_get_random_address(hdev, !connectable,
+ adv_use_rpa(hdev, flags), adv_instance,
+ &own_addr_type, &random_addr);
+ if (err < 0)
+ return err;
+
+ memset(&cp, 0, sizeof(cp));
+
+ memcpy(cp.min_interval, adv_interval, sizeof(cp.min_interval));
+ memcpy(cp.max_interval, adv_interval, sizeof(cp.max_interval));
+
+ secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK);
+
+ if (connectable) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND);
+ } else if (get_adv_instance_scan_rsp_len(hdev, instance)) {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND);
+ } else {
+ if (secondary_adv)
+ cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND);
+ else
+ cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND);
+ }
+
+ cp.own_addr_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+ cp.tx_power = 127;
+ cp.handle = 0;
+
+ if (flags & MGMT_ADV_FLAG_SEC_2M) {
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_2M;
+ } else if (flags & MGMT_ADV_FLAG_SEC_CODED) {
+ cp.primary_phy = HCI_ADV_PHY_CODED;
+ cp.secondary_phy = HCI_ADV_PHY_CODED;
+ } else {
+ /* In all other cases use 1M */
+ cp.primary_phy = HCI_ADV_PHY_1M;
+ cp.secondary_phy = HCI_ADV_PHY_1M;
+ }
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp);
+
+ if (own_addr_type == ADDR_LE_DEV_RANDOM &&
+ bacmp(&random_addr, BDADDR_ANY)) {
+ struct hci_cp_le_set_adv_set_rand_addr cp;
+
+ /* Check if random address need to be updated */
+ if (adv_instance) {
+ if (!bacmp(&random_addr, &adv_instance->random_addr))
+ return 0;
+ } else {
+ if (!bacmp(&random_addr, &hdev->random_addr))
+ return 0;
+ }
+
+ memset(&cp, 0, sizeof(cp));
+
+ cp.handle = 0;
+ bacpy(&cp.bdaddr, &random_addr);
+
+ hci_req_add(req,
+ HCI_OP_LE_SET_ADV_SET_RAND_ADDR,
+ sizeof(cp), &cp);
+ }
+
+ return 0;
+}
+
+void __hci_req_enable_ext_advertising(struct hci_request *req)
+{
+ struct hci_cp_le_set_ext_adv_enable *cp;
+ struct hci_cp_ext_adv_set *adv_set;
+ u8 data[sizeof(*cp) + sizeof(*adv_set) * 1];
+
+ cp = (void *) data;
+ adv_set = (void *) cp->data;
+
+ memset(cp, 0, sizeof(*cp));
+
+ cp->enable = 0x01;
+ cp->num_of_sets = 0x01;
+
+ memset(adv_set, 0, sizeof(*adv_set));
+
+ adv_set->handle = 0;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE,
+ sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets,
+ data);
+}
+
+int __hci_req_start_ext_adv(struct hci_request *req, u8 instance)
+{
+ struct hci_dev *hdev = req->hdev;
+ int err;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ __hci_req_disable_advertising(req);
+
+ err = __hci_req_setup_ext_adv_instance(req, instance);
+ if (err < 0)
+ return err;
+
+ __hci_req_update_scan_rsp_data(req, instance);
+ __hci_req_enable_ext_advertising(req);
+
+ return 0;
+}
+
int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
bool force)
{
@@ -1321,9 +1732,13 @@ int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
return 0;
hdev->cur_adv_instance = instance;
- __hci_req_update_adv_data(req, instance);
- __hci_req_update_scan_rsp_data(req, instance);
- __hci_req_enable_advertising(req);
+ if (ext_adv_capable(hdev)) {
+ __hci_req_start_ext_adv(req, instance);
+ } else {
+ __hci_req_update_adv_data(req, instance);
+ __hci_req_update_scan_rsp_data(req, instance);
+ __hci_req_enable_advertising(req);
+ }
return 0;
}
@@ -1594,8 +2009,12 @@ static int connectable_update(struct hci_request *req, unsigned long opt)
/* Update the advertising parameters if necessary */
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !list_empty(&hdev->adv_instances))
- __hci_req_enable_advertising(req);
+ !list_empty(&hdev->adv_instances)) {
+ if (ext_adv_capable(hdev))
+ __hci_req_start_ext_adv(req, hdev->cur_adv_instance);
+ else
+ __hci_req_enable_advertising(req);
+ }
__hci_update_background_scan(req);
@@ -1704,8 +2123,12 @@ static int discoverable_update(struct hci_request *req, unsigned long opt)
/* Discoverable mode affects the local advertising
* address in limited privacy mode.
*/
- if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
- __hci_req_enable_advertising(req);
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) {
+ if (ext_adv_capable(hdev))
+ __hci_req_start_ext_adv(req, 0x00);
+ else
+ __hci_req_enable_advertising(req);
+ }
}
hci_dev_unlock(hdev);
@@ -1940,7 +2363,6 @@ discov_stopped:
static int le_scan_restart(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_enable cp;
/* If controller is not scanning we are done. */
if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
@@ -1948,10 +2370,23 @@ static int le_scan_restart(struct hci_request *req, unsigned long opt)
hci_req_add_le_scan_disable(req);
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_ENABLE;
- cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ if (use_ext_scan(hdev)) {
+ struct hci_cp_le_set_ext_scan_enable ext_enable_cp;
+
+ memset(&ext_enable_cp, 0, sizeof(ext_enable_cp));
+ ext_enable_cp.enable = LE_SCAN_ENABLE;
+ ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+
+ hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE,
+ sizeof(ext_enable_cp), &ext_enable_cp);
+ } else {
+ struct hci_cp_le_set_scan_enable cp;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_ENABLE;
+ cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+ }
return 0;
}
@@ -2010,8 +2445,6 @@ static int active_scan(struct hci_request *req, unsigned long opt)
{
uint16_t interval = opt;
struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_param param_cp;
- struct hci_cp_le_set_scan_enable enable_cp;
u8 own_addr_type;
int err;
@@ -2050,22 +2483,8 @@ static int active_scan(struct hci_request *req, unsigned long opt)
if (err < 0)
own_addr_type = ADDR_LE_DEV_PUBLIC;
- memset(&param_cp, 0, sizeof(param_cp));
- param_cp.type = LE_SCAN_ACTIVE;
- param_cp.interval = cpu_to_le16(interval);
- param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN);
- param_cp.own_address_type = own_addr_type;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
- &param_cp);
-
- memset(&enable_cp, 0, sizeof(enable_cp));
- enable_cp.enable = LE_SCAN_ENABLE;
- enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
- &enable_cp);
-
+ hci_req_start_scan(req, LE_SCAN_ACTIVE, interval, DISCOV_LE_SCAN_WIN,
+ own_addr_type, 0);
return 0;
}
@@ -2302,11 +2721,26 @@ static int powered_update_hci(struct hci_request *req, unsigned long opt)
*/
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
list_empty(&hdev->adv_instances)) {
- __hci_req_update_adv_data(req, 0x00);
- __hci_req_update_scan_rsp_data(req, 0x00);
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
- __hci_req_enable_advertising(req);
+ int err;
+
+ if (ext_adv_capable(hdev)) {
+ err = __hci_req_setup_ext_adv_instance(req,
+ 0x00);
+ if (!err)
+ __hci_req_update_scan_rsp_data(req,
+ 0x00);
+ } else {
+ err = 0;
+ __hci_req_update_adv_data(req, 0x00);
+ __hci_req_update_scan_rsp_data(req, 0x00);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ if (!ext_adv_capable(hdev))
+ __hci_req_enable_advertising(req);
+ else if (!err)
+ __hci_req_enable_ext_advertising(req);
+ }
} else if (!list_empty(&hdev->adv_instances)) {
struct adv_info *adv_instance;
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 702beb140d9f..692cc8b13368 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -80,6 +80,14 @@ void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk,
struct hci_request *req, u8 instance,
bool force);
+int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance);
+int __hci_req_start_ext_adv(struct hci_request *req, u8 instance);
+void __hci_req_enable_ext_advertising(struct hci_request *req);
+void __hci_req_clear_ext_adv_sets(struct hci_request *req);
+int hci_get_random_address(struct hci_dev *hdev, bool require_privacy,
+ bool use_rpa, struct adv_info *adv_instance,
+ u8 *own_addr_type, bdaddr_t *rand_addr);
+
void __hci_req_update_class(struct hci_request *req);
/* Returns true if HCI commands were queued */
diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c
index 1036e4fa1ea2..253975cce943 100644
--- a/net/bluetooth/hidp/core.c
+++ b/net/bluetooth/hidp/core.c
@@ -431,8 +431,8 @@ static void hidp_del_timer(struct hidp_session *session)
del_timer(&session->timer);
}
-static void hidp_process_report(struct hidp_session *session,
- int type, const u8 *data, int len, int intr)
+static void hidp_process_report(struct hidp_session *session, int type,
+ const u8 *data, unsigned int len, int intr)
{
if (len > HID_MAX_BUFFER_SIZE)
len = HID_MAX_BUFFER_SIZE;
@@ -775,7 +775,7 @@ static int hidp_setup_hid(struct hidp_session *session,
hid->version = req->version;
hid->country = req->country;
- strncpy(hid->name, req->name, sizeof(req->name) - 1);
+ strncpy(hid->name, req->name, sizeof(hid->name));
snprintf(hid->phys, sizeof(hid->phys), "%pMR",
&l2cap_pi(session->ctrl_sock->sk)->chan->src);
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 8a80d48d89c4..231602f7cb66 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -617,6 +617,127 @@ static int read_config_info(struct sock *sk, struct hci_dev *hdev,
&rp, sizeof(rp));
}
+static u32 get_supported_phys(struct hci_dev *hdev)
+{
+ u32 supported_phys = 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ supported_phys |= MGMT_PHY_BR_1M_1SLOT;
+
+ if (hdev->features[0][0] & LMP_3SLOT)
+ supported_phys |= MGMT_PHY_BR_1M_3SLOT;
+
+ if (hdev->features[0][0] & LMP_5SLOT)
+ supported_phys |= MGMT_PHY_BR_1M_5SLOT;
+
+ if (lmp_edr_2m_capable(hdev)) {
+ supported_phys |= MGMT_PHY_EDR_2M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_2M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_2M_5SLOT;
+
+ if (lmp_edr_3m_capable(hdev)) {
+ supported_phys |= MGMT_PHY_EDR_3M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_3M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev))
+ supported_phys |= MGMT_PHY_EDR_3M_5SLOT;
+ }
+ }
+ }
+
+ if (lmp_le_capable(hdev)) {
+ supported_phys |= MGMT_PHY_LE_1M_TX;
+ supported_phys |= MGMT_PHY_LE_1M_RX;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_2M) {
+ supported_phys |= MGMT_PHY_LE_2M_TX;
+ supported_phys |= MGMT_PHY_LE_2M_RX;
+ }
+
+ if (hdev->le_features[1] & HCI_LE_PHY_CODED) {
+ supported_phys |= MGMT_PHY_LE_CODED_TX;
+ supported_phys |= MGMT_PHY_LE_CODED_RX;
+ }
+ }
+
+ return supported_phys;
+}
+
+static u32 get_selected_phys(struct hci_dev *hdev)
+{
+ u32 selected_phys = 0;
+
+ if (lmp_bredr_capable(hdev)) {
+ selected_phys |= MGMT_PHY_BR_1M_1SLOT;
+
+ if (hdev->pkt_type & (HCI_DM3 | HCI_DH3))
+ selected_phys |= MGMT_PHY_BR_1M_3SLOT;
+
+ if (hdev->pkt_type & (HCI_DM5 | HCI_DH5))
+ selected_phys |= MGMT_PHY_BR_1M_5SLOT;
+
+ if (lmp_edr_2m_capable(hdev)) {
+ if (!(hdev->pkt_type & HCI_2DH1))
+ selected_phys |= MGMT_PHY_EDR_2M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_2DH3))
+ selected_phys |= MGMT_PHY_EDR_2M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_2DH5))
+ selected_phys |= MGMT_PHY_EDR_2M_5SLOT;
+
+ if (lmp_edr_3m_capable(hdev)) {
+ if (!(hdev->pkt_type & HCI_3DH1))
+ selected_phys |= MGMT_PHY_EDR_3M_1SLOT;
+
+ if (lmp_edr_3slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_3DH3))
+ selected_phys |= MGMT_PHY_EDR_3M_3SLOT;
+
+ if (lmp_edr_5slot_capable(hdev) &&
+ !(hdev->pkt_type & HCI_3DH5))
+ selected_phys |= MGMT_PHY_EDR_3M_5SLOT;
+ }
+ }
+ }
+
+ if (lmp_le_capable(hdev)) {
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_1M)
+ selected_phys |= MGMT_PHY_LE_1M_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_1M)
+ selected_phys |= MGMT_PHY_LE_1M_RX;
+
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_2M)
+ selected_phys |= MGMT_PHY_LE_2M_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_2M)
+ selected_phys |= MGMT_PHY_LE_2M_RX;
+
+ if (hdev->le_tx_def_phys & HCI_LE_SET_PHY_CODED)
+ selected_phys |= MGMT_PHY_LE_CODED_TX;
+
+ if (hdev->le_rx_def_phys & HCI_LE_SET_PHY_CODED)
+ selected_phys |= MGMT_PHY_LE_CODED_RX;
+ }
+
+ return selected_phys;
+}
+
+static u32 get_configurable_phys(struct hci_dev *hdev)
+{
+ return (get_supported_phys(hdev) & ~MGMT_PHY_BR_1M_1SLOT &
+ ~MGMT_PHY_LE_1M_TX & ~MGMT_PHY_LE_1M_RX);
+}
+
static u32 get_supported_settings(struct hci_dev *hdev)
{
u32 settings = 0;
@@ -654,6 +775,8 @@ static u32 get_supported_settings(struct hci_dev *hdev)
hdev->set_bdaddr)
settings |= MGMT_SETTING_CONFIGURATION;
+ settings |= MGMT_SETTING_PHY_CONFIGURATION;
+
return settings;
}
@@ -817,7 +940,10 @@ static void rpa_expired(struct work_struct *work)
* function.
*/
hci_req_init(&req, hdev);
- __hci_req_enable_advertising(&req);
+ if (ext_adv_capable(hdev))
+ __hci_req_start_ext_adv(&req, hdev->cur_adv_instance);
+ else
+ __hci_req_enable_advertising(&req);
hci_req_run(&req, NULL);
}
@@ -1721,10 +1847,17 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
*/
if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
struct hci_request req;
-
hci_req_init(&req, hdev);
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
+ if (ext_adv_capable(hdev)) {
+ int err;
+
+ err = __hci_req_setup_ext_adv_instance(&req, 0x00);
+ if (!err)
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ }
hci_req_run(&req, NULL);
hci_update_background_scan(hdev);
}
@@ -1823,6 +1956,9 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
} else {
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
__hci_req_disable_advertising(&req);
+
+ if (ext_adv_capable(hdev))
+ __hci_req_clear_ext_adv_sets(&req);
}
hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp),
@@ -3184,6 +3320,228 @@ static int set_appearance(struct sock *sk, struct hci_dev *hdev, void *data,
return err;
}
+static int get_phy_configuration(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_rp_get_phy_confguration rp;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ hci_dev_lock(hdev);
+
+ memset(&rp, 0, sizeof(rp));
+
+ rp.supported_phys = cpu_to_le32(get_supported_phys(hdev));
+ rp.selected_phys = cpu_to_le32(get_selected_phys(hdev));
+ rp.configurable_phys = cpu_to_le32(get_configurable_phys(hdev));
+
+ hci_dev_unlock(hdev);
+
+ return mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_PHY_CONFIGURATION, 0,
+ &rp, sizeof(rp));
+}
+
+int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip)
+{
+ struct mgmt_ev_phy_configuration_changed ev;
+
+ memset(&ev, 0, sizeof(ev));
+
+ ev.selected_phys = cpu_to_le32(get_selected_phys(hdev));
+
+ return mgmt_event(MGMT_EV_PHY_CONFIGURATION_CHANGED, hdev, &ev,
+ sizeof(ev), skip);
+}
+
+static void set_default_phy_complete(struct hci_dev *hdev, u8 status,
+ u16 opcode, struct sk_buff *skb)
+{
+ struct mgmt_cp_set_phy_confguration *cp;
+ struct mgmt_pending_cmd *cmd;
+
+ BT_DBG("status 0x%02x", status);
+
+ hci_dev_lock(hdev);
+
+ cmd = pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev);
+ if (!cmd)
+ goto unlock;
+
+ cp = cmd->param;
+
+ if (status) {
+ mgmt_cmd_status(cmd->sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ mgmt_status(status));
+ } else {
+ mgmt_cmd_complete(cmd->sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION, 0,
+ NULL, 0);
+
+ mgmt_phy_configuration_changed(hdev, cmd->sk);
+ }
+
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ struct mgmt_cp_set_phy_confguration *cp = data;
+ struct hci_cp_le_set_default_phy cp_phy;
+ struct mgmt_pending_cmd *cmd;
+ struct hci_request req;
+ u32 selected_phys, configurable_phys, supported_phys, unconfigure_phys;
+ u16 pkt_type = (HCI_DH1 | HCI_DM1);
+ bool changed = false;
+ int err;
+
+ BT_DBG("sock %p %s", sk, hdev->name);
+
+ configurable_phys = get_configurable_phys(hdev);
+ supported_phys = get_supported_phys(hdev);
+ selected_phys = __le32_to_cpu(cp->selected_phys);
+
+ if (selected_phys & ~supported_phys)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ unconfigure_phys = supported_phys & ~configurable_phys;
+
+ if ((selected_phys & unconfigure_phys) != unconfigure_phys)
+ return mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (selected_phys == get_selected_phys(hdev))
+ return mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ 0, NULL, 0);
+
+ hci_dev_lock(hdev);
+
+ if (!hdev_is_powered(hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_REJECTED);
+ goto unlock;
+ }
+
+ if (pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) {
+ err = mgmt_cmd_status(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ MGMT_STATUS_BUSY);
+ goto unlock;
+ }
+
+ if (selected_phys & MGMT_PHY_BR_1M_3SLOT)
+ pkt_type |= (HCI_DH3 | HCI_DM3);
+ else
+ pkt_type &= ~(HCI_DH3 | HCI_DM3);
+
+ if (selected_phys & MGMT_PHY_BR_1M_5SLOT)
+ pkt_type |= (HCI_DH5 | HCI_DM5);
+ else
+ pkt_type &= ~(HCI_DH5 | HCI_DM5);
+
+ if (selected_phys & MGMT_PHY_EDR_2M_1SLOT)
+ pkt_type &= ~HCI_2DH1;
+ else
+ pkt_type |= HCI_2DH1;
+
+ if (selected_phys & MGMT_PHY_EDR_2M_3SLOT)
+ pkt_type &= ~HCI_2DH3;
+ else
+ pkt_type |= HCI_2DH3;
+
+ if (selected_phys & MGMT_PHY_EDR_2M_5SLOT)
+ pkt_type &= ~HCI_2DH5;
+ else
+ pkt_type |= HCI_2DH5;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_1SLOT)
+ pkt_type &= ~HCI_3DH1;
+ else
+ pkt_type |= HCI_3DH1;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_3SLOT)
+ pkt_type &= ~HCI_3DH3;
+ else
+ pkt_type |= HCI_3DH3;
+
+ if (selected_phys & MGMT_PHY_EDR_3M_5SLOT)
+ pkt_type &= ~HCI_3DH5;
+ else
+ pkt_type |= HCI_3DH5;
+
+ if (pkt_type != hdev->pkt_type) {
+ hdev->pkt_type = pkt_type;
+ changed = true;
+ }
+
+ if ((selected_phys & MGMT_PHY_LE_MASK) ==
+ (get_selected_phys(hdev) & MGMT_PHY_LE_MASK)) {
+ if (changed)
+ mgmt_phy_configuration_changed(hdev, sk);
+
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_SET_PHY_CONFIGURATION,
+ 0, NULL, 0);
+
+ goto unlock;
+ }
+
+ cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data,
+ len);
+ if (!cmd) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+
+ hci_req_init(&req, hdev);
+
+ memset(&cp_phy, 0, sizeof(cp_phy));
+
+ if (!(selected_phys & MGMT_PHY_LE_TX_MASK))
+ cp_phy.all_phys |= 0x01;
+
+ if (!(selected_phys & MGMT_PHY_LE_RX_MASK))
+ cp_phy.all_phys |= 0x02;
+
+ if (selected_phys & MGMT_PHY_LE_1M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_TX)
+ cp_phy.tx_phys |= HCI_LE_SET_PHY_CODED;
+
+ if (selected_phys & MGMT_PHY_LE_1M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (selected_phys & MGMT_PHY_LE_2M_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (selected_phys & MGMT_PHY_LE_CODED_RX)
+ cp_phy.rx_phys |= HCI_LE_SET_PHY_CODED;
+
+ hci_req_add(&req, HCI_OP_LE_SET_DEFAULT_PHY, sizeof(cp_phy), &cp_phy);
+
+ err = hci_req_run_skb(&req, set_default_phy_complete);
+ if (err < 0)
+ mgmt_pending_remove(cmd);
+
+unlock:
+ hci_dev_unlock(hdev);
+
+ return err;
+}
+
static void read_local_oob_data_complete(struct hci_dev *hdev, u8 status,
u16 opcode, struct sk_buff *skb)
{
@@ -4037,9 +4395,14 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
* HCI_ADVERTISING flag is not yet set.
*/
hdev->cur_adv_instance = 0x00;
- __hci_req_update_adv_data(&req, 0x00);
- __hci_req_update_scan_rsp_data(&req, 0x00);
- __hci_req_enable_advertising(&req);
+
+ if (ext_adv_capable(hdev)) {
+ __hci_req_start_ext_adv(&req, 0x00);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ __hci_req_enable_advertising(&req);
+ }
} else {
__hci_req_disable_advertising(&req);
}
@@ -4609,6 +4972,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, true);
if (cp->privacy == 0x02)
hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
else
@@ -4617,6 +4981,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
memset(hdev->irk, 0, sizeof(hdev->irk));
hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ hci_adv_instances_set_rpa_expired(hdev, false);
hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
}
@@ -5967,9 +6332,23 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev)
flags |= MGMT_ADV_FLAG_APPEARANCE;
flags |= MGMT_ADV_FLAG_LOCAL_NAME;
- if (hdev->adv_tx_power != HCI_TX_POWER_INVALID)
+ /* In extended adv TX_POWER returned from Set Adv Param
+ * will be always valid.
+ */
+ if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) ||
+ ext_adv_capable(hdev))
flags |= MGMT_ADV_FLAG_TX_POWER;
+ if (ext_adv_capable(hdev)) {
+ flags |= MGMT_ADV_FLAG_SEC_1M;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_2M)
+ flags |= MGMT_ADV_FLAG_SEC_2M;
+
+ if (hdev->le_features[1] & HCI_LE_PHY_CODED)
+ flags |= MGMT_ADV_FLAG_SEC_CODED;
+ }
+
return flags;
}
@@ -6175,7 +6554,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
struct mgmt_cp_add_advertising *cp = data;
struct mgmt_rp_add_advertising rp;
u32 flags;
- u32 supported_flags;
+ u32 supported_flags, phy_flags;
u8 status;
u16 timeout, duration;
unsigned int prev_instance_cnt = hdev->adv_instance_cnt;
@@ -6205,10 +6584,12 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
duration = __le16_to_cpu(cp->duration);
/* The current implementation only supports a subset of the specified
- * flags.
+ * flags. Also need to check mutual exclusiveness of sec flags.
*/
supported_flags = get_supported_adv_flags(hdev);
- if (flags & ~supported_flags)
+ phy_flags = flags & MGMT_ADV_FLAG_SEC_MASK;
+ if (flags & ~supported_flags ||
+ ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
@@ -6544,6 +6925,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ read_ext_controller_info,MGMT_READ_EXT_INFO_SIZE,
HCI_MGMT_UNTRUSTED },
{ set_appearance, MGMT_SET_APPEARANCE_SIZE },
+ { get_phy_configuration, MGMT_GET_PHY_CONFIGURATION_SIZE },
+ { set_phy_configuration, MGMT_SET_PHY_CONFIGURATION_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index 413b8ee49fec..8f0f9279eac9 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -393,7 +393,8 @@ static void sco_sock_cleanup_listen(struct sock *parent)
*/
static void sco_sock_kill(struct sock *sk)
{
- if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket)
+ if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket ||
+ sock_flag(sk, SOCK_DEAD))
return;
BT_DBG("sk %p state %d", sk, sk->sk_state);
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 68c3578343b4..22a78eedf4b1 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -96,6 +96,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
u32 size = kattr->test.data_size_in;
u32 repeat = kattr->test.repeat;
u32 retval, duration;
+ int hh_len = ETH_HLEN;
struct sk_buff *skb;
void *data;
int ret;
@@ -131,12 +132,22 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
skb_reset_network_header(skb);
if (is_l2)
- __skb_push(skb, ETH_HLEN);
+ __skb_push(skb, hh_len);
if (is_direct_pkt_access)
bpf_compute_data_pointers(skb);
retval = bpf_test_run(prog, skb, repeat, &duration);
- if (!is_l2)
- __skb_push(skb, ETH_HLEN);
+ if (!is_l2) {
+ if (skb_headroom(skb) < hh_len) {
+ int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb));
+
+ if (pskb_expand_head(skb, nhead, 0, GFP_USER)) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
+ }
+ memset(__skb_push(skb, hh_len), 0, hh_len);
+ }
+
size = skb->len;
/* bpf program can never convert linear skb to non-linear */
if (WARN_ON_ONCE(skb_is_nonlinear(skb)))
diff --git a/net/bpfilter/Kconfig b/net/bpfilter/Kconfig
index 76deb6615883..e558b46596c4 100644
--- a/net/bpfilter/Kconfig
+++ b/net/bpfilter/Kconfig
@@ -13,4 +13,3 @@ config BPFILTER_UMH
help
This builds bpfilter kernel module with embedded user mode helper
endif
-
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 9019f326fe81..5372e2042adf 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev,
void br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, bool local_rcv, bool local_orig)
{
- if (to && should_deliver(to, skb)) {
+ if (unlikely(!to))
+ goto out;
+
+ /* redirect to backup link if the destination port is down */
+ if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
+ struct net_bridge_port *backup_port;
+
+ backup_port = rcu_dereference(to->backup_port);
+ if (unlikely(!backup_port))
+ goto out;
+ to = backup_port;
+ }
+
+ if (should_deliver(to, skb)) {
if (local_rcv)
deliver_clone(to, skb, local_orig);
else
@@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to,
return;
}
+out:
if (!local_rcv)
kfree_skb(skb);
}
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index 05e42d86882d..0363f1bdc401 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -26,6 +26,7 @@
#include <net/sock.h>
#include <linux/if_vlan.h>
#include <net/switchdev.h>
+#include <net/net_namespace.h>
#include "br_private.h"
@@ -169,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br)
}
}
+int nbp_backup_change(struct net_bridge_port *p,
+ struct net_device *backup_dev)
+{
+ struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
+ struct net_bridge_port *backup_p = NULL;
+
+ ASSERT_RTNL();
+
+ if (backup_dev) {
+ if (!br_port_exists(backup_dev))
+ return -ENOENT;
+
+ backup_p = br_port_get_rtnl(backup_dev);
+ if (backup_p->br != p->br)
+ return -EINVAL;
+ }
+
+ if (p == backup_p)
+ return -EINVAL;
+
+ if (old_backup == backup_p)
+ return 0;
+
+ /* if the backup link is already set, clear it */
+ if (old_backup)
+ old_backup->backup_redirected_cnt--;
+
+ if (backup_p)
+ backup_p->backup_redirected_cnt++;
+ rcu_assign_pointer(p->backup_port, backup_p);
+
+ return 0;
+}
+
+static void nbp_backup_clear(struct net_bridge_port *p)
+{
+ nbp_backup_change(p, NULL);
+ if (p->backup_redirected_cnt) {
+ struct net_bridge_port *cur_p;
+
+ list_for_each_entry(cur_p, &p->br->port_list, list) {
+ struct net_bridge_port *backup_p;
+
+ backup_p = rtnl_dereference(cur_p->backup_port);
+ if (backup_p == p)
+ nbp_backup_change(cur_p, NULL);
+ }
+ }
+
+ WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
+}
+
static void nbp_update_port_count(struct net_bridge *br)
{
struct net_bridge_port *p;
@@ -204,11 +257,19 @@ static void release_nbp(struct kobject *kobj)
kfree(p);
}
+static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid)
+{
+ struct net_bridge_port *p = kobj_to_brport(kobj);
+
+ net_ns_get_ownership(dev_net(p->dev), uid, gid);
+}
+
static struct kobj_type brport_ktype = {
#ifdef CONFIG_SYSFS
.sysfs_ops = &brport_sysfs_ops,
#endif
.release = release_nbp,
+ .get_ownership = brport_get_ownership,
};
static void destroy_nbp(struct net_bridge_port *p)
@@ -286,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p)
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
switchdev_deferred_process();
+ nbp_backup_clear(p);
nbp_update_port_count(br);
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 9b16eaf33819..6e0dc6bcd32a 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -26,6 +26,7 @@
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <linux/netfilter_bridge.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_arp.h>
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 9f5eb05b0373..ec2b58a09f76 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
+ nla_total_size(br_get_link_af_size_filtered(dev,
- filter_mask)); /* IFLA_AF_SPEC */
+ filter_mask)) /* IFLA_AF_SPEC */
+ + nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */
}
static int br_port_fill_attrs(struct sk_buff *skb,
const struct net_bridge_port *p)
{
u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
+ struct net_bridge_port *backup_p;
u64 timerval;
if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
@@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb,
return -EMSGSIZE;
#endif
+ /* we might be called only with br->lock */
+ rcu_read_lock();
+ backup_p = rcu_dereference(p->backup_port);
+ if (backup_p)
+ nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT,
+ backup_p->dev->ifindex);
+ rcu_read_unlock();
+
return 0;
}
@@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
[IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
+ [IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
};
/* Change the state of the port and notify spanning tree */
@@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
if (err)
return err;
+ if (tb[IFLA_BRPORT_BACKUP_PORT]) {
+ struct net_device *backup_dev = NULL;
+ u32 backup_ifindex;
+
+ backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
+ if (backup_ifindex) {
+ backup_dev = __dev_get_by_index(dev_net(p->dev),
+ backup_ifindex);
+ if (!backup_dev)
+ return -ENOENT;
+ }
+
+ err = nbp_backup_change(p, backup_dev);
+ if (err)
+ return err;
+ }
+
br_port_flags_change(p, old_flags ^ p->flags);
return 0;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 5216a524b537..11ed2029985f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -237,6 +237,7 @@ struct net_bridge_port {
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
+ struct net_bridge_port __rcu *backup_port;
/* STP */
u8 priority;
@@ -281,8 +282,11 @@ struct net_bridge_port {
int offload_fwd_mark;
#endif
u16 group_fwd_mask;
+ u16 backup_redirected_cnt;
};
+#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
+
#define br_auto_port(p) ((p)->flags & BR_AUTO_MASK)
#define br_promisc_port(p) ((p)->flags & BR_PROMISC)
@@ -595,6 +599,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br,
netdev_features_t features);
void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
void br_manage_promisc(struct net_bridge *br);
+int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
/* br_input.c */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index f99c5bf5c906..7c87a2fe5248 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -25,6 +25,15 @@ struct brport_attribute {
struct attribute attr;
ssize_t (*show)(struct net_bridge_port *, char *);
int (*store)(struct net_bridge_port *, unsigned long);
+ int (*store_raw)(struct net_bridge_port *, char *);
+};
+
+#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \
+const struct brport_attribute brport_attr_##_name = { \
+ .attr = {.name = __stringify(_name), \
+ .mode = _mode }, \
+ .show = _show, \
+ .store_raw = _store, \
};
#define BRPORT_ATTR(_name, _mode, _show, _store) \
@@ -182,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p,
static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
store_group_fwd_mask);
+static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
+{
+ struct net_bridge_port *backup_p;
+ int ret = 0;
+
+ rcu_read_lock();
+ backup_p = rcu_dereference(p->backup_port);
+ if (backup_p)
+ ret = sprintf(buf, "%s\n", backup_p->dev->name);
+ rcu_read_unlock();
+
+ return ret;
+}
+
+static int store_backup_port(struct net_bridge_port *p, char *buf)
+{
+ struct net_device *backup_dev = NULL;
+ char *nl = strchr(buf, '\n');
+
+ if (nl)
+ *nl = '\0';
+
+ if (strlen(buf) > 0) {
+ backup_dev = __dev_get_by_name(dev_net(p->dev), buf);
+ if (!backup_dev)
+ return -ENOENT;
+ }
+
+ return nbp_backup_change(p, backup_dev);
+}
+static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port);
+
BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
@@ -245,17 +286,17 @@ static const struct brport_attribute *brport_attrs[] = {
&brport_attr_group_fwd_mask,
&brport_attr_neigh_suppress,
&brport_attr_isolated,
+ &brport_attr_backup_port,
NULL
};
#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr)
-#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
static ssize_t brport_show(struct kobject *kobj,
struct attribute *attr, char *buf)
{
struct brport_attribute *brport_attr = to_brport_attr(attr);
- struct net_bridge_port *p = to_brport(kobj);
+ struct net_bridge_port *p = kobj_to_brport(kobj);
if (!brport_attr->show)
return -EINVAL;
@@ -268,29 +309,48 @@ static ssize_t brport_store(struct kobject *kobj,
const char *buf, size_t count)
{
struct brport_attribute *brport_attr = to_brport_attr(attr);
- struct net_bridge_port *p = to_brport(kobj);
+ struct net_bridge_port *p = kobj_to_brport(kobj);
ssize_t ret = -EINVAL;
- char *endp;
unsigned long val;
+ char *endp;
if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
- val = simple_strtoul(buf, &endp, 0);
- if (endp != buf) {
- if (!rtnl_trylock())
- return restart_syscall();
- if (p->dev && p->br && brport_attr->store) {
- spin_lock_bh(&p->br->lock);
- ret = brport_attr->store(p, val);
- spin_unlock_bh(&p->br->lock);
- if (!ret) {
- br_ifinfo_notify(RTM_NEWLINK, NULL, p);
- ret = count;
- }
+ if (!rtnl_trylock())
+ return restart_syscall();
+
+ if (!p->dev || !p->br)
+ goto out_unlock;
+
+ if (brport_attr->store_raw) {
+ char *buf_copy;
+
+ buf_copy = kstrndup(buf, count, GFP_KERNEL);
+ if (!buf_copy) {
+ ret = -ENOMEM;
+ goto out_unlock;
}
- rtnl_unlock();
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store_raw(p, buf_copy);
+ spin_unlock_bh(&p->br->lock);
+ kfree(buf_copy);
+ } else if (brport_attr->store) {
+ val = simple_strtoul(buf, &endp, 0);
+ if (endp == buf)
+ goto out_unlock;
+ spin_lock_bh(&p->br->lock);
+ ret = brport_attr->store(p, val);
+ spin_unlock_bh(&p->br->lock);
}
+
+ if (!ret) {
+ br_ifinfo_notify(RTM_NEWLINK, NULL, p);
+ ret = count;
+ }
+out_unlock:
+ rtnl_unlock();
+
return ret;
}
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index c41da5fac84f..550324c516ee 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -9,6 +9,7 @@
*/
#include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/module.h>
#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index 08df7406ecb3..c0fb3ca518af 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -9,6 +9,7 @@
*/
#include <linux/netfilter_bridge/ebtables.h>
+#include <uapi/linux/netfilter_bridge.h>
#include <linux/module.h>
#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index 6de981270566..08cbed7d940e 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -89,8 +89,7 @@ static void nft_reject_br_send_v4_tcp_reset(struct net *net,
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
net->ipv4.sysctl_ip_default_ttl);
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- niph->ttl = net->ipv4.sysctl_ip_default_ttl;
- niph->tot_len = htons(nskb->len);
+ niph->tot_len = htons(nskb->len);
ip_send_check(niph);
nft_reject_br_push_etherhdr(oldskb, nskb);
diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c
index e0adcd123f48..711d7156efd8 100644
--- a/net/caif/caif_dev.c
+++ b/net/caif/caif_dev.c
@@ -131,8 +131,10 @@ static void caif_flow_cb(struct sk_buff *skb)
caifd = caif_get(skb->dev);
WARN_ON(caifd == NULL);
- if (caifd == NULL)
+ if (!caifd) {
+ rcu_read_unlock();
return;
+ }
caifd_hold(caifd);
rcu_read_unlock();
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index a6fb1b3bcad9..d18965f3291f 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -941,7 +941,7 @@ static __poll_t caif_poll(struct file *file,
__poll_t mask;
struct caifsock *cf_sk = container_of(sk, struct caifsock, sk);
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
mask = 0;
/* exceptional events? */
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 9938952c5c78..9aac0d63d53e 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -837,7 +837,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
mask = 0;
/* exceptional events? */
diff --git a/net/core/dev.c b/net/core/dev.c
index 14a748ee8cc9..f68122f0ab02 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4252,7 +4252,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb,
/* Reinjected packets coming from act_mirred or similar should
* not get XDP generic processing.
*/
- if (skb_cloned(skb))
+ if (skb_cloned(skb) || skb_is_tc_redirected(skb))
return XDP_PASS;
/* XDP packets must be linear and must have sufficient headroom
@@ -4602,6 +4602,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
return NULL;
+ case TC_ACT_REINSERT:
+ /* this does not scrub the packet, and updates stats on error */
+ skb_tc_reinsert(skb, &cl_res);
+ return NULL;
default:
break;
}
@@ -5042,7 +5046,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
break;
case XDP_QUERY_PROG:
- xdp->prog_attached = !!old;
xdp->prog_id = old ? old->aux->id : 0;
break;
@@ -5283,9 +5286,11 @@ static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
list_del(&skb->list);
skb->next = NULL;
napi_gro_complete(skb);
- napi->gro_count--;
napi->gro_hash[index].count--;
}
+
+ if (!napi->gro_hash[index].count)
+ __clear_bit(index, &napi->gro_bitmask);
}
/* napi->gro_hash[].list contains packets ordered by age.
@@ -5296,8 +5301,10 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old)
{
u32 i;
- for (i = 0; i < GRO_HASH_BUCKETS; i++)
- __napi_gro_flush_chain(napi, i, flush_old);
+ for (i = 0; i < GRO_HASH_BUCKETS; i++) {
+ if (test_bit(i, &napi->gro_bitmask))
+ __napi_gro_flush_chain(napi, i, flush_old);
+ }
}
EXPORT_SYMBOL(napi_gro_flush);
@@ -5389,8 +5396,8 @@ static void gro_flush_oldest(struct list_head *head)
if (WARN_ON_ONCE(!oldest))
return;
- /* Do not adjust napi->gro_count, caller is adding a new SKB to
- * the chain.
+ /* Do not adjust napi->gro_hash[].count, caller is adding a new
+ * SKB to the chain.
*/
list_del(&oldest->list);
napi_gro_complete(oldest);
@@ -5465,7 +5472,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
list_del(&pp->list);
pp->next = NULL;
napi_gro_complete(pp);
- napi->gro_count--;
napi->gro_hash[hash].count--;
}
@@ -5478,7 +5484,6 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
gro_flush_oldest(gro_head);
} else {
- napi->gro_count++;
napi->gro_hash[hash].count++;
}
NAPI_GRO_CB(skb)->count = 1;
@@ -5493,6 +5498,13 @@ pull:
if (grow > 0)
gro_pull_from_frag0(skb, grow);
ok:
+ if (napi->gro_hash[hash].count) {
+ if (!test_bit(hash, &napi->gro_bitmask))
+ __set_bit(hash, &napi->gro_bitmask);
+ } else if (test_bit(hash, &napi->gro_bitmask)) {
+ __clear_bit(hash, &napi->gro_bitmask);
+ }
+
return ret;
normal:
@@ -5891,7 +5903,7 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
NAPIF_STATE_IN_BUSY_POLL)))
return false;
- if (n->gro_count) {
+ if (n->gro_bitmask) {
unsigned long timeout = 0;
if (work_done)
@@ -6100,26 +6112,31 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
/* Note : we use a relaxed variant of napi_schedule_prep() not setting
* NAPI_STATE_MISSED, since we do not react to a device IRQ.
*/
- if (napi->gro_count && !napi_disable_pending(napi) &&
+ if (napi->gro_bitmask && !napi_disable_pending(napi) &&
!test_and_set_bit(NAPI_STATE_SCHED, &napi->state))
__napi_schedule_irqoff(napi);
return HRTIMER_NORESTART;
}
-void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
- int (*poll)(struct napi_struct *, int), int weight)
+static void init_gro_hash(struct napi_struct *napi)
{
int i;
- INIT_LIST_HEAD(&napi->poll_list);
- hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
- napi->timer.function = napi_watchdog;
- napi->gro_count = 0;
for (i = 0; i < GRO_HASH_BUCKETS; i++) {
INIT_LIST_HEAD(&napi->gro_hash[i].list);
napi->gro_hash[i].count = 0;
}
+ napi->gro_bitmask = 0;
+}
+
+void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
+ int (*poll)(struct napi_struct *, int), int weight)
+{
+ INIT_LIST_HEAD(&napi->poll_list);
+ hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ napi->timer.function = napi_watchdog;
+ init_gro_hash(napi);
napi->skb = NULL;
napi->poll = poll;
if (weight > NAPI_POLL_WEIGHT)
@@ -6175,7 +6192,7 @@ void netif_napi_del(struct napi_struct *napi)
napi_free_frags(napi);
flush_gro_hash(napi);
- napi->gro_count = 0;
+ napi->gro_bitmask = 0;
}
EXPORT_SYMBOL(netif_napi_del);
@@ -6217,7 +6234,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll)
goto out_unlock;
}
- if (n->gro_count) {
+ if (n->gro_bitmask) {
/* flush too old packets
* If HZ < 1000, flush all packets.
*/
@@ -7510,13 +7527,15 @@ int __dev_set_mtu(struct net_device *dev, int new_mtu)
EXPORT_SYMBOL(__dev_set_mtu);
/**
- * dev_set_mtu - Change maximum transfer unit
+ * dev_set_mtu_ext - Change maximum transfer unit
* @dev: device
* @new_mtu: new transfer unit
+ * @extack: netlink extended ack
*
* Change the maximum transfer size of the network device.
*/
-int dev_set_mtu(struct net_device *dev, int new_mtu)
+int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
+ struct netlink_ext_ack *extack)
{
int err, orig_mtu;
@@ -7525,14 +7544,12 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
/* MTU must be positive, and in range */
if (new_mtu < 0 || new_mtu < dev->min_mtu) {
- net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
- dev->name, new_mtu, dev->min_mtu);
+ NL_SET_ERR_MSG(extack, "mtu less than device minimum");
return -EINVAL;
}
if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
- net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
- dev->name, new_mtu, dev->max_mtu);
+ NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
return -EINVAL;
}
@@ -7560,6 +7577,18 @@ int dev_set_mtu(struct net_device *dev, int new_mtu)
}
return err;
}
+
+int dev_set_mtu(struct net_device *dev, int new_mtu)
+{
+ struct netlink_ext_ack extack;
+ int err;
+
+ memset(&extack, 0, sizeof(extack));
+ err = dev_set_mtu_ext(dev, new_mtu, &extack);
+ if (err && extack._msg)
+ net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
+ return err;
+}
EXPORT_SYMBOL(dev_set_mtu);
/**
@@ -7579,16 +7608,19 @@ int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
dev->tx_queue_len = new_len;
res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
res = notifier_to_errno(res);
- if (res) {
- netdev_err(dev,
- "refused to change device tx_queue_len\n");
- dev->tx_queue_len = orig_len;
- return res;
- }
- return dev_qdisc_change_tx_queue_len(dev);
+ if (res)
+ goto err_rollback;
+ res = dev_qdisc_change_tx_queue_len(dev);
+ if (res)
+ goto err_rollback;
}
return 0;
+
+err_rollback:
+ netdev_err(dev, "refused to change device tx_queue_len\n");
+ dev->tx_queue_len = orig_len;
+ return res;
}
/**
@@ -7706,23 +7738,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
}
EXPORT_SYMBOL(dev_change_proto_down);
-void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
- struct netdev_bpf *xdp)
+u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
+ enum bpf_netdev_command cmd)
{
- memset(xdp, 0, sizeof(*xdp));
- xdp->command = XDP_QUERY_PROG;
+ struct netdev_bpf xdp;
- /* Query must always succeed. */
- WARN_ON(bpf_op(dev, xdp) < 0);
-}
+ if (!bpf_op)
+ return 0;
-static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op)
-{
- struct netdev_bpf xdp;
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = cmd;
- __dev_xdp_query(dev, bpf_op, &xdp);
+ /* Query must always succeed. */
+ WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
- return xdp.prog_attached;
+ return xdp.prog_id;
}
static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
@@ -7756,12 +7786,19 @@ static void dev_xdp_uninstall(struct net_device *dev)
if (!ndo_bpf)
return;
- __dev_xdp_query(dev, ndo_bpf, &xdp);
- if (xdp.prog_attached == XDP_ATTACHED_NONE)
- return;
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG;
+ WARN_ON(ndo_bpf(dev, &xdp));
+ if (xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
- /* Program removal should always succeed */
- WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL));
+ /* Remove HW offload */
+ memset(&xdp, 0, sizeof(xdp));
+ xdp.command = XDP_QUERY_PROG_HW;
+ if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
+ WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
+ NULL));
}
/**
@@ -7777,12 +7814,15 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
int fd, u32 flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ enum bpf_netdev_command query;
struct bpf_prog *prog = NULL;
bpf_op_t bpf_op, bpf_chk;
int err;
ASSERT_RTNL();
+ query = flags & XDP_FLAGS_HW_MODE ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
+
bpf_op = bpf_chk = ops->ndo_bpf;
if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE)))
return -EOPNOTSUPP;
@@ -7792,10 +7832,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
bpf_chk = generic_xdp_install;
if (fd >= 0) {
- if (bpf_chk && __dev_xdp_attached(dev, bpf_chk))
+ if (__dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG) ||
+ __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG_HW))
return -EEXIST;
if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) &&
- __dev_xdp_attached(dev, bpf_op))
+ __dev_xdp_query(dev, bpf_op, query))
return -EBUSY;
prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
@@ -9264,6 +9305,9 @@ static struct hlist_head * __net_init netdev_create_hash(void)
/* Initialize per network namespace state */
static int __net_init netdev_init(struct net *net)
{
+ BUILD_BUG_ON(GRO_HASH_BUCKETS >
+ 8 * FIELD_SIZEOF(struct napi_struct, gro_bitmask));
+
if (net != &init_net)
INIT_LIST_HEAD(&net->dev_base_head);
@@ -9534,6 +9578,7 @@ static int __init net_dev_init(void)
sd->cpu = i;
#endif
+ init_gro_hash(&sd->backlog);
sd->backlog.poll = process_backlog;
sd->backlog.weight = weight_p;
}
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 50537ff961a7..90e8aa36881e 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -284,12 +284,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
case SIOCSIFTXQLEN:
if (ifr->ifr_qlen < 0)
return -EINVAL;
- if (dev->tx_queue_len ^ ifr->ifr_qlen) {
- err = dev_change_tx_queue_len(dev, ifr->ifr_qlen);
- if (err)
- return err;
- }
- return 0;
+ return dev_change_tx_queue_len(dev, ifr->ifr_qlen);
case SIOCSIFNAME:
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
diff --git a/net/core/dst.c b/net/core/dst.c
index 2d9b37f8944a..81ccf20e2826 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -307,6 +307,7 @@ void metadata_dst_free(struct metadata_dst *md_dst)
#endif
kfree(md_dst);
}
+EXPORT_SYMBOL_GPL(metadata_dst_free);
struct metadata_dst __percpu *
metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index e677a20180cf..c9993c6c2fd4 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -111,6 +111,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RX_UDP_TUNNEL_PORT_BIT] = "rx-udp_tunnel-port-offload",
[NETIF_F_HW_TLS_RECORD_BIT] = "tls-hw-record",
[NETIF_F_HW_TLS_TX_BIT] = "tls-hw-tx-offload",
+ [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
};
static const char
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index f64aa13811ea..0ff3953f64aa 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -924,8 +924,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh,
return 0;
errout:
- if (nlrule)
- kfree(nlrule);
+ kfree(nlrule);
rules_ops_put(ops);
return err;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index b9ec916f4e3a..7509bb7f0694 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -459,11 +459,21 @@ static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp)
(!unaligned_ok && offset >= 0 &&
offset + ip_align >= 0 &&
offset + ip_align % size == 0))) {
+ bool ldx_off_ok = offset <= S16_MAX;
+
*insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H);
*insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset);
- *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, size, 2 + endian);
- *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, BPF_REG_D,
- offset);
+ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP,
+ size, 2 + endian + (!ldx_off_ok * 2));
+ if (ldx_off_ok) {
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_D, offset);
+ } else {
+ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D);
+ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset);
+ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A,
+ BPF_REG_TMP, 0);
+ }
if (endian)
*insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8);
*insn++ = BPF_JMP_A(8);
@@ -1702,24 +1712,26 @@ static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb,
u32, offset, void *, to, u32, len, u32, start_header)
{
+ u8 *end = skb_tail_pointer(skb);
+ u8 *net = skb_network_header(skb);
+ u8 *mac = skb_mac_header(skb);
u8 *ptr;
- if (unlikely(offset > 0xffff || len > skb_headlen(skb)))
+ if (unlikely(offset > 0xffff || len > (end - mac)))
goto err_clear;
switch (start_header) {
case BPF_HDR_START_MAC:
- ptr = skb_mac_header(skb) + offset;
+ ptr = mac + offset;
break;
case BPF_HDR_START_NET:
- ptr = skb_network_header(skb) + offset;
+ ptr = net + offset;
break;
default:
goto err_clear;
}
- if (likely(ptr >= skb_mac_header(skb) &&
- ptr + len <= skb_tail_pointer(skb))) {
+ if (likely(ptr >= mac && ptr + len <= end)) {
memcpy(to, ptr, len);
return 0;
}
@@ -1762,6 +1774,37 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
.arg2_type = ARG_ANYTHING,
};
+static inline int sk_skb_try_make_writable(struct sk_buff *skb,
+ unsigned int write_len)
+{
+ int err = __bpf_try_make_writable(skb, write_len);
+
+ bpf_compute_data_end_sk_skb(skb);
+ return err;
+}
+
+BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
+{
+ /* Idea is the following: should the needed direct read/write
+ * test fail during runtime, we can pull in more data and redo
+ * again, since implicitly, we invalidate previous checks here.
+ *
+ * Or, since we know how much we need to make read/writeable,
+ * this can be done once at the program beginning for direct
+ * access case. By this we overcome limitations of only current
+ * headroom being accessible.
+ */
+ return sk_skb_try_make_writable(skb, len ? : skb_headlen(skb));
+}
+
+static const struct bpf_func_proto sk_skb_pull_data_proto = {
+ .func = sk_skb_pull_data,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+};
+
BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset,
u64, from, u64, to, u64, flags)
{
@@ -2779,7 +2822,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 len_diff)
static u32 __bpf_skb_max_len(const struct sk_buff *skb)
{
- return skb->dev->mtu + skb->dev->hard_header_len;
+ return skb->dev ? skb->dev->mtu + skb->dev->hard_header_len :
+ SKB_MAX_ALLOC;
}
static int bpf_skb_adjust_net(struct sk_buff *skb, s32 len_diff)
@@ -2863,8 +2907,8 @@ static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len)
return __skb_trim_rcsum(skb, new_len);
}
-BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
- u64, flags)
+static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len,
+ u64 flags)
{
u32 max_len = __bpf_skb_max_len(skb);
u32 min_len = __bpf_skb_min_len(skb);
@@ -2900,6 +2944,13 @@ BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
if (!ret && skb_is_gso(skb))
skb_gso_reset(skb);
}
+ return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_tail(skb, new_len, flags);
bpf_compute_data_pointers(skb);
return ret;
@@ -2914,9 +2965,27 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = {
.arg3_type = ARG_ANYTHING,
};
-BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
u64, flags)
{
+ int ret = __bpf_skb_change_tail(skb, new_len, flags);
+
+ bpf_compute_data_end_sk_skb(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto sk_skb_change_tail_proto = {
+ .func = sk_skb_change_tail,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
+
+static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room,
+ u64 flags)
+{
u32 max_len = __bpf_skb_max_len(skb);
u32 new_len = skb->len + head_room;
int ret;
@@ -2941,8 +3010,16 @@ BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
skb_reset_mac_header(skb);
}
+ return ret;
+}
+
+BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_head(skb, head_room, flags);
+
bpf_compute_data_pointers(skb);
- return 0;
+ return ret;
}
static const struct bpf_func_proto bpf_skb_change_head_proto = {
@@ -2954,6 +3031,23 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = {
.arg3_type = ARG_ANYTHING,
};
+BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
+ u64, flags)
+{
+ int ret = __bpf_skb_change_head(skb, head_room, flags);
+
+ bpf_compute_data_end_sk_skb(skb);
+ return ret;
+}
+
+static const struct bpf_func_proto sk_skb_change_head_proto = {
+ .func = sk_skb_change_head,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_ANYTHING,
+};
static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
{
return xdp_data_meta_unsupported(xdp) ? 0 :
@@ -3046,12 +3140,16 @@ static int __bpf_tx_xdp(struct net_device *dev,
u32 index)
{
struct xdp_frame *xdpf;
- int sent;
+ int err, sent;
if (!dev->netdev_ops->ndo_xdp_xmit) {
return -EOPNOTSUPP;
}
+ err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
+ if (unlikely(err))
+ return err;
+
xdpf = convert_to_xdp_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
@@ -3285,7 +3383,8 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
goto err;
}
- if (unlikely((err = __xdp_generic_ok_fwd_dev(skb, fwd))))
+ err = xdp_ok_fwd_dev(fwd, skb->len);
+ if (unlikely(err))
goto err;
skb->dev = fwd;
@@ -4439,10 +4538,10 @@ static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
.arg4_type = ARG_CONST_SIZE
};
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
const void *, from, u32, len)
{
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
void *srh_tlvs, *srh_end, *ptr;
@@ -4468,9 +4567,6 @@ BPF_CALL_4(bpf_lwt_seg6_store_bytes, struct sk_buff *, skb, u32, offset,
memcpy(skb->data + offset, from, len);
return 0;
-#else /* CONFIG_IPV6_SEG6_BPF */
- return -EOPNOTSUPP;
-#endif
}
static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
@@ -4486,7 +4582,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_store_bytes_proto = {
BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
u32, action, void *, param, u32, param_len)
{
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
struct ipv6_sr_hdr *srh;
@@ -4534,9 +4629,6 @@ BPF_CALL_4(bpf_lwt_seg6_action, struct sk_buff *, skb,
default:
return -EINVAL;
}
-#else /* CONFIG_IPV6_SEG6_BPF */
- return -EOPNOTSUPP;
-#endif
}
static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
@@ -4552,7 +4644,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_action_proto = {
BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
s32, len)
{
-#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
struct seg6_bpf_srh_state *srh_state =
this_cpu_ptr(&seg6_bpf_srh_states);
void *srh_end, *srh_tlvs, *ptr;
@@ -4596,9 +4687,6 @@ BPF_CALL_3(bpf_lwt_seg6_adjust_srh, struct sk_buff *, skb, u32, offset,
srh_state->hdrlen += len;
srh_state->valid = 0;
return 0;
-#else /* CONFIG_IPV6_SEG6_BPF */
- return -EOPNOTSUPP;
-#endif
}
static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
@@ -4609,6 +4697,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_ANYTHING,
};
+#endif /* CONFIG_IPV6_SEG6_BPF */
bool bpf_helper_changes_pkt_data(void *func)
{
@@ -4617,9 +4706,12 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_skb_store_bytes ||
func == bpf_skb_change_proto ||
func == bpf_skb_change_head ||
+ func == sk_skb_change_head ||
func == bpf_skb_change_tail ||
+ func == sk_skb_change_tail ||
func == bpf_skb_adjust_room ||
func == bpf_skb_pull_data ||
+ func == sk_skb_pull_data ||
func == bpf_clone_redirect ||
func == bpf_l3_csum_replace ||
func == bpf_l4_csum_replace ||
@@ -4627,11 +4719,12 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_xdp_adjust_meta ||
func == bpf_msg_pull_data ||
func == bpf_xdp_adjust_tail ||
- func == bpf_lwt_push_encap ||
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
func == bpf_lwt_seg6_store_bytes ||
func == bpf_lwt_seg6_adjust_srh ||
- func == bpf_lwt_seg6_action
- )
+ func == bpf_lwt_seg6_action ||
+#endif
+ func == bpf_lwt_push_encap)
return true;
return false;
@@ -4872,11 +4965,11 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skb_load_bytes:
return &bpf_skb_load_bytes_proto;
case BPF_FUNC_skb_pull_data:
- return &bpf_skb_pull_data_proto;
+ return &sk_skb_pull_data_proto;
case BPF_FUNC_skb_change_tail:
- return &bpf_skb_change_tail_proto;
+ return &sk_skb_change_tail_proto;
case BPF_FUNC_skb_change_head:
- return &bpf_skb_change_head_proto;
+ return &sk_skb_change_head_proto;
case BPF_FUNC_get_socket_cookie:
return &bpf_get_socket_cookie_proto;
case BPF_FUNC_get_socket_uid:
@@ -4967,12 +5060,14 @@ static const struct bpf_func_proto *
lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
+#if IS_ENABLED(CONFIG_IPV6_SEG6_BPF)
case BPF_FUNC_lwt_seg6_store_bytes:
return &bpf_lwt_seg6_store_bytes_proto;
case BPF_FUNC_lwt_seg6_action:
return &bpf_lwt_seg6_action_proto;
case BPF_FUNC_lwt_seg6_adjust_srh:
return &bpf_lwt_seg6_adjust_srh_proto;
+#endif
default:
return lwt_out_func_proto(func_id, prog);
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index b555fc229e96..08a5184f4b34 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -152,7 +152,9 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
!dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_ENC_CONTROL) &&
!dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_ENC_PORTS))
+ FLOW_DISSECTOR_KEY_ENC_PORTS) &&
+ !dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP))
return;
info = skb_tunnel_info(skb);
@@ -212,6 +214,16 @@ skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
tp->src = key->tp_src;
tp->dst = key->tp_dst;
}
+
+ if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ENC_IP)) {
+ struct flow_dissector_key_ip *ip;
+
+ ip = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_ENC_IP,
+ target_container);
+ ip->tos = key->tos;
+ ip->ttl = key->ttl;
+ }
}
EXPORT_SYMBOL(skb_flow_dissect_tunnel_info);
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index b2b2323bdc84..188d693cb251 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -77,8 +77,20 @@ gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type,
d->lock = lock;
spin_lock_bh(lock);
}
- if (d->tail)
- return gnet_stats_copy(d, type, NULL, 0, padattr);
+ if (d->tail) {
+ int ret = gnet_stats_copy(d, type, NULL, 0, padattr);
+
+ /* The initial attribute added in gnet_stats_copy() may be
+ * preceded by a padding attribute, in which case d->tail will
+ * end up pointing at the padding instead of the real attribute.
+ * Fix this so gnet_stats_finish_copy() adjusts the length of
+ * the right attribute.
+ */
+ if (ret == 0 && d->tail->nla_type == padattr)
+ d->tail = (struct nlattr *)((char *)d->tail +
+ NLA_ALIGN(d->tail->nla_len));
+ return ret;
+ }
return 0;
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index e7e626fb87bb..e45098593dc0 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -217,7 +217,7 @@ static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog,
if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME])
return -EINVAL;
- prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL);
+ prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC);
if (!prog->name)
return -ENOMEM;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index cbe85d8d4cc2..aa19d86937af 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -3274,4 +3274,3 @@ static int __init neigh_init(void)
}
subsys_initcall(neigh_init);
-
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index ffa1d18f2c2c..0a95bcf64cdc 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -905,11 +905,20 @@ static const void *rx_queue_namespace(struct kobject *kobj)
return ns;
}
+static void rx_queue_get_ownership(struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = rx_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
static struct kobj_type rx_queue_ktype __ro_after_init = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
.default_attrs = rx_queue_default_attrs,
- .namespace = rx_queue_namespace
+ .namespace = rx_queue_namespace,
+ .get_ownership = rx_queue_get_ownership,
};
static int rx_queue_add_kobject(struct net_device *dev, int index)
@@ -1087,6 +1096,9 @@ static ssize_t tx_maxrate_store(struct netdev_queue *queue,
int err, index = get_netdev_queue_index(queue);
u32 rate = 0;
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
err = kstrtou32(buf, 10, &rate);
if (err < 0)
return err;
@@ -1428,11 +1440,20 @@ static const void *netdev_queue_namespace(struct kobject *kobj)
return ns;
}
+static void netdev_queue_get_ownership(struct kobject *kobj,
+ kuid_t *uid, kgid_t *gid)
+{
+ const struct net *net = netdev_queue_namespace(kobj);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
static struct kobj_type netdev_queue_ktype __ro_after_init = {
.sysfs_ops = &netdev_queue_sysfs_ops,
.release = netdev_queue_release,
.default_attrs = netdev_queue_default_attrs,
.namespace = netdev_queue_namespace,
+ .get_ownership = netdev_queue_get_ownership,
};
static int netdev_queue_add_kobject(struct net_device *dev, int index)
@@ -1622,6 +1643,14 @@ static const void *net_namespace(struct device *d)
return dev_net(dev);
}
+static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid)
+{
+ struct net_device *dev = to_net_dev(d);
+ const struct net *net = dev_net(dev);
+
+ net_ns_get_ownership(net, uid, gid);
+}
+
static struct class net_class __ro_after_init = {
.name = "net",
.dev_release = netdev_release,
@@ -1629,6 +1658,7 @@ static struct class net_class __ro_after_init = {
.dev_uevent = netdev_uevent,
.ns_type = &net_ns_type_operations,
.namespace = net_namespace,
+ .get_ownership = net_get_ownership,
};
#ifdef CONFIG_OF_NET
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a11e03f920d3..738871af5efa 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -17,6 +17,7 @@
#include <linux/user_namespace.h>
#include <linux/net_namespace.h>
#include <linux/sched/task.h>
+#include <linux/uidgid.h>
#include <net/sock.h>
#include <net/netlink.h>
@@ -448,6 +449,33 @@ dec_ucounts:
return net;
}
+/**
+ * net_ns_get_ownership - get sysfs ownership data for @net
+ * @net: network namespace in question (can be NULL)
+ * @uid: kernel user ID for sysfs objects
+ * @gid: kernel group ID for sysfs objects
+ *
+ * Returns the uid/gid pair of root in the user namespace associated with the
+ * given network namespace.
+ */
+void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
+{
+ if (net) {
+ kuid_t ns_root_uid = make_kuid(net->user_ns, 0);
+ kgid_t ns_root_gid = make_kgid(net->user_ns, 0);
+
+ if (uid_valid(ns_root_uid))
+ *uid = ns_root_uid;
+
+ if (gid_valid(ns_root_gid))
+ *gid = ns_root_gid;
+ } else {
+ *uid = GLOBAL_ROOT_UID;
+ *gid = GLOBAL_ROOT_GID;
+ }
+}
+EXPORT_SYMBOL_GPL(net_ns_get_ownership);
+
static void unhash_nsid(struct net *net, struct net *last)
{
struct net *tmp;
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 68bf07206744..43a932cb609b 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -269,7 +269,7 @@ static void __page_pool_empty_ring(struct page_pool *pool)
struct page *page;
/* Empty recycle ring */
- while ((page = ptr_ring_consume(&pool->ring))) {
+ while ((page = ptr_ring_consume_bh(&pool->ring))) {
/* Verify the refcnt invariant of cached pages */
if (!(page_ref_count(page) == 1))
pr_crit("%s() page_pool refcnt %d violation\n",
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 49368e21d228..7f6938405fa1 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1265,7 +1265,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_min) != 0) {
memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min));
- strncpy(pkt_dev->dst_min, buf, len);
+ strcpy(pkt_dev->dst_min, buf);
pkt_dev->daddr_min = in_aton(pkt_dev->dst_min);
pkt_dev->cur_daddr = pkt_dev->daddr_min;
}
@@ -1280,14 +1280,12 @@ static ssize_t pktgen_if_write(struct file *file,
if (len < 0)
return len;
-
if (copy_from_user(buf, &user_buffer[i], len))
return -EFAULT;
-
buf[len] = 0;
if (strcmp(buf, pkt_dev->dst_max) != 0) {
memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max));
- strncpy(pkt_dev->dst_max, buf, len);
+ strcpy(pkt_dev->dst_max, buf);
pkt_dev->daddr_max = in_aton(pkt_dev->dst_max);
pkt_dev->cur_daddr = pkt_dev->daddr_max;
}
@@ -1396,7 +1394,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_min) != 0) {
memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min));
- strncpy(pkt_dev->src_min, buf, len);
+ strcpy(pkt_dev->src_min, buf);
pkt_dev->saddr_min = in_aton(pkt_dev->src_min);
pkt_dev->cur_saddr = pkt_dev->saddr_min;
}
@@ -1416,7 +1414,7 @@ static ssize_t pktgen_if_write(struct file *file,
buf[len] = 0;
if (strcmp(buf, pkt_dev->src_max) != 0) {
memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max));
- strncpy(pkt_dev->src_max, buf, len);
+ strcpy(pkt_dev->src_max, buf);
pkt_dev->saddr_max = in_aton(pkt_dev->src_max);
pkt_dev->cur_saddr = pkt_dev->saddr_max;
}
@@ -2255,7 +2253,7 @@ static void get_ipsec_sa(struct pktgen_dev *pkt_dev, int flow)
x = xfrm_state_lookup_byspi(pn->net, htonl(pkt_dev->spi), AF_INET);
} else {
/* slow path: we dont already have xfrm_state */
- x = xfrm_stateonly_find(pn->net, DUMMY_MARK,
+ x = xfrm_stateonly_find(pn->net, DUMMY_MARK, 0,
(xfrm_address_t *)&pkt_dev->cur_daddr,
(xfrm_address_t *)&pkt_dev->cur_saddr,
AF_INET,
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index e3f743c141b3..24431e578310 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -964,7 +964,8 @@ static size_t rtnl_xdp_size(void)
{
size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */
nla_total_size(1) + /* XDP_ATTACHED */
- nla_total_size(4); /* XDP_PROG_ID */
+ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */
+ nla_total_size(4); /* XDP_<mode>_PROG_ID */
return xdp_size;
}
@@ -1014,6 +1015,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_IF_NETNSID */
+ nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */
+ nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */
+ + nla_total_size(4) /* IFLA_MIN_MTU */
+ + nla_total_size(4) /* IFLA_MAX_MTU */
+ 0;
}
@@ -1353,27 +1356,51 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
return 0;
}
-static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id)
+static u32 rtnl_xdp_prog_skb(struct net_device *dev)
{
- const struct net_device_ops *ops = dev->netdev_ops;
const struct bpf_prog *generic_xdp_prog;
- struct netdev_bpf xdp;
ASSERT_RTNL();
- *prog_id = 0;
generic_xdp_prog = rtnl_dereference(dev->xdp_prog);
- if (generic_xdp_prog) {
- *prog_id = generic_xdp_prog->aux->id;
- return XDP_ATTACHED_SKB;
- }
- if (!ops->ndo_bpf)
- return XDP_ATTACHED_NONE;
+ if (!generic_xdp_prog)
+ return 0;
+ return generic_xdp_prog->aux->id;
+}
- __dev_xdp_query(dev, ops->ndo_bpf, &xdp);
- *prog_id = xdp.prog_id;
+static u32 rtnl_xdp_prog_drv(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+}
- return xdp.prog_attached;
+static u32 rtnl_xdp_prog_hw(struct net_device *dev)
+{
+ return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
+ XDP_QUERY_PROG_HW);
+}
+
+static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,
+ u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr,
+ u32 (*get_prog_id)(struct net_device *dev))
+{
+ u32 curr_id;
+ int err;
+
+ curr_id = get_prog_id(dev);
+ if (!curr_id)
+ return 0;
+
+ *prog_id = curr_id;
+ err = nla_put_u32(skb, attr, curr_id);
+ if (err)
+ return err;
+
+ if (*mode != XDP_ATTACHED_NONE)
+ *mode = XDP_ATTACHED_MULTI;
+ else
+ *mode = tgt_mode;
+
+ return 0;
}
static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
@@ -1381,17 +1408,32 @@ static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev)
struct nlattr *xdp;
u32 prog_id;
int err;
+ u8 mode;
xdp = nla_nest_start(skb, IFLA_XDP);
if (!xdp)
return -EMSGSIZE;
- err = nla_put_u8(skb, IFLA_XDP_ATTACHED,
- rtnl_xdp_attached_mode(dev, &prog_id));
+ prog_id = 0;
+ mode = XDP_ATTACHED_NONE;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB,
+ IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV,
+ IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv);
+ if (err)
+ goto err_cancel;
+ err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW,
+ IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw);
+ if (err)
+ goto err_cancel;
+
+ err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode);
if (err)
goto err_cancel;
- if (prog_id) {
+ if (prog_id && mode != XDP_ATTACHED_MULTI) {
err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id);
if (err)
goto err_cancel;
@@ -1561,6 +1603,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
netif_running(dev) ? dev->operstate : IF_OPER_DOWN) ||
nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) ||
nla_put_u32(skb, IFLA_MTU, dev->mtu) ||
+ nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) ||
+ nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) ||
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
@@ -1692,6 +1736,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_IF_NETNSID] = { .type = NLA_S32 },
[IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 },
[IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 },
+ [IFLA_MIN_MTU] = { .type = NLA_U32 },
+ [IFLA_MAX_MTU] = { .type = NLA_U32 },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2336,7 +2382,7 @@ static int do_setlink(const struct sk_buff *skb,
}
if (tb[IFLA_MTU]) {
- err = dev_set_mtu(dev, nla_get_u32(tb[IFLA_MTU]));
+ err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack);
if (err < 0)
goto errout;
status |= DO_SETLINK_MODIFIED;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c4e24ac27464..8d574a88125d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -858,6 +858,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
n->cloned = 1;
n->nohdr = 0;
n->peeked = 0;
+ C(pfmemalloc);
n->destructor = NULL;
C(tail);
C(end);
@@ -1714,7 +1715,7 @@ void *skb_push(struct sk_buff *skb, unsigned int len)
{
skb->data -= len;
skb->len += len;
- if (unlikely(skb->data<skb->head))
+ if (unlikely(skb->data < skb->head))
skb_under_panic(skb, len, __builtin_return_address(0));
return skb->data;
}
@@ -2857,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge);
/**
* skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty
+ * Return value: the sum of truesizes of all purged skbs.
*
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock).
*/
-void skb_rbtree_purge(struct rb_root *root)
+unsigned int skb_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
+ unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
+ sum += skb->truesize;
kfree_skb(skb);
}
+ return sum;
}
/**
@@ -3719,6 +3724,7 @@ normal:
net_warn_ratelimited(
"skb_segment: too many frags: %u %u\n",
pos, mss);
+ err = -EINVAL;
goto err;
}
@@ -3752,11 +3758,10 @@ skip_fraglist:
perform_csum_check:
if (!csum) {
- if (skb_has_shared_frag(nskb)) {
- err = __skb_linearize(nskb);
- if (err)
- goto err;
- }
+ if (skb_has_shared_frag(nskb) &&
+ __skb_linearize(nskb))
+ goto err;
+
if (!nskb->remcsum_offload)
nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
diff --git a/net/core/sock.c b/net/core/sock.c
index 03fdea5b0f57..e31233f5ba39 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -250,58 +250,13 @@ static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
_sock_locks("k-clock-")
};
static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
- "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" ,
- "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK",
- "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" ,
- "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" ,
- "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" ,
- "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" ,
- "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" ,
- "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" ,
- "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" ,
- "rlock-27" , "rlock-28" , "rlock-AF_CAN" ,
- "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" ,
- "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" ,
- "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" ,
- "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" ,
- "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_XDP" ,
- "rlock-AF_MAX"
+ _sock_locks("rlock-")
};
static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
- "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" ,
- "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK",
- "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" ,
- "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" ,
- "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" ,
- "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" ,
- "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" ,
- "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" ,
- "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" ,
- "wlock-27" , "wlock-28" , "wlock-AF_CAN" ,
- "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" ,
- "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" ,
- "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" ,
- "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" ,
- "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_XDP" ,
- "wlock-AF_MAX"
+ _sock_locks("wlock-")
};
static const char *const af_family_elock_key_strings[AF_MAX+1] = {
- "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" ,
- "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK",
- "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" ,
- "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" ,
- "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" ,
- "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" ,
- "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" ,
- "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" ,
- "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" ,
- "elock-27" , "elock-28" , "elock-AF_CAN" ,
- "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" ,
- "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" ,
- "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" ,
- "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" ,
- "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_XDP" ,
- "elock-AF_MAX"
+ _sock_locks("elock-")
};
/*
@@ -2316,9 +2271,9 @@ int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
pfrag->offset += use;
sge = sg + sg_curr - 1;
- if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
- sg->offset + sg->length == orig_offset) {
- sg->length += use;
+ if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
+ sge->offset + sge->length == orig_offset) {
+ sge->length += use;
} else {
sge = sg + sg_curr;
sg_unmark_end(sge);
diff --git a/net/core/utils.c b/net/core/utils.c
index d47863b07a60..2a597ac7808e 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -397,7 +397,7 @@ int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af,
break;
default:
pr_err("unexpected address family %d\n", af);
- };
+ }
return ret;
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 31c58719b5a9..c013b836006b 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -3,8 +3,11 @@
* Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc.
* Released under terms in GPL version 2. See COPYING.
*/
+#include <linux/bpf.h>
+#include <linux/filter.h>
#include <linux/types.h>
#include <linux/mm.h>
+#include <linux/netdevice.h>
#include <linux/slab.h>
#include <linux/idr.h>
#include <linux/rhashtable.h>
@@ -345,7 +348,8 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
rcu_read_lock();
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
- xa->zc_alloc->free(xa->zc_alloc, handle);
+ if (!WARN_ON_ONCE(!xa))
+ xa->zc_alloc->free(xa->zc_alloc, handle);
rcu_read_unlock();
default:
/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
@@ -370,3 +374,34 @@ void xdp_return_buff(struct xdp_buff *xdp)
__xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+
+int xdp_attachment_query(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ bpf->prog_id = info->prog ? info->prog->aux->id : 0;
+ bpf->prog_flags = info->prog ? info->flags : 0;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_query);
+
+bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog && (bpf->flags ^ info->flags) & XDP_FLAGS_MODES) {
+ NL_SET_ERR_MSG(bpf->extack,
+ "program loaded with different flags");
+ return false;
+ }
+ return true;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_flags_ok);
+
+void xdp_attachment_setup(struct xdp_attachment_info *info,
+ struct netdev_bpf *bpf)
+{
+ if (info->prog)
+ bpf_prog_put(info->prog);
+ info->prog = bpf->prog;
+ info->flags = bpf->flags;
+}
+EXPORT_SYMBOL_GPL(xdp_attachment_setup);
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 2589a6b78aa1..a556cd708885 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -1786,7 +1786,7 @@ static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app,
if (itr->app.selector == app->selector &&
itr->app.protocol == app->protocol &&
itr->ifindex == ifindex &&
- (!prio || itr->app.priority == prio))
+ ((prio == -1) || itr->app.priority == prio))
return itr;
}
@@ -1821,7 +1821,8 @@ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app)
u8 prio = 0;
spin_lock_bh(&dcb_lock);
- if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+ itr = dcb_app_lookup(app, dev->ifindex, -1);
+ if (itr)
prio = itr->app.priority;
spin_unlock_bh(&dcb_lock);
@@ -1849,7 +1850,8 @@ int dcb_setapp(struct net_device *dev, struct dcb_app *new)
spin_lock_bh(&dcb_lock);
/* Search for existing match and replace */
- if ((itr = dcb_app_lookup(new, dev->ifindex, 0))) {
+ itr = dcb_app_lookup(new, dev->ifindex, -1);
+ if (itr) {
if (new->priority)
itr->app.priority = new->priority;
else {
@@ -1882,7 +1884,8 @@ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app)
u8 prio = 0;
spin_lock_bh(&dcb_lock);
- if ((itr = dcb_app_lookup(app, dev->ifindex, 0)))
+ itr = dcb_app_lookup(app, dev->ifindex, -1);
+ if (itr)
prio |= 1 << itr->app.priority;
spin_unlock_bh(&dcb_lock);
@@ -1955,6 +1958,92 @@ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del)
}
EXPORT_SYMBOL(dcb_ieee_delapp);
+/**
+ * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from
+ * priorities to the DSCP values assigned to that priority. Initialize p_map
+ * such that each map element holds a bit mask of DSCP values configured for
+ * that priority by APP entries.
+ */
+void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_prio_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 prio;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS) {
+ prio = itr->app.priority;
+ p_map->map[prio] |= 1ULL << itr->app.protocol;
+ }
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map);
+
+/**
+ * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from
+ * DSCP values to the priorities assigned to that DSCP value. Initialize p_map
+ * such that each map element holds a bit mask of priorities configured for a
+ * given DSCP value by APP entries.
+ */
+void
+dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev,
+ struct dcb_ieee_app_dscp_map *p_map)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+
+ memset(p_map->map, 0, sizeof(p_map->map));
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP &&
+ itr->app.protocol < 64 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+ p_map->map[itr->app.protocol] |= 1 << itr->app.priority;
+ }
+ spin_unlock_bh(&dcb_lock);
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map);
+
+/**
+ * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet
+ * type, with valid PID values >= 1536. A special meaning is then assigned to
+ * protocol value of 0: "default priority. For use when priority is not
+ * otherwise specified".
+ *
+ * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries
+ * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default
+ * priorities set by these entries.
+ */
+u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev)
+{
+ int ifindex = dev->ifindex;
+ struct dcb_app_type *itr;
+ u8 mask = 0;
+
+ spin_lock_bh(&dcb_lock);
+ list_for_each_entry(itr, &dcb_app_list, list) {
+ if (itr->ifindex == ifindex &&
+ itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE &&
+ itr->app.protocol == 0 &&
+ itr->app.priority < IEEE_8021QAZ_MAX_TCS)
+ mask |= 1 << itr->app.priority;
+ }
+ spin_unlock_bh(&dcb_lock);
+
+ return mask;
+}
+EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask);
+
static int __init dcbnl_init(void)
{
INIT_LIST_HEAD(&dcb_app_list);
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 0d56e36a6db7..875858c8b059 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -325,7 +325,7 @@ __poll_t dccp_poll(struct file *file, struct socket *sock,
__poll_t mask;
struct sock *sk = sock->sk;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
if (sk->sk_state == DCCP_LISTEN)
return inet_csk_listen_poll(sk);
diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig
index f3393e154f0f..dcc74956badd 100644
--- a/net/decnet/Kconfig
+++ b/net/decnet/Kconfig
@@ -40,4 +40,3 @@ config DECNET_ROUTER
to work.
See <file:Documentation/networking/decnet.txt> for more information.
-
diff --git a/net/decnet/Makefile b/net/decnet/Makefile
index 9e38122d942b..07b38e441b2d 100644
--- a/net/decnet/Makefile
+++ b/net/decnet/Makefile
@@ -8,4 +8,3 @@ decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o
decnet-y += sysctl_net_decnet.o
obj-$(CONFIG_NETFILTER) += netfilter/
-
diff --git a/net/decnet/TODO b/net/decnet/TODO
index ebb5ac69d128..358e9eb49016 100644
--- a/net/decnet/TODO
+++ b/net/decnet/TODO
@@ -16,14 +16,14 @@ Steve's quick list of things that need finishing off:
o Verify errors etc. against POSIX 1003.1g (draft)
- o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
+ o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g)
[maybe this should be done at socket level... the control data in the
send/recvmsg() calls should simply be a vector of set/getsockopt()
calls]
o check MSG_CTRUNC is set where it should be.
- o Find all the commonality between DECnet and IPv4 routing code and extract
+ o Find all the commonality between DECnet and IPv4 routing code and extract
it into a small library of routines. [probably a project for 2.7.xx]
o Add perfect socket hashing - an idea suggested by Paul Koning. Currently
@@ -38,4 +38,3 @@ Steve's quick list of things that need finishing off:
o DECnet sendpages() function
o AIO for DECnet
-
diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c
index fce94cbd4378..f78fe58eafc8 100644
--- a/net/decnet/dn_fib.c
+++ b/net/decnet/dn_fib.c
@@ -797,5 +797,3 @@ void __init dn_fib_init(void)
rtnl_register_module(THIS_MODULE, PF_DECnet, RTM_DELROUTE,
dn_fib_rtm_delroute, NULL, 0);
}
-
-
diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c
index 34aba55ed573..2fb5e055ba25 100644
--- a/net/decnet/dn_nsp_in.c
+++ b/net/decnet/dn_nsp_in.c
@@ -912,4 +912,3 @@ free_out:
return NET_RX_SUCCESS;
}
-
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index 56a52a004c56..a1779de6bd9c 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -701,4 +701,3 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
dn_nsp_send(skb);
}
-
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index e74765024d88..3107a2e24e6b 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -1925,4 +1925,3 @@ void __exit dn_route_cleanup(void)
remove_proc_entry("decnet_cache", init_net.proc_net);
dst_entries_destroy(&dn_dst_ops);
}
-
diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c
index 72236695db3d..4a4e3c17740c 100644
--- a/net/decnet/dn_rules.c
+++ b/net/decnet/dn_rules.c
@@ -256,5 +256,3 @@ void __exit dn_fib_rules_cleanup(void)
rtnl_unlock();
rcu_barrier();
}
-
-
diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile
index 255c1ae9daeb..b579e52130aa 100644
--- a/net/decnet/netfilter/Makefile
+++ b/net/decnet/netfilter/Makefile
@@ -3,4 +3,3 @@
#
obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o
-
diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c
index ab395e55cd78..a4faacadd8a8 100644
--- a/net/decnet/netfilter/dn_rtmsg.c
+++ b/net/decnet/netfilter/dn_rtmsg.c
@@ -158,4 +158,3 @@ MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_DNRTMSG);
module_init(dn_rtmsg_init);
module_exit(dn_rtmsg_fini);
-
diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c
index 40c851693f77..7f4534828f6c 100644
--- a/net/dns_resolver/dns_key.c
+++ b/net/dns_resolver/dns_key.c
@@ -86,35 +86,39 @@ dns_resolver_preparse(struct key_preparsed_payload *prep)
opt++;
kdebug("options: '%s'", opt);
do {
+ int opt_len, opt_nlen;
const char *eq;
- int opt_len, opt_nlen, opt_vlen, tmp;
+ char optval[128];
next_opt = memchr(opt, '#', end - opt) ?: end;
opt_len = next_opt - opt;
- if (opt_len <= 0 || opt_len > 128) {
+ if (opt_len <= 0 || opt_len > sizeof(optval)) {
pr_warn_ratelimited("Invalid option length (%d) for dns_resolver key\n",
opt_len);
return -EINVAL;
}
- eq = memchr(opt, '=', opt_len) ?: end;
- opt_nlen = eq - opt;
- eq++;
- opt_vlen = next_opt - eq; /* will be -1 if no value */
+ eq = memchr(opt, '=', opt_len);
+ if (eq) {
+ opt_nlen = eq - opt;
+ eq++;
+ memcpy(optval, eq, next_opt - eq);
+ optval[next_opt - eq] = '\0';
+ } else {
+ opt_nlen = opt_len;
+ optval[0] = '\0';
+ }
- tmp = opt_vlen >= 0 ? opt_vlen : 0;
- kdebug("option '%*.*s' val '%*.*s'",
- opt_nlen, opt_nlen, opt, tmp, tmp, eq);
+ kdebug("option '%*.*s' val '%s'",
+ opt_nlen, opt_nlen, opt, optval);
/* see if it's an error number representing a DNS error
* that's to be recorded as the result in this key */
if (opt_nlen == sizeof(DNS_ERRORNO_OPTION) - 1 &&
memcmp(opt, DNS_ERRORNO_OPTION, opt_nlen) == 0) {
kdebug("dns error number option");
- if (opt_vlen <= 0)
- goto bad_option_value;
- ret = kstrtoul(eq, 10, &derrno);
+ ret = kstrtoul(optval, 10, &derrno);
if (ret < 0)
goto bad_option_value;
@@ -316,4 +320,3 @@ static void __exit exit_dns_resolver(void)
module_init(init_dns_resolver)
module_exit(exit_dns_resolver)
MODULE_LICENSE("GPL");
-
diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c
index dc5d9af3dc80..a1917025e155 100644
--- a/net/dsa/dsa2.c
+++ b/net/dsa/dsa2.c
@@ -775,6 +775,20 @@ struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n)
if (!ds)
return NULL;
+ /* We avoid allocating memory outside dsa_switch
+ * if it is not needed.
+ */
+ if (n <= sizeof(ds->_bitmap) * 8) {
+ ds->bitmap = &ds->_bitmap;
+ } else {
+ ds->bitmap = devm_kcalloc(dev,
+ BITS_TO_LONGS(n),
+ sizeof(unsigned long),
+ GFP_KERNEL);
+ if (unlikely(!ds->bitmap))
+ return NULL;
+ }
+
ds->dev = dev;
ds->num_ports = n;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 71536c435132..1ba3bde96b55 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1248,6 +1248,9 @@ int dsa_slave_suspend(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
+ if (!netif_running(slave_dev))
+ return 0;
+
netif_device_detach(slave_dev);
rtnl_lock();
@@ -1261,6 +1264,9 @@ int dsa_slave_resume(struct net_device *slave_dev)
{
struct dsa_port *dp = dsa_slave_to_port(slave_dev);
+ if (!netif_running(slave_dev))
+ return 0;
+
netif_device_attach(slave_dev);
rtnl_lock();
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index b93511726069..142b294d3446 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -136,21 +136,20 @@ static int dsa_switch_mdb_add(struct dsa_switch *ds,
{
const struct switchdev_obj_port_mdb *mdb = info->mdb;
struct switchdev_trans *trans = info->trans;
- DECLARE_BITMAP(group, ds->num_ports);
int port;
/* Build a mask of Multicast group members */
- bitmap_zero(group, ds->num_ports);
+ bitmap_zero(ds->bitmap, ds->num_ports);
if (ds->index == info->sw_index)
- set_bit(info->port, group);
+ set_bit(info->port, ds->bitmap);
for (port = 0; port < ds->num_ports; port++)
if (dsa_is_dsa_port(ds, port))
- set_bit(port, group);
+ set_bit(port, ds->bitmap);
if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_mdb_prepare_bitmap(ds, mdb, group);
+ return dsa_switch_mdb_prepare_bitmap(ds, mdb, ds->bitmap);
- dsa_switch_mdb_add_bitmap(ds, mdb, group);
+ dsa_switch_mdb_add_bitmap(ds, mdb, ds->bitmap);
return 0;
}
@@ -204,21 +203,20 @@ static int dsa_switch_vlan_add(struct dsa_switch *ds,
{
const struct switchdev_obj_port_vlan *vlan = info->vlan;
struct switchdev_trans *trans = info->trans;
- DECLARE_BITMAP(members, ds->num_ports);
int port;
/* Build a mask of VLAN members */
- bitmap_zero(members, ds->num_ports);
+ bitmap_zero(ds->bitmap, ds->num_ports);
if (ds->index == info->sw_index)
- set_bit(info->port, members);
+ set_bit(info->port, ds->bitmap);
for (port = 0; port < ds->num_ports; port++)
if (dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))
- set_bit(port, members);
+ set_bit(port, ds->bitmap);
if (switchdev_trans_ph_prepare(trans))
- return dsa_switch_vlan_prepare_bitmap(ds, vlan, members);
+ return dsa_switch_vlan_prepare_bitmap(ds, vlan, ds->bitmap);
- dsa_switch_vlan_add_bitmap(ds, vlan, members);
+ dsa_switch_vlan_add_bitmap(ds, vlan, ds->bitmap);
return 0;
}
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 275449b0d633..3297e7fa9945 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -90,12 +90,18 @@ static int lowpan_neigh_construct(struct net_device *dev, struct neighbour *n)
return 0;
}
+static int lowpan_get_iflink(const struct net_device *dev)
+{
+ return lowpan_802154_dev(dev)->wdev->ifindex;
+}
+
static const struct net_device_ops lowpan_netdev_ops = {
.ndo_init = lowpan_dev_init,
.ndo_start_xmit = lowpan_xmit,
.ndo_open = lowpan_open,
.ndo_stop = lowpan_stop,
.ndo_neigh_construct = lowpan_neigh_construct,
+ .ndo_get_iflink = lowpan_get_iflink,
};
static void lowpan_setup(struct net_device *ldev)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2cc224106b69..ec7a5da56129 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -25,7 +25,7 @@
#include <net/ieee802154_netdev.h>
#include <net/6lowpan.h>
-#include <net/ipv6.h>
+#include <net/ipv6_frag.h>
#include <net/inet_frag.h>
#include "6lowpan_i.h"
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index cb7176cd4cd6..fe225d9a1877 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -400,4 +400,3 @@ module_exit(wpan_phy_class_exit);
MODULE_LICENSE("GPL v2");
MODULE_DESCRIPTION("IEEE 802.15.4 configuration interface");
MODULE_AUTHOR("Dmitry Eremin-Solenikov");
-
diff --git a/net/ieee802154/nl_policy.c b/net/ieee802154/nl_policy.c
index 35c432668454..78f6f1233194 100644
--- a/net/ieee802154/nl_policy.c
+++ b/net/ieee802154/nl_policy.c
@@ -75,4 +75,3 @@ const struct nla_policy ieee802154_policy[IEEE802154_ATTR_MAX + 1] = {
[IEEE802154_ATTR_LLSEC_DEV_OVERRIDE] = { .type = NLA_U8, },
[IEEE802154_ATTR_LLSEC_DEV_KEY_MODE] = { .type = NLA_U8, },
};
-
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 80dad301361d..32cae39cdff6 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -430,7 +430,7 @@ config INET_DIAG
Support for INET (TCP, DCCP, etc) socket monitoring interface used by
native Linux tools such as ss. ss is included in iproute2, currently
downloadable at:
-
+
http://www.linuxfoundation.org/collaborate/workgroups/networking/iproute2
If unsure, say Y.
@@ -600,7 +600,7 @@ config TCP_CONG_VENO
distinguishing to circumvent the difficult judgment of the packet loss
type. TCP Veno cuts down less congestion window in response to random
loss packets.
- See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
+ See <http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=1177186>
config TCP_CONG_YEAH
tristate "YeAH TCP"
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index eec9569ffa5c..7446b98661d8 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -43,7 +43,7 @@ obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
obj-$(CONFIG_IP_PNP) += ipconfig.o
obj-$(CONFIG_NETFILTER) += netfilter.o netfilter/
-obj-$(CONFIG_INET_DIAG) += inet_diag.o
+obj-$(CONFIG_INET_DIAG) += inet_diag.o
obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index c716be13d58c..20fda8fb8ffd 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -229,6 +229,7 @@ int inet_listen(struct socket *sock, int backlog)
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
+ tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL);
}
sk->sk_max_ack_backlog = backlog;
err = 0;
@@ -485,8 +486,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
* is temporarily down)
*/
err = -EADDRNOTAVAIL;
- if (!net->ipv4.sysctl_ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!inet_can_nonlocal_bind(net, inet) &&
addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
@@ -1801,6 +1801,7 @@ static __net_init int inet_init_net(struct net *net)
* We set them here, in case sysctl is not compiled.
*/
net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_fwd_update_priority = 1;
net->ipv4.sysctl_ip_dynaddr = 0;
net->ipv4.sysctl_ip_early_demux = 1;
net->ipv4.sysctl_udp_early_demux = 1;
diff --git a/net/ipv4/bpfilter/Makefile b/net/ipv4/bpfilter/Makefile
index ce262d76cc48..e9e42f99725e 100644
--- a/net/ipv4/bpfilter/Makefile
+++ b/net/ipv4/bpfilter/Makefile
@@ -1,2 +1 @@
obj-$(CONFIG_BPFILTER) += sockopt.o
-
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index d7585ab1a77a..ea4bd8a52422 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1827,6 +1827,8 @@ static int inet_netconf_msgsize_devconf(int type)
size += nla_total_size(4);
if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
+ if (all || type == NETCONFA_BC_FORWARDING)
+ size += nla_total_size(4);
if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
@@ -1873,6 +1875,10 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
+ if ((all || type == NETCONFA_BC_FORWARDING) &&
+ nla_put_s32(skb, NETCONFA_BC_FORWARDING,
+ IPV4_DEVCONF(*devconf, BC_FORWARDING)) < 0)
+ goto nla_put_failure;
if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
@@ -2143,6 +2149,10 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write,
if ((new_value == 0) && (old_value != 0))
rt_cache_flush(net);
+ if (i == IPV4_DEVCONF_BC_FORWARDING - 1 &&
+ new_value != old_value)
+ rt_cache_flush(net);
+
if (i == IPV4_DEVCONF_RP_FILTER - 1 &&
new_value != old_value) {
ifindex = devinet_conf_ifindex(net, cnf);
@@ -2259,6 +2269,7 @@ static struct devinet_sysctl_table {
DEVINET_SYSCTL_COMPLEX_ENTRY(FORWARDING, "forwarding",
devinet_sysctl_forward),
DEVINET_SYSCTL_RO_ENTRY(MC_FORWARDING, "mc_forwarding"),
+ DEVINET_SYSCTL_RW_ENTRY(BC_FORWARDING, "bc_forwarding"),
DEVINET_SYSCTL_RW_ENTRY(ACCEPT_REDIRECTS, "accept_redirects"),
DEVINET_SYSCTL_RW_ENTRY(SECURE_REDIRECTS, "secure_redirects"),
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index bbeecd13e534..58834a10c0be 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -135,8 +135,7 @@ static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev))
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -179,8 +178,7 @@ static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_
if (!xo)
return -EINVAL;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev)) {
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b21833651394..2998b0e47d4b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -292,18 +292,19 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
return ip_hdr(skb)->daddr;
in_dev = __in_dev_get_rcu(dev);
- BUG_ON(!in_dev);
net = dev_net(dev);
scope = RT_SCOPE_UNIVERSE;
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
+ bool vmark = in_dev && IN_DEV_SRC_VMARK(in_dev);
struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
+ .flowi4_oif = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
.flowi4_scope = scope,
- .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+ .flowi4_mark = vmark ? skb->mark : 0,
};
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 85b617b655bc..cf75f8944b05 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1200,13 +1200,13 @@ static void igmpv3_del_delrec(struct in_device *in_dev, struct ip_mc_list *im)
spin_lock_bh(&im->lock);
if (pmc) {
im->interface = pmc->interface;
- im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
- im->sfmode = pmc->sfmode;
- if (pmc->sfmode == MCAST_INCLUDE) {
+ if (im->sfmode == MCAST_INCLUDE) {
im->tomb = pmc->tomb;
im->sources = pmc->sources;
for (psf = im->sources; psf; psf = psf->sf_next)
- psf->sf_crcount = im->crcount;
+ psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ } else {
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
}
in_dev_put(pmc->interface);
kfree(pmc);
@@ -1316,7 +1316,13 @@ static void igmp_group_added(struct ip_mc_list *im)
}
/* else, v3 */
- im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+ /* Based on RFC3376 5.1, for newly added INCLUDE SSM, we should
+ * not send filter-mode change record as the mode should be from
+ * IN() to IN(A).
+ */
+ if (im->sfmode == MCAST_EXCLUDE)
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+
igmp_ifc_event(in_dev);
#endif
}
@@ -1381,8 +1387,8 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
/*
* A socket has joined a multicast group on device dev.
*/
-
-void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+static void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr,
+ unsigned int mode)
{
struct ip_mc_list *im;
#ifdef CONFIG_IP_MULTICAST
@@ -1394,7 +1400,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
for_each_pmc_rtnl(in_dev, im) {
if (im->multiaddr == addr) {
im->users++;
- ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0);
+ ip_mc_add_src(in_dev, &addr, mode, 0, NULL, 0);
goto out;
}
}
@@ -1408,8 +1414,8 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
in_dev_hold(in_dev);
im->multiaddr = addr;
/* initial mode is (EX, empty) */
- im->sfmode = MCAST_EXCLUDE;
- im->sfcount[MCAST_EXCLUDE] = 1;
+ im->sfmode = mode;
+ im->sfcount[mode] = 1;
refcount_set(&im->refcnt, 1);
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
@@ -1432,6 +1438,11 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
out:
return;
}
+
+void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
+{
+ __ip_mc_inc_group(in_dev, addr, MCAST_EXCLUDE);
+}
EXPORT_SYMBOL(ip_mc_inc_group);
static int ip_mc_check_iphdr(struct sk_buff *skb)
@@ -2130,8 +2141,8 @@ static void ip_mc_clear_src(struct ip_mc_list *pmc)
/* Join a multicast group
*/
-
-int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+static int __ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr,
+ unsigned int mode)
{
__be32 addr = imr->imr_multiaddr.s_addr;
struct ip_mc_socklist *iml, *i;
@@ -2172,15 +2183,30 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
memcpy(&iml->multi, imr, sizeof(*imr));
iml->next_rcu = inet->mc_list;
iml->sflist = NULL;
- iml->sfmode = MCAST_EXCLUDE;
+ iml->sfmode = mode;
rcu_assign_pointer(inet->mc_list, iml);
- ip_mc_inc_group(in_dev, addr);
+ __ip_mc_inc_group(in_dev, addr, mode);
err = 0;
done:
return err;
}
+
+/* Join ASM (Any-Source Multicast) group
+ */
+int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
+{
+ return __ip_mc_join_group(sk, imr, MCAST_EXCLUDE);
+}
EXPORT_SYMBOL(ip_mc_join_group);
+/* Join SSM (Source-Specific Multicast) group
+ */
+int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr,
+ unsigned int mode)
+{
+ return __ip_mc_join_group(sk, imr, mode);
+}
+
static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml,
struct in_device *in_dev)
{
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 316518f87294..6d258a5669e7 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -91,7 +91,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
void inet_frags_exit_net(struct netns_frags *nf)
{
- nf->low_thresh = 0; /* prevent creation of new frags */
+ nf->high_thresh = 0; /* prevent creation of new frags */
rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL);
}
@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments;
nf = q->net;
f = nf->f;
- while (fp) {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
+ if (fp) {
+ do {
+ struct sk_buff *xp = fp->next;
+
+ sum_truesize += fp->truesize;
+ kfree_skb(fp);
+ fp = xp;
+ } while (fp);
+ } else {
+ sum_truesize = skb_rbtree_purge(&q->rb_fragments);
}
sum = sum_truesize + f->qsize;
@@ -158,9 +162,6 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
{
struct inet_frag_queue *q;
- if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
- return NULL;
-
q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
if (!q)
return NULL;
@@ -205,6 +206,9 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
{
struct inet_frag_queue *fq;
+ if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
+ return NULL;
+
rcu_read_lock();
fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index b54b948b0596..32662e9e5d21 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -143,7 +143,8 @@ int ip_forward(struct sk_buff *skb)
!skb_sec_path(skb))
ip_rt_send_redirect(skb);
- skb->priority = rt_tos2priority(iph->tos);
+ if (net->ipv4.sysctl_ip_fwd_update_priority)
+ skb->priority = rt_tos2priority(iph->tos);
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
net, NULL, skb, skb->dev, rt->dst.dev,
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 8e9528ebaa8e..0e8f8de77e71 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -136,7 +136,7 @@ static void ip_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
const struct iphdr *iph;
- struct sk_buff *head;
+ struct sk_buff *head = NULL;
struct net *net;
struct ipq *qp;
int err;
@@ -152,14 +152,31 @@ static void ip_expire(struct timer_list *t)
ipq_kill(qp);
__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
-
- head = qp->q.fragments;
-
__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
- if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head)
+ if (!qp->q.flags & INET_FRAG_FIRST_IN)
goto out;
+ /* sk_buff::dev and sk_buff::rbnode are unionized. So we
+ * pull the head out of the tree in order to be able to
+ * deal with head->dev.
+ */
+ if (qp->q.fragments) {
+ head = qp->q.fragments;
+ qp->q.fragments = head->next;
+ } else {
+ head = skb_rb_first(&qp->q.rb_fragments);
+ if (!head)
+ goto out;
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ memset(&head->rbnode, 0, sizeof(head->rbnode));
+ barrier();
+ }
+ if (head == qp->q.fragments_tail)
+ qp->q.fragments_tail = NULL;
+
+ sub_frag_mem_limit(qp->q.net, head->truesize);
+
head->dev = dev_get_by_index_rcu(net, qp->iif);
if (!head->dev)
goto out;
@@ -179,16 +196,16 @@ static void ip_expire(struct timer_list *t)
(skb_rtable(head)->rt_type != RTN_LOCAL))
goto out;
- skb_get(head);
spin_unlock(&qp->q.lock);
icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
- kfree_skb(head);
goto out_rcu_unlock;
out:
spin_unlock(&qp->q.lock);
out_rcu_unlock:
rcu_read_unlock();
+ if (head)
+ kfree_skb(head);
ipq_put(qp);
}
@@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *qp)
end = atomic_inc_return(&peer->rid);
qp->rid = end;
- rc = qp->q.fragments && (end - start) > max;
+ rc = qp->q.fragments_tail && (end - start) > max;
if (rc) {
struct net *net;
@@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *qp)
static int ip_frag_reinit(struct ipq *qp)
{
- struct sk_buff *fp;
unsigned int sum_truesize = 0;
if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
@@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp)
return -ETIMEDOUT;
}
- fp = qp->q.fragments;
- do {
- struct sk_buff *xp = fp->next;
-
- sum_truesize += fp->truesize;
- kfree_skb(fp);
- fp = xp;
- } while (fp);
+ sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments);
sub_frag_mem_limit(qp->q.net, sum_truesize);
qp->q.flags = 0;
qp->q.len = 0;
qp->q.meat = 0;
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
qp->iif = 0;
qp->ecn = 0;
@@ -277,7 +287,9 @@ static int ip_frag_reinit(struct ipq *qp)
/* Add new segment to existing queue. */
static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
- struct sk_buff *prev, *next;
+ struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
+ struct rb_node **rbn, *parent;
+ struct sk_buff *skb1;
struct net_device *dev;
unsigned int fragsize;
int flags, offset;
@@ -340,95 +352,58 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (err)
goto err;
- /* Find out which fragments are in front and at the back of us
- * in the chain of fragments so far. We must know where to put
- * this fragment, right?
- */
- prev = qp->q.fragments_tail;
- if (!prev || prev->ip_defrag_offset < offset) {
- next = NULL;
- goto found;
- }
- prev = NULL;
- for (next = qp->q.fragments; next != NULL; next = next->next) {
- if (next->ip_defrag_offset >= offset)
- break; /* bingo! */
- prev = next;
- }
+ /* Note : skb->rbnode and skb->dev share the same location. */
+ dev = skb->dev;
+ /* Makes sure compiler wont do silly aliasing games */
+ barrier();
-found:
- /* We found where to put this one. Check for overlap with
- * preceding fragment, and, if needed, align things so that
- * any overlaps are eliminated.
+ /* RFC5722, Section 4, amended by Errata ID : 3089
+ * When reassembling an IPv6 datagram, if
+ * one or more its constituent fragments is determined to be an
+ * overlapping fragment, the entire datagram (and any constituent
+ * fragments) MUST be silently discarded.
+ *
+ * We do the same here for IPv4 (and increment an snmp counter).
*/
- if (prev) {
- int i = (prev->ip_defrag_offset + prev->len) - offset;
- if (i > 0) {
- offset += i;
- err = -EINVAL;
- if (end <= offset)
- goto err;
- err = -ENOMEM;
- if (!pskb_pull(skb, i))
- goto err;
- if (skb->ip_summed != CHECKSUM_UNNECESSARY)
- skb->ip_summed = CHECKSUM_NONE;
- }
- }
-
- err = -ENOMEM;
-
- while (next && next->ip_defrag_offset < end) {
- int i = end - next->ip_defrag_offset; /* overlap is 'i' bytes */
-
- if (i < next->len) {
- /* Eat head of the next overlapped fragment
- * and leave the loop. The next ones cannot overlap.
- */
- if (!pskb_pull(next, i))
- goto err;
- next->ip_defrag_offset += i;
- qp->q.meat -= i;
- if (next->ip_summed != CHECKSUM_UNNECESSARY)
- next->ip_summed = CHECKSUM_NONE;
- break;
- } else {
- struct sk_buff *free_it = next;
-
- /* Old fragment is completely overridden with
- * new one drop it.
- */
- next = next->next;
-
- if (prev)
- prev->next = next;
- else
- qp->q.fragments = next;
-
- qp->q.meat -= free_it->len;
- sub_frag_mem_limit(qp->q.net, free_it->truesize);
- kfree_skb(free_it);
- }
+ /* Find out where to put this fragment. */
+ skb1 = qp->q.fragments_tail;
+ if (!skb1) {
+ /* This is the first fragment we've received. */
+ rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node);
+ qp->q.fragments_tail = skb;
+ } else if ((skb1->ip_defrag_offset + skb1->len) < end) {
+ /* This is the common/special case: skb goes to the end. */
+ /* Detect and discard overlaps. */
+ if (offset < (skb1->ip_defrag_offset + skb1->len))
+ goto discard_qp;
+ /* Insert after skb1. */
+ rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right);
+ qp->q.fragments_tail = skb;
+ } else {
+ /* Binary search. Note that skb can become the first fragment, but
+ * not the last (covered above). */
+ rbn = &qp->q.rb_fragments.rb_node;
+ do {
+ parent = *rbn;
+ skb1 = rb_to_skb(parent);
+ if (end <= skb1->ip_defrag_offset)
+ rbn = &parent->rb_left;
+ else if (offset >= skb1->ip_defrag_offset + skb1->len)
+ rbn = &parent->rb_right;
+ else /* Found an overlap with skb1. */
+ goto discard_qp;
+ } while (*rbn);
+ /* Here we have parent properly set, and rbn pointing to
+ * one of its NULL left/right children. Insert skb. */
+ rb_link_node(&skb->rbnode, parent, rbn);
}
+ rb_insert_color(&skb->rbnode, &qp->q.rb_fragments);
- /* Note : skb->ip_defrag_offset and skb->dev share the same location */
- dev = skb->dev;
if (dev)
qp->iif = dev->ifindex;
- /* Makes sure compiler wont do silly aliasing games */
- barrier();
skb->ip_defrag_offset = offset;
- /* Insert this fragment in the chain of fragments. */
- skb->next = next;
- if (!next)
- qp->q.fragments_tail = skb;
- if (prev)
- prev->next = skb;
- else
- qp->q.fragments = skb;
-
qp->q.stamp = skb->tstamp;
qp->q.meat += skb->len;
qp->ecn |= ecn;
@@ -450,7 +425,7 @@ found:
unsigned long orefdst = skb->_skb_refdst;
skb->_skb_refdst = 0UL;
- err = ip_frag_reasm(qp, prev, dev);
+ err = ip_frag_reasm(qp, skb, dev);
skb->_skb_refdst = orefdst;
return err;
}
@@ -458,20 +433,24 @@ found:
skb_dst_drop(skb);
return -EINPROGRESS;
+discard_qp:
+ inet_frag_kill(&qp->q);
+ err = -EINVAL;
+ __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
err:
kfree_skb(skb);
return err;
}
-
/* Build a new IP datagram from all its fragments. */
-
-static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
+static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
struct net_device *dev)
{
struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
struct iphdr *iph;
- struct sk_buff *fp, *head = qp->q.fragments;
+ struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments);
+ struct sk_buff **nextp; /* To build frag_list. */
+ struct rb_node *rbn;
int len;
int ihlen;
int err;
@@ -485,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
goto out_fail;
}
/* Make the one we just received the head. */
- if (prev) {
- head = prev->next;
- fp = skb_clone(head, GFP_ATOMIC);
+ if (head != skb) {
+ fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
goto out_nomem;
-
- fp->next = head->next;
- if (!fp->next)
+ rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments);
+ if (qp->q.fragments_tail == skb)
qp->q.fragments_tail = fp;
- prev->next = fp;
-
- skb_morph(head, qp->q.fragments);
- head->next = qp->q.fragments->next;
-
- consume_skb(qp->q.fragments);
- qp->q.fragments = head;
+ skb_morph(skb, head);
+ rb_replace_node(&head->rbnode, &skb->rbnode,
+ &qp->q.rb_fragments);
+ consume_skb(head);
+ head = skb;
}
- WARN_ON(!head);
WARN_ON(head->ip_defrag_offset != 0);
/* Allocate a new buffer for the datagram. */
@@ -528,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
goto out_nomem;
- clone->next = head->next;
- head->next = clone;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->len = clone->data_len = head->data_len - plen;
- head->data_len -= clone->len;
- head->len -= clone->len;
+ skb->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(qp->q.net, clone->truesize);
+ skb_shinfo(head)->frag_list = clone;
+ nextp = &clone->next;
+ } else {
+ nextp = &skb_shinfo(head)->frag_list;
}
- skb_shinfo(head)->frag_list = head->next;
skb_push(head, head->data - skb_network_header(head));
- for (fp=head->next; fp; fp = fp->next) {
+ /* Traverse the tree in order, to build frag_list. */
+ rbn = rb_next(&head->rbnode);
+ rb_erase(&head->rbnode, &qp->q.rb_fragments);
+ while (rbn) {
+ struct rb_node *rbnext = rb_next(rbn);
+ fp = rb_to_skb(rbn);
+ rb_erase(rbn, &qp->q.rb_fragments);
+ rbn = rbnext;
+ *nextp = fp;
+ nextp = &fp->next;
+ fp->prev = NULL;
+ memset(&fp->rbnode, 0, sizeof(fp->rbnode));
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
@@ -556,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
}
sub_frag_mem_limit(qp->q.net, head->truesize);
+ *nextp = NULL;
head->next = NULL;
+ head->prev = NULL;
head->dev = dev;
head->tstamp = qp->q.stamp;
IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
@@ -584,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
__IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
qp->q.fragments = NULL;
+ qp->q.rb_fragments = RB_ROOT;
qp->q.fragments_tail = NULL;
return 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index c8ca5d8f0f75..51a5d06085ac 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -985,7 +985,6 @@ static void ipgre_tunnel_setup(struct net_device *dev)
static void __gre_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel;
- int t_hlen;
tunnel = netdev_priv(dev);
tunnel->tun_hlen = gre_calc_hlen(tunnel->parms.o_flags);
@@ -993,8 +992,6 @@ static void __gre_tunnel_init(struct net_device *dev)
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen;
- t_hlen = tunnel->hlen + sizeof(struct iphdr);
-
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
@@ -1304,13 +1301,11 @@ static const struct net_device_ops gre_tap_netdev_ops = {
static int erspan_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- int t_hlen;
tunnel->tun_hlen = 8;
tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen +
erspan_hdr_len(tunnel->erspan_ver);
- t_hlen = tunnel->hlen + sizeof(struct iphdr);
dev->features |= GRE_FEATURES;
dev->hw_features |= GRE_FEATURES;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e2b6bd478afb..9c4e72e9c60a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -524,6 +524,8 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->dev = from->dev;
to->mark = from->mark;
+ skb_copy_hash(to, from);
+
/* Copy the flags to each fragment. */
IPCB(to)->flags = IPCB(from)->flags;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index fc32fdbeefa6..c0fe5ad996f2 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -150,15 +150,18 @@ static void ip_cmsg_recv_dstaddr(struct msghdr *msg, struct sk_buff *skb)
{
struct sockaddr_in sin;
const struct iphdr *iph = ip_hdr(skb);
- __be16 *ports = (__be16 *)skb_transport_header(skb);
+ __be16 *ports;
+ int end;
- if (skb_transport_offset(skb) + 4 > (int)skb->len)
+ end = skb_transport_offset(skb) + 4;
+ if (end > 0 && !pskb_may_pull(skb, end))
return;
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
+ ports = (__be16 *)skb_transport_header(skb);
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = iph->daddr;
@@ -984,7 +987,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr;
mreq.imr_address.s_addr = mreqs.imr_interface;
mreq.imr_ifindex = 0;
- err = ip_mc_join_group(sk, &mreq);
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
if (err && err != -EADDRINUSE)
break;
omode = MCAST_INCLUDE;
@@ -1061,7 +1064,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
mreq.imr_multiaddr = psin->sin_addr;
mreq.imr_address.s_addr = 0;
mreq.imr_ifindex = greqs.gsr_interface;
- err = ip_mc_join_group(sk, &mreq);
+ err = ip_mc_join_group_ssm(sk, &mreq, MCAST_INCLUDE);
if (err && err != -EADDRINUSE)
break;
greqs.gsr_interface = mreq.imr_ifindex;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 82f914122f1b..5660adcf7a04 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1052,7 +1052,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *skb;
int ret;
- if (assert == IGMPMSG_WHOLEPKT)
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
skb = alloc_skb(128, GFP_ATOMIC);
@@ -1060,7 +1060,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
if (!skb)
return -ENOBUFS;
- if (assert == IGMPMSG_WHOLEPKT) {
+ if (assert == IGMPMSG_WHOLEPKT || assert == IGMPMSG_WRVIFWHOLE) {
/* Ugly, but we have no choice with this interface.
* Duplicate old header, fix ihl, length etc.
* And all this only to mangle msg->im_msgtype and
@@ -1071,9 +1071,12 @@ static int ipmr_cache_report(struct mr_table *mrt,
skb_reset_transport_header(skb);
msg = (struct igmpmsg *)skb_network_header(skb);
memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
- msg->im_msgtype = IGMPMSG_WHOLEPKT;
+ msg->im_msgtype = assert;
msg->im_mbz = 0;
- msg->im_vif = mrt->mroute_reg_vif_num;
+ if (assert == IGMPMSG_WRVIFWHOLE)
+ msg->im_vif = vifi;
+ else
+ msg->im_vif = mrt->mroute_reg_vif_num;
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
@@ -1372,6 +1375,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
struct mr_table *mrt;
struct vifctl vif;
struct mfcctl mfc;
+ bool do_wrvifwhole;
u32 uval;
/* There's one exception to the lock - MRT_DONE which needs to unlock */
@@ -1502,10 +1506,12 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
break;
}
+ do_wrvifwhole = (val == IGMPMSG_WRVIFWHOLE);
val = !!val;
if (val != mrt->mroute_do_pim) {
mrt->mroute_do_pim = val;
mrt->mroute_do_assert = val;
+ mrt->mroute_do_wrvifwhole = do_wrvifwhole;
}
break;
case MRT_TABLE:
@@ -1983,6 +1989,9 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
MFC_ASSERT_THRESH)) {
c->_c.mfc_un.res.last_assert = jiffies;
ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
+ if (mrt->mroute_do_wrvifwhole)
+ ipmr_cache_report(mrt, skb, true_vifi,
+ IGMPMSG_WRVIFWHOLE);
}
goto dont_forward;
}
@@ -2659,7 +2668,9 @@ static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
mrt->mroute_reg_vif_num) ||
nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
mrt->mroute_do_assert) ||
- nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim) ||
+ nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_WRVIFWHOLE,
+ mrt->mroute_do_wrvifwhole))
return false;
return true;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index e6774ccb7731..8d2e5dc9a827 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -98,59 +98,6 @@ int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry)
}
EXPORT_SYMBOL_GPL(nf_ip_reroute);
-__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
- break;
- if ((protocol == 0 && !csum_fold(skb->csum)) ||
- !csum_tcpudp_magic(iph->saddr, iph->daddr,
- skb->len - dataoff, protocol,
- skb->csum)) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- /* fall through */
- case CHECKSUM_NONE:
- if (protocol == 0)
- skb->csum = 0;
- else
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- skb->len - dataoff,
- protocol, 0);
- csum = __skb_checksum_complete(skb);
- }
- return csum;
-}
-EXPORT_SYMBOL(nf_ip_checksum);
-
-__sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, unsigned int len,
- u_int8_t protocol)
-{
- const struct iphdr *iph = ip_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (len == skb->len - dataoff)
- return nf_ip_checksum(skb, hook, dataoff, protocol);
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
- skb->len - dataoff, 0);
- skb->ip_summed = CHECKSUM_NONE;
- return __skb_checksum_complete_head(skb, dataoff + len);
- }
- return csum;
-}
-EXPORT_SYMBOL_GPL(nf_ip_checksum_partial);
-
int nf_ip_route(struct net *net, struct dst_entry **dst, struct flowi *fl,
bool strict __always_unused)
{
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index bbfc356cb1b5..d9504adc47b3 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -9,22 +9,6 @@ config NF_DEFRAG_IPV4
tristate
default n
-config NF_CONNTRACK_IPV4
- tristate "IPv4 connection tracking support (required for NAT)"
- depends on NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_DEFRAG_IPV4
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv4 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config NF_SOCKET_IPV4
tristate "IPv4 socket lookup support"
help
@@ -112,7 +96,7 @@ config NF_REJECT_IPV4
config NF_NAT_IPV4
tristate "IPv4 NAT"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
help
@@ -279,7 +263,7 @@ config IP_NF_TARGET_SYNPROXY
# NAT + specific targets: nf_conntrack
config IP_NF_NAT
tristate "iptables NAT support"
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
default m if NETFILTER_ADVANCED=n
select NF_NAT
select NF_NAT_IPV4
@@ -340,7 +324,7 @@ config IP_NF_MANGLE
config IP_NF_TARGET_CLUSTERIP
tristate "CLUSTERIP target support"
depends on IP_NF_MANGLE
- depends on NF_CONNTRACK_IPV4
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_CONNTRACK_MARK
select NETFILTER_FAMILY_ARP
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index 8394c17c269f..367993adf4d3 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -3,12 +3,6 @@
# Makefile for the netfilter modules on top of IPv4.
#
-# objects for l3 independent conntrack
-nf_conntrack_ipv4-y := nf_conntrack_l3proto_ipv4.o nf_conntrack_proto_icmp.o
-
-# connection tracking
-obj-$(CONFIG_NF_CONNTRACK_IPV4) += nf_conntrack_ipv4.o
-
nf_nat_ipv4-y := nf_nat_l3proto_ipv4.o nf_nat_proto_icmp.o
nf_nat_ipv4-$(CONFIG_NF_NAT_MASQUERADE_IPV4) += nf_nat_masquerade_ipv4.o
obj-$(CONFIG_NF_NAT_IPV4) += nf_nat_ipv4.o
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index ca0dad90803a..e77872c93c20 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1898,6 +1898,7 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
.checkentry = icmp_checkentry,
.proto = IPPROTO_ICMP,
.family = NFPROTO_IPV4,
+ .me = THIS_MODULE,
},
};
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
deleted file mode 100644
index 9db988f9a4d7..000000000000
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ /dev/null
@@ -1,472 +0,0 @@
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/route.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack4_net_id __read_mostly;
-static DEFINE_MUTEX(register_ipv4_hooks);
-
-struct conntrack4_net {
- unsigned int users;
-};
-
-static bool ipv4_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- const __be32 *ap;
- __be32 _addrs[2];
- ap = skb_header_pointer(skb, nhoff + offsetof(struct iphdr, saddr),
- sizeof(u_int32_t) * 2, _addrs);
- if (ap == NULL)
- return false;
-
- tuple->src.u3.ip = ap[0];
- tuple->dst.u3.ip = ap[1];
-
- return true;
-}
-
-static bool ipv4_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u3.ip = orig->dst.u3.ip;
- tuple->dst.u3.ip = orig->src.u3.ip;
-
- return true;
-}
-
-static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- const struct iphdr *iph;
- struct iphdr _iph;
-
- iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
- if (iph == NULL)
- return -NF_ACCEPT;
-
- /* Conntrack defragments packets, we might still see fragments
- * inside ICMP packets though. */
- if (iph->frag_off & htons(IP_OFFSET))
- return -NF_ACCEPT;
-
- *dataoff = nhoff + (iph->ihl << 2);
- *protonum = iph->protocol;
-
- /* Check bogus IP headers */
- if (*dataoff > skb->len) {
- pr_debug("nf_conntrack_ipv4: bogus IPv4 packet: "
- "nhoff %u, ihl %u, skblen %u\n",
- nhoff, iph->ihl << 2, skb->len);
- return -NF_ACCEPT;
- }
-
- return NF_ACCEPT;
-}
-
-static unsigned int ipv4_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
- ct, ctinfo);
-}
-
-static unsigned int ipv4_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
- }
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv4_conntrack_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-static unsigned int ipv4_conntrack_local(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
- enum ip_conntrack_info ctinfo;
- struct nf_conn *tmpl;
-
- tmpl = nf_ct_get(skb, &ctinfo);
- if (tmpl && nf_ct_is_template(tmpl)) {
- /* when skipping ct, clear templates to avoid fooling
- * later targets/matches
- */
- skb->_nfct = 0;
- nf_ct_put(tmpl);
- }
- return NF_ACCEPT;
- }
-
- return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
-}
-
-/* Connection tracking may drop packets, but never alters them, so
- make it the first hook. */
-static const struct nf_hook_ops ipv4_conntrack_ops[] = {
- {
- .hook = ipv4_conntrack_in,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_conntrack_local,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP_PRI_CONNTRACK,
- },
- {
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
- {
- .hook = ipv4_helper,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv4_confirm,
- .pf = NFPROTO_IPV4,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
- },
-};
-
-/* Fast function for those who don't want to parse /proc (and I don't
- blame them). */
-/* Reversing the socket's dst/src point of view gives us the reply
- mapping. */
-static int
-getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- const struct inet_sock *inet = inet_sk(sk);
- const struct nf_conntrack_tuple_hash *h;
- struct nf_conntrack_tuple tuple;
-
- memset(&tuple, 0, sizeof(tuple));
-
- lock_sock(sk);
- tuple.src.u3.ip = inet->inet_rcv_saddr;
- tuple.src.u.tcp.port = inet->inet_sport;
- tuple.dst.u3.ip = inet->inet_daddr;
- tuple.dst.u.tcp.port = inet->inet_dport;
- tuple.src.l3num = PF_INET;
- tuple.dst.protonum = sk->sk_protocol;
- release_sock(sk);
-
- /* We only do TCP and SCTP at the moment: is there a better way? */
- if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP) {
- pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
- return -ENOPROTOOPT;
- }
-
- if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
- pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
- *len, sizeof(struct sockaddr_in));
- return -EINVAL;
- }
-
- h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (h) {
- struct sockaddr_in sin;
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
- sin.sin_family = AF_INET;
- sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u.tcp.port;
- sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
- .tuple.dst.u3.ip;
- memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
-
- pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
- &sin.sin_addr.s_addr, ntohs(sin.sin_port));
- nf_ct_put(ct);
- if (copy_to_user(user, &sin, sizeof(sin)) != 0)
- return -EFAULT;
- else
- return 0;
- }
- pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
- &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
- nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static const struct nla_policy ipv4_nla_policy[CTA_IP_MAX+1] = {
- [CTA_IP_V4_SRC] = { .type = NLA_U32 },
- [CTA_IP_V4_DST] = { .type = NLA_U32 },
-};
-
-static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
- return -EINVAL;
-
- t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
- t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
-
- return 0;
-}
-#endif
-
-static struct nf_sockopt_ops so_getorigdst = {
- .pf = PF_INET,
- .get_optmin = SO_ORIGINAL_DST,
- .get_optmax = SO_ORIGINAL_DST+1,
- .get = getorigdst,
- .owner = THIS_MODULE,
-};
-
-static int ipv4_hooks_register(struct net *net)
-{
- struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
- int err = 0;
-
- mutex_lock(&register_ipv4_hooks);
-
- cnet->users++;
- if (cnet->users > 1)
- goto out_unlock;
-
- err = nf_defrag_ipv4_enable(net);
- if (err) {
- cnet->users = 0;
- goto out_unlock;
- }
-
- err = nf_register_net_hooks(net, ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
-
- if (err)
- cnet->users = 0;
- out_unlock:
- mutex_unlock(&register_ipv4_hooks);
- return err;
-}
-
-static void ipv4_hooks_unregister(struct net *net)
-{
- struct conntrack4_net *cnet = net_generic(net, conntrack4_net_id);
-
- mutex_lock(&register_ipv4_hooks);
- if (cnet->users && (--cnet->users == 0))
- nf_unregister_net_hooks(net, ipv4_conntrack_ops,
- ARRAY_SIZE(ipv4_conntrack_ops));
- mutex_unlock(&register_ipv4_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 = {
- .l3proto = PF_INET,
- .pkt_to_tuple = ipv4_pkt_to_tuple,
- .invert_tuple = ipv4_invert_tuple,
- .get_l4proto = ipv4_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = ipv4_tuple_to_nlattr,
- .nlattr_to_tuple = ipv4_nlattr_to_tuple,
- .nla_policy = ipv4_nla_policy,
- .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32)) + /* CTA_IP_V4_SRC */
- NLA_ALIGN(NLA_HDRLEN + sizeof(u32)), /* CTA_IP_V4_DST */
-#endif
- .net_ns_get = ipv4_hooks_register,
- .net_ns_put = ipv4_hooks_unregister,
- .me = THIS_MODULE,
-};
-
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
- &nf_conntrack_htable_size, 0600);
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
-MODULE_ALIAS("ip_conntrack");
-MODULE_LICENSE("GPL");
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto4[] = {
- &nf_conntrack_l4proto_tcp4,
- &nf_conntrack_l4proto_udp4,
- &nf_conntrack_l4proto_icmp,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- &nf_conntrack_l4proto_dccp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
- &nf_conntrack_l4proto_sctp4,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
- &nf_conntrack_l4proto_udplite4,
-#endif
-};
-
-static int ipv4_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
-}
-
-static void ipv4_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
-}
-
-static struct pernet_operations ipv4_net_ops = {
- .init = ipv4_net_init,
- .exit = ipv4_net_exit,
- .id = &conntrack4_net_id,
- .size = sizeof(struct conntrack4_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv4_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (WARN_ON(nla_policy_len(ipv4_nla_policy, CTA_IP_MAX + 1) !=
- nf_conntrack_l3proto_ipv4.nla_size))
- return -EINVAL;
-#endif
- ret = nf_register_sockopt(&so_getorigdst);
- if (ret < 0) {
- pr_err("Unable to register netfilter socket option\n");
- return ret;
- }
-
- ret = register_pernet_subsys(&ipv4_net_ops);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register pernet ops\n");
- goto cleanup_sockopt;
- }
-
- ret = nf_ct_l4proto_register(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- if (ret < 0)
- goto cleanup_pernet;
-
- ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv4);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv4: can't register ipv4 proto.\n");
- goto cleanup_l4proto;
- }
-
- return ret;
-cleanup_l4proto:
- nf_ct_l4proto_unregister(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- cleanup_pernet:
- unregister_pernet_subsys(&ipv4_net_ops);
- cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst);
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv4_fini(void)
-{
- synchronize_net();
- nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv4);
- nf_ct_l4proto_unregister(builtin_l4proto4,
- ARRAY_SIZE(builtin_l4proto4));
- unregister_pernet_subsys(&ipv4_net_ops);
- nf_unregister_sockopt(&so_getorigdst);
-}
-
-module_init(nf_conntrack_l3proto_ipv4_init);
-module_exit(nf_conntrack_l3proto_ipv4_fini);
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 805e83ec3ad9..164714104965 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -37,7 +37,7 @@ nf_tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+ sk2 = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, laddr ? laddr : iph->daddr,
hp->source, lport ? lport : hp->dest,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -71,7 +71,7 @@ __be32 nf_tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
EXPORT_SYMBOL_GPL(nf_tproxy_laddr4);
struct sock *
-nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const u8 protocol,
const __be32 saddr, const __be32 daddr,
const __be16 sport, const __be16 dport,
@@ -79,16 +79,21 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
const enum nf_tproxy_lookup_t lookup_type)
{
struct sock *sk;
- struct tcphdr *tcph;
switch (protocol) {
- case IPPROTO_TCP:
+ case IPPROTO_TCP: {
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(struct tcphdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- tcph = hp;
sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
ip_hdrlen(skb) +
- __tcp_hdrlen(tcph),
+ __tcp_hdrlen(hp),
saddr, sport,
daddr, dport,
in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
BUG();
}
break;
+ }
case IPPROTO_UDP:
sk = udp4_lib_lookup(net, saddr, sport, daddr, dport,
in->ifindex);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index b54c964ad925..8d7aaf118a30 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -320,8 +320,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
if (addr->sin_addr.s_addr == htonl(INADDR_ANY))
chk_addr_ret = RTN_LOCAL;
- if ((net->ipv4.sysctl_ip_nonlocal_bind == 0 &&
- isk->freebind == 0 && isk->transparent == 0 &&
+ if ((!inet_can_nonlocal_bind(net, isk) &&
chk_addr_ret != RTN_LOCAL) ||
chk_addr_ret == RTN_MULTICAST ||
chk_addr_ret == RTN_BROADCAST)
@@ -361,8 +360,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk,
scoped);
rcu_read_unlock();
- if (!(net->ipv6.sysctl.ip_nonlocal_bind ||
- isk->freebind || isk->transparent || has_addr ||
+ if (!(ipv6_can_nonlocal_bind(net, isk) || has_addr ||
addr_type == IPV6_ADDR_ANY))
return -EADDRNOTAVAIL;
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index b46e4cf9a55a..70289682a670 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
+ SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
SNMP_MIB_SENTINEL
};
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1df6e97106d7..b678466da451 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1996,8 +1996,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto no_route;
}
- if (res->type == RTN_BROADCAST)
+ if (res->type == RTN_BROADCAST) {
+ if (IN_DEV_BFORWARD(in_dev))
+ goto make_route;
goto brd_input;
+ }
if (res->type == RTN_LOCAL) {
err = fib_validate_source(skb, saddr, daddr, tos,
@@ -2014,6 +2017,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
if (res->type != RTN_UNICAST)
goto martian_destination;
+make_route:
err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
out: return err;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index af0a857d8352..b92f422f2fa8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -189,8 +189,9 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
if (write && ret == 0) {
low = make_kgid(user_ns, urange[0]);
high = make_kgid(user_ns, urange[1]);
- if (!gid_valid(low) || !gid_valid(high) ||
- (urange[1] < urange[0]) || gid_lt(high, low)) {
+ if (!gid_valid(low) || !gid_valid(high))
+ return -EINVAL;
+ if (urange[1] < urange[0] || gid_lt(high, low)) {
low = make_kgid(&init_user_ns, 1);
high = make_kgid(&init_user_ns, 0);
}
@@ -200,6 +201,23 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
return ret;
}
+static int ipv4_fwd_update_priority(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+ int ret;
+
+ net = container_of(table->data, struct net,
+ ipv4.sysctl_ip_fwd_update_priority);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (write && ret == 0)
+ call_netevent_notifiers(NETEVENT_IPV4_FWD_UPDATE_PRIORITY_UPDATE,
+ net);
+
+ return ret;
+}
+
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -663,6 +681,15 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec,
},
{
+ .procname = "ip_forward_update_priority",
+ .data = &init_net.ipv4.sysctl_ip_fwd_update_priority,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = ipv4_fwd_update_priority,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "ip_nonlocal_bind",
.data = &init_net.ipv4.sysctl_ip_nonlocal_bind,
.maxlen = sizeof(int),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e3704a49164b..b8af2fec5ad5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -507,7 +507,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
const struct tcp_sock *tp = tcp_sk(sk);
int state;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
state = inet_sk_state_load(sk);
if (state == TCP_LISTEN)
@@ -1994,7 +1994,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
* shouldn't happen.
*/
if (WARN(before(*seq, TCP_SKB_CB(skb)->seq),
- "recvmsg bug: copied %X seq %X rcvnxt %X fl %X\n",
+ "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt,
flags))
break;
@@ -2009,7 +2009,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto found_fin_ok;
WARN(!(flags & MSG_PEEK),
- "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+ "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n",
*seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
}
@@ -2531,7 +2531,6 @@ int tcp_disconnect(struct sock *sk, int flags)
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
- int err = 0;
int old_state = sk->sk_state;
if (old_state != TCP_CLOSE)
@@ -2555,6 +2554,8 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_clear_xmit_timers(sk);
__skb_queue_purge(&sk->sk_receive_queue);
+ tp->copied_seq = tp->rcv_nxt;
+ tp->urg_data = 0;
tcp_write_queue_purge(sk);
tcp_fastopen_active_disable_ofo_check(sk);
skb_rbtree_purge(&tp->out_of_order_queue);
@@ -2592,6 +2593,10 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_rx_dst = NULL;
tcp_saved_syn_free(tp);
tp->compressed_ack = 0;
+ tp->bytes_sent = 0;
+ tp->bytes_retrans = 0;
+ tp->dsack_dups = 0;
+ tp->reord_seen = 0;
/* Clean up fastopen related fields */
tcp_free_fastopen_req(tp);
@@ -2606,7 +2611,7 @@ int tcp_disconnect(struct sock *sk, int flags)
}
sk->sk_error_report(sk);
- return err;
+ return 0;
}
EXPORT_SYMBOL(tcp_disconnect);
@@ -2815,14 +2820,17 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_REPAIR:
if (!tcp_can_repair_sock(sk))
err = -EPERM;
- else if (val == 1) {
+ else if (val == TCP_REPAIR_ON) {
tp->repair = 1;
sk->sk_reuse = SK_FORCE_REUSE;
tp->repair_queue = TCP_NO_QUEUE;
- } else if (val == 0) {
+ } else if (val == TCP_REPAIR_OFF) {
tp->repair = 0;
sk->sk_reuse = SK_NO_REUSE;
tcp_send_window_probe(sk);
+ } else if (val == TCP_REPAIR_OFF_NO_WP) {
+ tp->repair = 0;
+ sk->sk_reuse = SK_NO_REUSE;
} else
err = -EINVAL;
@@ -2984,7 +2992,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
if (val < 0)
err = -EINVAL;
else
- icsk->icsk_user_timeout = msecs_to_jiffies(val);
+ icsk->icsk_user_timeout = val;
break;
case TCP_FASTOPEN:
@@ -3196,10 +3204,41 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_delivery_rate = rate64;
info->tcpi_delivered = tp->delivered;
info->tcpi_delivered_ce = tp->delivered_ce;
+ info->tcpi_bytes_sent = tp->bytes_sent;
+ info->tcpi_bytes_retrans = tp->bytes_retrans;
+ info->tcpi_dsack_dups = tp->dsack_dups;
+ info->tcpi_reord_seen = tp->reord_seen;
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
+static size_t tcp_opt_stats_get_size(void)
+{
+ return
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */
+ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */
+ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */
+ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */
+ 0;
+}
+
struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
{
const struct tcp_sock *tp = tcp_sk(sk);
@@ -3208,9 +3247,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
u64 rate64;
u32 rate;
- stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) +
- 7 * nla_total_size(sizeof(u32)) +
- 3 * nla_total_size(sizeof(u8)), GFP_ATOMIC);
+ stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC);
if (!stats)
return NULL;
@@ -3246,6 +3283,13 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk)
nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una);
nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent,
+ TCP_NLA_PAD);
+ nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans,
+ TCP_NLA_PAD);
+ nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups);
+ nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen);
+
return stats;
}
@@ -3440,7 +3484,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
break;
case TCP_USER_TIMEOUT:
- val = jiffies_to_msecs(icsk->icsk_user_timeout);
+ val = icsk->icsk_user_timeout;
break;
case TCP_FASTOPEN:
@@ -3714,8 +3758,7 @@ int tcp_abort(struct sock *sk, int err)
struct request_sock *req = inet_reqsk(sk);
local_bh_disable();
- inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
- req);
+ inet_csk_reqsk_queue_drop(req->rsk_listener, req);
local_bh_enable();
return 0;
}
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 3b5f45b9e81e..13d34427ca3d 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -358,6 +358,10 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
/* Reduce delayed ACKs by rounding up cwnd to the next even number. */
cwnd = (cwnd + 1) & ~1U;
+ /* Ensure gain cycling gets inflight above BDP even for small BDPs. */
+ if (bbr->mode == BBR_PROBE_BW && gain > BBR_UNIT)
+ cwnd += 2;
+
return cwnd;
}
diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
index 5f5e5936760e..8b637f9f23a2 100644
--- a/net/ipv4/tcp_dctcp.c
+++ b/net/ipv4/tcp_dctcp.c
@@ -55,7 +55,6 @@ struct dctcp {
u32 dctcp_alpha;
u32 next_seq;
u32 ce_state;
- u32 delayed_ack_reserved;
u32 loss_cwnd;
};
@@ -96,7 +95,6 @@ static void dctcp_init(struct sock *sk)
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
- ca->delayed_ack_reserved = 0;
ca->loss_cwnd = 0;
ca->ce_state = 0;
@@ -131,23 +129,14 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* State has changed from CE=0 to CE=1 and delayed
- * ACK has not sent yet.
- */
- if (!ca->ce_state && ca->delayed_ack_reserved) {
- u32 tmp_rcv_nxt;
-
- /* Save current rcv_nxt. */
- tmp_rcv_nxt = tp->rcv_nxt;
-
- /* Generate previous ack with CE=0. */
- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
- tp->rcv_nxt = ca->prior_rcv_nxt;
-
- tcp_send_ack(sk);
-
- /* Recover current rcv_nxt. */
- tp->rcv_nxt = tmp_rcv_nxt;
+ if (!ca->ce_state) {
+ /* State has changed from CE=0 to CE=1, force an immediate
+ * ACK to reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ tcp_enter_quickack_mode(sk, 1);
}
ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -161,23 +150,14 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
struct dctcp *ca = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
- /* State has changed from CE=1 to CE=0 and delayed
- * ACK has not sent yet.
- */
- if (ca->ce_state && ca->delayed_ack_reserved) {
- u32 tmp_rcv_nxt;
-
- /* Save current rcv_nxt. */
- tmp_rcv_nxt = tp->rcv_nxt;
-
- /* Generate previous ack with CE=1. */
- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
- tp->rcv_nxt = ca->prior_rcv_nxt;
-
- tcp_send_ack(sk);
-
- /* Recover current rcv_nxt. */
- tp->rcv_nxt = tmp_rcv_nxt;
+ if (ca->ce_state) {
+ /* State has changed from CE=1 to CE=0, force an immediate
+ * ACK to reflect the new CE state. If an ACK was delayed,
+ * send that first to reflect the prior CE state.
+ */
+ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+ tcp_enter_quickack_mode(sk, 1);
}
ca->prior_rcv_nxt = tp->rcv_nxt;
@@ -248,25 +228,6 @@ static void dctcp_state(struct sock *sk, u8 new_state)
}
}
-static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
-{
- struct dctcp *ca = inet_csk_ca(sk);
-
- switch (ev) {
- case CA_EVENT_DELAYED_ACK:
- if (!ca->delayed_ack_reserved)
- ca->delayed_ack_reserved = 1;
- break;
- case CA_EVENT_NON_DELAYED_ACK:
- if (ca->delayed_ack_reserved)
- ca->delayed_ack_reserved = 0;
- break;
- default:
- /* Don't care for the rest. */
- break;
- }
-}
-
static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
{
switch (ev) {
@@ -276,10 +237,6 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
case CA_EVENT_ECN_NO_CE:
dctcp_ce_state_1_to_0(sk);
break;
- case CA_EVENT_DELAYED_ACK:
- case CA_EVENT_NON_DELAYED_ACK:
- dctcp_update_ack_reserved(sk, ev);
- break;
default:
/* Don't care for the rest. */
break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d3b6390ecf23..715d541b52dd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -216,7 +216,7 @@ static void tcp_incr_quickack(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.quick = quickacks;
}
-static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
+void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -224,6 +224,7 @@ static void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks)
icsk->icsk_ack.pingpong = 0;
icsk->icsk_ack.ato = TCP_ATO_MIN;
}
+EXPORT_SYMBOL(tcp_enter_quickack_mode);
/* Send ACKs quickly, if "quick" count is not exhausted
* and the session is not interactive.
@@ -246,8 +247,15 @@ static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
static void tcp_ecn_accept_cwr(struct tcp_sock *tp, const struct sk_buff *skb)
{
- if (tcp_hdr(skb)->cwr)
+ if (tcp_hdr(skb)->cwr) {
tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+
+ /* If the sender is telling us it has entered CWR, then its
+ * cwnd may be very low (even just 1 packet), so we should ACK
+ * immediately.
+ */
+ tcp_enter_quickack_mode((struct sock *)tp, 2);
+ }
}
static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
@@ -873,6 +881,7 @@ static void tcp_dsack_seen(struct tcp_sock *tp)
{
tp->rx_opt.sack_ok |= TCP_DSACK_SEEN;
tp->rack.dsack_seen = 1;
+ tp->dsack_dups++;
}
/* It's reordering when higher sequence was delivered (i.e. sacked) before
@@ -904,8 +913,8 @@ static void tcp_check_sack_reordering(struct sock *sk, const u32 low_seq,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
}
- tp->rack.reord = 1;
/* This exciting event is worth to be remembered. 8) */
+ tp->reord_seen++;
NET_INC_STATS(sock_net(sk),
ts ? LINUX_MIB_TCPTSREORDER : LINUX_MIB_TCPSACKREORDER);
}
@@ -1869,6 +1878,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
tp->reordering = min_t(u32, tp->packets_out + addend,
sock_net(sk)->ipv4.sysctl_tcp_max_reordering);
+ tp->reord_seen++;
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRENOREORDER);
}
@@ -4343,6 +4353,11 @@ static bool tcp_try_coalesce(struct sock *sk,
if (TCP_SKB_CB(from)->seq != TCP_SKB_CB(to)->end_seq)
return false;
+#ifdef CONFIG_TLS_DEVICE
+ if (from->decrypted != to->decrypted)
+ return false;
+#endif
+
if (!skb_try_coalesce(to, from, fragstolen, &delta))
return false;
@@ -4361,6 +4376,23 @@ static bool tcp_try_coalesce(struct sock *sk,
return true;
}
+static bool tcp_ooo_try_coalesce(struct sock *sk,
+ struct sk_buff *to,
+ struct sk_buff *from,
+ bool *fragstolen)
+{
+ bool res = tcp_try_coalesce(sk, to, from, fragstolen);
+
+ /* In case tcp_drop() is called later, update to->gso_segs */
+ if (res) {
+ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
+ max_t(u16, 1, skb_shinfo(from)->gso_segs);
+
+ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
+ }
+ return res;
+}
+
static void tcp_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
@@ -4484,8 +4516,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
/* In the typical case, we are adding an skb to the end of the list.
* Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
*/
- if (tcp_try_coalesce(sk, tp->ooo_last_skb,
- skb, &fragstolen)) {
+ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
+ skb, &fragstolen)) {
coalesce_done:
tcp_grow_window(sk, skb);
kfree_skb_partial(skb, fragstolen);
@@ -4513,7 +4545,7 @@ coalesce_done:
/* All the bits are present. Drop. */
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb);
+ tcp_drop(sk, skb);
skb = NULL;
tcp_dsack_set(sk, seq, end_seq);
goto add_sack;
@@ -4532,11 +4564,11 @@ coalesce_done:
TCP_SKB_CB(skb1)->end_seq);
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPOFOMERGE);
- __kfree_skb(skb1);
+ tcp_drop(sk, skb1);
goto merge_right;
}
- } else if (tcp_try_coalesce(sk, skb1,
- skb, &fragstolen)) {
+ } else if (tcp_ooo_try_coalesce(sk, skb1,
+ skb, &fragstolen)) {
goto coalesce_done;
}
p = &parent->rb_right;
@@ -4695,7 +4727,6 @@ queue_and_out:
}
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
- tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
if (skb->len)
tcp_event_data_recv(sk, skb);
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -4872,6 +4903,9 @@ restart:
break;
memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+#ifdef CONFIG_TLS_DEVICE
+ nskb->decrypted = skb->decrypted;
+#endif
TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start;
if (list)
__skb_queue_before(list, skb, nskb);
@@ -4899,6 +4933,10 @@ restart:
skb == tail ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
goto end;
+#ifdef CONFIG_TLS_DEVICE
+ if (skb->decrypted != nskb->decrypted)
+ goto end;
+#endif
}
}
}
@@ -4913,6 +4951,7 @@ end:
static void tcp_collapse_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 range_truesize, sum_tiny = 0;
struct sk_buff *skb, *head;
u32 start, end;
@@ -4924,6 +4963,7 @@ new_range:
}
start = TCP_SKB_CB(skb)->seq;
end = TCP_SKB_CB(skb)->end_seq;
+ range_truesize = skb->truesize;
for (head = skb;;) {
skb = skb_rb_next(skb);
@@ -4934,11 +4974,20 @@ new_range:
if (!skb ||
after(TCP_SKB_CB(skb)->seq, end) ||
before(TCP_SKB_CB(skb)->end_seq, start)) {
- tcp_collapse(sk, NULL, &tp->out_of_order_queue,
- head, skb, start, end);
+ /* Do not attempt collapsing tiny skbs */
+ if (range_truesize != head->truesize ||
+ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
+ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+ head, skb, start, end);
+ } else {
+ sum_tiny += range_truesize;
+ if (sum_tiny > sk->sk_rcvbuf >> 3)
+ return;
+ }
goto new_range;
}
+ range_truesize += skb->truesize;
if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
start = TCP_SKB_CB(skb)->seq;
if (after(TCP_SKB_CB(skb)->end_seq, end))
@@ -4953,6 +5002,7 @@ new_range:
* 2) not add too big latencies if thousands of packets sit there.
* (But if application shrinks SO_RCVBUF, we could still end up
* freeing whole queue here)
+ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
*
* Return true if queue has shrunk.
*/
@@ -4960,20 +5010,26 @@ static bool tcp_prune_ofo_queue(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct rb_node *node, *prev;
+ int goal;
if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
return false;
NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
+ goal = sk->sk_rcvbuf >> 3;
node = &tp->ooo_last_skb->rbnode;
do {
prev = rb_prev(node);
rb_erase(node, &tp->out_of_order_queue);
+ goal -= rb_to_skb(node)->truesize;
tcp_drop(sk, rb_to_skb(node));
- sk_mem_reclaim(sk);
- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
- !tcp_under_memory_pressure(sk))
- break;
+ if (!prev || goal <= 0) {
+ sk_mem_reclaim(sk);
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+ !tcp_under_memory_pressure(sk))
+ break;
+ goal = sk->sk_rcvbuf >> 3;
+ }
node = prev;
} while (node);
tp->ooo_last_skb = rb_to_skb(prev);
@@ -5008,6 +5064,9 @@ static int tcp_prune_queue(struct sock *sk)
else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
+ return 0;
+
tcp_collapse_ofo_queue(sk);
if (!skb_queue_empty(&sk->sk_receive_queue))
tcp_collapse(sk, &sk->sk_receive_queue, NULL,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index dc415c66a33a..9e041fa5c545 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -157,11 +157,24 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
if (tcptw->tw_ts_recent_stamp &&
(!twp || (reuse && time_after32(ktime_get_seconds(),
tcptw->tw_ts_recent_stamp)))) {
- tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
- if (tp->write_seq == 0)
- tp->write_seq = 1;
- tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
- tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ /* In case of repair and re-using TIME-WAIT sockets we still
+ * want to be sure that it is safe as above but honor the
+ * sequence numbers and time stamps set as part of the repair
+ * process.
+ *
+ * Without this check re-using a TIME-WAIT socket with TCP
+ * repair would accumulate a -1 on the repair assigned
+ * sequence number. The first time it is reused the sequence
+ * is -1, the second time -2, etc. This fixes that issue
+ * without appearing to create any others.
+ */
+ if (likely(!tp->repair)) {
+ tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
+ if (tp->write_seq == 0)
+ tp->write_seq = 1;
+ tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
+ tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
+ }
sock_hold(sktw);
return 1;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index f5aee641f825..870b0a335061 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -262,6 +262,9 @@ found:
flush |= (len - 1) >= mss;
flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq);
+#ifdef CONFIG_TLS_DEVICE
+ flush |= p->decrypted ^ skb->decrypted;
+#endif
if (flush || skb_gro_receive(p, skb)) {
mss = 1;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f8f6129160dd..597dbd749f05 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -160,7 +160,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
}
/* Account for an ACK we sent. */
-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
+ u32 rcv_nxt)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -171,6 +172,9 @@ static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
if (hrtimer_try_to_cancel(&tp->compressed_ack_timer) == 1)
__sock_put(sk);
}
+
+ if (unlikely(rcv_nxt != tp->rcv_nxt))
+ return; /* Special ACK sent by DCTCP to reflect ECN */
tcp_dec_quickack_mode(sk, pkts);
inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
}
@@ -1009,8 +1013,8 @@ static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
* We are working here with either a clone of the original
* SKB, or a fresh unique copy made by the retransmit engine.
*/
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
- gfp_t gfp_mask)
+static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
+ int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet;
@@ -1086,7 +1090,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
th->source = inet->inet_sport;
th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
- th->ack_seq = htonl(tp->rcv_nxt);
+ th->ack_seq = htonl(rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
tcb->tcp_flags);
@@ -1127,11 +1131,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
icsk->icsk_af_ops->send_check(sk, skb);
if (likely(tcb->tcp_flags & TCPHDR_ACK))
- tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+ tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
tp->data_segs_out += tcp_skb_pcount(skb);
+ tp->bytes_sent += skb->len - tcp_header_size;
tcp_internal_pacing(sk, skb);
}
@@ -1164,6 +1169,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
return err;
}
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+ gfp_t gfp_mask)
+{
+ return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
+ tcp_sk(sk)->rcv_nxt);
+}
+
/* This routine just queues the buffer for sending.
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
@@ -2686,9 +2698,8 @@ static bool tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = skb_rb_next(skb);
- int skb_size, next_skb_size;
+ int next_skb_size;
- skb_size = skb->len;
next_skb_size = next_skb->len;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
@@ -2859,6 +2870,7 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
tp->total_retrans += segs;
+ tp->bytes_retrans += skb->len;
/* make sure skb->data is aligned on arches that require it
* and check if ack-trimming & collapsing extended the headroom
@@ -3509,8 +3521,6 @@ void tcp_send_delayed_ack(struct sock *sk)
int ato = icsk->icsk_ack.ato;
unsigned long timeout;
- tcp_ca_event(sk, CA_EVENT_DELAYED_ACK);
-
if (ato > TCP_DELACK_MIN) {
const struct tcp_sock *tp = tcp_sk(sk);
int max_ato = HZ / 2;
@@ -3559,7 +3569,7 @@ void tcp_send_delayed_ack(struct sock *sk)
}
/* This routine sends an ack and also updates the window. */
-void tcp_send_ack(struct sock *sk)
+void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
{
struct sk_buff *buff;
@@ -3567,8 +3577,6 @@ void tcp_send_ack(struct sock *sk)
if (sk->sk_state == TCP_CLOSE)
return;
- tcp_ca_event(sk, CA_EVENT_NON_DELAYED_ACK);
-
/* We are not putting this on the write queue, so
* tcp_transmit_skb() will set the ownership to this
* sock.
@@ -3594,9 +3602,14 @@ void tcp_send_ack(struct sock *sk)
skb_set_tcp_pure_ack(buff);
/* Send it off, this clears delayed acks for us. */
- tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
+ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+}
+EXPORT_SYMBOL_GPL(__tcp_send_ack);
+
+void tcp_send_ack(struct sock *sk)
+{
+ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
}
-EXPORT_SYMBOL_GPL(tcp_send_ack);
/* This routine sends a packet with an out of date sequence
* number. It assumes the other end will try to ack it.
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index 71593e4400ab..c81aadff769b 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -25,7 +25,7 @@ static u32 tcp_rack_reo_wnd(const struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
- if (!tp->rack.reord) {
+ if (!tp->reord_seen) {
/* If reordering has not been observed, be aggressive during
* the recovery or starting the recovery by DUPACK threshold.
*/
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 3b3611729928..7fdf222a0bdf 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,35 @@
#include <linux/gfp.h>
#include <net/tcp.h>
+static u32 tcp_retransmit_stamp(const struct sock *sk)
+{
+ u32 start_ts = tcp_sk(sk)->retrans_stamp;
+
+ if (unlikely(!start_ts)) {
+ struct sk_buff *head = tcp_rtx_queue_head(sk);
+
+ if (!head)
+ return 0;
+ start_ts = tcp_skb_timestamp(head);
+ }
+ return start_ts;
+}
+
+static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ u32 elapsed, start_ts;
+
+ start_ts = tcp_retransmit_stamp(sk);
+ if (!icsk->icsk_user_timeout || !start_ts)
+ return icsk->icsk_rto;
+ elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
+ if (elapsed >= icsk->icsk_user_timeout)
+ return 1; /* user timeout has passed; fire ASAP */
+ else
+ return min_t(u32, icsk->icsk_rto, msecs_to_jiffies(icsk->icsk_user_timeout - elapsed));
+}
+
/**
* tcp_write_err() - close socket and save error info
* @sk: The socket the error has appeared on.
@@ -166,14 +195,9 @@ static bool retransmits_timed_out(struct sock *sk,
if (!inet_csk(sk)->icsk_retransmits)
return false;
- start_ts = tcp_sk(sk)->retrans_stamp;
- if (unlikely(!start_ts)) {
- struct sk_buff *head = tcp_rtx_queue_head(sk);
-
- if (!head)
- return false;
- start_ts = tcp_skb_timestamp(head);
- }
+ start_ts = tcp_retransmit_stamp(sk);
+ if (!start_ts)
+ return false;
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
@@ -183,8 +207,9 @@ static bool retransmits_timed_out(struct sock *sk,
else
timeout = ((2 << linear_backoff_thresh) - 1) * rto_base +
(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+ timeout = jiffies_to_msecs(timeout);
}
- return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= jiffies_to_msecs(timeout);
+ return (tcp_time_stamp(tcp_sk(sk)) - start_ts) >= timeout;
}
/* A write timeout has occurred. Process the after effects. */
@@ -337,8 +362,7 @@ static void tcp_probe_timer(struct sock *sk)
if (!start_ts)
skb->skb_mstamp = tp->tcp_mstamp;
else if (icsk->icsk_user_timeout &&
- (s32)(tcp_time_stamp(tp) - start_ts) >
- jiffies_to_msecs(icsk->icsk_user_timeout))
+ (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
goto abort;
max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
@@ -535,7 +559,8 @@ out_reset_timer:
/* Use normal (exponential) backoff */
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
- inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
+ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ tcp_clamp_rto_to_user_timeout(sk), TCP_RTO_MAX);
if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0))
__sk_dst_reset(sk);
@@ -672,7 +697,7 @@ static void tcp_keepalive_timer (struct timer_list *t)
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
- elapsed >= icsk->icsk_user_timeout &&
+ elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 0eff75525da1..613282c65a10 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -15,7 +15,7 @@ menuconfig IPV6
Documentation/networking/ipv6.txt and read the HOWTO at
<http://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/>
- To compile this protocol support as a module, choose M here: the
+ To compile this protocol support as a module, choose M here: the
module will be called ipv6.
if IPV6
@@ -108,6 +108,7 @@ config IPV6_MIP6
config IPV6_ILA
tristate "IPv6: Identifier Locator Addressing (ILA)"
depends on NETFILTER
+ select DST_CACHE
select LWTUNNEL
---help---
Support for IPv6 Identifier Locator Addressing (ILA).
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 1659a6b3cf42..2fac4ad74867 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -2372,7 +2372,8 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
continue;
if ((rt->fib6_flags & noflags) != 0)
continue;
- fib6_info_hold(rt);
+ if (!fib6_info_hold_safe(rt))
+ continue;
break;
}
out:
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index c9535354149f..020f6e14a7af 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -322,8 +322,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
/* Reproduce AF_INET checks to make the bindings consistent */
v4addr = addr->sin6_addr.s6_addr32[3];
chk_addr_ret = inet_addr_type(net, v4addr);
- if (!net->ipv4.sysctl_ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!inet_can_nonlocal_bind(net, inet) &&
v4addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
@@ -362,8 +361,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
*/
v4addr = LOOPBACK4_IPV6;
if (!(addr_type & IPV6_ADDR_MULTICAST)) {
- if (!net->ipv6.sysctl.ip_nonlocal_bind &&
- !(inet->freebind || inet->transparent) &&
+ if (!ipv6_can_nonlocal_bind(net, inet) &&
!ipv6_chk_addr(net, &addr->sin6_addr,
dev, 0)) {
err = -EADDRNOTAVAIL;
diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c
index 1323b9679cf7..1c0bb9fb76e6 100644
--- a/net/ipv6/calipso.c
+++ b/net/ipv6/calipso.c
@@ -799,8 +799,7 @@ static int calipso_opt_update(struct sock *sk, struct ipv6_opt_hdr *hop)
{
struct ipv6_txoptions *old = txopt_get(inet6_sk(sk)), *txopts;
- txopts = ipv6_renew_options_kern(sk, old, IPV6_HOPOPTS,
- hop, hop ? ipv6_optlen(hop) : 0);
+ txopts = ipv6_renew_options(sk, old, IPV6_HOPOPTS, hop);
txopt_put(old);
if (IS_ERR(txopts))
return PTR_ERR(txopts);
@@ -1222,8 +1221,7 @@ static int calipso_req_setattr(struct request_sock *req,
if (IS_ERR(new))
return PTR_ERR(new);
- txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
- new, new ? ipv6_optlen(new) : 0);
+ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
kfree(new);
@@ -1260,8 +1258,7 @@ static void calipso_req_delattr(struct request_sock *req)
if (calipso_opt_del(req_inet->ipv6_opt->hopopt, &new))
return; /* Nothing to do */
- txopts = ipv6_renew_options_kern(sk, req_inet->ipv6_opt, IPV6_HOPOPTS,
- new, new ? ipv6_optlen(new) : 0);
+ txopts = ipv6_renew_options(sk, req_inet->ipv6_opt, IPV6_HOPOPTS, new);
if (!IS_ERR(txopts)) {
txopts = xchg(&req_inet->ipv6_opt, txopts);
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 201306b9b5ea..1ede7a16a0be 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(struct sock *sk, struct msghdr *msg,
}
if (np->rxopt.bits.rxorigdstaddr) {
struct sockaddr_in6 sin6;
- __be16 *ports = (__be16 *) skb_transport_header(skb);
+ __be16 *ports;
+ int end;
- if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
+ end = skb_transport_offset(skb) + 4;
+ if (end <= 0 || pskb_may_pull(skb, end)) {
/* All current transport protocols have the port numbers in the
* first four bytes of the transport header and this function is
* written with this assumption in mind.
*/
+ ports = (__be16 *)skb_transport_header(skb);
sin6.sin6_family = AF_INET6;
sin6.sin6_addr = ipv6_hdr(skb)->daddr;
@@ -800,7 +803,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
if (addr_type != IPV6_ADDR_ANY) {
int strict = __ipv6_addr_src_scope(addr_type) <= IPV6_ADDR_SCOPE_LINKLOCAL;
- if (!(inet_sk(sk)->freebind || inet_sk(sk)->transparent) &&
+ if (!ipv6_can_nonlocal_bind(net, inet_sk(sk)) &&
!ipv6_chk_addr_and_flags(net, &src_info->ipi6_addr,
dev, !strict, 0,
IFA_F_TENTATIVE) &&
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 97513f35bcc5..88a7579c23bd 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -669,8 +669,10 @@ skip_cow:
sg_init_table(sg, nfrags);
ret = skb_to_sgvec(skb, sg, 0, skb->len);
- if (unlikely(ret < 0))
+ if (unlikely(ret < 0)) {
+ kfree(tmp);
goto out;
+ }
skb->ip_summed = CHECKSUM_NONE;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index ddfa533a84e5..6177e2171171 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -162,8 +162,7 @@ static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
skb->encap_hdr_csum = 1;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev))
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev)
esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK);
else if (!(features & NETIF_F_HW_ESP_TX_CSUM))
esp_features = features & ~NETIF_F_CSUM_MASK;
@@ -207,8 +206,7 @@ static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features
if (!xo)
return -EINVAL;
- if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle ||
- (x->xso.dev != skb->dev)) {
+ if (!(features & NETIF_F_HW_ESP) || x->xso.dev != skb->dev) {
xo->flags |= CRYPTO_FALLBACK;
hw_offload = false;
}
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 5bc2bf3733ab..20291c2036fc 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -1015,29 +1015,21 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
}
EXPORT_SYMBOL_GPL(ipv6_dup_options);
-static int ipv6_renew_option(void *ohdr,
- struct ipv6_opt_hdr __user *newopt, int newoptlen,
- int inherit,
- struct ipv6_opt_hdr **hdr,
- char **p)
+static void ipv6_renew_option(int renewtype,
+ struct ipv6_opt_hdr **dest,
+ struct ipv6_opt_hdr *old,
+ struct ipv6_opt_hdr *new,
+ int newtype, char **p)
{
- if (inherit) {
- if (ohdr) {
- memcpy(*p, ohdr, ipv6_optlen((struct ipv6_opt_hdr *)ohdr));
- *hdr = (struct ipv6_opt_hdr *)*p;
- *p += CMSG_ALIGN(ipv6_optlen(*hdr));
- }
- } else {
- if (newopt) {
- if (copy_from_user(*p, newopt, newoptlen))
- return -EFAULT;
- *hdr = (struct ipv6_opt_hdr *)*p;
- if (ipv6_optlen(*hdr) > newoptlen)
- return -EINVAL;
- *p += CMSG_ALIGN(newoptlen);
- }
- }
- return 0;
+ struct ipv6_opt_hdr *src;
+
+ src = (renewtype == newtype ? new : old);
+ if (!src)
+ return;
+
+ memcpy(*p, src, ipv6_optlen(src));
+ *dest = (struct ipv6_opt_hdr *)*p;
+ *p += CMSG_ALIGN(ipv6_optlen(*dest));
}
/**
@@ -1063,13 +1055,11 @@ static int ipv6_renew_option(void *ohdr,
*/
struct ipv6_txoptions *
ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
- int newtype,
- struct ipv6_opt_hdr __user *newopt, int newoptlen)
+ int newtype, struct ipv6_opt_hdr *newopt)
{
int tot_len = 0;
char *p;
struct ipv6_txoptions *opt2;
- int err;
if (opt) {
if (newtype != IPV6_HOPOPTS && opt->hopopt)
@@ -1082,8 +1072,8 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
tot_len += CMSG_ALIGN(ipv6_optlen(opt->dst1opt));
}
- if (newopt && newoptlen)
- tot_len += CMSG_ALIGN(newoptlen);
+ if (newopt)
+ tot_len += CMSG_ALIGN(ipv6_optlen(newopt));
if (!tot_len)
return NULL;
@@ -1098,29 +1088,19 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
opt2->tot_len = tot_len;
p = (char *)(opt2 + 1);
- err = ipv6_renew_option(opt ? opt->hopopt : NULL, newopt, newoptlen,
- newtype != IPV6_HOPOPTS,
- &opt2->hopopt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->dst0opt : NULL, newopt, newoptlen,
- newtype != IPV6_RTHDRDSTOPTS,
- &opt2->dst0opt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->srcrt : NULL, newopt, newoptlen,
- newtype != IPV6_RTHDR,
- (struct ipv6_opt_hdr **)&opt2->srcrt, &p);
- if (err)
- goto out;
-
- err = ipv6_renew_option(opt ? opt->dst1opt : NULL, newopt, newoptlen,
- newtype != IPV6_DSTOPTS,
- &opt2->dst1opt, &p);
- if (err)
- goto out;
+ ipv6_renew_option(IPV6_HOPOPTS, &opt2->hopopt,
+ (opt ? opt->hopopt : NULL),
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_RTHDRDSTOPTS, &opt2->dst0opt,
+ (opt ? opt->dst0opt : NULL),
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_RTHDR,
+ (struct ipv6_opt_hdr **)&opt2->srcrt,
+ (opt ? (struct ipv6_opt_hdr *)opt->srcrt : NULL),
+ newopt, newtype, &p);
+ ipv6_renew_option(IPV6_DSTOPTS, &opt2->dst1opt,
+ (opt ? opt->dst1opt : NULL),
+ newopt, newtype, &p);
opt2->opt_nflen = (opt2->hopopt ? ipv6_optlen(opt2->hopopt) : 0) +
(opt2->dst0opt ? ipv6_optlen(opt2->dst0opt) : 0) +
@@ -1128,37 +1108,6 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
opt2->opt_flen = (opt2->dst1opt ? ipv6_optlen(opt2->dst1opt) : 0);
return opt2;
-out:
- sock_kfree_s(sk, opt2, opt2->tot_len);
- return ERR_PTR(err);
-}
-
-/**
- * ipv6_renew_options_kern - replace a specific ext hdr with a new one.
- *
- * @sk: sock from which to allocate memory
- * @opt: original options
- * @newtype: option type to replace in @opt
- * @newopt: new option of type @newtype to replace (kernel-mem)
- * @newoptlen: length of @newopt
- *
- * See ipv6_renew_options(). The difference is that @newopt is
- * kernel memory, rather than user memory.
- */
-struct ipv6_txoptions *
-ipv6_renew_options_kern(struct sock *sk, struct ipv6_txoptions *opt,
- int newtype, struct ipv6_opt_hdr *newopt,
- int newoptlen)
-{
- struct ipv6_txoptions *ret_val;
- const mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret_val = ipv6_renew_options(sk, opt, newtype,
- (struct ipv6_opt_hdr __user *)newopt,
- newoptlen);
- set_fs(old_fs);
- return ret_val;
}
struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space,
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 24611c8b0562..7f6b1f81c200 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -92,7 +92,7 @@ static void icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct net *net = dev_net(skb->dev);
if (type == ICMPV6_PKT_TOOBIG)
- ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL));
else if (type == NDISC_REDIRECT)
ip6_redirect(skb, net, skb->dev->ifindex, 0,
sock_net_uid(net, NULL));
@@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buff *skb)
/* for local traffic to local address, skb dev is the loopback
* device. Check if there is a dst attached to the skb and if so
- * get the real device index.
+ * get the real device index. Same is needed for replies to a link
+ * local address on a device enslaved to an L3 master device
*/
- if (unlikely(iif == LOOPBACK_IFINDEX)) {
+ if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
const struct rt6_info *rt6 = skb_rt6_info(skb);
if (rt6)
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
index 579310466eac..95e9146918cc 100644
--- a/net/ipv6/ila/ila_common.c
+++ b/net/ipv6/ila/ila_common.c
@@ -153,4 +153,3 @@ void ila_update_ipv6_locator(struct sk_buff *skb, struct ila_params *p,
/* Now change destination address */
iaddr->loc = p->locator;
}
-
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
index 51a15ce50a64..17c455ff69ff 100644
--- a/net/ipv6/ila/ila_xlat.c
+++ b/net/ipv6/ila/ila_xlat.c
@@ -663,4 +663,3 @@ static int ila_xlat_addr(struct sk_buff *skb, bool sir2ila)
return 0;
}
-
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 1fb2f3118d60..d212738e9d10 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -935,20 +935,19 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
{
struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
lockdep_is_held(&rt->fib6_table->tb6_lock));
- enum fib_event_type event = FIB_EVENT_ENTRY_ADD;
- struct fib6_info *iter = NULL, *match = NULL;
+ struct fib6_info *iter = NULL;
struct fib6_info __rcu **ins;
+ struct fib6_info __rcu **fallback_ins = NULL;
int replace = (info->nlh &&
(info->nlh->nlmsg_flags & NLM_F_REPLACE));
- int append = (info->nlh &&
- (info->nlh->nlmsg_flags & NLM_F_APPEND));
int add = (!info->nlh ||
(info->nlh->nlmsg_flags & NLM_F_CREATE));
int found = 0;
+ bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
u16 nlflags = NLM_F_EXCL;
int err;
- if (append)
+ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
nlflags |= NLM_F_APPEND;
ins = &fn->leaf;
@@ -970,8 +969,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
nlflags &= ~NLM_F_EXCL;
if (replace) {
- found++;
- break;
+ if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+ found++;
+ break;
+ }
+ if (rt_can_ecmp)
+ fallback_ins = fallback_ins ?: ins;
+ goto next_iter;
}
if (rt6_duplicate_nexthop(iter, rt)) {
@@ -986,51 +990,71 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu);
return -EEXIST;
}
-
- /* first route that matches */
- if (!match)
- match = iter;
+ /* If we have the same destination and the same metric,
+ * but not the same gateway, then the route we try to
+ * add is sibling to this route, increment our counter
+ * of siblings, and later we will add our route to the
+ * list.
+ * Only static routes (which don't have flag
+ * RTF_EXPIRES) are used for ECMPv6.
+ *
+ * To avoid long list, we only had siblings if the
+ * route have a gateway.
+ */
+ if (rt_can_ecmp &&
+ rt6_qualify_for_ecmp(iter))
+ rt->fib6_nsiblings++;
}
if (iter->fib6_metric > rt->fib6_metric)
break;
+next_iter:
ins = &iter->fib6_next;
}
+ if (fallback_ins && !found) {
+ /* No ECMP-able route found, replace first non-ECMP one */
+ ins = fallback_ins;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ found++;
+ }
+
/* Reset round-robin state, if necessary */
if (ins == &fn->leaf)
fn->rr_ptr = NULL;
/* Link this route to others same route. */
- if (append && match) {
+ if (rt->fib6_nsiblings) {
+ unsigned int fib6_nsiblings;
struct fib6_info *sibling, *temp_sibling;
- if (rt->fib6_flags & RTF_REJECT) {
- NL_SET_ERR_MSG(extack,
- "Can not append a REJECT route");
- return -EINVAL;
- } else if (match->fib6_flags & RTF_REJECT) {
- NL_SET_ERR_MSG(extack,
- "Can not append to a REJECT route");
- return -EINVAL;
+ /* Find the first route that have the same metric */
+ sibling = leaf;
+ while (sibling) {
+ if (sibling->fib6_metric == rt->fib6_metric &&
+ rt6_qualify_for_ecmp(sibling)) {
+ list_add_tail(&rt->fib6_siblings,
+ &sibling->fib6_siblings);
+ break;
+ }
+ sibling = rcu_dereference_protected(sibling->fib6_next,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
- event = FIB_EVENT_ENTRY_APPEND;
- rt->fib6_nsiblings = match->fib6_nsiblings;
- list_add_tail(&rt->fib6_siblings, &match->fib6_siblings);
- match->fib6_nsiblings++;
-
/* For each sibling in the list, increment the counter of
* siblings. BUG() if counters does not match, list of siblings
* is broken!
*/
+ fib6_nsiblings = 0;
list_for_each_entry_safe(sibling, temp_sibling,
- &match->fib6_siblings, fib6_siblings) {
+ &rt->fib6_siblings, fib6_siblings) {
sibling->fib6_nsiblings++;
- BUG_ON(sibling->fib6_nsiblings != match->fib6_nsiblings);
+ BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
+ fib6_nsiblings++;
}
-
- rt6_multipath_rebalance(match);
+ BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
+ rt6_multipath_rebalance(temp_sibling);
}
/*
@@ -1043,8 +1067,9 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
add:
nlflags |= NLM_F_CREATE;
- err = call_fib6_entry_notifiers(info->nl_net, event, rt,
- extack);
+ err = call_fib6_entry_notifiers(info->nl_net,
+ FIB_EVENT_ENTRY_ADD,
+ rt, extack);
if (err)
return err;
@@ -1062,7 +1087,7 @@ add:
}
} else {
- struct fib6_info *tmp;
+ int nsiblings;
if (!found) {
if (add)
@@ -1077,57 +1102,48 @@ add:
if (err)
return err;
- /* if route being replaced has siblings, set tmp to
- * last one, otherwise tmp is current route. this is
- * used to set fib6_next for new route
- */
- if (iter->fib6_nsiblings)
- tmp = list_last_entry(&iter->fib6_siblings,
- struct fib6_info,
- fib6_siblings);
- else
- tmp = iter;
-
- /* insert new route */
atomic_inc(&rt->fib6_ref);
rcu_assign_pointer(rt->fib6_node, fn);
- rt->fib6_next = tmp->fib6_next;
+ rt->fib6_next = iter->fib6_next;
rcu_assign_pointer(*ins, rt);
-
if (!info->skip_notify)
inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
if (!(fn->fn_flags & RTN_RTINFO)) {
info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
fn->fn_flags |= RTN_RTINFO;
}
+ nsiblings = iter->fib6_nsiblings;
+ iter->fib6_node = NULL;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
+ fn->rr_ptr = NULL;
+ fib6_info_release(iter);
- /* delete old route */
- rt = iter;
-
- if (rt->fib6_nsiblings) {
- struct fib6_info *tmp;
-
+ if (nsiblings) {
/* Replacing an ECMP route, remove all siblings */
- list_for_each_entry_safe(iter, tmp, &rt->fib6_siblings,
- fib6_siblings) {
- iter->fib6_node = NULL;
- fib6_purge_rt(iter, fn, info->nl_net);
- if (rcu_access_pointer(fn->rr_ptr) == iter)
- fn->rr_ptr = NULL;
- fib6_info_release(iter);
-
- rt->fib6_nsiblings--;
- info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+ ins = &rt->fib6_next;
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
+ while (iter) {
+ if (iter->fib6_metric > rt->fib6_metric)
+ break;
+ if (rt6_qualify_for_ecmp(iter)) {
+ *ins = iter->fib6_next;
+ iter->fib6_node = NULL;
+ fib6_purge_rt(iter, fn, info->nl_net);
+ if (rcu_access_pointer(fn->rr_ptr) == iter)
+ fn->rr_ptr = NULL;
+ fib6_info_release(iter);
+ nsiblings--;
+ info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
+ } else {
+ ins = &iter->fib6_next;
+ }
+ iter = rcu_dereference_protected(*ins,
+ lockdep_is_held(&rt->fib6_table->tb6_lock));
}
+ WARN_ON(nsiblings != 0);
}
-
- WARN_ON(rt->fib6_nsiblings != 0);
-
- rt->fib6_node = NULL;
- fib6_purge_rt(rt, fn, info->nl_net);
- if (rcu_access_pointer(fn->rr_ptr) == rt)
- fn->rr_ptr = NULL;
- fib6_info_release(rt);
}
return 0;
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 367177786e34..fc7dd3a04360 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -927,7 +927,6 @@ tx_err:
static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
struct net_device *dev)
{
- struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct ip6_tnl *t = netdev_priv(dev);
struct dst_entry *dst = skb_dst(skb);
struct net_device_stats *stats;
@@ -1012,6 +1011,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
goto tx_err;
}
} else {
+ struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
switch (skb->protocol) {
case htons(ETH_P_IP):
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 8047fd41ba88..16f200f06500 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -570,6 +570,8 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
to->dev = from->dev;
to->mark = from->mark;
+ skb_copy_hash(to, from);
+
#ifdef CONFIG_NET_SCHED
to->tc_index = from->tc_index;
#endif
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index b7f28deddaea..c72ae3a4fe09 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -480,10 +480,6 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
goto tx_err_dst_release;
}
- skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
- skb_dst_set(skb, dst);
- skb->dev = skb_dst(skb)->dev;
-
mtu = dst_mtu(dst);
if (!skb->ignore_df && skb->len > mtu) {
skb_dst_update_pmtu(skb, mtu);
@@ -498,9 +494,14 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
htonl(mtu));
}
- return -EMSGSIZE;
+ err = -EMSGSIZE;
+ goto tx_err_dst_release;
}
+ skb_scrub_packet(skb, !net_eq(t->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = skb_dst(skb)->dev;
+
err = dst_output(t->net, skb->sk, skb);
if (net_xmit_eval(err) == 0) {
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index fabe3ba1bddc..c0cac9cc3a28 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -398,6 +398,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
case IPV6_DSTOPTS:
{
struct ipv6_txoptions *opt;
+ struct ipv6_opt_hdr *new = NULL;
+
+ /* hop-by-hop / destination options are privileged option */
+ retv = -EPERM;
+ if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
+ break;
/* remove any sticky options header with a zero option
* length, per RFC3542.
@@ -409,17 +415,22 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
else if (optlen < sizeof(struct ipv6_opt_hdr) ||
optlen & 0x7 || optlen > 8 * 255)
goto e_inval;
-
- /* hop-by-hop / destination options are privileged option */
- retv = -EPERM;
- if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
- break;
+ else {
+ new = memdup_user(optval, optlen);
+ if (IS_ERR(new)) {
+ retv = PTR_ERR(new);
+ break;
+ }
+ if (unlikely(ipv6_optlen(new) > optlen)) {
+ kfree(new);
+ goto e_inval;
+ }
+ }
opt = rcu_dereference_protected(np->opt,
lockdep_sock_is_held(sk));
- opt = ipv6_renew_options(sk, opt, optname,
- (struct ipv6_opt_hdr __user *)optval,
- optlen);
+ opt = ipv6_renew_options(sk, opt, optname, new);
+ kfree(new);
if (IS_ERR(opt)) {
retv = PTR_ERR(opt);
break;
@@ -717,8 +728,9 @@ done:
struct sockaddr_in6 *psin6;
psin6 = (struct sockaddr_in6 *)&greqs.gsr_group;
- retv = ipv6_sock_mc_join(sk, greqs.gsr_interface,
- &psin6->sin6_addr);
+ retv = ipv6_sock_mc_join_ssm(sk, greqs.gsr_interface,
+ &psin6->sin6_addr,
+ MCAST_INCLUDE);
/* prior join w/ different source is ok */
if (retv && retv != -EADDRINUSE)
break;
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index c0c74088f2af..4ae54aaca373 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -95,6 +95,8 @@ static int ip6_mc_add_src(struct inet6_dev *idev, const struct in6_addr *pmca,
int delta);
static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml,
struct inet6_dev *idev);
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+ const struct in6_addr *addr, unsigned int mode);
#define MLD_QRV_DEFAULT 2
/* RFC3810, 9.2. Query Interval */
@@ -132,7 +134,8 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
return iv > 0 ? iv : 1;
}
-int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
+ const struct in6_addr *addr, unsigned int mode)
{
struct net_device *dev = NULL;
struct ipv6_mc_socklist *mc_lst;
@@ -179,7 +182,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
}
mc_lst->ifindex = dev->ifindex;
- mc_lst->sfmode = MCAST_EXCLUDE;
+ mc_lst->sfmode = mode;
rwlock_init(&mc_lst->sflock);
mc_lst->sflist = NULL;
@@ -187,7 +190,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
* now add/increase the group membership on the device
*/
- err = ipv6_dev_mc_inc(dev, addr);
+ err = __ipv6_dev_mc_inc(dev, addr, mode);
if (err) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
@@ -199,8 +202,19 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
return 0;
}
+
+int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
+{
+ return __ipv6_sock_mc_join(sk, ifindex, addr, MCAST_EXCLUDE);
+}
EXPORT_SYMBOL(ipv6_sock_mc_join);
+int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex,
+ const struct in6_addr *addr, unsigned int mode)
+{
+ return __ipv6_sock_mc_join(sk, ifindex, addr, mode);
+}
+
/*
* socket leave on multicast group
*/
@@ -672,7 +686,13 @@ static void igmp6_group_added(struct ifmcaddr6 *mc)
}
/* else v2 */
- mc->mca_crcount = mc->idev->mc_qrv;
+ /* Based on RFC3810 6.1, for newly added INCLUDE SSM, we
+ * should not send filter-mode change record as the mode
+ * should be from IN() to IN(A).
+ */
+ if (mc->mca_sfmode == MCAST_EXCLUDE)
+ mc->mca_crcount = mc->idev->mc_qrv;
+
mld_ifc_event(mc->idev);
}
@@ -770,13 +790,13 @@ static void mld_del_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im)
spin_lock_bh(&im->mca_lock);
if (pmc) {
im->idev = pmc->idev;
- im->mca_crcount = idev->mc_qrv;
- im->mca_sfmode = pmc->mca_sfmode;
- if (pmc->mca_sfmode == MCAST_INCLUDE) {
+ if (im->mca_sfmode == MCAST_INCLUDE) {
im->mca_tomb = pmc->mca_tomb;
im->mca_sources = pmc->mca_sources;
for (psf = im->mca_sources; psf; psf = psf->sf_next)
- psf->sf_crcount = im->mca_crcount;
+ psf->sf_crcount = idev->mc_qrv;
+ } else {
+ im->mca_crcount = idev->mc_qrv;
}
in6_dev_put(pmc->idev);
kfree(pmc);
@@ -831,7 +851,8 @@ static void ma_put(struct ifmcaddr6 *mc)
}
static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
- const struct in6_addr *addr)
+ const struct in6_addr *addr,
+ unsigned int mode)
{
struct ifmcaddr6 *mc;
@@ -849,9 +870,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
refcount_set(&mc->mca_refcnt, 1);
spin_lock_init(&mc->mca_lock);
- /* initial mode is (EX, empty) */
- mc->mca_sfmode = MCAST_EXCLUDE;
- mc->mca_sfcount[MCAST_EXCLUDE] = 1;
+ mc->mca_sfmode = mode;
+ mc->mca_sfcount[mode] = 1;
if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) ||
IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL)
@@ -863,7 +883,8 @@ static struct ifmcaddr6 *mca_alloc(struct inet6_dev *idev,
/*
* device multicast group inc (add if not found)
*/
-int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+static int __ipv6_dev_mc_inc(struct net_device *dev,
+ const struct in6_addr *addr, unsigned int mode)
{
struct ifmcaddr6 *mc;
struct inet6_dev *idev;
@@ -887,14 +908,13 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
if (ipv6_addr_equal(&mc->mca_addr, addr)) {
mc->mca_users++;
write_unlock_bh(&idev->lock);
- ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0,
- NULL, 0);
+ ip6_mc_add_src(idev, &mc->mca_addr, mode, 0, NULL, 0);
in6_dev_put(idev);
return 0;
}
}
- mc = mca_alloc(idev, addr);
+ mc = mca_alloc(idev, addr, mode);
if (!mc) {
write_unlock_bh(&idev->lock);
in6_dev_put(idev);
@@ -916,6 +936,11 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
return 0;
}
+int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr)
+{
+ return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE);
+}
+
/*
* device multicast group del
*/
@@ -1751,7 +1776,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
psf_next = psf->sf_next;
- if (!is_in(pmc, psf, type, gdeleted, sdeleted)) {
+ if (!is_in(pmc, psf, type, gdeleted, sdeleted) && !crsend) {
psf_prev = psf;
continue;
}
@@ -2066,7 +2091,7 @@ static void mld_send_initial_cr(struct inet6_dev *idev)
if (pmc->mca_sfcount[MCAST_EXCLUDE])
type = MLD2_CHANGE_TO_EXCLUDE;
else
- type = MLD2_CHANGE_TO_INCLUDE;
+ type = MLD2_ALLOW_NEW_SOURCES;
skb = add_grec(skb, pmc, type, 0, 0, 1);
spin_unlock_bh(&pmc->mca_lock);
}
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index e640d2f3c55c..0ec273997d1d 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -811,7 +811,7 @@ static void ndisc_recv_ns(struct sk_buff *skb)
return;
}
}
- if (ndopts.nd_opts_nonce)
+ if (ndopts.nd_opts_nonce && ndopts.nd_opts_nonce->nd_opt_len == 1)
memcpy(&nonce, (u8 *)(ndopts.nd_opts_nonce + 1), 6);
inc = ipv6_addr_is_multicast(daddr);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 531d6957af36..5ae8e1c51079 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -15,7 +15,6 @@
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/xfrm.h>
-#include <net/ip6_checksum.h>
#include <net/netfilter/nf_queue.h>
int ip6_route_me_harder(struct net *net, struct sk_buff *skb)
@@ -106,71 +105,10 @@ static int nf_ip6_route(struct net *net, struct dst_entry **dst,
return err;
}
-__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
- break;
- if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- skb->len - dataoff, protocol,
- csum_sub(skb->csum,
- skb_checksum(skb, 0,
- dataoff, 0)))) {
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- break;
- }
- /* fall through */
- case CHECKSUM_NONE:
- skb->csum = ~csum_unfold(
- csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
- skb->len - dataoff,
- protocol,
- csum_sub(0,
- skb_checksum(skb, 0,
- dataoff, 0))));
- csum = __skb_checksum_complete(skb);
- }
- return csum;
-}
-EXPORT_SYMBOL(nf_ip6_checksum);
-
-static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, unsigned int len,
- u_int8_t protocol)
-{
- const struct ipv6hdr *ip6h = ipv6_hdr(skb);
- __wsum hsum;
- __sum16 csum = 0;
-
- switch (skb->ip_summed) {
- case CHECKSUM_COMPLETE:
- if (len == skb->len - dataoff)
- return nf_ip6_checksum(skb, hook, dataoff, protocol);
- /* fall through */
- case CHECKSUM_NONE:
- hsum = skb_checksum(skb, 0, dataoff, 0);
- skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
- &ip6h->daddr,
- skb->len - dataoff,
- protocol,
- csum_sub(0, hsum)));
- skb->ip_summed = CHECKSUM_NONE;
- return __skb_checksum_complete_head(skb, dataoff + len);
- }
- return csum;
-};
-
static const struct nf_ipv6_ops ipv6ops = {
.chk_addr = ipv6_chk_addr,
.route_input = ip6_route_input,
.fragment = ip6_fragment,
- .checksum = nf_ip6_checksum,
- .checksum_partial = nf_ip6_checksum_partial,
.route = nf_ip6_route,
.reroute = nf_ip6_reroute,
};
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 37b14dc9d863..339d0762b027 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -5,26 +5,6 @@
menu "IPv6: Netfilter Configuration"
depends on INET && IPV6 && NETFILTER
-config NF_DEFRAG_IPV6
- tristate
- default n
-
-config NF_CONNTRACK_IPV6
- tristate "IPv6 connection tracking support"
- depends on INET && IPV6 && NF_CONNTRACK
- default m if NETFILTER_ADVANCED=n
- select NF_DEFRAG_IPV6
- ---help---
- Connection tracking keeps a record of what packets have passed
- through your machine, in order to figure out how they are related
- into connections.
-
- This is IPv6 support on Layer 3 independent connection tracking.
- Layer 3 independent connection tracking is experimental scheme
- which generalize ip_conntrack to support other layer 3 protocols.
-
- To compile it as a module, choose M here. If unsure, say N.
-
config NF_SOCKET_IPV6
tristate "IPv6 socket lookup support"
help
@@ -128,7 +108,7 @@ config NF_LOG_IPV6
config NF_NAT_IPV6
tristate "IPv6 NAT"
- depends on NF_CONNTRACK_IPV6
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_NAT
help
@@ -328,7 +308,7 @@ config IP6_NF_SECURITY
config IP6_NF_NAT
tristate "ip6tables NAT support"
- depends on NF_CONNTRACK_IPV6
+ depends on NF_CONNTRACK
depends on NETFILTER_ADVANCED
select NF_NAT
select NF_NAT_IPV6
@@ -365,6 +345,7 @@ config IP6_NF_TARGET_NPT
endif # IP6_NF_NAT
endif # IP6_NF_IPTABLES
-
endmenu
+config NF_DEFRAG_IPV6
+ tristate
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 10a5a1c87320..200c0c235565 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -11,12 +11,6 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o
obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o
obj-$(CONFIG_IP6_NF_NAT) += ip6table_nat.o
-# objects for l3 independent conntrack
-nf_conntrack_ipv6-y := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
-
-# l3 independent conntrack
-obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
-
nf_nat_ipv6-y := nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
nf_nat_ipv6-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 7eab959734bc..daf2e9e9193d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1909,6 +1909,7 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
.checkentry = icmp6_checkentry,
.proto = IPPROTO_ICMPV6,
.family = NFPROTO_IPV6,
+ .me = THIS_MODULE,
},
};
diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
deleted file mode 100644
index 663827ee3cf8..000000000000
--- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (C)2004 USAGI/WIDE Project
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- */
-
-#include <linux/types.h>
-#include <linux/ipv6.h>
-#include <linux/in6.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <net/ipv6.h>
-#include <net/inet_frag.h>
-
-#include <linux/netfilter_bridge.h>
-#include <linux/netfilter_ipv6.h>
-#include <linux/netfilter_ipv6/ip6_tables.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_helper.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_zones.h>
-#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
-#include <net/netfilter/nf_nat_helper.h>
-#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
-#include <net/netfilter/nf_log.h>
-
-static int conntrack6_net_id;
-static DEFINE_MUTEX(register_ipv6_hooks);
-
-struct conntrack6_net {
- unsigned int users;
-};
-
-static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- const u_int32_t *ap;
- u_int32_t _addrs[8];
-
- ap = skb_header_pointer(skb, nhoff + offsetof(struct ipv6hdr, saddr),
- sizeof(_addrs), _addrs);
- if (ap == NULL)
- return false;
-
- memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
- memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
-
- return true;
-}
-
-static bool ipv6_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- memcpy(tuple->src.u3.ip6, orig->dst.u3.ip6, sizeof(tuple->src.u3.ip6));
- memcpy(tuple->dst.u3.ip6, orig->src.u3.ip6, sizeof(tuple->dst.u3.ip6));
-
- return true;
-}
-
-static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
- __be16 frag_off;
- int protoff;
- u8 nexthdr;
-
- if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
- &nexthdr, sizeof(nexthdr)) != 0) {
- pr_debug("ip6_conntrack_core: can't get nexthdr\n");
- return -NF_ACCEPT;
- }
- protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
- /*
- * (protoff == skb->len) means the packet has not data, just
- * IPv6 and possibly extensions headers, but it is tracked anyway
- */
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("ip6_conntrack_core: can't find proto in pkt\n");
- return -NF_ACCEPT;
- }
-
- *dataoff = protoff;
- *protonum = nexthdr;
- return NF_ACCEPT;
-}
-
-static unsigned int ipv6_helper(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- const struct nf_conn_help *help;
- const struct nf_conntrack_helper *helper;
- enum ip_conntrack_info ctinfo;
- __be16 frag_off;
- int protoff;
- u8 nexthdr;
-
- /* This is where we call the helper: as the packet goes out. */
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
- /* rcu_read_lock()ed by nf_hook_thresh */
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- nexthdr = ipv6_hdr(skb)->nexthdr;
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
-
- return helper->help(skb, protoff, ct, ctinfo);
-}
-
-static unsigned int ipv6_confirm(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- struct nf_conn *ct;
- enum ip_conntrack_info ctinfo;
- unsigned char pnum = ipv6_hdr(skb)->nexthdr;
- int protoff;
- __be16 frag_off;
-
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- goto out;
-
- protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
- &frag_off);
- if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- goto out;
- }
-
- /* adjust seqs for loopback traffic only in outgoing direction */
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_is_loopback_packet(skb)) {
- if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
- NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
- return NF_DROP;
- }
- }
-out:
- /* We've seen it coming out the other side: confirm it */
- return nf_conntrack_confirm(skb);
-}
-
-static unsigned int ipv6_conntrack_in(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
-}
-
-static unsigned int ipv6_conntrack_local(void *priv,
- struct sk_buff *skb,
- const struct nf_hook_state *state)
-{
- return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
-}
-
-static const struct nf_hook_ops ipv6_conntrack_ops[] = {
- {
- .hook = ipv6_conntrack_in,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_PRE_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK,
- },
- {
- .hook = ipv6_conntrack_local,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_OUT,
- .priority = NF_IP6_PRI_CONNTRACK,
- },
- {
- .hook = ipv6_helper,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv6_confirm,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_POST_ROUTING,
- .priority = NF_IP6_PRI_LAST,
- },
- {
- .hook = ipv6_helper,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP6_PRI_CONNTRACK_HELPER,
- },
- {
- .hook = ipv6_confirm,
- .pf = NFPROTO_IPV6,
- .hooknum = NF_INET_LOCAL_IN,
- .priority = NF_IP6_PRI_LAST-1,
- },
-};
-
-static int
-ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
-{
- struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
- const struct ipv6_pinfo *inet6 = inet6_sk(sk);
- const struct inet_sock *inet = inet_sk(sk);
- const struct nf_conntrack_tuple_hash *h;
- struct sockaddr_in6 sin6;
- struct nf_conn *ct;
- __be32 flow_label;
- int bound_dev_if;
-
- lock_sock(sk);
- tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
- tuple.src.u.tcp.port = inet->inet_sport;
- tuple.dst.u3.in6 = sk->sk_v6_daddr;
- tuple.dst.u.tcp.port = inet->inet_dport;
- tuple.dst.protonum = sk->sk_protocol;
- bound_dev_if = sk->sk_bound_dev_if;
- flow_label = inet6->flow_label;
- release_sock(sk);
-
- if (tuple.dst.protonum != IPPROTO_TCP &&
- tuple.dst.protonum != IPPROTO_SCTP)
- return -ENOPROTOOPT;
-
- if (*len < 0 || (unsigned int) *len < sizeof(sin6))
- return -EINVAL;
-
- h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
- if (!h) {
- pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
- &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
- &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
- return -ENOENT;
- }
-
- ct = nf_ct_tuplehash_to_ctrack(h);
-
- sin6.sin6_family = AF_INET6;
- sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
- sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
- memcpy(&sin6.sin6_addr,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
- sizeof(sin6.sin6_addr));
-
- nf_ct_put(ct);
- sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
- return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
-}
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-
-static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple)
-{
- if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
- nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
- goto nla_put_failure;
- return 0;
-
-nla_put_failure:
- return -1;
-}
-
-static const struct nla_policy ipv6_nla_policy[CTA_IP_MAX+1] = {
- [CTA_IP_V6_SRC] = { .len = sizeof(u_int32_t)*4 },
- [CTA_IP_V6_DST] = { .len = sizeof(u_int32_t)*4 },
-};
-
-static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
- struct nf_conntrack_tuple *t)
-{
- if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
- return -EINVAL;
-
- t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
- t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
-
- return 0;
-}
-#endif
-
-static int ipv6_hooks_register(struct net *net)
-{
- struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
- int err = 0;
-
- mutex_lock(&register_ipv6_hooks);
- cnet->users++;
- if (cnet->users > 1)
- goto out_unlock;
-
- err = nf_defrag_ipv6_enable(net);
- if (err < 0) {
- cnet->users = 0;
- goto out_unlock;
- }
-
- err = nf_register_net_hooks(net, ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- if (err)
- cnet->users = 0;
- out_unlock:
- mutex_unlock(&register_ipv6_hooks);
- return err;
-}
-
-static void ipv6_hooks_unregister(struct net *net)
-{
- struct conntrack6_net *cnet = net_generic(net, conntrack6_net_id);
-
- mutex_lock(&register_ipv6_hooks);
- if (cnet->users && (--cnet->users == 0))
- nf_unregister_net_hooks(net, ipv6_conntrack_ops,
- ARRAY_SIZE(ipv6_conntrack_ops));
- mutex_unlock(&register_ipv6_hooks);
-}
-
-const struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 = {
- .l3proto = PF_INET6,
- .pkt_to_tuple = ipv6_pkt_to_tuple,
- .invert_tuple = ipv6_invert_tuple,
- .get_l4proto = ipv6_get_l4proto,
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- .tuple_to_nlattr = ipv6_tuple_to_nlattr,
- .nlattr_to_tuple = ipv6_nlattr_to_tuple,
- .nla_policy = ipv6_nla_policy,
- .nla_size = NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])) +
- NLA_ALIGN(NLA_HDRLEN + sizeof(u32[4])),
-#endif
- .net_ns_get = ipv6_hooks_register,
- .net_ns_put = ipv6_hooks_unregister,
- .me = THIS_MODULE,
-};
-
-MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Yasuyuki KOZAKAI @USAGI <yasuyuki.kozakai@toshiba.co.jp>");
-
-static struct nf_sockopt_ops so_getorigdst6 = {
- .pf = NFPROTO_IPV6,
- .get_optmin = IP6T_SO_ORIGINAL_DST,
- .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
- .get = ipv6_getorigdst,
- .owner = THIS_MODULE,
-};
-
-static const struct nf_conntrack_l4proto * const builtin_l4proto6[] = {
- &nf_conntrack_l4proto_tcp6,
- &nf_conntrack_l4proto_udp6,
- &nf_conntrack_l4proto_icmpv6,
-#ifdef CONFIG_NF_CT_PROTO_DCCP
- &nf_conntrack_l4proto_dccp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_SCTP
- &nf_conntrack_l4proto_sctp6,
-#endif
-#ifdef CONFIG_NF_CT_PROTO_UDPLITE
- &nf_conntrack_l4proto_udplite6,
-#endif
-};
-
-static int ipv6_net_init(struct net *net)
-{
- return nf_ct_l4proto_pernet_register(net, builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
-}
-
-static void ipv6_net_exit(struct net *net)
-{
- nf_ct_l4proto_pernet_unregister(net, builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
-}
-
-static struct pernet_operations ipv6_net_ops = {
- .init = ipv6_net_init,
- .exit = ipv6_net_exit,
- .id = &conntrack6_net_id,
- .size = sizeof(struct conntrack6_net),
-};
-
-static int __init nf_conntrack_l3proto_ipv6_init(void)
-{
- int ret = 0;
-
- need_conntrack();
-
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (WARN_ON(nla_policy_len(ipv6_nla_policy, CTA_IP_MAX + 1) !=
- nf_conntrack_l3proto_ipv6.nla_size))
- return -EINVAL;
-#endif
-
- ret = nf_register_sockopt(&so_getorigdst6);
- if (ret < 0) {
- pr_err("Unable to register netfilter socket option\n");
- return ret;
- }
-
- ret = register_pernet_subsys(&ipv6_net_ops);
- if (ret < 0)
- goto cleanup_sockopt;
-
- ret = nf_ct_l4proto_register(builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
- if (ret < 0)
- goto cleanup_pernet;
-
- ret = nf_ct_l3proto_register(&nf_conntrack_l3proto_ipv6);
- if (ret < 0) {
- pr_err("nf_conntrack_ipv6: can't register ipv6 proto.\n");
- goto cleanup_l4proto;
- }
- return ret;
-cleanup_l4proto:
- nf_ct_l4proto_unregister(builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
- cleanup_pernet:
- unregister_pernet_subsys(&ipv6_net_ops);
- cleanup_sockopt:
- nf_unregister_sockopt(&so_getorigdst6);
- return ret;
-}
-
-static void __exit nf_conntrack_l3proto_ipv6_fini(void)
-{
- synchronize_net();
- nf_ct_l3proto_unregister(&nf_conntrack_l3proto_ipv6);
- nf_ct_l4proto_unregister(builtin_l4proto6,
- ARRAY_SIZE(builtin_l4proto6));
- unregister_pernet_subsys(&ipv6_net_ops);
- nf_unregister_sockopt(&so_getorigdst6);
-}
-
-module_init(nf_conntrack_l3proto_ipv6_init);
-module_exit(nf_conntrack_l3proto_ipv6_fini);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index a452d99c9f52..2a14d8b65924 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -33,9 +33,8 @@
#include <net/sock.h>
#include <net/snmp.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
-#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/rawv6.h>
@@ -151,7 +150,7 @@ static void nf_ct_frag6_expire(struct timer_list *t)
fq = container_of(frag, struct frag_queue, q);
net = container_of(fq->q.net, struct net, nf_frag.frags);
- ip6_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(net, fq);
}
/* Creation primitives. */
@@ -464,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
head->csum);
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return true;
@@ -558,6 +558,10 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
hdr = ipv6_hdr(skb);
fhdr = (struct frag_hdr *)skb_transport_header(skb);
+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
+ fhdr->frag_off & htons(IP6_MF))
+ return -EINVAL;
+
skb_orphan(skb);
fq = fq_find(net, fhdr->identification, user, hdr,
skb->dev ? skb->dev->ifindex : 0);
@@ -585,6 +589,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
fq->q.meat == fq->q.len &&
nf_ct_frag6_reasm(fq, skb, dev))
ret = 0;
+ else
+ skb_dst_drop(skb);
out_unlock:
spin_unlock_bh(&fq->q.lock);
@@ -622,16 +628,24 @@ static struct pernet_operations nf_ct_net_ops = {
.exit = nf_ct_net_exit,
};
+static const struct rhashtable_params nfct_rhash_params = {
+ .head_offset = offsetof(struct inet_frag_queue, node),
+ .hashfn = ip6frag_key_hashfn,
+ .obj_hashfn = ip6frag_obj_hashfn,
+ .obj_cmpfn = ip6frag_obj_cmpfn,
+ .automatic_shrinking = true,
+};
+
int nf_ct_frag6_init(void)
{
int ret = 0;
- nf_frags.constructor = ip6_frag_init;
+ nf_frags.constructor = ip6frag_init;
nf_frags.destructor = NULL;
nf_frags.qsize = sizeof(struct frag_queue);
nf_frags.frag_expire = nf_ct_frag6_expire;
nf_frags.frags_cache_name = nf_frags_cache_name;
- nf_frags.rhash_params = ip6_rhash_params;
+ nf_frags.rhash_params = nfct_rhash_params;
ret = inet_frags_init(&nf_frags);
if (ret)
goto out;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index c87b48359e8f..72dd3e202375 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -14,8 +14,7 @@
#include <linux/skbuff.h>
#include <linux/icmp.h>
#include <linux/sysctl.h>
-#include <net/ipv6.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
@@ -23,7 +22,6 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
#endif
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index bf1d6c421e3b..5dfd33af6451 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -55,7 +55,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, hp, tproto,
+ sk2 = nf_tproxy_get_sock_v6(net, skb, thoff, tproto,
&iph->saddr,
nf_tproxy_laddr6(skb, laddr, &iph->daddr),
hp->source,
@@ -72,7 +72,7 @@ nf_tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
EXPORT_SYMBOL_GPL(nf_tproxy_handle_time_wait6);
struct sock *
-nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
const u8 protocol,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const __be16 sport, const __be16 dport,
@@ -80,15 +80,20 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
const enum nf_tproxy_lookup_t lookup_type)
{
struct sock *sk;
- struct tcphdr *tcph;
switch (protocol) {
- case IPPROTO_TCP:
+ case IPPROTO_TCP: {
+ struct tcphdr _hdr, *hp;
+
+ hp = skb_header_pointer(skb, thoff,
+ sizeof(struct tcphdr), &_hdr);
+ if (hp == NULL)
+ return NULL;
+
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- tcph = hp;
sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
- thoff + __tcp_hdrlen(tcph),
+ thoff + __tcp_hdrlen(hp),
saddr, sport,
daddr, ntohs(dport),
in->ifindex, 0);
@@ -110,6 +115,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
BUG();
}
break;
+ }
case IPPROTO_UDP:
sk = udp6_lib_lookup(net, saddr, sport, daddr, dport,
in->ifindex);
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index b939b94e7e91..5c5b4f79296e 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -57,7 +57,7 @@
#include <net/rawv6.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
-#include <net/inet_frag.h>
+#include <net/ipv6_frag.h>
#include <net/inet_ecn.h>
static const char ip6_frag_cache_name[] = "ip6-frags";
@@ -72,61 +72,6 @@ static struct inet_frags ip6_frags;
static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
struct net_device *dev);
-void ip6_frag_init(struct inet_frag_queue *q, const void *a)
-{
- struct frag_queue *fq = container_of(q, struct frag_queue, q);
- const struct frag_v6_compare_key *key = a;
-
- q->key.v6 = *key;
- fq->ecn = 0;
-}
-EXPORT_SYMBOL(ip6_frag_init);
-
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
-{
- struct net_device *dev = NULL;
- struct sk_buff *head;
-
- rcu_read_lock();
- spin_lock(&fq->q.lock);
-
- if (fq->q.flags & INET_FRAG_COMPLETE)
- goto out;
-
- inet_frag_kill(&fq->q);
-
- dev = dev_get_by_index_rcu(net, fq->iif);
- if (!dev)
- goto out;
-
- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
- __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
-
- /* Don't send error if the first segment did not arrive. */
- head = fq->q.fragments;
- if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
- goto out;
-
- /* But use as source device on which LAST ARRIVED
- * segment was received. And do not use fq->dev
- * pointer directly, device might already disappeared.
- */
- head->dev = dev;
- skb_get(head);
- spin_unlock(&fq->q.lock);
-
- icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
- kfree_skb(head);
- goto out_rcu_unlock;
-
-out:
- spin_unlock(&fq->q.lock);
-out_rcu_unlock:
- rcu_read_unlock();
- inet_frag_put(&fq->q);
-}
-EXPORT_SYMBOL(ip6_expire_frag_queue);
-
static void ip6_frag_expire(struct timer_list *t)
{
struct inet_frag_queue *frag = from_timer(frag, t, timer);
@@ -136,7 +81,7 @@ static void ip6_frag_expire(struct timer_list *t)
fq = container_of(frag, struct frag_queue, q);
net = container_of(fq->q.net, struct net, ipv6.frags);
- ip6_expire_frag_queue(net, fq);
+ ip6frag_expire_frag_queue(net, fq);
}
static struct frag_queue *
@@ -460,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock();
fq->q.fragments = NULL;
+ fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL;
return 1;
@@ -510,6 +456,10 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
return 1;
}
+ if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU &&
+ fhdr->frag_off & htons(IP6_MF))
+ goto fail_hdr;
+
iif = skb->dev ? skb->dev->ifindex : 0;
fq = fq_find(net, fhdr->identification, hdr, iif);
if (fq) {
@@ -696,42 +646,19 @@ static struct pernet_operations ip6_frags_ops = {
.exit = ipv6_frags_exit_net,
};
-static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
-{
- return jhash2(data,
- sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
-}
-
-static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
-{
- const struct inet_frag_queue *fq = data;
-
- return jhash2((const u32 *)&fq->key.v6,
- sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
-}
-
-static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
-{
- const struct frag_v6_compare_key *key = arg->key;
- const struct inet_frag_queue *fq = ptr;
-
- return !!memcmp(&fq->key, key, sizeof(*key));
-}
-
-const struct rhashtable_params ip6_rhash_params = {
+static const struct rhashtable_params ip6_rhash_params = {
.head_offset = offsetof(struct inet_frag_queue, node),
- .hashfn = ip6_key_hashfn,
- .obj_hashfn = ip6_obj_hashfn,
- .obj_cmpfn = ip6_obj_cmpfn,
+ .hashfn = ip6frag_key_hashfn,
+ .obj_hashfn = ip6frag_obj_hashfn,
+ .obj_cmpfn = ip6frag_obj_cmpfn,
.automatic_shrinking = true,
};
-EXPORT_SYMBOL(ip6_rhash_params);
int __init ipv6_frag_init(void)
{
int ret;
- ip6_frags.constructor = ip6_frag_init;
+ ip6_frags.constructor = ip6frag_init;
ip6_frags.destructor = NULL;
ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.frag_expire = ip6_frag_expire;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 86a0e4333d42..ec18b3ce8b6d 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -972,10 +972,10 @@ static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
rt->dst.lastuse = jiffies;
}
+/* Caller must already hold reference to @from */
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
{
rt->rt6i_flags &= ~RTF_EXPIRES;
- fib6_info_hold(from);
rcu_assign_pointer(rt->from, from);
dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
if (from->fib6_metrics != &dst_default_metrics) {
@@ -984,6 +984,7 @@ static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
}
}
+/* Caller must already hold reference to @ort */
static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
{
struct net_device *dev = fib6_info_nh_dev(ort);
@@ -1044,9 +1045,14 @@ static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
struct net_device *dev = rt->fib6_nh.nh_dev;
struct rt6_info *nrt;
+ if (!fib6_info_hold_safe(rt))
+ return NULL;
+
nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
if (nrt)
ip6_rt_copy_init(nrt, rt);
+ else
+ fib6_info_release(rt);
return nrt;
}
@@ -1178,10 +1184,15 @@ static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
* Clone the route.
*/
+ if (!fib6_info_hold_safe(ort))
+ return NULL;
+
dev = ip6_rt_get_dev_rcu(ort);
rt = ip6_dst_alloc(dev_net(dev), dev, 0);
- if (!rt)
+ if (!rt) {
+ fib6_info_release(ort);
return NULL;
+ }
ip6_rt_copy_init(rt, ort);
rt->rt6i_flags |= RTF_CACHE;
@@ -1210,12 +1221,17 @@ static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
struct net_device *dev;
struct rt6_info *pcpu_rt;
+ if (!fib6_info_hold_safe(rt))
+ return NULL;
+
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(rt);
pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
rcu_read_unlock();
- if (!pcpu_rt)
+ if (!pcpu_rt) {
+ fib6_info_release(rt);
return NULL;
+ }
ip6_rt_copy_init(pcpu_rt, rt);
pcpu_rt->rt6i_flags |= RTF_PCPU;
return pcpu_rt;
@@ -2486,7 +2502,7 @@ restart:
out:
if (ret)
- dst_hold(&ret->dst);
+ ip6_hold_safe(net, &ret, true);
else
ret = ip6_create_rt_rcu(rt);
@@ -3303,7 +3319,8 @@ static int ip6_route_del(struct fib6_config *cfg,
continue;
if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
continue;
- fib6_info_hold(rt);
+ if (!fib6_info_hold_safe(rt))
+ continue;
rcu_read_unlock();
/* if gateway was specified only delete the one hop */
@@ -3409,6 +3426,9 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
rcu_read_lock();
from = rcu_dereference(rt->from);
+ /* This fib6_info_hold() is safe here because we hold reference to rt
+ * and rt already holds reference to fib6_info.
+ */
fib6_info_hold(from);
rcu_read_unlock();
@@ -3470,7 +3490,8 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
continue;
if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
continue;
- fib6_info_hold(rt);
+ if (!fib6_info_hold_safe(rt))
+ continue;
break;
}
out:
@@ -3530,8 +3551,8 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
break;
}
- if (rt)
- fib6_info_hold(rt);
+ if (rt && !fib6_info_hold_safe(rt))
+ rt = NULL;
rcu_read_unlock();
return rt;
}
@@ -3579,8 +3600,8 @@ restart:
struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
- (!idev || idev->cnf.accept_ra != 2)) {
- fib6_info_hold(rt);
+ (!idev || idev->cnf.accept_ra != 2) &&
+ fib6_info_hold_safe(rt)) {
rcu_read_unlock();
ip6_del_rt(net, rt);
goto restart;
@@ -3842,7 +3863,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
lockdep_is_held(&rt->fib6_table->tb6_lock));
while (iter) {
if (iter->fib6_metric == rt->fib6_metric &&
- iter->fib6_nsiblings)
+ rt6_qualify_for_ecmp(iter))
return iter;
iter = rcu_dereference_protected(iter->fib6_next,
lockdep_is_held(&rt->fib6_table->tb6_lock));
@@ -4388,6 +4409,13 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rt = NULL;
goto cleanup;
}
+ if (!rt6_qualify_for_ecmp(rt)) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack,
+ "Device only routes can not be added for IPv6 using the multipath API.");
+ fib6_info_release(rt);
+ goto cleanup;
+ }
rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
@@ -4439,7 +4467,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
*/
cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
NLM_F_REPLACE);
- cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_APPEND;
nhn++;
}
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 19ccf0dc996c..a8854dd3e9c5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -101,7 +101,7 @@ static __be32 seg6_make_flowlabel(struct net *net, struct sk_buff *skb,
if (do_flowlabel > 0) {
hash = skb_get_hash(skb);
- rol32(hash, 16);
+ hash = rol32(hash, 16);
flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
} else if (!do_flowlabel && skb->protocol == htons(ETH_P_IPV6)) {
flowlabel = ip6_flowlabel(inner_hdr);
diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
index cd6e4cab63f6..e1025b493a18 100644
--- a/net/ipv6/seg6_local.c
+++ b/net/ipv6/seg6_local.c
@@ -637,12 +637,10 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt)
if (!seg6_validate_srh(srh, len))
return -EINVAL;
- slwt->srh = kmalloc(len, GFP_KERNEL);
+ slwt->srh = kmemdup(srh, len, GFP_KERNEL);
if (!slwt->srh)
return -ENOMEM;
- memcpy(slwt->srh, srh, len);
-
slwt->headroom += len;
return 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 7efa9fd7e109..03e6b7a2bc53 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -938,7 +938,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
&tcp_hashinfo, NULL, 0,
&ipv6h->saddr,
th->source, &ipv6h->daddr,
- ntohs(th->source), tcp_v6_iif(skb),
+ ntohs(th->source),
+ tcp_v6_iif_l3_slave(skb),
tcp_v6_sdif(skb));
if (!sk1)
goto out;
@@ -1609,7 +1610,8 @@ do_time_wait:
skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
- ntohs(th->dest), tcp_v6_iif(skb),
+ ntohs(th->dest),
+ tcp_v6_iif_l3_slave(skb),
sdif);
if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
diff --git a/net/ipv6/xfrm6_mode_ro.c b/net/ipv6/xfrm6_mode_ro.c
index 07d36573f50b..da28e4407b8f 100644
--- a/net/ipv6/xfrm6_mode_ro.c
+++ b/net/ipv6/xfrm6_mode_ro.c
@@ -55,7 +55,7 @@ static int xfrm6_ro_output(struct xfrm_state *x, struct sk_buff *skb)
__skb_pull(skb, hdr_len);
memmove(ipv6_hdr(skb), iph, hdr_len);
- x->lastused = get_seconds();
+ x->lastused = ktime_get_real_seconds();
return 0;
}
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index 893a022f9620..92ee91e34395 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1494,7 +1494,7 @@ __poll_t iucv_sock_poll(struct file *file, struct socket *sock,
struct sock *sk = sock->sk;
__poll_t mask = 0;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
if (sk->sk_state == IUCV_LISTEN)
return iucv_accept_poll(sk);
@@ -2515,4 +2515,3 @@ MODULE_DESCRIPTION("IUCV Sockets ver " VERSION);
MODULE_VERSION(VERSION);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_IUCV);
-
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
index 87fca36e6c47..9ca83f2ade6f 100644
--- a/net/kcm/Kconfig
+++ b/net/kcm/Kconfig
@@ -8,4 +8,3 @@ config AF_KCM
KCM (Kernel Connection Multiplexor) sockets provide a method
for multiplexing messages of a message based application
protocol over kernel connectons (e.g. TCP connections).
-
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
index d3601d421571..571d824e4e24 100644
--- a/net/kcm/kcmsock.c
+++ b/net/kcm/kcmsock.c
@@ -2104,4 +2104,3 @@ module_exit(kcm_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_KCM);
-
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 5e1d2946ffbf..9d61266526e7 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1383,7 +1383,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
}
if (!x)
- x = xfrm_find_acq(net, &dummy_mark, mode, reqid, proto, xdaddr, xsaddr, 1, family);
+ x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family);
if (x == NULL)
return -ENOENT;
@@ -2414,7 +2414,7 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sa
return err;
}
- xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_bysel_ctx(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
pol->sadb_x_policy_dir - 1, &sel, pol_ctx,
1, &err);
security_xfrm_policy_free(pol_ctx);
@@ -2663,7 +2663,7 @@ static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_
return -EINVAL;
delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2);
- xp = xfrm_policy_byid(net, DUMMY_MARK, XFRM_POLICY_TYPE_MAIN,
+ xp = xfrm_policy_byid(net, DUMMY_MARK, 0, XFRM_POLICY_TYPE_MAIN,
dir, pol->sadb_x_policy_id, delete, &err);
if (xp == NULL)
return -ENOENT;
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 1ea285bad84b..ac6a00bcec71 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -619,7 +619,7 @@ discard:
*/
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
- int length, int (*payload_hook)(struct sk_buff *skb))
+ int length)
{
struct l2tp_tunnel *tunnel = session->tunnel;
int offset;
@@ -740,13 +740,6 @@ void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
__skb_pull(skb, offset);
- /* If caller wants to process the payload before we queue the
- * packet, do so now.
- */
- if (payload_hook)
- if ((*payload_hook)(skb))
- goto discard;
-
/* Prepare skb for adding to the session's reorder_q. Hold
* packets for max reorder_timeout or 1 second if not
* reordering.
@@ -800,8 +793,7 @@ static int l2tp_session_queue_purge(struct l2tp_session *session)
* Returns 1 if the packet was not a good data packet and could not be
* forwarded. All such packets are passed up to userspace to deal with.
*/
-static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
- int (*payload_hook)(struct sk_buff *skb))
+static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb)
{
struct l2tp_session *session = NULL;
unsigned char *ptr, *optr;
@@ -892,7 +884,7 @@ static int l2tp_udp_recv_core(struct l2tp_tunnel *tunnel, struct sk_buff *skb,
goto error;
}
- l2tp_recv_common(session, skb, ptr, optr, hdrflags, length, payload_hook);
+ l2tp_recv_common(session, skb, ptr, optr, hdrflags, length);
l2tp_session_dec_refcount(session);
return 0;
@@ -921,7 +913,7 @@ int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
l2tp_dbg(tunnel, L2TP_MSG_DATA, "%s: received %d bytes\n",
tunnel->name, skb->len);
- if (l2tp_udp_recv_core(tunnel, skb, tunnel->recv_payload_hook))
+ if (l2tp_udp_recv_core(tunnel, skb))
goto pass_up;
return 0;
@@ -1682,8 +1674,6 @@ struct l2tp_session *l2tp_session_create(int priv_size, struct l2tp_tunnel *tunn
if (cfg) {
session->pwtype = cfg->pw_type;
session->debug = cfg->debug;
- session->mtu = cfg->mtu;
- session->mru = cfg->mru;
session->send_seq = cfg->send_seq;
session->recv_seq = cfg->recv_seq;
session->lns_mode = cfg->lns_mode;
@@ -1795,4 +1785,3 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
MODULE_DESCRIPTION("L2TP core");
MODULE_LICENSE("GPL");
MODULE_VERSION(L2TP_DRV_VERSION);
-
diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h
index a5c09d3a5698..5804065dfbfb 100644
--- a/net/l2tp/l2tp_core.h
+++ b/net/l2tp/l2tp_core.h
@@ -12,6 +12,9 @@
#ifndef _L2TP_CORE_H_
#define _L2TP_CORE_H_
+#include <net/dst.h>
+#include <net/sock.h>
+
/* Just some random numbers */
#define L2TP_TUNNEL_MAGIC 0x42114DDA
#define L2TP_SESSION_MAGIC 0x0C04EB7D
@@ -45,10 +48,6 @@ struct l2tp_tunnel;
*/
struct l2tp_session_cfg {
enum l2tp_pwtype pw_type;
- unsigned int data_seq:2; /* data sequencing level
- * 0 => none, 1 => IP only,
- * 2 => all
- */
unsigned int recv_seq:1; /* expect receive packets with
* sequence numbers? */
unsigned int send_seq:1; /* send packets with sequence
@@ -58,7 +57,6 @@ struct l2tp_session_cfg {
* control of LNS. */
int debug; /* bitmask of debug message
* categories */
- u16 vlan_id; /* VLAN pseudowire only */
u16 l2specific_type; /* Layer 2 specific type */
u8 cookie[8]; /* optional cookie */
int cookie_len; /* 0, 4 or 8 bytes */
@@ -66,8 +64,6 @@ struct l2tp_session_cfg {
int peer_cookie_len; /* 0, 4 or 8 bytes */
int reorder_timeout; /* configured reorder timeout
* (in jiffies) */
- int mtu;
- int mru;
char *ifname;
};
@@ -99,10 +95,6 @@ struct l2tp_session {
char name[32]; /* for logging */
char ifname[IFNAMSIZ];
- unsigned int data_seq:2; /* data sequencing level
- * 0 => none, 1 => IP only,
- * 2 => all
- */
unsigned int recv_seq:1; /* expect receive packets with
* sequence numbers? */
unsigned int send_seq:1; /* send packets with sequence
@@ -115,8 +107,6 @@ struct l2tp_session {
int reorder_timeout; /* configured reorder timeout
* (in jiffies) */
int reorder_skip; /* set if skip to next nr */
- int mtu;
- int mru;
enum l2tp_pwtype pwtype;
struct l2tp_stats stats;
struct hlist_node global_hlist; /* Global hash list node */
@@ -180,7 +170,6 @@ struct l2tp_tunnel {
struct net *l2tp_net; /* the net we belong to */
refcount_t ref_count;
- int (*recv_payload_hook)(struct sk_buff *skb);
void (*old_sk_destruct)(struct sock *);
struct sock *sock; /* Parent socket */
int fd; /* Parent fd, if tunnel socket
@@ -232,7 +221,7 @@ int l2tp_session_delete(struct l2tp_session *session);
void l2tp_session_free(struct l2tp_session *session);
void l2tp_recv_common(struct l2tp_session *session, struct sk_buff *skb,
unsigned char *ptr, unsigned char *optr, u16 hdrflags,
- int length, int (*payload_hook)(struct sk_buff *skb));
+ int length);
int l2tp_udp_encap_recv(struct sock *sk, struct sk_buff *skb);
void l2tp_session_set_header_len(struct l2tp_session *session, int version);
@@ -280,6 +269,21 @@ static inline int l2tp_get_l2specific_len(struct l2tp_session *session)
}
}
+static inline u32 l2tp_tunnel_dst_mtu(const struct l2tp_tunnel *tunnel)
+{
+ struct dst_entry *dst;
+ u32 mtu;
+
+ dst = sk_dst_get(tunnel->sock);
+ if (!dst)
+ return 0;
+
+ mtu = dst_mtu(dst);
+ dst_release(dst);
+
+ return mtu;
+}
+
#define l2tp_printk(ptr, type, func, fmt, ...) \
do { \
if (((ptr)->debug) & (type)) \
diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c
index b5d7dde003ef..9821a1458555 100644
--- a/net/l2tp/l2tp_debugfs.c
+++ b/net/l2tp/l2tp_debugfs.c
@@ -191,12 +191,9 @@ static void l2tp_dfs_seq_session_show(struct seq_file *m, void *v)
if (session->send_seq || session->recv_seq)
seq_printf(m, " nr %hu, ns %hu\n", session->nr, session->ns);
seq_printf(m, " refcnt %d\n", refcount_read(&session->ref_count));
- seq_printf(m, " config %d/%d/%c/%c/%s/%s %08x %u\n",
- session->mtu, session->mru,
+ seq_printf(m, " config 0/0/%c/%c/-/%s %08x %u\n",
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
- session->data_seq == 1 ? "IPSEQ" :
- session->data_seq == 2 ? "DATASEQ" : "-",
session->lns_mode ? "LNS" : "LAC",
session->debug,
jiffies_to_msecs(session->reorder_timeout));
diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c
index 5c366ecfa1cb..3728986ec885 100644
--- a/net/l2tp/l2tp_eth.c
+++ b/net/l2tp/l2tp_eth.c
@@ -226,22 +226,19 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
struct net_device *dev)
{
unsigned int overhead = 0;
- struct dst_entry *dst;
u32 l3_overhead = 0;
+ u32 mtu;
/* if the encap is UDP, account for UDP header size */
if (tunnel->encap == L2TP_ENCAPTYPE_UDP) {
overhead += sizeof(struct udphdr);
dev->needed_headroom += sizeof(struct udphdr);
}
- if (session->mtu != 0) {
- dev->mtu = session->mtu;
- dev->needed_headroom += session->hdr_len;
- return;
- }
+
lock_sock(tunnel->sock);
l3_overhead = kernel_sock_ip_overhead(tunnel->sock);
release_sock(tunnel->sock);
+
if (l3_overhead == 0) {
/* L3 Overhead couldn't be identified, this could be
* because tunnel->sock was NULL or the socket's
@@ -255,18 +252,12 @@ static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel,
*/
overhead += session->hdr_len + ETH_HLEN + l3_overhead;
- /* If PMTU discovery was enabled, use discovered MTU on L2TP device */
- dst = sk_dst_get(tunnel->sock);
- if (dst) {
- /* dst_mtu will use PMTU if found, else fallback to intf MTU */
- u32 pmtu = dst_mtu(dst);
+ mtu = l2tp_tunnel_dst_mtu(tunnel) - overhead;
+ if (mtu < dev->min_mtu || mtu > dev->max_mtu)
+ dev->mtu = ETH_DATA_LEN - overhead;
+ else
+ dev->mtu = mtu;
- if (pmtu != 0)
- dev->mtu = pmtu;
- dst_release(dst);
- }
- session->mtu = dev->mtu - overhead;
- dev->mtu = session->mtu;
dev->needed_headroom += session->hdr_len;
}
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index a9c05b2bc1b0..0bc39cc20a3f 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -165,7 +165,7 @@ static int l2tp_ip_recv(struct sk_buff *skb)
print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
}
- l2tp_recv_common(session, skb, ptr, optr, 0, skb->len, tunnel->recv_payload_hook);
+ l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
l2tp_session_dec_refcount(session);
return 0;
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 672e5b753738..42f828cf62fb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -178,8 +178,7 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
}
- l2tp_recv_common(session, skb, ptr, optr, 0, skb->len,
- tunnel->recv_payload_hook);
+ l2tp_recv_common(session, skb, ptr, optr, 0, skb->len);
l2tp_session_dec_refcount(session);
return 0;
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index 5b9900889e31..2e1e92651545 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -560,9 +560,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
}
if (tunnel->version > 2) {
- if (info->attrs[L2TP_ATTR_DATA_SEQ])
- cfg.data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
if (info->attrs[L2TP_ATTR_L2SPEC_TYPE]) {
cfg.l2specific_type = nla_get_u8(info->attrs[L2TP_ATTR_L2SPEC_TYPE]);
if (cfg.l2specific_type != L2TP_L2SPECTYPE_DEFAULT &&
@@ -594,9 +591,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
}
if (info->attrs[L2TP_ATTR_IFNAME])
cfg.ifname = nla_data(info->attrs[L2TP_ATTR_IFNAME]);
-
- if (info->attrs[L2TP_ATTR_VLAN_ID])
- cfg.vlan_id = nla_get_u16(info->attrs[L2TP_ATTR_VLAN_ID]);
}
if (info->attrs[L2TP_ATTR_DEBUG])
@@ -614,12 +608,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
cfg.reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
- if (info->attrs[L2TP_ATTR_MTU])
- cfg.mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
-
- if (info->attrs[L2TP_ATTR_MRU])
- cfg.mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
#ifdef CONFIG_MODULES
if (l2tp_nl_cmd_ops[cfg.pw_type] == NULL) {
genl_unlock();
@@ -693,9 +681,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_DEBUG])
session->debug = nla_get_u32(info->attrs[L2TP_ATTR_DEBUG]);
- if (info->attrs[L2TP_ATTR_DATA_SEQ])
- session->data_seq = nla_get_u8(info->attrs[L2TP_ATTR_DATA_SEQ]);
-
if (info->attrs[L2TP_ATTR_RECV_SEQ])
session->recv_seq = nla_get_u8(info->attrs[L2TP_ATTR_RECV_SEQ]);
@@ -710,12 +695,6 @@ static int l2tp_nl_cmd_session_modify(struct sk_buff *skb, struct genl_info *inf
if (info->attrs[L2TP_ATTR_RECV_TIMEOUT])
session->reorder_timeout = nla_get_msecs(info->attrs[L2TP_ATTR_RECV_TIMEOUT]);
- if (info->attrs[L2TP_ATTR_MTU])
- session->mtu = nla_get_u16(info->attrs[L2TP_ATTR_MTU]);
-
- if (info->attrs[L2TP_ATTR_MRU])
- session->mru = nla_get_u16(info->attrs[L2TP_ATTR_MRU]);
-
ret = l2tp_session_notify(&l2tp_nl_family, info,
session, L2TP_CMD_SESSION_MODIFY);
@@ -745,10 +724,7 @@ static int l2tp_nl_session_send(struct sk_buff *skb, u32 portid, u32 seq, int fl
nla_put_u32(skb, L2TP_ATTR_PEER_SESSION_ID,
session->peer_session_id) ||
nla_put_u32(skb, L2TP_ATTR_DEBUG, session->debug) ||
- nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype) ||
- nla_put_u16(skb, L2TP_ATTR_MTU, session->mtu) ||
- (session->mru &&
- nla_put_u16(skb, L2TP_ATTR_MRU, session->mru)))
+ nla_put_u16(skb, L2TP_ATTR_PW_TYPE, session->pwtype))
goto nla_put_failure;
if ((session->ifname[0] &&
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 9ac02c93df98..6e2c8e7595e0 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -93,7 +93,6 @@
#include <linux/nsproxy.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <net/dst.h>
#include <net/ip.h>
#include <net/udp.h>
#include <net/xfrm.h>
@@ -127,8 +126,6 @@ struct pppol2tp_session {
* PPPoX socket */
struct sock *__sk; /* Copy of .sk, for cleanup */
struct rcu_head rcu; /* For asynchronous release */
- int flags; /* accessed by PPPIOCGFLAGS.
- * Unused. */
};
static int pppol2tp_xmit(struct ppp_channel *chan, struct sk_buff *skb);
@@ -183,25 +180,6 @@ out:
* Receive data handling
*****************************************************************************/
-static int pppol2tp_recv_payload_hook(struct sk_buff *skb)
-{
- /* Skip PPP header, if present. In testing, Microsoft L2TP clients
- * don't send the PPP header (PPP header compression enabled), but
- * other clients can include the header. So we cope with both cases
- * here. The PPP header is always FF03 when using L2TP.
- *
- * Note that skb->data[] isn't dereferenced from a u16 ptr here since
- * the field may be unaligned.
- */
- if (!pskb_may_pull(skb, 2))
- return 1;
-
- if ((skb->data[0] == PPP_ALLSTATIONS) && (skb->data[1] == PPP_UI))
- skb_pull(skb, 2);
-
- return 0;
-}
-
/* Receive message. This is the recvmsg for the PPPoL2TP socket.
*/
static int pppol2tp_recvmsg(struct socket *sock, struct msghdr *msg,
@@ -248,6 +226,17 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk == NULL)
goto no_sock;
+ /* If the first two bytes are 0xFF03, consider that it is the PPP's
+ * Address and Control fields and skip them. The L2TP module has always
+ * worked this way, although, in theory, the use of these fields should
+ * be negociated and handled at the PPP layer. These fields are
+ * constant: 0xFF is the All-Stations Address and 0x03 the Unnumbered
+ * Information command with Poll/Final bit set to zero (RFC 1662).
+ */
+ if (pskb_may_pull(skb, 2) && skb->data[0] == PPP_ALLSTATIONS &&
+ skb->data[1] == PPP_UI)
+ skb_pull(skb, 2);
+
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
@@ -564,7 +553,6 @@ static void pppol2tp_show(struct seq_file *m, void *arg)
static void pppol2tp_session_init(struct l2tp_session *session)
{
struct pppol2tp_session *ps;
- struct dst_entry *dst;
session->recv_skb = pppol2tp_recv;
#if IS_ENABLED(CONFIG_L2TP_DEBUGFS)
@@ -574,18 +562,6 @@ static void pppol2tp_session_init(struct l2tp_session *session)
ps = l2tp_session_priv(session);
mutex_init(&ps->sk_lock);
ps->owner = current->pid;
-
- /* If PMTU discovery was enabled, use the MTU that was discovered */
- dst = sk_dst_get(session->tunnel->sock);
- if (dst) {
- u32 pmtu = dst_mtu(dst);
-
- if (pmtu) {
- session->mtu = pmtu - PPPOL2TP_HEADER_OVERHEAD;
- session->mru = pmtu - PPPOL2TP_HEADER_OVERHEAD;
- }
- dst_release(dst);
- }
}
struct l2tp_connect_info {
@@ -672,6 +648,22 @@ static int pppol2tp_sockaddr_get_info(const void *sa, int sa_len,
return 0;
}
+/* Rough estimation of the maximum payload size a tunnel can transmit without
+ * fragmenting at the lower IP layer. Assumes L2TPv2 with sequence
+ * numbers and no IP option. Not quite accurate, but the result is mostly
+ * unused anyway.
+ */
+static int pppol2tp_tunnel_mtu(const struct l2tp_tunnel *tunnel)
+{
+ int mtu;
+
+ mtu = l2tp_tunnel_dst_mtu(tunnel);
+ if (mtu <= PPPOL2TP_HEADER_OVERHEAD)
+ return 1500 - PPPOL2TP_HEADER_OVERHEAD;
+
+ return mtu - PPPOL2TP_HEADER_OVERHEAD;
+}
+
/* connect() handler. Attach a PPPoX socket to a tunnel UDP socket
*/
static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
@@ -763,9 +755,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
- if (tunnel->recv_payload_hook == NULL)
- tunnel->recv_payload_hook = pppol2tp_recv_payload_hook;
-
if (tunnel->peer_tunnel_id == 0)
tunnel->peer_tunnel_id = info.peer_tunnel_id;
@@ -792,9 +781,6 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
goto end;
}
} else {
- /* Default MTU must allow space for UDP/L2TP/PPP headers */
- cfg.mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
- cfg.mru = cfg.mtu;
cfg.pw_type = L2TP_PWTYPE_PPP;
session = l2tp_session_create(sizeof(struct pppol2tp_session),
@@ -839,7 +825,7 @@ static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr,
po->chan.private = sk;
po->chan.ops = &pppol2tp_chan_ops;
- po->chan.mtu = session->mtu;
+ po->chan.mtu = pppol2tp_tunnel_mtu(tunnel);
error = ppp_register_net_channel(sock_net(sk), &po->chan);
if (error) {
@@ -895,12 +881,6 @@ static int pppol2tp_session_create(struct net *net, struct l2tp_tunnel *tunnel,
goto err;
}
- /* Default MTU values. */
- if (cfg->mtu == 0)
- cfg->mtu = 1500 - PPPOL2TP_HEADER_OVERHEAD;
- if (cfg->mru == 0)
- cfg->mru = cfg->mtu;
-
/* Allocate and initialize a new session context. */
session = l2tp_session_create(sizeof(struct pppol2tp_session),
tunnel, session_id,
@@ -1064,11 +1044,9 @@ static void pppol2tp_copy_stats(struct pppol2tp_ioc_stats *dest,
static int pppol2tp_session_ioctl(struct l2tp_session *session,
unsigned int cmd, unsigned long arg)
{
- struct ifreq ifr;
int err = 0;
struct sock *sk;
int val = (int) arg;
- struct pppol2tp_session *ps = l2tp_session_priv(session);
struct l2tp_tunnel *tunnel = session->tunnel;
struct pppol2tp_ioc_stats stats;
@@ -1081,85 +1059,19 @@ static int pppol2tp_session_ioctl(struct l2tp_session *session,
return -EBADR;
switch (cmd) {
- case SIOCGIFMTU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- err = -EFAULT;
- if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
- break;
- ifr.ifr_mtu = session->mtu;
- if (copy_to_user((void __user *) arg, &ifr, sizeof(struct ifreq)))
- break;
-
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mtu=%d\n",
- session->name, session->mtu);
- err = 0;
- break;
-
- case SIOCSIFMTU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- err = -EFAULT;
- if (copy_from_user(&ifr, (void __user *) arg, sizeof(struct ifreq)))
- break;
-
- session->mtu = ifr.ifr_mtu;
-
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mtu=%d\n",
- session->name, session->mtu);
- err = 0;
- break;
-
case PPPIOCGMRU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- err = -EFAULT;
- if (put_user(session->mru, (int __user *) arg))
- break;
-
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get mru=%d\n",
- session->name, session->mru);
- err = 0;
- break;
-
- case PPPIOCSMRU:
- err = -ENXIO;
- if (!(sk->sk_state & PPPOX_CONNECTED))
- break;
-
- err = -EFAULT;
- if (get_user(val, (int __user *) arg))
- break;
-
- session->mru = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set mru=%d\n",
- session->name, session->mru);
- err = 0;
- break;
-
case PPPIOCGFLAGS:
err = -EFAULT;
- if (put_user(ps->flags, (int __user *) arg))
+ if (put_user(0, (int __user *)arg))
break;
-
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: get flags=%d\n",
- session->name, ps->flags);
err = 0;
break;
+ case PPPIOCSMRU:
case PPPIOCSFLAGS:
err = -EFAULT;
- if (get_user(val, (int __user *) arg))
+ if (get_user(val, (int __user *)arg))
break;
- ps->flags = val;
- l2tp_info(session, L2TP_MSG_CONTROL, "%s: set flags=%d\n",
- session->name, ps->flags);
err = 0;
break;
@@ -1227,13 +1139,18 @@ static int pppol2tp_tunnel_ioctl(struct l2tp_tunnel *tunnel,
l2tp_session_get(sock_net(sk), tunnel,
stats.session_id);
- if (session && session->pwtype == L2TP_PWTYPE_PPP) {
- err = pppol2tp_session_ioctl(session, cmd,
- arg);
+ if (!session) {
+ err = -EBADR;
+ break;
+ }
+ if (session->pwtype != L2TP_PWTYPE_PPP) {
l2tp_session_dec_refcount(session);
- } else {
err = -EBADR;
+ break;
}
+
+ err = pppol2tp_session_ioctl(session, cmd, arg);
+ l2tp_session_dec_refcount(session);
break;
}
#ifdef CONFIG_XFRM
@@ -1743,8 +1660,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v)
tunnel->peer_tunnel_id,
session->peer_session_id,
state, user_data_ok);
- seq_printf(m, " %d/%d/%c/%c/%s %08x %u\n",
- session->mtu, session->mru,
+ seq_printf(m, " 0/0/%c/%c/%s %08x %u\n",
session->recv_seq ? 'R' : '-',
session->send_seq ? 'S' : '-',
session->lns_mode ? "LNS" : "LAC",
diff --git a/net/llc/Kconfig b/net/llc/Kconfig
index b91c65108162..176a6c1521a5 100644
--- a/net/llc/Kconfig
+++ b/net/llc/Kconfig
@@ -6,5 +6,5 @@ config LLC2
tristate "ANSI/IEEE 802.2 LLC type 2 Support"
select LLC
help
- This is a Logical Link Layer type 2, connection oriented support.
+ This is a Logical Link Layer type 2, connection oriented support.
Select this if you want to have support for PF_LLC sockets.
diff --git a/net/llc/Makefile b/net/llc/Makefile
index 4e260cff3c5d..5e0ef436daae 100644
--- a/net/llc/Makefile
+++ b/net/llc/Makefile
@@ -4,7 +4,7 @@
# Copyright (c) 1997 by Procom Technology,Inc.
# 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br>
#
-# This program can be redistributed or modified under the terms of the
+# This program can be redistributed or modified under the terms of the
# GNU General Public License as published by the Free Software Foundation.
# This program is distributed without any warranty or implied warranty
# of merchantability or fitness for a particular purpose.
diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c
index 6daf391b3e84..8db03c2d5440 100644
--- a/net/llc/llc_if.c
+++ b/net/llc/llc_if.c
@@ -151,4 +151,3 @@ out:
sock_put(sk);
return rc;
}
-
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 02f3672e7b5e..d25da0e66da1 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -495,7 +495,7 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev,
goto out_unlock;
}
- ieee80211_key_free(key, true);
+ ieee80211_key_free(key, sdata->vif.type == NL80211_IFTYPE_STATION);
ret = 0;
out_unlock:
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index ee0d0cc8dc3b..c054ac85793c 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -656,11 +656,15 @@ int ieee80211_key_link(struct ieee80211_key *key,
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_key *old_key;
- int idx, ret;
- bool pairwise;
-
- pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
- idx = key->conf.keyidx;
+ int idx = key->conf.keyidx;
+ bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE;
+ /*
+ * We want to delay tailroom updates only for station - in that
+ * case it helps roaming speed, but in other cases it hurts and
+ * can cause warnings to appear.
+ */
+ bool delay_tailroom = sdata->vif.type == NL80211_IFTYPE_STATION;
+ int ret;
mutex_lock(&sdata->local->key_mtx);
@@ -688,14 +692,14 @@ int ieee80211_key_link(struct ieee80211_key *key,
increment_tailroom_need_count(sdata);
ieee80211_key_replace(sdata, sta, pairwise, old_key, key);
- ieee80211_key_destroy(old_key, true);
+ ieee80211_key_destroy(old_key, delay_tailroom);
ieee80211_debugfs_key_add(key);
if (!local->wowlan) {
ret = ieee80211_key_enable_hw_accel(key);
if (ret)
- ieee80211_key_free(key, true);
+ ieee80211_key_free(key, delay_tailroom);
} else {
ret = 0;
}
@@ -930,7 +934,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local,
ieee80211_key_replace(key->sdata, key->sta,
key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
key, NULL);
- __ieee80211_key_destroy(key, true);
+ __ieee80211_key_destroy(key, key->sdata->vif.type ==
+ NL80211_IFTYPE_STATION);
}
for (i = 0; i < NUM_DEFAULT_KEYS; i++) {
@@ -940,7 +945,8 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local,
ieee80211_key_replace(key->sdata, key->sta,
key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE,
key, NULL);
- __ieee80211_key_destroy(key, true);
+ __ieee80211_key_destroy(key, key->sdata->vif.type ==
+ NL80211_IFTYPE_STATION);
}
mutex_unlock(&local->key_mtx);
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 76048b53c5b2..07fb219327d6 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -751,4 +751,3 @@ rc80211_minstrel_exit(void)
{
ieee80211_rate_control_unregister(&mac80211_minstrel);
}
-
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index a16ba568e2a3..64742f2765c4 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2370,11 +2370,8 @@ static void ieee80211_deliver_skb_to_local_stack(struct sk_buff *skb,
sdata->control_port_over_nl80211)) {
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
bool noencrypt = status->flag & RX_FLAG_DECRYPTED;
- struct ethhdr *ehdr = eth_hdr(skb);
- cfg80211_rx_control_port(dev, skb->data, skb->len,
- ehdr->h_source,
- be16_to_cpu(skb->protocol), noencrypt);
+ cfg80211_rx_control_port(dev, skb, noencrypt);
dev_kfree_skb(skb);
} else {
/* deliver to local stack */
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 6a79d564de35..cd332e3e1134 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1249,7 +1249,7 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
(info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
return NULL;
- if (!ieee80211_is_data(hdr->frame_control))
+ if (!ieee80211_is_data_present(hdr->frame_control))
return NULL;
if (sta) {
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 3e68132a41fa..88efda7c9f8a 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -2140,7 +2140,8 @@ int ieee80211_reconfig(struct ieee80211_local *local)
if (!sta->uploaded)
continue;
- if (sta->sdata->vif.type != NL80211_IFTYPE_AP)
+ if (sta->sdata->vif.type != NL80211_IFTYPE_AP &&
+ sta->sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
continue;
for (state = IEEE80211_STA_NOTEXIST;
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 6e558a419f60..94f53a9b7d1a 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -224,7 +224,7 @@ static int mpls_fill_encap_info(struct sk_buff *skb,
struct lwtunnel_state *lwtstate)
{
struct mpls_iptunnel_encap *tun_encap_info;
-
+
tun_encap_info = mpls_lwtunnel_encap(lwtstate);
if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index dbd7d1fad277..71709c104081 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -46,9 +46,19 @@ config NETFILTER_NETLINK_LOG
and is also scheduled to replace the old syslog-based ipt_LOG
and ip6t_LOG modules.
+config NETFILTER_NETLINK_OSF
+ tristate "Netfilter OSF over NFNETLINK interface"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK
+ help
+ If this option is enabled, the kernel will include support
+ for passive OS fingerprint via NFNETLINK.
+
config NF_CONNTRACK
tristate "Netfilter connection tracking support"
default m if NETFILTER_ADVANCED=n
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if IPV6 != n
help
Connection tracking keeps a record of what packets have passed
through your machine, in order to figure out how they are related
@@ -96,7 +106,6 @@ config NF_CONNTRACK_SECMARK
config NF_CONNTRACK_ZONES
bool 'Connection tracking zones'
depends on NETFILTER_ADVANCED
- depends on NETFILTER_XT_TARGET_CT
help
This option enables support for connection tracking zones.
Normally, each connection needs to have a unique system wide
@@ -148,10 +157,11 @@ config NF_CONNTRACK_TIMESTAMP
If unsure, say `N'.
config NF_CONNTRACK_LABELS
- bool
+ bool "Connection tracking labels"
help
This option enables support for assigning user-defined flag bits
- to connection tracking entries. It selected by the connlabel match.
+ to connection tracking entries. It can be used with xtables connlabel
+ match and the nftables ct expression.
config NF_CT_PROTO_DCCP
bool 'DCCP protocol connection tracking support'
@@ -355,6 +365,7 @@ config NF_CT_NETLINK_TIMEOUT
tristate 'Connection tracking timeout tuning via Netlink'
select NETFILTER_NETLINK
depends on NETFILTER_ADVANCED
+ depends on NF_CONNTRACK_TIMEOUT
help
This option enables support for connection tracking timeout
fine-grain tuning. This allows you to attach specific timeout
@@ -440,9 +451,6 @@ config NETFILTER_SYNPROXY
endif # NF_CONNTRACK
-config NF_OSF
- tristate
-
config NF_TABLES
select NETFILTER_NETLINK
tristate "Netfilter nf_tables support"
@@ -460,6 +468,13 @@ config NF_TABLES
if NF_TABLES
+config NF_TABLES_SET
+ tristate "Netfilter nf_tables set infrastructure"
+ help
+ This option enables the nf_tables set infrastructure that allows to
+ look up for elements in a set and to build one-way mappings between
+ matchings and actions.
+
config NF_TABLES_INET
depends on IPV6
select NF_TABLES_IPV4
@@ -493,24 +508,6 @@ config NFT_FLOW_OFFLOAD
This option adds the "flow_offload" expression that you can use to
choose what flows are placed into the hardware.
-config NFT_SET_RBTREE
- tristate "Netfilter nf_tables rbtree set module"
- help
- This option adds the "rbtree" set type (Red Black tree) that is used
- to build interval-based sets.
-
-config NFT_SET_HASH
- tristate "Netfilter nf_tables hash set module"
- help
- This option adds the "hash" set type that is used to build one-way
- mappings between matchings and actions.
-
-config NFT_SET_BITMAP
- tristate "Netfilter nf_tables bitmap set module"
- help
- This option adds the "bitmap" set type that is used to build sets
- whose keys are smaller or equal to 16 bits.
-
config NFT_COUNTER
tristate "Netfilter nf_tables counter module"
help
@@ -562,6 +559,12 @@ config NFT_NAT
This option adds the "nat" expression that you can use to perform
typical Network Address Translation (NAT) packet transformations.
+config NFT_TUNNEL
+ tristate "Netfilter nf_tables tunnel module"
+ help
+ This option adds the "tunnel" expression that you can use to set
+ tunneling policies.
+
config NFT_OBJREF
tristate "Netfilter nf_tables stateful object reference module"
help
@@ -626,11 +629,28 @@ config NFT_SOCKET
tristate "Netfilter nf_tables socket match support"
depends on IPV6 || IPV6=n
select NF_SOCKET_IPV4
- select NF_SOCKET_IPV6 if IPV6
+ select NF_SOCKET_IPV6 if NF_TABLES_IPV6
help
This option allows matching for the presence or absence of a
corresponding socket and its attributes.
+config NFT_OSF
+ tristate "Netfilter nf_tables passive OS fingerprint support"
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_OSF
+ help
+ This option allows matching packets from an specific OS.
+
+config NFT_TPROXY
+ tristate "Netfilter nf_tables tproxy support"
+ depends on IPV6 || IPV6=n
+ select NF_DEFRAG_IPV4
+ select NF_DEFRAG_IPV6 if NF_TABLES_IPV6
+ select NF_TPROXY_IPV4
+ select NF_TPROXY_IPV6 if NF_TABLES_IPV6
+ help
+ This makes transparent proxy support available in nftables.
+
if NF_TABLES_NETDEV
config NF_DUP_NETDEV
@@ -892,7 +912,7 @@ config NETFILTER_XT_TARGET_LOG
tristate "LOG target support"
select NF_LOG_COMMON
select NF_LOG_IPV4
- select NF_LOG_IPV6 if IPV6
+ select NF_LOG_IPV6 if IP6_NF_IPTABLES
default m if NETFILTER_ADVANCED=n
help
This option adds a `LOG' target, which allows you to create rules in
@@ -984,7 +1004,7 @@ config NETFILTER_XT_TARGET_TEE
depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV4
- select NF_DUP_IPV6 if IPV6
+ select NF_DUP_IPV6 if IP6_NF_IPTABLES
---help---
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -1377,8 +1397,8 @@ config NETFILTER_XT_MATCH_NFACCT
config NETFILTER_XT_MATCH_OSF
tristate '"osf" Passive OS fingerprint match'
- depends on NETFILTER_ADVANCED && NETFILTER_NETLINK
- select NF_OSF
+ depends on NETFILTER_ADVANCED
+ select NETFILTER_NETLINK_OSF
help
This option selects the Passive OS Fingerprinting match module
that allows to passively match the remote operating system by
@@ -1492,8 +1512,8 @@ config NETFILTER_XT_MATCH_SOCKET
depends on NETFILTER_ADVANCED
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
- depends on NF_SOCKET_IPV4
- depends on NF_SOCKET_IPV6
+ select NF_SOCKET_IPV4
+ select NF_SOCKET_IPV6 if IP6_NF_IPTABLES
select NF_DEFRAG_IPV4
select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 44449389e527..16895e045b66 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -1,7 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
netfilter-objs := core.o nf_log.o nf_queue.o nf_sockopt.o utils.o
-nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o nf_conntrack_proto.o nf_conntrack_l3proto_generic.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+nf_conntrack-y := nf_conntrack_core.o nf_conntrack_standalone.o nf_conntrack_expect.o nf_conntrack_helper.o \
+ nf_conntrack_proto.o nf_conntrack_proto_generic.o nf_conntrack_proto_tcp.o nf_conntrack_proto_udp.o \
+ nf_conntrack_proto_icmp.o \
+ nf_conntrack_extend.o nf_conntrack_acct.o nf_conntrack_seqadj.o
+
+nf_conntrack-$(subst m,y,$(CONFIG_IPV6)) += nf_conntrack_proto_icmpv6.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMEOUT) += nf_conntrack_timeout.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_TIMESTAMP) += nf_conntrack_timestamp.o
nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o
@@ -15,6 +20,7 @@ obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o
obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
+obj-$(CONFIG_NETFILTER_NETLINK_OSF) += nfnetlink_osf.o
# connection tracking
obj-$(CONFIG_NF_CONNTRACK) += nf_conntrack.o
@@ -78,7 +84,11 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o
+nf_tables_set-objs := nf_tables_set_core.o \
+ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o
+
obj-$(CONFIG_NF_TABLES) += nf_tables.o
+obj-$(CONFIG_NF_TABLES_SET) += nf_tables_set.o
obj-$(CONFIG_NFT_COMPAT) += nft_compat.o
obj-$(CONFIG_NFT_CONNLIMIT) += nft_connlimit.o
obj-$(CONFIG_NFT_NUMGEN) += nft_numgen.o
@@ -91,9 +101,7 @@ obj-$(CONFIG_NFT_QUEUE) += nft_queue.o
obj-$(CONFIG_NFT_QUOTA) += nft_quota.o
obj-$(CONFIG_NFT_REJECT) += nft_reject.o
obj-$(CONFIG_NFT_REJECT_INET) += nft_reject_inet.o
-obj-$(CONFIG_NFT_SET_RBTREE) += nft_set_rbtree.o
-obj-$(CONFIG_NFT_SET_HASH) += nft_set_hash.o
-obj-$(CONFIG_NFT_SET_BITMAP) += nft_set_bitmap.o
+obj-$(CONFIG_NFT_TUNNEL) += nft_tunnel.o
obj-$(CONFIG_NFT_COUNTER) += nft_counter.o
obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
@@ -102,8 +110,9 @@ obj-$(CONFIG_NFT_HASH) += nft_hash.o
obj-$(CONFIG_NFT_FIB) += nft_fib.o
obj-$(CONFIG_NFT_FIB_INET) += nft_fib_inet.o
obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o
-obj-$(CONFIG_NF_OSF) += nf_osf.o
obj-$(CONFIG_NFT_SOCKET) += nft_socket.o
+obj-$(CONFIG_NFT_OSF) += nft_osf.o
+obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o
# nf_tables netdev
obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 99e0aa350dc5..0edc62910ebf 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -825,12 +825,23 @@ static void ip_vs_conn_expire(struct timer_list *t)
/* Unlink conn if not referenced anymore */
if (likely(ip_vs_conn_unlink(cp))) {
+ struct ip_vs_conn *ct = cp->control;
+
/* delete the timer if it is activated by other users */
del_timer(&cp->timer);
/* does anybody control me? */
- if (cp->control)
+ if (ct) {
ip_vs_control_del(cp);
+ /* Drop CTL or non-assured TPL if not used anymore */
+ if (!cp->timeout && !atomic_read(&ct->n_control) &&
+ (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
+ !(ct->state & IP_VS_CTPL_S_ASSURED))) {
+ IP_VS_DBG(4, "drop controlling connection\n");
+ ct->timeout = 0;
+ ip_vs_conn_expire_now(ct);
+ }
+ }
if ((cp->flags & IP_VS_CONN_F_NFCT) &&
!(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
@@ -872,6 +883,10 @@ static void ip_vs_conn_expire(struct timer_list *t)
/* Modify timer, so that it expires as soon as possible.
* Can be called without reference only if under RCU lock.
+ * We can have such chain of conns linked with ->control: DATA->CTL->TPL
+ * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
+ * - cp->timeout=0 indicates all conns from chain should be dropped but
+ * TPL is not dropped if in assured state
*/
void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
{
@@ -1107,7 +1122,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_state_name(cp),
(cp->timer.expires-jiffies)/HZ, pe_data);
else
#endif
@@ -1118,7 +1133,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_state_name(cp),
(cp->timer.expires-jiffies)/HZ, pe_data);
}
return 0;
@@ -1169,7 +1184,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
&cp->caddr.in6, ntohs(cp->cport),
&cp->vaddr.in6, ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_state_name(cp),
ip_vs_origin_name(cp->flags),
(cp->timer.expires-jiffies)/HZ);
else
@@ -1181,7 +1196,7 @@ static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
ntohl(cp->caddr.ip), ntohs(cp->cport),
ntohl(cp->vaddr.ip), ntohs(cp->vport),
dbuf, ntohs(cp->dport),
- ip_vs_state_name(cp->protocol, cp->state),
+ ip_vs_state_name(cp),
ip_vs_origin_name(cp->flags),
(cp->timer.expires-jiffies)/HZ);
}
@@ -1197,8 +1212,11 @@ static const struct seq_operations ip_vs_conn_sync_seq_ops = {
#endif
-/*
- * Randomly drop connection entries before running out of memory
+/* Randomly drop connection entries before running out of memory
+ * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
+ * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
+ * - traffic for services not in OPS mode does not increase ct->in_pkts in
+ * all cases, so it is not supported
*/
static inline int todrop_entry(struct ip_vs_conn *cp)
{
@@ -1242,7 +1260,7 @@ static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
int idx;
- struct ip_vs_conn *cp, *cp_c;
+ struct ip_vs_conn *cp;
rcu_read_lock();
/*
@@ -1254,13 +1272,15 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->ipvs != ipvs)
continue;
+ if (atomic_read(&cp->n_control))
+ continue;
if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
- if (atomic_read(&cp->n_control) ||
- !ip_vs_conn_ops_mode(cp))
- continue;
- else
- /* connection template of OPS */
+ /* connection template of OPS */
+ if (ip_vs_conn_ops_mode(cp))
goto try_drop;
+ if (!(cp->state & IP_VS_CTPL_S_ASSURED))
+ goto drop;
+ continue;
}
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
@@ -1294,15 +1314,10 @@ try_drop:
continue;
}
- IP_VS_DBG(4, "del connection\n");
+drop:
+ IP_VS_DBG(4, "drop connection\n");
+ cp->timeout = 0;
ip_vs_conn_expire_now(cp);
- cp_c = cp->control;
- /* cp->control is valid only with reference to cp */
- if (cp_c && __ip_vs_conn_get(cp)) {
- IP_VS_DBG(4, "del conn template\n");
- ip_vs_conn_expire_now(cp_c);
- __ip_vs_conn_put(cp);
- }
}
cond_resched_rcu();
}
@@ -1325,15 +1340,19 @@ flush_again:
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[idx], c_list) {
if (cp->ipvs != ipvs)
continue;
- IP_VS_DBG(4, "del connection\n");
- ip_vs_conn_expire_now(cp);
+ /* As timers are expired in LIFO order, restart
+ * the timer of controlling connection first, so
+ * that it is expired after us.
+ */
cp_c = cp->control;
/* cp->control is valid only with reference to cp */
if (cp_c && __ip_vs_conn_get(cp)) {
- IP_VS_DBG(4, "del conn template\n");
+ IP_VS_DBG(4, "del controlling connection\n");
ip_vs_conn_expire_now(cp_c);
__ip_vs_conn_put(cp);
}
+ IP_VS_DBG(4, "del connection\n");
+ ip_vs_conn_expire_now(cp);
}
cond_resched_rcu();
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index dd21782e2f12..62eefea48973 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -134,7 +134,7 @@ static void update_defense_level(struct netns_ipvs *ipvs)
} else {
atomic_set(&ipvs->dropentry, 0);
ipvs->sysctl_drop_entry = 1;
- };
+ }
break;
case 3:
atomic_set(&ipvs->dropentry, 1);
diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c
index ca880a3ad033..54ee84adf0bd 100644
--- a/net/netfilter/ipvs/ip_vs_proto.c
+++ b/net/netfilter/ipvs/ip_vs_proto.c
@@ -42,6 +42,11 @@
static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE];
+/* States for conn templates: NONE or words separated with ",", max 15 chars */
+static const char *ip_vs_ctpl_state_name_table[IP_VS_CTPL_S_LAST] = {
+ [IP_VS_CTPL_S_NONE] = "NONE",
+ [IP_VS_CTPL_S_ASSURED] = "ASSURED",
+};
/*
* register an ipvs protocol
@@ -193,12 +198,20 @@ ip_vs_create_timeout_table(int *table, int size)
}
-const char * ip_vs_state_name(__u16 proto, int state)
+const char *ip_vs_state_name(const struct ip_vs_conn *cp)
{
- struct ip_vs_protocol *pp = ip_vs_proto_get(proto);
+ unsigned int state = cp->state;
+ struct ip_vs_protocol *pp;
+
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
+ if (state >= IP_VS_CTPL_S_LAST)
+ return "ERR!";
+ return ip_vs_ctpl_state_name_table[state] ? : "?";
+ }
+ pp = ip_vs_proto_get(cp->protocol);
if (pp == NULL || pp->state_name == NULL)
- return (IPPROTO_IP == proto) ? "NONE" : "ERR!";
+ return (cp->protocol == IPPROTO_IP) ? "NONE" : "ERR!";
return pp->state_name(state);
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 3250c4a1111e..b0cd7d08f2a7 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -461,6 +461,8 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
+ if (next_state == IP_VS_SCTP_S_ESTABLISHED)
+ ip_vs_control_assure_ct(cp);
}
if (likely(pd))
cp->timeout = pd->timeout_table[cp->state = next_state];
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index 80d10ad12a15..1770fc6ce960 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -569,6 +569,8 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
cp->flags &= ~IP_VS_CONN_F_INACTIVE;
}
}
+ if (new_state == IP_VS_TCP_S_ESTABLISHED)
+ ip_vs_control_assure_ct(cp);
}
if (likely(pd))
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index e0ef11c3691e..0f53c49025f8 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -460,6 +460,8 @@ udp_state_transition(struct ip_vs_conn *cp, int direction,
}
cp->timeout = pd->timeout_table[IP_VS_UDP_S_NORMAL];
+ if (direction == IP_VS_DIR_OUTPUT)
+ ip_vs_control_assure_ct(cp);
}
static int __udp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 001501e25625..d4020c5e831d 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -1003,12 +1003,9 @@ static void ip_vs_process_message_v0(struct netns_ipvs *ipvs, const char *buffer
continue;
}
} else {
- /* protocol in templates is not used for state/timeout */
- if (state > 0) {
- IP_VS_DBG(2, "BACKUP v0, Invalid template state %u\n",
- state);
- state = 0;
- }
+ if (state >= IP_VS_CTPL_S_LAST)
+ IP_VS_DBG(7, "BACKUP v0, Invalid tpl state %u\n",
+ state);
}
ip_vs_conn_fill_param(ipvs, AF_INET, s->protocol,
@@ -1166,12 +1163,9 @@ static inline int ip_vs_proc_sync_conn(struct netns_ipvs *ipvs, __u8 *p, __u8 *m
goto out;
}
} else {
- /* protocol in templates is not used for state/timeout */
- if (state > 0) {
- IP_VS_DBG(3, "BACKUP, Invalid template state %u\n",
- state);
- state = 0;
- }
+ if (state >= IP_VS_CTPL_S_LAST)
+ IP_VS_DBG(7, "BACKUP, Invalid tpl state %u\n",
+ state);
}
if (ip_vs_conn_fill_param_sync(ipvs, af, s, &param, pe_data,
pe_data_len, pe_name, pe_name_len)) {
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 510039862aa9..02ca7df793f5 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -44,17 +44,19 @@
/* we will save the tuples of all connections we care about */
struct nf_conncount_tuple {
- struct hlist_node node;
+ struct list_head node;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_zone zone;
int cpu;
u32 jiffies32;
+ struct rcu_head rcu_head;
};
struct nf_conncount_rb {
struct rb_node node;
- struct hlist_head hhead; /* connections/hosts in same subnet */
+ struct nf_conncount_list list;
u32 key[MAX_KEYLEN];
+ struct rcu_head rcu_head;
};
static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp;
@@ -62,6 +64,10 @@ static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_i
struct nf_conncount_data {
unsigned int keylen;
struct rb_root root[CONNCOUNT_SLOTS];
+ struct net *net;
+ struct work_struct gc_work;
+ unsigned long pending_trees[BITS_TO_LONGS(CONNCOUNT_SLOTS)];
+ unsigned int gc_tree;
};
static u_int32_t conncount_rnd __read_mostly;
@@ -82,26 +88,70 @@ static int key_diff(const u32 *a, const u32 *b, unsigned int klen)
return memcmp(a, b, klen * sizeof(u32));
}
-bool nf_conncount_add(struct hlist_head *head,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+enum nf_conncount_list_add
+nf_conncount_add(struct nf_conncount_list *list,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
struct nf_conncount_tuple *conn;
+ if (WARN_ON_ONCE(list->count > INT_MAX))
+ return NF_CONNCOUNT_ERR;
+
conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
if (conn == NULL)
- return false;
+ return NF_CONNCOUNT_ERR;
+
conn->tuple = *tuple;
conn->zone = *zone;
conn->cpu = raw_smp_processor_id();
conn->jiffies32 = (u32)jiffies;
- hlist_add_head(&conn->node, head);
- return true;
+ spin_lock(&list->list_lock);
+ if (list->dead == true) {
+ kmem_cache_free(conncount_conn_cachep, conn);
+ spin_unlock(&list->list_lock);
+ return NF_CONNCOUNT_SKIP;
+ }
+ list_add_tail(&conn->node, &list->head);
+ list->count++;
+ spin_unlock(&list->list_lock);
+ return NF_CONNCOUNT_ADDED;
}
EXPORT_SYMBOL_GPL(nf_conncount_add);
+static void __conn_free(struct rcu_head *h)
+{
+ struct nf_conncount_tuple *conn;
+
+ conn = container_of(h, struct nf_conncount_tuple, rcu_head);
+ kmem_cache_free(conncount_conn_cachep, conn);
+}
+
+static bool conn_free(struct nf_conncount_list *list,
+ struct nf_conncount_tuple *conn)
+{
+ bool free_entry = false;
+
+ spin_lock(&list->list_lock);
+
+ if (list->count == 0) {
+ spin_unlock(&list->list_lock);
+ return free_entry;
+ }
+
+ list->count--;
+ list_del_rcu(&conn->node);
+ if (list->count == 0)
+ free_entry = true;
+
+ spin_unlock(&list->list_lock);
+ call_rcu(&conn->rcu_head, __conn_free);
+ return free_entry;
+}
+
static const struct nf_conntrack_tuple_hash *
-find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
+find_or_evict(struct net *net, struct nf_conncount_list *list,
+ struct nf_conncount_tuple *conn, bool *free_entry)
{
const struct nf_conntrack_tuple_hash *found;
unsigned long a, b;
@@ -121,34 +171,37 @@ find_or_evict(struct net *net, struct nf_conncount_tuple *conn)
*/
age = a - b;
if (conn->cpu == cpu || age >= 2) {
- hlist_del(&conn->node);
- kmem_cache_free(conncount_conn_cachep, conn);
+ *free_entry = conn_free(list, conn);
return ERR_PTR(-ENOENT);
}
return ERR_PTR(-EAGAIN);
}
-unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone,
- bool *addit)
+void nf_conncount_lookup(struct net *net,
+ struct nf_conncount_list *list,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone,
+ bool *addit)
{
const struct nf_conntrack_tuple_hash *found;
- struct nf_conncount_tuple *conn;
+ struct nf_conncount_tuple *conn, *conn_n;
struct nf_conn *found_ct;
- struct hlist_node *n;
- unsigned int length = 0;
+ unsigned int collect = 0;
+ bool free_entry = false;
+ /* best effort only */
*addit = tuple ? true : false;
/* check the saved connections */
- hlist_for_each_entry_safe(conn, n, head, node) {
- found = find_or_evict(net, conn);
+ list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+ if (collect > CONNCOUNT_GC_MAX_NODES)
+ break;
+
+ found = find_or_evict(net, list, conn, &free_entry);
if (IS_ERR(found)) {
/* Not found, but might be about to be confirmed */
if (PTR_ERR(found) == -EAGAIN) {
- length++;
if (!tuple)
continue;
@@ -156,7 +209,8 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
nf_ct_zone_id(&conn->zone, conn->zone.dir) ==
nf_ct_zone_id(zone, zone->dir))
*addit = false;
- }
+ } else if (PTR_ERR(found) == -ENOENT)
+ collect++;
continue;
}
@@ -165,9 +219,10 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) &&
nf_ct_zone_equal(found_ct, zone, zone->dir)) {
/*
- * Just to be sure we have it only once in the list.
* We should not see tuples twice unless someone hooks
* this into a table without "-p tcp --syn".
+ *
+ * Attempt to avoid a re-add in this case.
*/
*addit = false;
} else if (already_closed(found_ct)) {
@@ -176,19 +231,75 @@ unsigned int nf_conncount_lookup(struct net *net, struct hlist_head *head,
* closed already -> ditch it
*/
nf_ct_put(found_ct);
- hlist_del(&conn->node);
- kmem_cache_free(conncount_conn_cachep, conn);
+ conn_free(list, conn);
+ collect++;
continue;
}
nf_ct_put(found_ct);
- length++;
}
-
- return length;
}
EXPORT_SYMBOL_GPL(nf_conncount_lookup);
+void nf_conncount_list_init(struct nf_conncount_list *list)
+{
+ spin_lock_init(&list->list_lock);
+ INIT_LIST_HEAD(&list->head);
+ list->count = 1;
+ list->dead = false;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_list_init);
+
+/* Return true if the list is empty */
+bool nf_conncount_gc_list(struct net *net,
+ struct nf_conncount_list *list)
+{
+ const struct nf_conntrack_tuple_hash *found;
+ struct nf_conncount_tuple *conn, *conn_n;
+ struct nf_conn *found_ct;
+ unsigned int collected = 0;
+ bool free_entry = false;
+
+ list_for_each_entry_safe(conn, conn_n, &list->head, node) {
+ found = find_or_evict(net, list, conn, &free_entry);
+ if (IS_ERR(found)) {
+ if (PTR_ERR(found) == -ENOENT) {
+ if (free_entry)
+ return true;
+ collected++;
+ }
+ continue;
+ }
+
+ found_ct = nf_ct_tuplehash_to_ctrack(found);
+ if (already_closed(found_ct)) {
+ /*
+ * we do not care about connections which are
+ * closed already -> ditch it
+ */
+ nf_ct_put(found_ct);
+ if (conn_free(list, conn))
+ return true;
+ collected++;
+ continue;
+ }
+
+ nf_ct_put(found_ct);
+ if (collected > CONNCOUNT_GC_MAX_NODES)
+ return false;
+ }
+ return false;
+}
+EXPORT_SYMBOL_GPL(nf_conncount_gc_list);
+
+static void __tree_nodes_free(struct rcu_head *h)
+{
+ struct nf_conncount_rb *rbconn;
+
+ rbconn = container_of(h, struct nf_conncount_rb, rcu_head);
+ kmem_cache_free(conncount_rb_cachep, rbconn);
+}
+
static void tree_nodes_free(struct rb_root *root,
struct nf_conncount_rb *gc_nodes[],
unsigned int gc_count)
@@ -197,32 +308,46 @@ static void tree_nodes_free(struct rb_root *root,
while (gc_count) {
rbconn = gc_nodes[--gc_count];
- rb_erase(&rbconn->node, root);
- kmem_cache_free(conncount_rb_cachep, rbconn);
+ spin_lock(&rbconn->list.list_lock);
+ if (rbconn->list.count == 0 && rbconn->list.dead == false) {
+ rbconn->list.dead = true;
+ rb_erase(&rbconn->node, root);
+ call_rcu(&rbconn->rcu_head, __tree_nodes_free);
+ }
+ spin_unlock(&rbconn->list.list_lock);
}
}
+static void schedule_gc_worker(struct nf_conncount_data *data, int tree)
+{
+ set_bit(tree, data->pending_trees);
+ schedule_work(&data->gc_work);
+}
+
static unsigned int
-count_tree(struct net *net, struct rb_root *root,
- const u32 *key, u8 keylen,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+insert_tree(struct net *net,
+ struct nf_conncount_data *data,
+ struct rb_root *root,
+ unsigned int hash,
+ const u32 *key,
+ u8 keylen,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
+ enum nf_conncount_list_add ret;
struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES];
struct rb_node **rbnode, *parent;
struct nf_conncount_rb *rbconn;
struct nf_conncount_tuple *conn;
- unsigned int gc_count;
- bool no_gc = false;
+ unsigned int count = 0, gc_count = 0;
+ bool node_found = false;
+
+ spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
- restart:
- gc_count = 0;
parent = NULL;
rbnode = &(root->rb_node);
while (*rbnode) {
int diff;
- bool addit;
-
rbconn = rb_entry(*rbnode, struct nf_conncount_rb, node);
parent = *rbnode;
@@ -232,33 +357,30 @@ count_tree(struct net *net, struct rb_root *root,
} else if (diff > 0) {
rbnode = &((*rbnode)->rb_right);
} else {
- /* same source network -> be counted! */
- unsigned int count;
-
- count = nf_conncount_lookup(net, &rbconn->hhead, tuple,
- zone, &addit);
-
- tree_nodes_free(root, gc_nodes, gc_count);
- if (!addit)
- return count;
-
- if (!nf_conncount_add(&rbconn->hhead, tuple, zone))
- return 0; /* hotdrop */
-
- return count + 1;
+ /* unlikely: other cpu added node already */
+ node_found = true;
+ ret = nf_conncount_add(&rbconn->list, tuple, zone);
+ if (ret == NF_CONNCOUNT_ERR) {
+ count = 0; /* hotdrop */
+ } else if (ret == NF_CONNCOUNT_ADDED) {
+ count = rbconn->list.count;
+ } else {
+ /* NF_CONNCOUNT_SKIP, rbconn is already
+ * reclaimed by gc, insert a new tree node
+ */
+ node_found = false;
+ }
+ break;
}
- if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+ if (gc_count >= ARRAY_SIZE(gc_nodes))
continue;
- /* only used for GC on hhead, retval and 'addit' ignored */
- nf_conncount_lookup(net, &rbconn->hhead, tuple, zone, &addit);
- if (hlist_empty(&rbconn->hhead))
+ if (nf_conncount_gc_list(net, &rbconn->list))
gc_nodes[gc_count++] = rbconn;
}
if (gc_count) {
- no_gc = true;
tree_nodes_free(root, gc_nodes, gc_count);
/* tree_node_free before new allocation permits
* allocator to re-use newly free'd object.
@@ -266,58 +388,146 @@ count_tree(struct net *net, struct rb_root *root,
* This is a rare event; in most cases we will find
* existing node to re-use. (or gc_count is 0).
*/
- goto restart;
+
+ if (gc_count >= ARRAY_SIZE(gc_nodes))
+ schedule_gc_worker(data, hash);
}
- if (!tuple)
- return 0;
+ if (node_found)
+ goto out_unlock;
- /* no match, need to insert new node */
+ /* expected case: match, insert new node */
rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC);
if (rbconn == NULL)
- return 0;
+ goto out_unlock;
conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC);
if (conn == NULL) {
kmem_cache_free(conncount_rb_cachep, rbconn);
- return 0;
+ goto out_unlock;
}
conn->tuple = *tuple;
conn->zone = *zone;
memcpy(rbconn->key, key, sizeof(u32) * keylen);
- INIT_HLIST_HEAD(&rbconn->hhead);
- hlist_add_head(&conn->node, &rbconn->hhead);
+ nf_conncount_list_init(&rbconn->list);
+ list_add(&conn->node, &rbconn->list.head);
+ count = 1;
rb_link_node(&rbconn->node, parent, rbnode);
rb_insert_color(&rbconn->node, root);
- return 1;
+out_unlock:
+ spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+ return count;
}
-/* Count and return number of conntrack entries in 'net' with particular 'key'.
- * If 'tuple' is not null, insert it into the accounting data structure.
- */
-unsigned int nf_conncount_count(struct net *net,
- struct nf_conncount_data *data,
- const u32 *key,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_zone *zone)
+static unsigned int
+count_tree(struct net *net,
+ struct nf_conncount_data *data,
+ const u32 *key,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
{
+ enum nf_conncount_list_add ret;
struct rb_root *root;
- int count;
- u32 hash;
+ struct rb_node *parent;
+ struct nf_conncount_rb *rbconn;
+ unsigned int hash;
+ u8 keylen = data->keylen;
hash = jhash2(key, data->keylen, conncount_rnd) % CONNCOUNT_SLOTS;
root = &data->root[hash];
- spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+ parent = rcu_dereference_raw(root->rb_node);
+ while (parent) {
+ int diff;
+ bool addit;
- count = count_tree(net, root, key, data->keylen, tuple, zone);
+ rbconn = rb_entry(parent, struct nf_conncount_rb, node);
- spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]);
+ diff = key_diff(key, rbconn->key, keylen);
+ if (diff < 0) {
+ parent = rcu_dereference_raw(parent->rb_left);
+ } else if (diff > 0) {
+ parent = rcu_dereference_raw(parent->rb_right);
+ } else {
+ /* same source network -> be counted! */
+ nf_conncount_lookup(net, &rbconn->list, tuple, zone,
+ &addit);
- return count;
+ if (!addit)
+ return rbconn->list.count;
+
+ ret = nf_conncount_add(&rbconn->list, tuple, zone);
+ if (ret == NF_CONNCOUNT_ERR) {
+ return 0; /* hotdrop */
+ } else if (ret == NF_CONNCOUNT_ADDED) {
+ return rbconn->list.count;
+ } else {
+ /* NF_CONNCOUNT_SKIP, rbconn is already
+ * reclaimed by gc, insert a new tree node
+ */
+ break;
+ }
+ }
+ }
+
+ if (!tuple)
+ return 0;
+
+ return insert_tree(net, data, root, hash, key, keylen, tuple, zone);
+}
+
+static void tree_gc_worker(struct work_struct *work)
+{
+ struct nf_conncount_data *data = container_of(work, struct nf_conncount_data, gc_work);
+ struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES], *rbconn;
+ struct rb_root *root;
+ struct rb_node *node;
+ unsigned int tree, next_tree, gc_count = 0;
+
+ tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS;
+ root = &data->root[tree];
+
+ rcu_read_lock();
+ for (node = rb_first(root); node != NULL; node = rb_next(node)) {
+ rbconn = rb_entry(node, struct nf_conncount_rb, node);
+ if (nf_conncount_gc_list(data->net, &rbconn->list))
+ gc_nodes[gc_count++] = rbconn;
+ }
+ rcu_read_unlock();
+
+ spin_lock_bh(&nf_conncount_locks[tree]);
+
+ if (gc_count) {
+ tree_nodes_free(root, gc_nodes, gc_count);
+ }
+
+ clear_bit(tree, data->pending_trees);
+
+ next_tree = (tree + 1) % CONNCOUNT_SLOTS;
+ next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS);
+
+ if (next_tree < CONNCOUNT_SLOTS) {
+ data->gc_tree = next_tree;
+ schedule_work(work);
+ }
+
+ spin_unlock_bh(&nf_conncount_locks[tree]);
+}
+
+/* Count and return number of conntrack entries in 'net' with particular 'key'.
+ * If 'tuple' is not null, insert it into the accounting data structure.
+ * Call with RCU read lock.
+ */
+unsigned int nf_conncount_count(struct net *net,
+ struct nf_conncount_data *data,
+ const u32 *key,
+ const struct nf_conntrack_tuple *tuple,
+ const struct nf_conntrack_zone *zone)
+{
+ return count_tree(net, data, key, tuple, zone);
}
EXPORT_SYMBOL_GPL(nf_conncount_count);
@@ -348,17 +558,18 @@ struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family
data->root[i] = RB_ROOT;
data->keylen = keylen / sizeof(u32);
+ data->net = net;
+ INIT_WORK(&data->gc_work, tree_gc_worker);
return data;
}
EXPORT_SYMBOL_GPL(nf_conncount_init);
-void nf_conncount_cache_free(struct hlist_head *hhead)
+void nf_conncount_cache_free(struct nf_conncount_list *list)
{
- struct nf_conncount_tuple *conn;
- struct hlist_node *n;
+ struct nf_conncount_tuple *conn, *conn_n;
- hlist_for_each_entry_safe(conn, n, hhead, node)
+ list_for_each_entry_safe(conn, conn_n, &list->head, node)
kmem_cache_free(conncount_conn_cachep, conn);
}
EXPORT_SYMBOL_GPL(nf_conncount_cache_free);
@@ -373,7 +584,7 @@ static void destroy_tree(struct rb_root *r)
rb_erase(node, r);
- nf_conncount_cache_free(&rbconn->hhead);
+ nf_conncount_cache_free(&rbconn->list);
kmem_cache_free(conncount_rb_cachep, rbconn);
}
@@ -384,6 +595,7 @@ void nf_conncount_destroy(struct net *net, unsigned int family,
{
unsigned int i;
+ cancel_work_sync(&data->gc_work);
nf_ct_netns_put(net, family);
for (i = 0; i < ARRAY_SIZE(data->root); ++i)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 85ab2fd6a665..a676d5f76bdc 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -37,7 +37,6 @@
#include <linux/rculist_nulls.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -55,6 +54,7 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_helper.h>
#include <net/netns/hash.h>
+#include <net/ip.h>
#include "nf_internals.h"
@@ -222,7 +222,7 @@ static u32 hash_conntrack(const struct net *net,
return scale_hash(hash_conntrack_raw(tuple, net));
}
-bool
+static bool
nf_ct_get_tuple(const struct sk_buff *skb,
unsigned int nhoff,
unsigned int dataoff,
@@ -230,37 +230,151 @@ nf_ct_get_tuple(const struct sk_buff *skb,
u_int8_t protonum,
struct net *net,
struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
+ unsigned int size;
+ const __be32 *ap;
+ __be32 _addrs[8];
+ struct {
+ __be16 sport;
+ __be16 dport;
+ } _inet_hdr, *inet_hdr;
+
memset(tuple, 0, sizeof(*tuple));
tuple->src.l3num = l3num;
- if (l3proto->pkt_to_tuple(skb, nhoff, tuple) == 0)
+ switch (l3num) {
+ case NFPROTO_IPV4:
+ nhoff += offsetof(struct iphdr, saddr);
+ size = 2 * sizeof(__be32);
+ break;
+ case NFPROTO_IPV6:
+ nhoff += offsetof(struct ipv6hdr, saddr);
+ size = sizeof(_addrs);
+ break;
+ default:
+ return true;
+ }
+
+ ap = skb_header_pointer(skb, nhoff, size, _addrs);
+ if (!ap)
return false;
+ switch (l3num) {
+ case NFPROTO_IPV4:
+ tuple->src.u3.ip = ap[0];
+ tuple->dst.u3.ip = ap[1];
+ break;
+ case NFPROTO_IPV6:
+ memcpy(tuple->src.u3.ip6, ap, sizeof(tuple->src.u3.ip6));
+ memcpy(tuple->dst.u3.ip6, ap + 4, sizeof(tuple->dst.u3.ip6));
+ break;
+ }
+
tuple->dst.protonum = protonum;
tuple->dst.dir = IP_CT_DIR_ORIGINAL;
- return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
+ if (unlikely(l4proto->pkt_to_tuple))
+ return l4proto->pkt_to_tuple(skb, dataoff, net, tuple);
+
+ /* Actually only need first 4 bytes to get ports. */
+ inet_hdr = skb_header_pointer(skb, dataoff, sizeof(_inet_hdr), &_inet_hdr);
+ if (!inet_hdr)
+ return false;
+
+ tuple->src.u.udp.port = inet_hdr->sport;
+ tuple->dst.u.udp.port = inet_hdr->dport;
+ return true;
+}
+
+static int ipv4_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ u_int8_t *protonum)
+{
+ int dataoff = -1;
+ const struct iphdr *iph;
+ struct iphdr _iph;
+
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return -1;
+
+ /* Conntrack defragments packets, we might still see fragments
+ * inside ICMP packets though.
+ */
+ if (iph->frag_off & htons(IP_OFFSET))
+ return -1;
+
+ dataoff = nhoff + (iph->ihl << 2);
+ *protonum = iph->protocol;
+
+ /* Check bogus IP headers */
+ if (dataoff > skb->len) {
+ pr_debug("bogus IPv4 packet: nhoff %u, ihl %u, skblen %u\n",
+ nhoff, iph->ihl << 2, skb->len);
+ return -1;
+ }
+ return dataoff;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int ipv6_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
+ u8 *protonum)
+{
+ int protoff = -1;
+ unsigned int extoff = nhoff + sizeof(struct ipv6hdr);
+ __be16 frag_off;
+ u8 nexthdr;
+
+ if (skb_copy_bits(skb, nhoff + offsetof(struct ipv6hdr, nexthdr),
+ &nexthdr, sizeof(nexthdr)) != 0) {
+ pr_debug("can't get nexthdr\n");
+ return -1;
+ }
+ protoff = ipv6_skip_exthdr(skb, extoff, &nexthdr, &frag_off);
+ /*
+ * (protoff == skb->len) means the packet has not data, just
+ * IPv6 and possibly extensions headers, but it is tracked anyway
+ */
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("can't find proto in pkt\n");
+ return -1;
+ }
+
+ *protonum = nexthdr;
+ return protoff;
+}
+#endif
+
+static int get_l4proto(const struct sk_buff *skb,
+ unsigned int nhoff, u8 pf, u8 *l4num)
+{
+ switch (pf) {
+ case NFPROTO_IPV4:
+ return ipv4_get_l4proto(skb, nhoff, l4num);
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ return ipv6_get_l4proto(skb, nhoff, l4num);
+#endif
+ default:
+ *l4num = 0;
+ break;
+ }
+ return -1;
}
-EXPORT_SYMBOL_GPL(nf_ct_get_tuple);
bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
u_int16_t l3num,
struct net *net, struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
- unsigned int protoff;
- u_int8_t protonum;
+ u8 protonum;
+ int protoff;
int ret;
rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(l3num);
- ret = l3proto->get_l4proto(skb, nhoff, &protoff, &protonum);
- if (ret != NF_ACCEPT) {
+ protoff = get_l4proto(skb, nhoff, l3num, &protonum);
+ if (protoff <= 0) {
rcu_read_unlock();
return false;
}
@@ -268,7 +382,7 @@ bool nf_ct_get_tuplepr(const struct sk_buff *skb, unsigned int nhoff,
l4proto = __nf_ct_l4proto_find(l3num, protonum);
ret = nf_ct_get_tuple(skb, nhoff, protoff, l3num, protonum, net, tuple,
- l3proto, l4proto);
+ l4proto);
rcu_read_unlock();
return ret;
@@ -278,19 +392,35 @@ EXPORT_SYMBOL_GPL(nf_ct_get_tuplepr);
bool
nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
const struct nf_conntrack_tuple *orig,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
memset(inverse, 0, sizeof(*inverse));
inverse->src.l3num = orig->src.l3num;
- if (l3proto->invert_tuple(inverse, orig) == 0)
- return false;
+
+ switch (orig->src.l3num) {
+ case NFPROTO_IPV4:
+ inverse->src.u3.ip = orig->dst.u3.ip;
+ inverse->dst.u3.ip = orig->src.u3.ip;
+ break;
+ case NFPROTO_IPV6:
+ inverse->src.u3.in6 = orig->dst.u3.in6;
+ inverse->dst.u3.in6 = orig->src.u3.in6;
+ break;
+ default:
+ break;
+ }
inverse->dst.dir = !orig->dst.dir;
inverse->dst.protonum = orig->dst.protonum;
- return l4proto->invert_tuple(inverse, orig);
+
+ if (unlikely(l4proto->invert_tuple))
+ return l4proto->invert_tuple(inverse, orig);
+
+ inverse->src.u.all = orig->dst.u.all;
+ inverse->dst.u.all = orig->src.u.all;
+ return true;
}
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
@@ -502,6 +632,18 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
net_eq(net, nf_ct_net(ct));
}
+static inline bool
+nf_ct_match(const struct nf_conn *ct1, const struct nf_conn *ct2)
+{
+ return nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+ &ct2->tuplehash[IP_CT_DIR_ORIGINAL].tuple) &&
+ nf_ct_tuple_equal(&ct1->tuplehash[IP_CT_DIR_REPLY].tuple,
+ &ct2->tuplehash[IP_CT_DIR_REPLY].tuple) &&
+ nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_ORIGINAL) &&
+ nf_ct_zone_equal(ct1, nf_ct_zone(ct2), IP_CT_DIR_REPLY) &&
+ net_eq(nf_ct_net(ct1), nf_ct_net(ct2));
+}
+
/* caller must hold rcu readlock and none of the nf_conntrack_locks */
static void nf_ct_gc_expired(struct nf_conn *ct)
{
@@ -695,19 +837,21 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb,
/* This is the conntrack entry already in hashes that won race. */
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
const struct nf_conntrack_l4proto *l4proto;
+ enum ip_conntrack_info oldinfo;
+ struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (l4proto->allow_clash &&
- ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
!nf_ct_is_dying(ct) &&
atomic_inc_not_zero(&ct->ct_general.use)) {
- enum ip_conntrack_info oldinfo;
- struct nf_conn *loser_ct = nf_ct_get(skb, &oldinfo);
-
- nf_ct_acct_merge(ct, ctinfo, loser_ct);
- nf_conntrack_put(&loser_ct->ct_general);
- nf_ct_set(skb, ct, oldinfo);
- return NF_ACCEPT;
+ if (((ct->status & IPS_NAT_DONE_MASK) == 0) ||
+ nf_ct_match(ct, loser_ct)) {
+ nf_ct_acct_merge(ct, ctinfo, loser_ct);
+ nf_conntrack_put(&loser_ct->ct_general);
+ nf_ct_set(skb, ct, oldinfo);
+ return NF_ACCEPT;
+ }
+ nf_ct_put(ct);
}
NF_CT_STAT_INC(net, drop);
return NF_DROP;
@@ -1195,7 +1339,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free);
static noinline struct nf_conntrack_tuple_hash *
init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto,
struct sk_buff *skb,
unsigned int dataoff, u32 hash)
@@ -1208,9 +1351,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
const struct nf_conntrack_zone *zone;
struct nf_conn_timeout *timeout_ext;
struct nf_conntrack_zone tmp;
- unsigned int *timeouts;
- if (!nf_ct_invert_tuple(&repl_tuple, tuple, l3proto, l4proto)) {
+ if (!nf_ct_invert_tuple(&repl_tuple, tuple, l4proto)) {
pr_debug("Can't invert tuple.\n");
return NULL;
}
@@ -1227,15 +1369,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
}
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
- if (timeout_ext) {
- timeouts = nf_ct_timeout_data(timeout_ext);
- if (unlikely(!timeouts))
- timeouts = l4proto->get_timeouts(net);
- } else {
- timeouts = l4proto->get_timeouts(net);
- }
- if (!l4proto->new(ct, skb, dataoff, timeouts)) {
+ if (!l4proto->new(ct, skb, dataoff)) {
nf_conntrack_free(ct);
pr_debug("can't track with proto module\n");
return NULL;
@@ -1266,8 +1401,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
/* exp->master safe, refcnt bumped in nf_ct_find_expectation */
ct->master = exp->master;
if (exp->helper) {
- help = nf_ct_helper_ext_add(ct, exp->helper,
- GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help)
rcu_assign_pointer(help->helper, exp->helper);
}
@@ -1307,7 +1441,6 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
unsigned int dataoff,
u_int16_t l3num,
u_int8_t protonum,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
const struct nf_conntrack_zone *zone;
@@ -1319,8 +1452,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
u32 hash;
if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
- dataoff, l3num, protonum, net, &tuple, l3proto,
- l4proto)) {
+ dataoff, l3num, protonum, net, &tuple, l4proto)) {
pr_debug("Can't get tuple\n");
return 0;
}
@@ -1330,7 +1462,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
hash = hash_conntrack_raw(&tuple, net);
h = __nf_conntrack_find_get(net, zone, &tuple, hash);
if (!h) {
- h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto,
+ h = init_conntrack(net, tmpl, &tuple, l4proto,
skb, dataoff, hash);
if (!h)
return 0;
@@ -1363,14 +1495,11 @@ unsigned int
nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
struct sk_buff *skb)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct nf_conn *ct, *tmpl;
enum ip_conntrack_info ctinfo;
- unsigned int *timeouts;
- unsigned int dataoff;
u_int8_t protonum;
- int ret;
+ int dataoff, ret;
tmpl = nf_ct_get(skb, &ctinfo);
if (tmpl || ctinfo == IP_CT_UNTRACKED) {
@@ -1384,14 +1513,12 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
}
/* rcu_read_lock()ed by nf_hook_thresh */
- l3proto = __nf_ct_l3proto_find(pf);
- ret = l3proto->get_l4proto(skb, skb_network_offset(skb),
- &dataoff, &protonum);
- if (ret <= 0) {
+ dataoff = get_l4proto(skb, skb_network_offset(skb), pf, &protonum);
+ if (dataoff <= 0) {
pr_debug("not prepared to track yet or error occurred\n");
NF_CT_STAT_INC_ATOMIC(net, error);
NF_CT_STAT_INC_ATOMIC(net, invalid);
- ret = -ret;
+ ret = NF_ACCEPT;
goto out;
}
@@ -1413,8 +1540,7 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum,
goto out;
}
repeat:
- ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum,
- l3proto, l4proto);
+ ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, l4proto);
if (ret < 0) {
/* Too stressed to deal. */
NF_CT_STAT_INC_ATOMIC(net, drop);
@@ -1430,10 +1556,7 @@ repeat:
goto out;
}
- /* Decide what timeout policy we want to apply to this flow. */
- timeouts = nf_ct_timeout_lookup(net, ct, l4proto);
-
- ret = l4proto->packet(ct, skb, dataoff, ctinfo, timeouts);
+ ret = l4proto->packet(ct, skb, dataoff, ctinfo);
if (ret <= 0) {
/* Invalid: inverse of the return code tells
* the netfilter core what to do */
@@ -1471,7 +1594,6 @@ bool nf_ct_invert_tuplepr(struct nf_conntrack_tuple *inverse,
rcu_read_lock();
ret = nf_ct_invert_tuple(inverse, orig,
- __nf_ct_l3proto_find(orig->src.l3num),
__nf_ct_l4proto_find(orig->src.l3num,
orig->dst.protonum));
rcu_read_unlock();
@@ -1609,14 +1731,14 @@ static void nf_conntrack_attach(struct sk_buff *nskb, const struct sk_buff *skb)
static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
enum ip_conntrack_info ctinfo;
struct nf_nat_hook *nat_hook;
- unsigned int dataoff, status;
+ unsigned int status;
struct nf_conn *ct;
+ int dataoff;
u16 l3num;
u8 l4num;
@@ -1625,16 +1747,15 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
return 0;
l3num = nf_ct_l3num(ct);
- l3proto = nf_ct_l3proto_find_get(l3num);
- if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
- &l4num) <= 0)
+ dataoff = get_l4proto(skb, skb_network_offset(skb), l3num, &l4num);
+ if (dataoff <= 0)
return -1;
l4proto = nf_ct_l4proto_find_get(l3num, l4num);
if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- l4num, net, &tuple, l3proto, l4proto))
+ l4num, net, &tuple, l4proto))
return -1;
if (ct->status & IPS_SRC_NAT) {
@@ -1901,16 +2022,6 @@ static int kill_all(struct nf_conn *i, void *data)
return net_eq(nf_ct_net(i), data);
}
-void nf_ct_free_hashtable(void *hash, unsigned int size)
-{
- if (is_vmalloc_addr(hash))
- vfree(hash);
- else
- free_pages((unsigned long)hash,
- get_order(sizeof(struct hlist_head) * size));
-}
-EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
-
void nf_conntrack_cleanup_start(void)
{
conntrack_gc_work.exiting = true;
@@ -1921,7 +2032,7 @@ void nf_conntrack_cleanup_end(void)
{
RCU_INIT_POINTER(nf_ct_hook, NULL);
cancel_delayed_work_sync(&conntrack_gc_work.dwork);
- nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+ kvfree(nf_conntrack_hash);
nf_conntrack_proto_fini();
nf_conntrack_seqadj_fini();
@@ -1987,7 +2098,6 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
{
struct hlist_nulls_head *hash;
unsigned int nr_slots, i;
- size_t sz;
if (*sizep > (UINT_MAX / sizeof(struct hlist_nulls_head)))
return NULL;
@@ -1995,14 +2105,8 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
- if (nr_slots > (UINT_MAX / sizeof(struct hlist_nulls_head)))
- return NULL;
-
- sz = nr_slots * sizeof(struct hlist_nulls_head);
- hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
- get_order(sz));
- if (!hash)
- hash = vzalloc(sz);
+ hash = kvmalloc_array(nr_slots, sizeof(struct hlist_nulls_head),
+ GFP_KERNEL | __GFP_ZERO);
if (hash && nulls)
for (i = 0; i < nr_slots; i++)
@@ -2029,7 +2133,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
old_size = nf_conntrack_htable_size;
if (old_size == hashsize) {
- nf_ct_free_hashtable(hash, hashsize);
+ kvfree(hash);
return 0;
}
@@ -2065,7 +2169,7 @@ int nf_conntrack_hash_resize(unsigned int hashsize)
local_bh_enable();
synchronize_net();
- nf_ct_free_hashtable(old_hash, old_size);
+ kvfree(old_hash);
return 0;
}
@@ -2078,7 +2182,7 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
return -EOPNOTSUPP;
/* On boot, we can set this without any fancy locking. */
- if (!nf_conntrack_htable_size)
+ if (!nf_conntrack_hash)
return param_set_uint(val, kp);
rc = kstrtouint(val, 0, &hashsize);
@@ -2089,9 +2193,6 @@ int nf_conntrack_set_hashsize(const char *val, const struct kernel_param *kp)
}
EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize);
-module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
- &nf_conntrack_htable_size, 0600);
-
static __always_inline unsigned int total_extension_size(void)
{
/* remember to add new extensions below */
@@ -2232,7 +2333,7 @@ err_acct:
err_expect:
kmem_cache_destroy(nf_conntrack_cachep);
err_cachep:
- nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size);
+ kvfree(nf_conntrack_hash);
return ret;
}
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 853b23206bb7..27b84231db10 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -610,7 +610,6 @@ static int exp_seq_show(struct seq_file *s, void *v)
expect->tuple.src.l3num,
expect->tuple.dst.protonum);
print_tuple(s, &expect->tuple,
- __nf_ct_l3proto_find(expect->tuple.src.l3num),
__nf_ct_l4proto_find(expect->tuple.src.l3num,
expect->tuple.dst.protonum));
@@ -713,5 +712,5 @@ void nf_conntrack_expect_fini(void)
{
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
- nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
+ kvfree(nf_ct_expect_hash);
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index a75b11c39312..e24b762ffa1d 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -24,7 +24,6 @@
#include <linux/rtnetlink.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -193,8 +192,7 @@ void nf_conntrack_helper_put(struct nf_conntrack_helper *helper)
EXPORT_SYMBOL_GPL(nf_conntrack_helper_put);
struct nf_conn_help *
-nf_ct_helper_ext_add(struct nf_conn *ct,
- struct nf_conntrack_helper *helper, gfp_t gfp)
+nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp)
{
struct nf_conn_help *help;
@@ -263,7 +261,7 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
}
if (help == NULL) {
- help = nf_ct_helper_ext_add(ct, helper, flags);
+ help = nf_ct_helper_ext_add(ct, flags);
if (help == NULL)
return -ENOMEM;
} else {
@@ -564,12 +562,12 @@ int nf_conntrack_helper_init(void)
return 0;
out_extend:
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ kvfree(nf_ct_helper_hash);
return ret;
}
void nf_conntrack_helper_fini(void)
{
nf_ct_extend_unregister(&helper_extend);
- nf_ct_free_hashtable(nf_ct_helper_hash, nf_ct_helper_hsize);
+ kvfree(nf_ct_helper_hash);
}
diff --git a/net/netfilter/nf_conntrack_l3proto_generic.c b/net/netfilter/nf_conntrack_l3proto_generic.c
deleted file mode 100644
index 397e6911214f..000000000000
--- a/net/netfilter/nf_conntrack_l3proto_generic.c
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
- *
- * Based largely upon the original ip_conntrack code which
- * had the following copyright information:
- *
- * (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * Author:
- * Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp>
- */
-
-#include <linux/types.h>
-#include <linux/ip.h>
-#include <linux/netfilter.h>
-#include <linux/module.h>
-#include <linux/skbuff.h>
-#include <linux/icmp.h>
-#include <linux/sysctl.h>
-#include <net/ip.h>
-
-#include <linux/netfilter_ipv4.h>
-#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l4proto.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
-#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
-
-static bool generic_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff,
- struct nf_conntrack_tuple *tuple)
-{
- memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
- memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
-
- return true;
-}
-
-static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- memset(&tuple->src.u3, 0, sizeof(tuple->src.u3));
- memset(&tuple->dst.u3, 0, sizeof(tuple->dst.u3));
-
- return true;
-}
-
-static int generic_get_l4proto(const struct sk_buff *skb, unsigned int nhoff,
- unsigned int *dataoff, u_int8_t *protonum)
-{
- /* Never track !!! */
- return -NF_ACCEPT;
-}
-
-
-struct nf_conntrack_l3proto nf_conntrack_l3proto_generic __read_mostly = {
- .l3proto = PF_UNSPEC,
- .pkt_to_tuple = generic_pkt_to_tuple,
- .invert_tuple = generic_invert_tuple,
- .get_l4proto = generic_get_l4proto,
-};
-EXPORT_SYMBOL_GPL(nf_conntrack_l3proto_generic);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 20a2e37c76d1..f981bfa8db72 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -38,7 +38,6 @@
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -81,9 +80,26 @@ nla_put_failure:
return -1;
}
+static int ipv4_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in_addr(skb, CTA_IP_V4_SRC, tuple->src.u3.ip) ||
+ nla_put_in_addr(skb, CTA_IP_V4_DST, tuple->dst.u3.ip))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int ipv6_tuple_to_nlattr(struct sk_buff *skb,
+ const struct nf_conntrack_tuple *tuple)
+{
+ if (nla_put_in6_addr(skb, CTA_IP_V6_SRC, &tuple->src.u3.in6) ||
+ nla_put_in6_addr(skb, CTA_IP_V6_DST, &tuple->dst.u3.in6))
+ return -EMSGSIZE;
+ return 0;
+}
+
static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
- const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto)
+ const struct nf_conntrack_tuple *tuple)
{
int ret = 0;
struct nlattr *nest_parms;
@@ -92,8 +108,14 @@ static int ctnetlink_dump_tuples_ip(struct sk_buff *skb,
if (!nest_parms)
goto nla_put_failure;
- if (likely(l3proto->tuple_to_nlattr))
- ret = l3proto->tuple_to_nlattr(skb, tuple);
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ ret = ipv4_tuple_to_nlattr(skb, tuple);
+ break;
+ case NFPROTO_IPV6:
+ ret = ipv6_tuple_to_nlattr(skb, tuple);
+ break;
+ }
nla_nest_end(skb, nest_parms);
@@ -106,13 +128,11 @@ nla_put_failure:
static int ctnetlink_dump_tuples(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
int ret;
rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
- ret = ctnetlink_dump_tuples_ip(skb, tuple, l3proto);
+ ret = ctnetlink_dump_tuples_ip(skb, tuple);
if (ret >= 0) {
l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
@@ -556,15 +576,20 @@ nla_put_failure:
return -1;
}
+static const struct nla_policy cta_ip_nla_policy[CTA_IP_MAX + 1] = {
+ [CTA_IP_V4_SRC] = { .type = NLA_U32 },
+ [CTA_IP_V4_DST] = { .type = NLA_U32 },
+ [CTA_IP_V6_SRC] = { .len = sizeof(__be32) * 4 },
+ [CTA_IP_V6_DST] = { .len = sizeof(__be32) * 4 },
+};
+
#if defined(CONFIG_NETFILTER_NETLINK_GLUE_CT) || defined(CONFIG_NF_CONNTRACK_EVENTS)
static size_t ctnetlink_proto_size(const struct nf_conn *ct)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
size_t len, len4 = 0;
- l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
- len = l3proto->nla_size;
+ len = nla_policy_len(cta_ip_nla_policy, CTA_IP_MAX + 1);
len *= 3u; /* ORIG, REPLY, MASTER */
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
@@ -936,29 +961,54 @@ out:
return skb->len;
}
+static int ipv4_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V4_SRC] || !tb[CTA_IP_V4_DST])
+ return -EINVAL;
+
+ t->src.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_SRC]);
+ t->dst.u3.ip = nla_get_in_addr(tb[CTA_IP_V4_DST]);
+
+ return 0;
+}
+
+static int ipv6_nlattr_to_tuple(struct nlattr *tb[],
+ struct nf_conntrack_tuple *t)
+{
+ if (!tb[CTA_IP_V6_SRC] || !tb[CTA_IP_V6_DST])
+ return -EINVAL;
+
+ t->src.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_SRC]);
+ t->dst.u3.in6 = nla_get_in6_addr(tb[CTA_IP_V6_DST]);
+
+ return 0;
+}
+
static int ctnetlink_parse_tuple_ip(struct nlattr *attr,
struct nf_conntrack_tuple *tuple)
{
struct nlattr *tb[CTA_IP_MAX+1];
- struct nf_conntrack_l3proto *l3proto;
int ret = 0;
ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL);
if (ret < 0)
return ret;
- rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
+ ret = nla_validate_nested(attr, CTA_IP_MAX,
+ cta_ip_nla_policy, NULL);
+ if (ret)
+ return ret;
- if (likely(l3proto->nlattr_to_tuple)) {
- ret = nla_validate_nested(attr, CTA_IP_MAX,
- l3proto->nla_policy, NULL);
- if (ret == 0)
- ret = l3proto->nlattr_to_tuple(tb, tuple);
+ switch (tuple->src.l3num) {
+ case NFPROTO_IPV4:
+ ret = ipv4_nlattr_to_tuple(tb, tuple);
+ break;
+ case NFPROTO_IPV6:
+ ret = ipv6_nlattr_to_tuple(tb, tuple);
+ break;
}
- rcu_read_unlock();
-
return ret;
}
@@ -1897,7 +1947,7 @@ ctnetlink_create_conntrack(struct net *net,
} else {
struct nf_conn_help *help;
- help = nf_ct_helper_ext_add(ct, helper, GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help == NULL) {
err = -ENOMEM;
goto err2;
@@ -2581,7 +2631,6 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple_mask *mask)
{
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple m;
struct nlattr *nest_parms;
@@ -2597,8 +2646,7 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
goto nla_put_failure;
rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(tuple->src.l3num);
- ret = ctnetlink_dump_tuples_ip(skb, &m, l3proto);
+ ret = ctnetlink_dump_tuples_ip(skb, &m);
if (ret >= 0) {
l4proto = __nf_ct_l4proto_find(tuple->src.l3num,
tuple->dst.protonum);
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index d88841fbc560..30070732ee50 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -1,14 +1,4 @@
-/* L3/L4 protocol support for nf_conntrack. */
-
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org>
- * (C) 2006-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
+// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/netfilter.h>
@@ -24,14 +14,36 @@
#include <linux/netdevice.h>
#include <net/netfilter/nf_conntrack.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_log.h>
+#include <linux/ip.h>
+#include <linux/icmp.h>
+#include <linux/sysctl.h>
+#include <net/route.h>
+#include <net/ip.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <net/netfilter/nf_conntrack_helper.h>
+#include <net/netfilter/nf_conntrack_zones.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
+#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
+#include <net/netfilter/nf_nat_helper.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+
+#include <linux/ipv6.h>
+#include <linux/in6.h>
+#include <net/ipv6.h>
+#include <net/inet_frag.h>
+
+extern unsigned int nf_conntrack_net_id;
+
static struct nf_conntrack_l4proto __rcu **nf_ct_protos[NFPROTO_NUMPROTO] __read_mostly;
-struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[NFPROTO_NUMPROTO] __read_mostly;
-EXPORT_SYMBOL_GPL(nf_ct_l3protos);
static DEFINE_MUTEX(nf_ct_proto_mutex);
@@ -122,137 +134,6 @@ __nf_ct_l4proto_find(u_int16_t l3proto, u_int8_t l4proto)
}
EXPORT_SYMBOL_GPL(__nf_ct_l4proto_find);
-/* this is guaranteed to always return a valid protocol helper, since
- * it falls back to generic_protocol */
-const struct nf_conntrack_l3proto *
-nf_ct_l3proto_find_get(u_int16_t l3proto)
-{
- struct nf_conntrack_l3proto *p;
-
- rcu_read_lock();
- p = __nf_ct_l3proto_find(l3proto);
- if (!try_module_get(p->me))
- p = &nf_conntrack_l3proto_generic;
- rcu_read_unlock();
-
- return p;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_find_get);
-
-int
-nf_ct_l3proto_try_module_get(unsigned short l3proto)
-{
- const struct nf_conntrack_l3proto *p;
- int ret;
-
-retry: p = nf_ct_l3proto_find_get(l3proto);
- if (p == &nf_conntrack_l3proto_generic) {
- ret = request_module("nf_conntrack-%d", l3proto);
- if (!ret)
- goto retry;
-
- return -EPROTOTYPE;
- }
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_try_module_get);
-
-void nf_ct_l3proto_module_put(unsigned short l3proto)
-{
- struct nf_conntrack_l3proto *p;
-
- /* rcu_read_lock not necessary since the caller holds a reference, but
- * taken anyways to avoid lockdep warnings in __nf_ct_l3proto_find()
- */
- rcu_read_lock();
- p = __nf_ct_l3proto_find(l3proto);
- module_put(p->me);
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_module_put);
-
-static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
-{
- const struct nf_conntrack_l3proto *l3proto;
- int ret;
-
- might_sleep();
-
- ret = nf_ct_l3proto_try_module_get(nfproto);
- if (ret < 0)
- return ret;
-
- /* we already have a reference, can't fail */
- rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(nfproto);
- rcu_read_unlock();
-
- if (!l3proto->net_ns_get)
- return 0;
-
- ret = l3proto->net_ns_get(net);
- if (ret < 0)
- nf_ct_l3proto_module_put(nfproto);
-
- return ret;
-}
-
-int nf_ct_netns_get(struct net *net, u8 nfproto)
-{
- int err;
-
- if (nfproto == NFPROTO_INET) {
- err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
- if (err < 0)
- goto err1;
- err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
- if (err < 0)
- goto err2;
- } else {
- err = nf_ct_netns_do_get(net, nfproto);
- if (err < 0)
- goto err1;
- }
- return 0;
-
-err2:
- nf_ct_netns_put(net, NFPROTO_IPV4);
-err1:
- return err;
-}
-EXPORT_SYMBOL_GPL(nf_ct_netns_get);
-
-static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
-{
- const struct nf_conntrack_l3proto *l3proto;
-
- might_sleep();
-
- /* same as nf_conntrack_netns_get(), reference assumed */
- rcu_read_lock();
- l3proto = __nf_ct_l3proto_find(nfproto);
- rcu_read_unlock();
-
- if (WARN_ON(!l3proto))
- return;
-
- if (l3proto->net_ns_put)
- l3proto->net_ns_put(net);
-
- nf_ct_l3proto_module_put(nfproto);
-}
-
-void nf_ct_netns_put(struct net *net, uint8_t nfproto)
-{
- if (nfproto == NFPROTO_INET) {
- nf_ct_netns_do_put(net, NFPROTO_IPV4);
- nf_ct_netns_do_put(net, NFPROTO_IPV6);
- } else
- nf_ct_netns_do_put(net, nfproto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_netns_put);
-
const struct nf_conntrack_l4proto *
nf_ct_l4proto_find_get(u_int16_t l3num, u_int8_t l4num)
{
@@ -274,11 +155,6 @@ void nf_ct_l4proto_put(const struct nf_conntrack_l4proto *p)
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_put);
-static int kill_l3proto(struct nf_conn *i, void *data)
-{
- return nf_ct_l3num(i) == ((const struct nf_conntrack_l3proto *)data)->l3proto;
-}
-
static int kill_l4proto(struct nf_conn *i, void *data)
{
const struct nf_conntrack_l4proto *l4proto;
@@ -287,52 +163,6 @@ static int kill_l4proto(struct nf_conn *i, void *data)
nf_ct_l3num(i) == l4proto->l3proto;
}
-int nf_ct_l3proto_register(const struct nf_conntrack_l3proto *proto)
-{
- int ret = 0;
- struct nf_conntrack_l3proto *old;
-
- if (proto->l3proto >= NFPROTO_NUMPROTO)
- return -EBUSY;
-#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
- if (proto->tuple_to_nlattr && proto->nla_size == 0)
- return -EINVAL;
-#endif
- mutex_lock(&nf_ct_proto_mutex);
- old = rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
- lockdep_is_held(&nf_ct_proto_mutex));
- if (old != &nf_conntrack_l3proto_generic) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- rcu_assign_pointer(nf_ct_l3protos[proto->l3proto], proto);
-
-out_unlock:
- mutex_unlock(&nf_ct_proto_mutex);
- return ret;
-
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_register);
-
-void nf_ct_l3proto_unregister(const struct nf_conntrack_l3proto *proto)
-{
- BUG_ON(proto->l3proto >= NFPROTO_NUMPROTO);
-
- mutex_lock(&nf_ct_proto_mutex);
- BUG_ON(rcu_dereference_protected(nf_ct_l3protos[proto->l3proto],
- lockdep_is_held(&nf_ct_proto_mutex)
- ) != proto);
- rcu_assign_pointer(nf_ct_l3protos[proto->l3proto],
- &nf_conntrack_l3proto_generic);
- mutex_unlock(&nf_ct_proto_mutex);
-
- synchronize_rcu();
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_destroy(kill_l3proto, (void*)proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l3proto_unregister);
-
static struct nf_proto_net *nf_ct_l4proto_net(struct net *net,
const struct nf_conntrack_l4proto *l4proto)
{
@@ -499,8 +329,23 @@ void nf_ct_l4proto_pernet_unregister_one(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister_one);
-int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
- unsigned int num_proto)
+static void
+nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
+ unsigned int num_proto)
+{
+ mutex_lock(&nf_ct_proto_mutex);
+ while (num_proto-- != 0)
+ __nf_ct_l4proto_unregister_one(l4proto[num_proto]);
+ mutex_unlock(&nf_ct_proto_mutex);
+
+ synchronize_net();
+ /* Remove all contrack entries for this protocol */
+ nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
+}
+
+static int
+nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
+ unsigned int num_proto)
{
int ret = -EINVAL, ver;
unsigned int i;
@@ -518,7 +363,6 @@ int nf_ct_l4proto_register(const struct nf_conntrack_l4proto * const l4proto[],
}
return ret;
}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_register);
int nf_ct_l4proto_pernet_register(struct net *net,
const struct nf_conntrack_l4proto *const l4proto[],
@@ -542,20 +386,6 @@ int nf_ct_l4proto_pernet_register(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_register);
-void nf_ct_l4proto_unregister(const struct nf_conntrack_l4proto * const l4proto[],
- unsigned int num_proto)
-{
- mutex_lock(&nf_ct_proto_mutex);
- while (num_proto-- != 0)
- __nf_ct_l4proto_unregister_one(l4proto[num_proto]);
- mutex_unlock(&nf_ct_proto_mutex);
-
- synchronize_net();
- /* Remove all contrack entries for this protocol */
- nf_ct_iterate_destroy(kill_l4proto, (void *)l4proto);
-}
-EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister);
-
void nf_ct_l4proto_pernet_unregister(struct net *net,
const struct nf_conntrack_l4proto *const l4proto[],
unsigned int num_proto)
@@ -565,6 +395,562 @@ void nf_ct_l4proto_pernet_unregister(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_ct_l4proto_pernet_unregister);
+static unsigned int ipv4_helper(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+
+ /* rcu_read_lock()ed by nf_hook_thresh */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ return helper->help(skb, skb_network_offset(skb) + ip_hdrlen(skb),
+ ct, ctinfo);
+}
+
+static unsigned int ipv4_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, ip_hdrlen(skb))) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv4_conntrack_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
+}
+
+static unsigned int ipv4_conntrack_local(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ if (ip_is_fragment(ip_hdr(skb))) { /* IP_NODEFRAG setsockopt set */
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *tmpl;
+
+ tmpl = nf_ct_get(skb, &ctinfo);
+ if (tmpl && nf_ct_is_template(tmpl)) {
+ /* when skipping ct, clear templates to avoid fooling
+ * later targets/matches
+ */
+ skb->_nfct = 0;
+ nf_ct_put(tmpl);
+ }
+ return NF_ACCEPT;
+ }
+
+ return nf_conntrack_in(state->net, PF_INET, state->hook, skb);
+}
+
+/* Connection tracking may drop packets, but never alters them, so
+ * make it the first hook.
+ */
+static const struct nf_hook_ops ipv4_conntrack_ops[] = {
+ {
+ .hook = ipv4_conntrack_in,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_conntrack_local,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv4_helper,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+ {
+ .hook = ipv4_helper,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv4_confirm,
+ .pf = NFPROTO_IPV4,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
+ },
+};
+
+/* Fast function for those who don't want to parse /proc (and I don't
+ * blame them).
+ * Reversing the socket's dst/src point of view gives us the reply
+ * mapping.
+ */
+static int
+getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+
+ memset(&tuple, 0, sizeof(tuple));
+
+ lock_sock(sk);
+ tuple.src.u3.ip = inet->inet_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.ip = inet->inet_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.src.l3num = PF_INET;
+ tuple.dst.protonum = sk->sk_protocol;
+ release_sock(sk);
+
+ /* We only do TCP and SCTP at the moment: is there a better way? */
+ if (tuple.dst.protonum != IPPROTO_TCP &&
+ tuple.dst.protonum != IPPROTO_SCTP) {
+ pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n");
+ return -ENOPROTOOPT;
+ }
+
+ if ((unsigned int)*len < sizeof(struct sockaddr_in)) {
+ pr_debug("SO_ORIGINAL_DST: len %d not %zu\n",
+ *len, sizeof(struct sockaddr_in));
+ return -EINVAL;
+ }
+
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
+ if (h) {
+ struct sockaddr_in sin;
+ struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin.sin_family = AF_INET;
+ sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u.tcp.port;
+ sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
+ .tuple.dst.u3.ip;
+ memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
+
+ pr_debug("SO_ORIGINAL_DST: %pI4 %u\n",
+ &sin.sin_addr.s_addr, ntohs(sin.sin_port));
+ nf_ct_put(ct);
+ if (copy_to_user(user, &sin, sizeof(sin)) != 0)
+ return -EFAULT;
+ else
+ return 0;
+ }
+ pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n",
+ &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+}
+
+static struct nf_sockopt_ops so_getorigdst = {
+ .pf = PF_INET,
+ .get_optmin = SO_ORIGINAL_DST,
+ .get_optmax = SO_ORIGINAL_DST + 1,
+ .get = getorigdst,
+ .owner = THIS_MODULE,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int
+ipv6_getorigdst(struct sock *sk, int optval, void __user *user, int *len)
+{
+ struct nf_conntrack_tuple tuple = { .src.l3num = NFPROTO_IPV6 };
+ const struct ipv6_pinfo *inet6 = inet6_sk(sk);
+ const struct inet_sock *inet = inet_sk(sk);
+ const struct nf_conntrack_tuple_hash *h;
+ struct sockaddr_in6 sin6;
+ struct nf_conn *ct;
+ __be32 flow_label;
+ int bound_dev_if;
+
+ lock_sock(sk);
+ tuple.src.u3.in6 = sk->sk_v6_rcv_saddr;
+ tuple.src.u.tcp.port = inet->inet_sport;
+ tuple.dst.u3.in6 = sk->sk_v6_daddr;
+ tuple.dst.u.tcp.port = inet->inet_dport;
+ tuple.dst.protonum = sk->sk_protocol;
+ bound_dev_if = sk->sk_bound_dev_if;
+ flow_label = inet6->flow_label;
+ release_sock(sk);
+
+ if (tuple.dst.protonum != IPPROTO_TCP &&
+ tuple.dst.protonum != IPPROTO_SCTP)
+ return -ENOPROTOOPT;
+
+ if (*len < 0 || (unsigned int)*len < sizeof(sin6))
+ return -EINVAL;
+
+ h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple);
+ if (!h) {
+ pr_debug("IP6T_SO_ORIGINAL_DST: Can't find %pI6c/%u-%pI6c/%u.\n",
+ &tuple.src.u3.ip6, ntohs(tuple.src.u.tcp.port),
+ &tuple.dst.u3.ip6, ntohs(tuple.dst.u.tcp.port));
+ return -ENOENT;
+ }
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_port = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u.tcp.port;
+ sin6.sin6_flowinfo = flow_label & IPV6_FLOWINFO_MASK;
+ memcpy(&sin6.sin6_addr,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.in6,
+ sizeof(sin6.sin6_addr));
+
+ nf_ct_put(ct);
+ sin6.sin6_scope_id = ipv6_iface_scope_id(&sin6.sin6_addr, bound_dev_if);
+ return copy_to_user(user, &sin6, sizeof(sin6)) ? -EFAULT : 0;
+}
+
+static struct nf_sockopt_ops so_getorigdst6 = {
+ .pf = NFPROTO_IPV6,
+ .get_optmin = IP6T_SO_ORIGINAL_DST,
+ .get_optmax = IP6T_SO_ORIGINAL_DST + 1,
+ .get = ipv6_getorigdst,
+ .owner = THIS_MODULE,
+};
+
+static unsigned int ipv6_confirm(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ enum ip_conntrack_info ctinfo;
+ unsigned char pnum = ipv6_hdr(skb)->nexthdr;
+ int protoff;
+ __be16 frag_off;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ goto out;
+
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ goto out;
+ }
+
+ /* adjust seqs for loopback traffic only in outgoing direction */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_is_loopback_packet(skb)) {
+ if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+ NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+ return NF_DROP;
+ }
+ }
+out:
+ /* We've seen it coming out the other side: confirm it */
+ return nf_conntrack_confirm(skb);
+}
+
+static unsigned int ipv6_conntrack_in(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
+}
+
+static unsigned int ipv6_conntrack_local(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ return nf_conntrack_in(state->net, PF_INET6, state->hook, skb);
+}
+
+static unsigned int ipv6_helper(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ struct nf_conn *ct;
+ const struct nf_conn_help *help;
+ const struct nf_conntrack_helper *helper;
+ enum ip_conntrack_info ctinfo;
+ __be16 frag_off;
+ int protoff;
+ u8 nexthdr;
+
+ /* This is where we call the helper: as the packet goes out. */
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct || ctinfo == IP_CT_RELATED_REPLY)
+ return NF_ACCEPT;
+
+ help = nfct_help(ct);
+ if (!help)
+ return NF_ACCEPT;
+ /* rcu_read_lock()ed by nf_hook_thresh */
+ helper = rcu_dereference(help->helper);
+ if (!helper)
+ return NF_ACCEPT;
+
+ nexthdr = ipv6_hdr(skb)->nexthdr;
+ protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+ &frag_off);
+ if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
+ pr_debug("proto header not found\n");
+ return NF_ACCEPT;
+ }
+
+ return helper->help(skb, protoff, ct, ctinfo);
+}
+
+static const struct nf_hook_ops ipv6_conntrack_ops[] = {
+ {
+ .hook = ipv6_conntrack_in,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv6_conntrack_local,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_OUT,
+ .priority = NF_IP6_PRI_CONNTRACK,
+ },
+ {
+ .hook = ipv6_helper,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv6_confirm,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP6_PRI_LAST,
+ },
+ {
+ .hook = ipv6_helper,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_CONNTRACK_HELPER,
+ },
+ {
+ .hook = ipv6_confirm,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = NF_IP6_PRI_LAST - 1,
+ },
+};
+#endif
+
+static int nf_ct_netns_do_get(struct net *net, u8 nfproto)
+{
+ struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+ int err = 0;
+
+ mutex_lock(&nf_ct_proto_mutex);
+
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ cnet->users4++;
+ if (cnet->users4 > 1)
+ goto out_unlock;
+ err = nf_defrag_ipv4_enable(net);
+ if (err) {
+ cnet->users4 = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ if (err)
+ cnet->users4 = 0;
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ cnet->users6++;
+ if (cnet->users6 > 1)
+ goto out_unlock;
+ err = nf_defrag_ipv6_enable(net);
+ if (err < 0) {
+ cnet->users6 = 0;
+ goto out_unlock;
+ }
+
+ err = nf_register_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ if (err)
+ cnet->users6 = 0;
+ break;
+#endif
+ default:
+ err = -EPROTO;
+ break;
+ }
+ out_unlock:
+ mutex_unlock(&nf_ct_proto_mutex);
+ return err;
+}
+
+static void nf_ct_netns_do_put(struct net *net, u8 nfproto)
+{
+ struct nf_conntrack_net *cnet = net_generic(net, nf_conntrack_net_id);
+
+ mutex_lock(&nf_ct_proto_mutex);
+ switch (nfproto) {
+ case NFPROTO_IPV4:
+ if (cnet->users4 && (--cnet->users4 == 0))
+ nf_unregister_net_hooks(net, ipv4_conntrack_ops,
+ ARRAY_SIZE(ipv4_conntrack_ops));
+ break;
+#if IS_ENABLED(CONFIG_IPV6)
+ case NFPROTO_IPV6:
+ if (cnet->users6 && (--cnet->users6 == 0))
+ nf_unregister_net_hooks(net, ipv6_conntrack_ops,
+ ARRAY_SIZE(ipv6_conntrack_ops));
+ break;
+#endif
+ }
+
+ mutex_unlock(&nf_ct_proto_mutex);
+}
+
+int nf_ct_netns_get(struct net *net, u8 nfproto)
+{
+ int err;
+
+ if (nfproto == NFPROTO_INET) {
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV4);
+ if (err < 0)
+ goto err1;
+ err = nf_ct_netns_do_get(net, NFPROTO_IPV6);
+ if (err < 0)
+ goto err2;
+ } else {
+ err = nf_ct_netns_do_get(net, nfproto);
+ if (err < 0)
+ goto err1;
+ }
+ return 0;
+
+err2:
+ nf_ct_netns_put(net, NFPROTO_IPV4);
+err1:
+ return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_get);
+
+void nf_ct_netns_put(struct net *net, uint8_t nfproto)
+{
+ if (nfproto == NFPROTO_INET) {
+ nf_ct_netns_do_put(net, NFPROTO_IPV4);
+ nf_ct_netns_do_put(net, NFPROTO_IPV6);
+ } else {
+ nf_ct_netns_do_put(net, nfproto);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_ct_netns_put);
+
+static const struct nf_conntrack_l4proto * const builtin_l4proto[] = {
+ &nf_conntrack_l4proto_tcp4,
+ &nf_conntrack_l4proto_udp4,
+ &nf_conntrack_l4proto_icmp,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp4,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite4,
+#endif
+#if IS_ENABLED(CONFIG_IPV6)
+ &nf_conntrack_l4proto_tcp6,
+ &nf_conntrack_l4proto_udp6,
+ &nf_conntrack_l4proto_icmpv6,
+#ifdef CONFIG_NF_CT_PROTO_DCCP
+ &nf_conntrack_l4proto_dccp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_SCTP
+ &nf_conntrack_l4proto_sctp6,
+#endif
+#ifdef CONFIG_NF_CT_PROTO_UDPLITE
+ &nf_conntrack_l4proto_udplite6,
+#endif
+#endif /* CONFIG_IPV6 */
+};
+
+int nf_conntrack_proto_init(void)
+{
+ int ret = 0;
+
+ ret = nf_register_sockopt(&so_getorigdst);
+ if (ret < 0)
+ return ret;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = nf_register_sockopt(&so_getorigdst6);
+ if (ret < 0)
+ goto cleanup_sockopt;
+#endif
+ ret = nf_ct_l4proto_register(builtin_l4proto,
+ ARRAY_SIZE(builtin_l4proto));
+ if (ret < 0)
+ goto cleanup_sockopt2;
+
+ return ret;
+cleanup_sockopt2:
+ nf_unregister_sockopt(&so_getorigdst);
+#if IS_ENABLED(CONFIG_IPV6)
+cleanup_sockopt:
+ nf_unregister_sockopt(&so_getorigdst6);
+#endif
+ return ret;
+}
+
+void nf_conntrack_proto_fini(void)
+{
+ unsigned int i;
+
+ nf_unregister_sockopt(&so_getorigdst);
+#if IS_ENABLED(CONFIG_IPV6)
+ nf_unregister_sockopt(&so_getorigdst6);
+#endif
+ /* No need to call nf_ct_l4proto_unregister(), the register
+ * tables are free'd here anyway.
+ */
+ for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
+ kfree(nf_ct_protos[i]);
+}
+
int nf_conntrack_proto_pernet_init(struct net *net)
{
int err;
@@ -581,6 +967,14 @@ int nf_conntrack_proto_pernet_init(struct net *net)
if (err < 0)
return err;
+ err = nf_ct_l4proto_pernet_register(net, builtin_l4proto,
+ ARRAY_SIZE(builtin_l4proto));
+ if (err < 0) {
+ nf_ct_l4proto_unregister_sysctl(net, pn,
+ &nf_conntrack_l4proto_generic);
+ return err;
+ }
+
pn->users++;
return 0;
}
@@ -590,25 +984,19 @@ void nf_conntrack_proto_pernet_fini(struct net *net)
struct nf_proto_net *pn = nf_ct_l4proto_net(net,
&nf_conntrack_l4proto_generic);
+ nf_ct_l4proto_pernet_unregister(net, builtin_l4proto,
+ ARRAY_SIZE(builtin_l4proto));
pn->users--;
nf_ct_l4proto_unregister_sysctl(net,
pn,
&nf_conntrack_l4proto_generic);
}
-int nf_conntrack_proto_init(void)
-{
- unsigned int i;
- for (i = 0; i < NFPROTO_NUMPROTO; i++)
- rcu_assign_pointer(nf_ct_l3protos[i],
- &nf_conntrack_l3proto_generic);
- return 0;
-}
-void nf_conntrack_proto_fini(void)
-{
- unsigned int i;
- /* free l3proto protocol tables */
- for (i = 0; i < ARRAY_SIZE(nf_ct_protos); i++)
- kfree(nf_ct_protos[i]);
-}
+module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
+ &nf_conntrack_htable_size, 0600);
+
+MODULE_ALIAS("ip_conntrack");
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET));
+MODULE_ALIAS("nf_conntrack-" __stringify(AF_INET6));
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c
index abe647d5b8c6..8c58f96b59e7 100644
--- a/net/netfilter/nf_conntrack_proto_dccp.c
+++ b/net/netfilter/nf_conntrack_proto_dccp.c
@@ -23,6 +23,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
/* Timeouts are based on values from RFC4340:
@@ -243,14 +244,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
* We currently ignore Sync packets
*
* sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
},
[DCCP_PKT_SYNCACK] = {
/*
* We currently ignore SyncAck packets
*
* sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
},
},
[CT_DCCP_ROLE_SERVER] = {
@@ -371,14 +372,14 @@ dccp_state_table[CT_DCCP_ROLE_MAX + 1][DCCP_PKT_SYNCACK + 1][CT_DCCP_MAX + 1] =
* We currently ignore Sync packets
*
* sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
},
[DCCP_PKT_SYNCACK] = {
/*
* We currently ignore SyncAck packets
*
* sNO, sRQ, sRS, sPO, sOP, sCR, sCG, sTW */
- sIG, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
+ sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG,
},
},
};
@@ -388,31 +389,8 @@ static inline struct nf_dccp_net *dccp_pernet(struct net *net)
return &net->ct.nf_ct_proto.dccp;
}
-static bool dccp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- struct dccp_hdr _hdr, *dh;
-
- /* Actually only need first 4 bytes to get ports. */
- dh = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (dh == NULL)
- return false;
-
- tuple->src.u.dccp.port = dh->dccph_sport;
- tuple->dst.u.dccp.port = dh->dccph_dport;
- return true;
-}
-
-static bool dccp_invert_tuple(struct nf_conntrack_tuple *inv,
- const struct nf_conntrack_tuple *tuple)
-{
- inv->src.u.dccp.port = tuple->dst.u.dccp.port;
- inv->dst.u.dccp.port = tuple->src.u.dccp.port;
- return true;
-}
-
static bool dccp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
struct net *net = nf_ct_net(ct);
struct nf_dccp_net *dn;
@@ -460,19 +438,14 @@ static u64 dccp_ack_seq(const struct dccp_hdr *dh)
ntohl(dhack->dccph_ack_nr_low);
}
-static unsigned int *dccp_get_timeouts(struct net *net)
-{
- return dccp_pernet(net)->dccp_timeout;
-}
-
static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ unsigned int dataoff, enum ip_conntrack_info ctinfo)
{
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
struct dccp_hdr _dh, *dh;
u_int8_t type, old_state, new_state;
enum ct_dccp_roles role;
+ unsigned int *timeouts;
dh = skb_header_pointer(skb, dataoff, sizeof(_dh), &_dh);
BUG_ON(dh == NULL);
@@ -546,6 +519,9 @@ static int dccp_packet(struct nf_conn *ct, const struct sk_buff *skb,
if (new_state != old_state)
nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = dccp_pernet(nf_ct_net(ct))->dccp_timeout;
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
return NF_ACCEPT;
@@ -864,11 +840,8 @@ static struct nf_proto_net *dccp_get_net_proto(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 = {
.l3proto = AF_INET,
.l4proto = IPPROTO_DCCP,
- .pkt_to_tuple = dccp_pkt_to_tuple,
- .invert_tuple = dccp_invert_tuple,
.new = dccp_new,
.packet = dccp_packet,
- .get_timeouts = dccp_get_timeouts,
.error = dccp_error,
.can_early_drop = dccp_can_early_drop,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
@@ -900,11 +873,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_dccp4);
const struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 = {
.l3proto = AF_INET6,
.l4proto = IPPROTO_DCCP,
- .pkt_to_tuple = dccp_pkt_to_tuple,
- .invert_tuple = dccp_invert_tuple,
.new = dccp_new,
.packet = dccp_packet,
- .get_timeouts = dccp_get_timeouts,
.error = dccp_error,
.can_early_drop = dccp_can_early_drop,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index 6c6896d21cd7..ac4a0b296dcd 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -11,6 +11,7 @@
#include <linux/timer.h>
#include <linux/netfilter.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
static const unsigned int nf_ct_generic_timeout = 600*HZ;
@@ -41,34 +42,24 @@ static bool generic_pkt_to_tuple(const struct sk_buff *skb,
return true;
}
-static bool generic_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.all = 0;
- tuple->dst.u.all = 0;
-
- return true;
-}
-
-static unsigned int *generic_get_timeouts(struct net *net)
-{
- return &(generic_pernet(net)->timeout);
-}
-
/* Returns verdict for packet, or -1 for invalid. */
static int generic_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
+ enum ip_conntrack_info ctinfo)
{
+ const unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+ if (!timeout)
+ timeout = &generic_pernet(nf_ct_net(ct))->timeout;
+
nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
}
/* Called when a new connection for this protocol found. */
static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
bool ret;
@@ -87,8 +78,11 @@ static bool generic_new(struct nf_conn *ct, const struct sk_buff *skb,
static int generic_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- unsigned int *timeout = data;
struct nf_generic_net *gn = generic_pernet(net);
+ unsigned int *timeout = data;
+
+ if (!timeout)
+ timeout = &gn->timeout;
if (tb[CTA_TIMEOUT_GENERIC_TIMEOUT])
*timeout =
@@ -168,9 +162,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
.l3proto = PF_UNSPEC,
.l4proto = 255,
.pkt_to_tuple = generic_pkt_to_tuple,
- .invert_tuple = generic_invert_tuple,
.packet = generic_packet,
- .get_timeouts = generic_get_timeouts,
.new = generic_new,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK_TIMEOUT)
.ctnl_timeout = {
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index d049ea5a3770..d1632252bf5b 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -39,6 +39,7 @@
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
@@ -179,15 +180,6 @@ EXPORT_SYMBOL_GPL(nf_ct_gre_keymap_destroy);
/* PUBLIC CONNTRACK PROTO HELPER FUNCTIONS */
-/* invert gre part of tuple */
-static bool gre_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->dst.u.gre.key = orig->src.u.gre.key;
- tuple->src.u.gre.key = orig->dst.u.gre.key;
- return true;
-}
-
/* gre hdr info to tuple */
static bool gre_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
struct net *net, struct nf_conntrack_tuple *tuple)
@@ -243,8 +235,7 @@ static unsigned int *gre_get_timeouts(struct net *net)
static int gre_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ enum ip_conntrack_info ctinfo)
{
/* If we've seen traffic both ways, this is a GRE connection.
* Extend timeout. */
@@ -263,8 +254,13 @@ static int gre_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool gre_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
+ unsigned int *timeouts = nf_ct_timeout_lookup(ct);
+
+ if (!timeouts)
+ timeouts = gre_get_timeouts(nf_ct_net(ct));
+
pr_debug(": ");
nf_ct_dump_tuple(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
@@ -300,6 +296,8 @@ static int gre_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeouts = data;
struct netns_proto_gre *net_gre = gre_pernet(net);
+ if (!timeouts)
+ timeouts = gre_get_timeouts(net);
/* set default timeouts for GRE. */
timeouts[GRE_CT_UNREPLIED] = net_gre->gre_timeouts[GRE_CT_UNREPLIED];
timeouts[GRE_CT_REPLIED] = net_gre->gre_timeouts[GRE_CT_REPLIED];
@@ -356,11 +354,9 @@ static const struct nf_conntrack_l4proto nf_conntrack_l4proto_gre4 = {
.l3proto = AF_INET,
.l4proto = IPPROTO_GRE,
.pkt_to_tuple = gre_pkt_to_tuple,
- .invert_tuple = gre_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = gre_print_conntrack,
#endif
- .get_timeouts = gre_get_timeouts,
.packet = gre_packet,
.new = gre_new,
.destroy = gre_destroy,
diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index 5c15beafa711..036670b38282 100644
--- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -19,6 +19,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/nf_log.h>
@@ -80,12 +81,16 @@ static unsigned int *icmp_get_timeouts(struct net *net)
static int icmp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
+ enum ip_conntrack_info ctinfo)
{
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
+ unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+ if (!timeout)
+ timeout = icmp_get_timeouts(nf_ct_net(ct));
+
nf_ct_refresh_acct(ct, ctinfo, skb, *timeout);
return NF_ACCEPT;
@@ -93,7 +98,7 @@ static int icmp_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool icmp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
static const u_int8_t valid_new[] = {
[ICMP_ECHO] = 1,
@@ -142,8 +147,7 @@ icmp_error_message(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb,
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&innertuple, &origtuple,
- &nf_conntrack_l3proto_ipv4, innerproto)) {
+ if (!nf_ct_invert_tuple(&innertuple, &origtuple, innerproto)) {
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT;
}
@@ -281,9 +285,11 @@ static int icmp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct nf_icmp_net *in = icmp_pernet(net);
if (tb[CTA_TIMEOUT_ICMP_TIMEOUT]) {
+ if (!timeout)
+ timeout = &in->timeout;
*timeout =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMP_TIMEOUT])) * HZ;
- } else {
+ } else if (timeout) {
/* Set default ICMP timeout. */
*timeout = in->timeout;
}
@@ -358,7 +364,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
.pkt_to_tuple = icmp_pkt_to_tuple,
.invert_tuple = icmp_invert_tuple,
.packet = icmp_packet,
- .get_timeouts = icmp_get_timeouts,
.new = icmp_new,
.error = icmp_error,
.destroy = NULL,
diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 2548e2c8aedd..bed07b998a10 100644
--- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -23,6 +23,7 @@
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_conntrack_icmpv6.h>
#include <net/netfilter/nf_log.h>
@@ -93,9 +94,13 @@ static unsigned int *icmpv6_get_timeouts(struct net *net)
static int icmpv6_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeout)
+ enum ip_conntrack_info ctinfo)
{
+ unsigned int *timeout = nf_ct_timeout_lookup(ct);
+
+ if (!timeout)
+ timeout = icmpv6_get_timeouts(nf_ct_net(ct));
+
/* Do not immediately delete the connection after the first
successful reply to avoid excessive conntrackd traffic
and also to handle correctly ICMP echo reply duplicates. */
@@ -106,7 +111,7 @@ static int icmpv6_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool icmpv6_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
static const u_int8_t valid_new[] = {
[ICMPV6_ECHO_REQUEST - 128] = 1,
@@ -152,8 +157,7 @@ icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
- if (!nf_ct_invert_tuple(&intuple, &origtuple,
- &nf_conntrack_l3proto_ipv6, inproto)) {
+ if (!nf_ct_invert_tuple(&intuple, &origtuple, inproto)) {
pr_debug("icmpv6_error: Can't invert tuple\n");
return -NF_ACCEPT;
}
@@ -281,6 +285,8 @@ static int icmpv6_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeout = data;
struct nf_icmp_net *in = icmpv6_pernet(net);
+ if (!timeout)
+ timeout = icmpv6_get_timeouts(net);
if (tb[CTA_TIMEOUT_ICMPV6_TIMEOUT]) {
*timeout =
ntohl(nla_get_be32(tb[CTA_TIMEOUT_ICMPV6_TIMEOUT])) * HZ;
@@ -359,7 +365,6 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
.pkt_to_tuple = icmpv6_pkt_to_tuple,
.invert_tuple = icmpv6_invert_tuple,
.packet = icmpv6_packet,
- .get_timeouts = icmpv6_get_timeouts,
.new = icmpv6_new,
.error = icmpv6_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c
index fb9a35d16069..8d1e085fc14a 100644
--- a/net/netfilter/nf_conntrack_proto_sctp.c
+++ b/net/netfilter/nf_conntrack_proto_sctp.c
@@ -28,6 +28,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
/* FIXME: Examine ipfilter's timeouts and conntrack transitions more
closely. They're more complex. --RR
@@ -150,30 +151,6 @@ static inline struct nf_sctp_net *sctp_pernet(struct net *net)
return &net->ct.nf_ct_proto.sctp;
}
-static bool sctp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- const struct sctphdr *hp;
- struct sctphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.sctp.port = hp->source;
- tuple->dst.u.sctp.port = hp->dest;
- return true;
-}
-
-static bool sctp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.sctp.port = orig->dst.u.sctp.port;
- tuple->dst.u.sctp.port = orig->src.u.sctp.port;
- return true;
-}
-
#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
static void sctp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -296,17 +273,11 @@ static int sctp_new_state(enum ip_conntrack_dir dir,
return sctp_conntracks[dir][i][cur_state];
}
-static unsigned int *sctp_get_timeouts(struct net *net)
-{
- return sctp_pernet(net)->timeouts;
-}
-
/* Returns verdict for packet, or -NF_ACCEPT for invalid. */
static int sctp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ enum ip_conntrack_info ctinfo)
{
enum sctp_conntrack new_state, old_state;
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
@@ -315,6 +286,7 @@ static int sctp_packet(struct nf_conn *ct,
const struct sctp_chunkhdr *sch;
struct sctp_chunkhdr _sch;
u_int32_t offset, count;
+ unsigned int *timeouts;
unsigned long map[256 / sizeof(unsigned long)] = { 0 };
sh = skb_header_pointer(skb, dataoff, sizeof(_sctph), &_sctph);
@@ -403,6 +375,10 @@ static int sctp_packet(struct nf_conn *ct,
}
spin_unlock_bh(&ct->lock);
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = sctp_pernet(nf_ct_net(ct))->timeouts;
+
nf_ct_refresh_acct(ct, ctinfo, skb, timeouts[new_state]);
if (old_state == SCTP_CONNTRACK_COOKIE_ECHOED &&
@@ -423,7 +399,7 @@ out:
/* Called when a new connection for this protocol found. */
static bool sctp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
enum sctp_conntrack new_state;
const struct sctphdr *sh;
@@ -780,13 +756,10 @@ static struct nf_proto_net *sctp_get_net_proto(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 = {
.l3proto = PF_INET,
.l4proto = IPPROTO_SCTP,
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = sctp_print_conntrack,
#endif
.packet = sctp_packet,
- .get_timeouts = sctp_get_timeouts,
.new = sctp_new,
.error = sctp_error,
.can_early_drop = sctp_can_early_drop,
@@ -817,13 +790,10 @@ EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_sctp4);
const struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 = {
.l3proto = PF_INET6,
.l4proto = IPPROTO_SCTP,
- .pkt_to_tuple = sctp_pkt_to_tuple,
- .invert_tuple = sctp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = sctp_print_conntrack,
#endif
.packet = sctp_packet,
- .get_timeouts = sctp_get_timeouts,
.new = sctp_new,
.error = sctp_error,
.can_early_drop = sctp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 8e67910185a0..d80d322b9d8b 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -29,6 +29,7 @@
#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -276,31 +277,6 @@ static inline struct nf_tcp_net *tcp_pernet(struct net *net)
return &net->ct.nf_ct_proto.tcp;
}
-static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
- struct net *net, struct nf_conntrack_tuple *tuple)
-{
- const struct tcphdr *hp;
- struct tcphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.tcp.port = hp->source;
- tuple->dst.u.tcp.port = hp->dest;
-
- return true;
-}
-
-static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.tcp.port = orig->dst.u.tcp.port;
- tuple->dst.u.tcp.port = orig->src.u.tcp.port;
- return true;
-}
-
#ifdef CONFIG_NF_CONNTRACK_PROCFS
/* Print out the private part of the conntrack. */
static void tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
@@ -793,27 +769,21 @@ static int tcp_error(struct net *net, struct nf_conn *tmpl,
return NF_ACCEPT;
}
-static unsigned int *tcp_get_timeouts(struct net *net)
-{
- return tcp_pernet(net)->timeouts;
-}
-
/* Returns verdict for packet, or -1 for invalid. */
static int tcp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ enum ip_conntrack_info ctinfo)
{
struct net *net = nf_ct_net(ct);
struct nf_tcp_net *tn = tcp_pernet(net);
struct nf_conntrack_tuple *tuple;
enum tcp_conntrack new_state, old_state;
+ unsigned int index, *timeouts;
enum ip_conntrack_dir dir;
const struct tcphdr *th;
struct tcphdr _tcph;
unsigned long timeout;
- unsigned int index;
th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
BUG_ON(th == NULL);
@@ -1046,6 +1016,10 @@ static int tcp_packet(struct nf_conn *ct,
&& new_state == TCP_CONNTRACK_FIN_WAIT)
ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = tn->timeouts;
+
if (ct->proto.tcp.retrans >= tn->tcp_max_retrans &&
timeouts[new_state] > timeouts[TCP_CONNTRACK_RETRANS])
timeout = timeouts[TCP_CONNTRACK_RETRANS];
@@ -1095,7 +1069,7 @@ static int tcp_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
enum tcp_conntrack new_state;
const struct tcphdr *th;
@@ -1313,10 +1287,12 @@ static unsigned int tcp_nlattr_tuple_size(void)
static int tcp_timeout_nlattr_to_obj(struct nlattr *tb[],
struct net *net, void *data)
{
- unsigned int *timeouts = data;
struct nf_tcp_net *tn = tcp_pernet(net);
+ unsigned int *timeouts = data;
int i;
+ if (!timeouts)
+ timeouts = tn->timeouts;
/* set default TCP timeouts. */
for (i=0; i<TCP_CONNTRACK_TIMEOUT_MAX; i++)
timeouts[i] = tn->timeouts[i];
@@ -1559,13 +1535,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 =
{
.l3proto = PF_INET,
.l4proto = IPPROTO_TCP,
- .pkt_to_tuple = tcp_pkt_to_tuple,
- .invert_tuple = tcp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = tcp_print_conntrack,
#endif
.packet = tcp_packet,
- .get_timeouts = tcp_get_timeouts,
.new = tcp_new,
.error = tcp_error,
.can_early_drop = tcp_can_early_drop,
@@ -1597,13 +1570,10 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 =
{
.l3proto = PF_INET6,
.l4proto = IPPROTO_TCP,
- .pkt_to_tuple = tcp_pkt_to_tuple,
- .invert_tuple = tcp_invert_tuple,
#ifdef CONFIG_NF_CONNTRACK_PROCFS
.print_conntrack = tcp_print_conntrack,
#endif
.packet = tcp_packet,
- .get_timeouts = tcp_get_timeouts,
.new = tcp_new,
.error = tcp_error,
.can_early_drop = tcp_can_early_drop,
diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c
index fe7243970aa4..7a1b8988a931 100644
--- a/net/netfilter/nf_conntrack_proto_udp.c
+++ b/net/netfilter/nf_conntrack_proto_udp.c
@@ -22,6 +22,7 @@
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_ecache.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_log.h>
#include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
#include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
@@ -36,33 +37,6 @@ static inline struct nf_udp_net *udp_pernet(struct net *net)
return &net->ct.nf_ct_proto.udp;
}
-static bool udp_pkt_to_tuple(const struct sk_buff *skb,
- unsigned int dataoff,
- struct net *net,
- struct nf_conntrack_tuple *tuple)
-{
- const struct udphdr *hp;
- struct udphdr _hdr;
-
- /* Actually only need first 4 bytes to get ports. */
- hp = skb_header_pointer(skb, dataoff, 4, &_hdr);
- if (hp == NULL)
- return false;
-
- tuple->src.u.udp.port = hp->source;
- tuple->dst.u.udp.port = hp->dest;
-
- return true;
-}
-
-static bool udp_invert_tuple(struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_tuple *orig)
-{
- tuple->src.u.udp.port = orig->dst.u.udp.port;
- tuple->dst.u.udp.port = orig->src.u.udp.port;
- return true;
-}
-
static unsigned int *udp_get_timeouts(struct net *net)
{
return udp_pernet(net)->timeouts;
@@ -72,9 +46,14 @@ static unsigned int *udp_get_timeouts(struct net *net)
static int udp_packet(struct nf_conn *ct,
const struct sk_buff *skb,
unsigned int dataoff,
- enum ip_conntrack_info ctinfo,
- unsigned int *timeouts)
+ enum ip_conntrack_info ctinfo)
{
+ unsigned int *timeouts;
+
+ timeouts = nf_ct_timeout_lookup(ct);
+ if (!timeouts)
+ timeouts = udp_get_timeouts(nf_ct_net(ct));
+
/* If we've seen traffic both ways, this is some kind of UDP
stream. Extend timeout. */
if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
@@ -92,7 +71,7 @@ static int udp_packet(struct nf_conn *ct,
/* Called when a new connection for this protocol found. */
static bool udp_new(struct nf_conn *ct, const struct sk_buff *skb,
- unsigned int dataoff, unsigned int *timeouts)
+ unsigned int dataoff)
{
return true;
}
@@ -203,6 +182,9 @@ static int udp_timeout_nlattr_to_obj(struct nlattr *tb[],
unsigned int *timeouts = data;
struct nf_udp_net *un = udp_pernet(net);
+ if (!timeouts)
+ timeouts = un->timeouts;
+
/* set default timeouts for UDP. */
timeouts[UDP_CT_UNREPLIED] = un->timeouts[UDP_CT_UNREPLIED];
timeouts[UDP_CT_REPLIED] = un->timeouts[UDP_CT_REPLIED];
@@ -301,10 +283,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
.packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
.new = udp_new,
.error = udp_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -333,10 +312,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
.packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
.new = udp_new,
.error = udplite_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -365,10 +341,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDP,
.allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
.packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
.new = udp_new,
.error = udp_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -397,10 +370,7 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
.allow_clash = true,
- .pkt_to_tuple = udp_pkt_to_tuple,
- .invert_tuple = udp_invert_tuple,
.packet = udp_packet,
- .get_timeouts = udp_get_timeouts,
.new = udp_new,
.error = udplite_error,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
@@ -423,3 +393,4 @@ const struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 =
};
EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_udplite6);
#endif
+#include <net/netfilter/nf_conntrack_timeout.h>
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index b642c0b2495c..13279f683da9 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -1,12 +1,4 @@
-/* (C) 1999-2001 Paul `Rusty' Russell
- * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
- * (C) 2005-2012 Patrick McHardy <kaber@trash.net>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
+// SPDX-License-Identifier: GPL-2.0
#include <linux/types.h>
#include <linux/netfilter.h>
#include <linux/slab.h>
@@ -24,7 +16,6 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_expect.h>
#include <net/netfilter/nf_conntrack_helper.h>
@@ -33,15 +24,14 @@
#include <net/netfilter/nf_conntrack_timestamp.h>
#include <linux/rculist_nulls.h>
-MODULE_LICENSE("GPL");
+unsigned int nf_conntrack_net_id __read_mostly;
#ifdef CONFIG_NF_CONNTRACK_PROCFS
void
print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
- const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *l4proto)
{
- switch (l3proto->l3proto) {
+ switch (tuple->src.l3num) {
case NFPROTO_IPV4:
seq_printf(s, "src=%pI4 dst=%pI4 ",
&tuple->src.u3.ip, &tuple->dst.u3.ip);
@@ -282,7 +272,6 @@ static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(hash);
- const struct nf_conntrack_l3proto *l3proto;
const struct nf_conntrack_l4proto *l4proto;
struct net *net = seq_file_net(s);
int ret = 0;
@@ -303,14 +292,12 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (!net_eq(nf_ct_net(ct), net))
goto release;
- l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
- WARN_ON(!l3proto);
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
WARN_ON(!l4proto);
ret = -ENOSPC;
seq_printf(s, "%-8s %u %-8s %u ",
- l3proto_name(l3proto->l3proto), nf_ct_l3num(ct),
+ l3proto_name(nf_ct_l3num(ct)), nf_ct_l3num(ct),
l4proto_name(l4proto->l4proto), nf_ct_protonum(ct));
if (!test_bit(IPS_OFFLOAD_BIT, &ct->status))
@@ -320,7 +307,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
l4proto->print_conntrack(s, ct);
print_tuple(s, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- l3proto, l4proto);
+ l4proto);
ct_show_zone(s, ct, NF_CT_ZONE_DIR_ORIG);
@@ -333,8 +320,7 @@ static int ct_seq_show(struct seq_file *s, void *v)
if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status)))
seq_puts(s, "[UNREPLIED] ");
- print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple,
- l3proto, l4proto);
+ print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l4proto);
ct_show_zone(s, ct, NF_CT_ZONE_DIR_REPL);
@@ -680,6 +666,8 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
static struct pernet_operations nf_conntrack_net_ops = {
.init = nf_conntrack_pernet_init,
.exit_batch = nf_conntrack_pernet_exit,
+ .id = &nf_conntrack_net_id,
+ .size = sizeof(struct nf_conntrack_net),
};
static int __init nf_conntrack_standalone_init(void)
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index eb0d1658ac05..d8125616edc7 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -107,11 +107,12 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp)
tcp->seen[1].td_maxwin = 0;
}
+#define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ)
+#define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ)
+
static void flow_offload_fixup_ct_state(struct nf_conn *ct)
{
const struct nf_conntrack_l4proto *l4proto;
- struct net *net = nf_ct_net(ct);
- unsigned int *timeouts;
unsigned int timeout;
int l4num;
@@ -123,14 +124,10 @@ static void flow_offload_fixup_ct_state(struct nf_conn *ct)
if (!l4proto)
return;
- timeouts = l4proto->get_timeouts(net);
- if (!timeouts)
- return;
-
if (l4num == IPPROTO_TCP)
- timeout = timeouts[TCP_CONNTRACK_ESTABLISHED];
+ timeout = NF_FLOWTABLE_TCP_PICKUP_TIMEOUT;
else if (l4num == IPPROTO_UDP)
- timeout = timeouts[UDP_CT_REPLIED];
+ timeout = NF_FLOWTABLE_UDP_PICKUP_TIMEOUT;
else
return;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 86df2a1666fd..e2b196054dfc 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -28,7 +28,6 @@
#include <net/netfilter/nf_nat_helper.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
@@ -743,12 +742,6 @@ EXPORT_SYMBOL_GPL(nf_nat_l4proto_unregister);
int nf_nat_l3proto_register(const struct nf_nat_l3proto *l3proto)
{
- int err;
-
- err = nf_ct_l3proto_try_module_get(l3proto->l3proto);
- if (err < 0)
- return err;
-
mutex_lock(&nf_nat_proto_mutex);
RCU_INIT_POINTER(nf_nat_l4protos[l3proto->l3proto][IPPROTO_TCP],
&nf_nat_l4proto_tcp);
@@ -781,7 +774,6 @@ void nf_nat_l3proto_unregister(const struct nf_nat_l3proto *l3proto)
synchronize_rcu();
nf_nat_l3proto_clean(l3proto->l3proto);
- nf_ct_l3proto_module_put(l3proto->l3proto);
}
EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister);
@@ -1064,7 +1056,7 @@ static int __init nf_nat_init(void)
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ kvfree(nf_nat_bysource);
pr_err("Unable to register extension\n");
return ret;
}
@@ -1102,7 +1094,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
- nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
+ kvfree(nf_nat_bysource);
unregister_pernet_subsys(&nat_net_ops);
}
diff --git a/net/netfilter/nf_osf.c b/net/netfilter/nf_osf.c
deleted file mode 100644
index 5ba5c7bef2f9..000000000000
--- a/net/netfilter/nf_osf.c
+++ /dev/null
@@ -1,218 +0,0 @@
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
-#include <linux/kernel.h>
-
-#include <linux/capability.h>
-#include <linux/if.h>
-#include <linux/inetdevice.h>
-#include <linux/ip.h>
-#include <linux/list.h>
-#include <linux/rculist.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/tcp.h>
-
-#include <net/ip.h>
-#include <net/tcp.h>
-
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/x_tables.h>
-#include <net/netfilter/nf_log.h>
-#include <linux/netfilter/nf_osf.h>
-
-static inline int nf_osf_ttl(const struct sk_buff *skb,
- const struct nf_osf_info *info,
- unsigned char f_ttl)
-{
- const struct iphdr *ip = ip_hdr(skb);
-
- if (info->flags & NF_OSF_TTL) {
- if (info->ttl == NF_OSF_TTL_TRUE)
- return ip->ttl == f_ttl;
- if (info->ttl == NF_OSF_TTL_NOCHECK)
- return 1;
- else if (ip->ttl <= f_ttl)
- return 1;
- else {
- struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
- int ret = 0;
-
- for_ifa(in_dev) {
- if (inet_ifa_match(ip->saddr, ifa)) {
- ret = (ip->ttl == f_ttl);
- break;
- }
- }
- endfor_ifa(in_dev);
-
- return ret;
- }
- }
-
- return ip->ttl == f_ttl;
-}
-
-bool
-nf_osf_match(const struct sk_buff *skb, u_int8_t family,
- int hooknum, struct net_device *in, struct net_device *out,
- const struct nf_osf_info *info, struct net *net,
- const struct list_head *nf_osf_fingers)
-{
- const unsigned char *optp = NULL, *_optp = NULL;
- unsigned int optsize = 0, check_WSS = 0;
- int fmatch = FMATCH_WRONG, fcount = 0;
- const struct iphdr *ip = ip_hdr(skb);
- const struct nf_osf_user_finger *f;
- unsigned char opts[MAX_IPOPTLEN];
- const struct nf_osf_finger *kf;
- u16 window, totlen, mss = 0;
- const struct tcphdr *tcp;
- struct tcphdr _tcph;
- bool df;
-
- tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
- if (!tcp)
- return false;
-
- if (!tcp->syn)
- return false;
-
- totlen = ntohs(ip->tot_len);
- df = ntohs(ip->frag_off) & IP_DF;
- window = ntohs(tcp->window);
-
- if (tcp->doff * 4 > sizeof(struct tcphdr)) {
- optsize = tcp->doff * 4 - sizeof(struct tcphdr);
-
- _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) +
- sizeof(struct tcphdr), optsize, opts);
- }
-
- list_for_each_entry_rcu(kf, &nf_osf_fingers[df], finger_entry) {
- int foptsize, optnum;
-
- f = &kf->finger;
-
- if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
- continue;
-
- optp = _optp;
- fmatch = FMATCH_WRONG;
-
- if (totlen != f->ss || !nf_osf_ttl(skb, info, f->ttl))
- continue;
-
- /*
- * Should not happen if userspace parser was written correctly.
- */
- if (f->wss.wc >= OSF_WSS_MAX)
- continue;
-
- /* Check options */
-
- foptsize = 0;
- for (optnum = 0; optnum < f->opt_num; ++optnum)
- foptsize += f->opt[optnum].length;
-
- if (foptsize > MAX_IPOPTLEN ||
- optsize > MAX_IPOPTLEN ||
- optsize != foptsize)
- continue;
-
- check_WSS = f->wss.wc;
-
- for (optnum = 0; optnum < f->opt_num; ++optnum) {
- if (f->opt[optnum].kind == (*optp)) {
- __u32 len = f->opt[optnum].length;
- const __u8 *optend = optp + len;
-
- fmatch = FMATCH_OK;
-
- switch (*optp) {
- case OSFOPT_MSS:
- mss = optp[3];
- mss <<= 8;
- mss |= optp[2];
-
- mss = ntohs((__force __be16)mss);
- break;
- case OSFOPT_TS:
- break;
- }
-
- optp = optend;
- } else
- fmatch = FMATCH_OPT_WRONG;
-
- if (fmatch != FMATCH_OK)
- break;
- }
-
- if (fmatch != FMATCH_OPT_WRONG) {
- fmatch = FMATCH_WRONG;
-
- switch (check_WSS) {
- case OSF_WSS_PLAIN:
- if (f->wss.val == 0 || window == f->wss.val)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MSS:
- /*
- * Some smart modems decrease mangle MSS to
- * SMART_MSS_2, so we check standard, decreased
- * and the one provided in the fingerprint MSS
- * values.
- */
-#define SMART_MSS_1 1460
-#define SMART_MSS_2 1448
- if (window == f->wss.val * mss ||
- window == f->wss.val * SMART_MSS_1 ||
- window == f->wss.val * SMART_MSS_2)
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MTU:
- if (window == f->wss.val * (mss + 40) ||
- window == f->wss.val * (SMART_MSS_1 + 40) ||
- window == f->wss.val * (SMART_MSS_2 + 40))
- fmatch = FMATCH_OK;
- break;
- case OSF_WSS_MODULO:
- if ((window % f->wss.val) == 0)
- fmatch = FMATCH_OK;
- break;
- }
- }
-
- if (fmatch != FMATCH_OK)
- continue;
-
- fcount++;
-
- if (info->flags & NF_OSF_LOG)
- nf_log_packet(net, family, hooknum, skb,
- in, out, NULL,
- "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
- f->genre, f->version, f->subtype,
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest),
- f->ttl - ip->ttl);
-
- if ((info->flags & NF_OSF_LOG) &&
- info->loglevel == NF_OSF_LOGLEVEL_FIRST)
- break;
- }
-
- if (!fcount && (info->flags & NF_OSF_LOG))
- nf_log_packet(net, family, hooknum, skb, in, out, NULL,
- "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
- &ip->saddr, ntohs(tcp->source),
- &ip->daddr, ntohs(tcp->dest));
-
- if (fcount)
- fmatch = FMATCH_OK;
-
- return fmatch == FMATCH_OK;
-}
-EXPORT_SYMBOL_GPL(nf_osf_match);
-
-MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 3f211e1025c1..67cdd5c4f4f5 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -76,6 +76,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
{
ctx->net = net;
ctx->family = family;
+ ctx->level = 0;
ctx->table = table;
ctx->chain = chain;
ctx->nla = nla;
@@ -455,20 +456,59 @@ __nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family)
return NULL;
}
+/*
+ * Loading a module requires dropping mutex that guards the
+ * transaction.
+ * We first need to abort any pending transactions as once
+ * mutex is unlocked a different client could start a new
+ * transaction. It must not see any 'future generation'
+ * changes * as these changes will never happen.
+ */
+#ifdef CONFIG_MODULES
+static int __nf_tables_abort(struct net *net);
+
+static void nft_request_module(struct net *net, const char *fmt, ...)
+{
+ char module_name[MODULE_NAME_LEN];
+ va_list args;
+ int ret;
+
+ __nf_tables_abort(net);
+
+ va_start(args, fmt);
+ ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+ va_end(args);
+ if (WARN(ret >= MODULE_NAME_LEN, "truncated: '%s' (len %d)", module_name, ret))
+ return;
+
+ mutex_unlock(&net->nft.commit_mutex);
+ request_module("%s", module_name);
+ mutex_lock(&net->nft.commit_mutex);
+}
+#endif
+
+static void lockdep_nfnl_nft_mutex_not_held(void)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ WARN_ON_ONCE(lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+#endif
+}
+
static const struct nft_chain_type *
-nf_tables_chain_type_lookup(const struct nlattr *nla, u8 family, bool autoload)
+nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla,
+ u8 family, bool autoload)
{
const struct nft_chain_type *type;
type = __nf_tables_chain_type_lookup(nla, family);
if (type != NULL)
return type;
+
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (autoload) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-chain-%u-%.*s", family,
- nla_len(nla), (const char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-chain-%u-%.*s", family,
+ nla_len(nla), (const char *)nla_data(nla));
type = __nf_tables_chain_type_lookup(nla, family);
if (type != NULL)
return ERR_PTR(-EAGAIN);
@@ -772,6 +812,7 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
int err;
+ lockdep_assert_held(&net->nft.commit_mutex);
attr = nla[NFTA_TABLE_NAME];
table = nft_table_lookup(net, attr, family, genmask);
if (IS_ERR(table)) {
@@ -1012,7 +1053,17 @@ nft_chain_lookup_byhandle(const struct nft_table *table, u64 handle, u8 genmask)
return ERR_PTR(-ENOENT);
}
-static struct nft_chain *nft_chain_lookup(struct nft_table *table,
+static bool lockdep_commit_lock_is_held(struct net *net)
+{
+#ifdef CONFIG_PROVE_LOCKING
+ return lockdep_is_held(&net->nft.commit_mutex);
+#else
+ return true;
+#endif
+}
+
+static struct nft_chain *nft_chain_lookup(struct net *net,
+ struct nft_table *table,
const struct nlattr *nla, u8 genmask)
{
char search[NFT_CHAIN_MAXNAMELEN + 1];
@@ -1025,7 +1076,7 @@ static struct nft_chain *nft_chain_lookup(struct nft_table *table,
nla_strlcpy(search, nla, sizeof(search));
WARN_ON(!rcu_read_lock_held() &&
- !lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ !lockdep_commit_lock_is_held(net));
chain = ERR_PTR(-ENOENT);
rcu_read_lock();
@@ -1265,7 +1316,7 @@ static int nf_tables_getchain(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- chain = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_CHAIN_NAME], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]);
return PTR_ERR(chain);
@@ -1391,13 +1442,16 @@ struct nft_chain_hook {
static int nft_chain_parse_hook(struct net *net,
const struct nlattr * const nla[],
struct nft_chain_hook *hook, u8 family,
- bool create)
+ bool autoload)
{
struct nlattr *ha[NFTA_HOOK_MAX + 1];
const struct nft_chain_type *type;
struct net_device *dev;
int err;
+ lockdep_assert_held(&net->nft.commit_mutex);
+ lockdep_nfnl_nft_mutex_not_held();
+
err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK],
nft_hook_policy, NULL);
if (err < 0)
@@ -1412,8 +1466,8 @@ static int nft_chain_parse_hook(struct net *net,
type = chain_type[family][NFT_CHAIN_T_DEFAULT];
if (nla[NFTA_CHAIN_TYPE]) {
- type = nf_tables_chain_type_lookup(nla[NFTA_CHAIN_TYPE],
- family, create);
+ type = nf_tables_chain_type_lookup(net, nla[NFTA_CHAIN_TYPE],
+ family, autoload);
if (IS_ERR(type))
return PTR_ERR(type);
}
@@ -1480,7 +1534,7 @@ static struct nft_rule **nf_tables_chain_alloc_rules(const struct nft_chain *cha
}
static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
- u8 policy, bool create)
+ u8 policy)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -1498,7 +1552,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask,
struct nft_chain_hook hook;
struct nf_hook_ops *ops;
- err = nft_chain_parse_hook(net, nla, &hook, family, create);
+ err = nft_chain_parse_hook(net, nla, &hook, family, true);
if (err < 0)
return err;
@@ -1589,8 +1643,7 @@ err1:
return err;
}
-static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
- bool create)
+static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy)
{
const struct nlattr * const *nla = ctx->nla;
struct nft_table *table = ctx->table;
@@ -1598,7 +1651,6 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
struct nft_base_chain *basechain;
struct nft_stats *stats = NULL;
struct nft_chain_hook hook;
- const struct nlattr *name;
struct nf_hook_ops *ops;
struct nft_trans *trans;
int err;
@@ -1608,7 +1660,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return -EBUSY;
err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family,
- create);
+ false);
if (err < 0)
return err;
@@ -1632,7 +1684,8 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
nla[NFTA_CHAIN_NAME]) {
struct nft_chain *chain2;
- chain2 = nft_chain_lookup(table, nla[NFTA_CHAIN_NAME], genmask);
+ chain2 = nft_chain_lookup(ctx->net, table,
+ nla[NFTA_CHAIN_NAME], genmask);
if (!IS_ERR(chain2))
return -EEXIST;
}
@@ -1646,12 +1699,11 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
return PTR_ERR(stats);
}
+ err = -ENOMEM;
trans = nft_trans_alloc(ctx, NFT_MSG_NEWCHAIN,
sizeof(struct nft_trans_chain));
- if (trans == NULL) {
- free_percpu(stats);
- return -ENOMEM;
- }
+ if (trans == NULL)
+ goto err;
nft_trans_chain_stats(trans) = stats;
nft_trans_chain_update(trans) = true;
@@ -1661,19 +1713,37 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy,
else
nft_trans_chain_policy(trans) = -1;
- name = nla[NFTA_CHAIN_NAME];
- if (nla[NFTA_CHAIN_HANDLE] && name) {
- nft_trans_chain_name(trans) =
- nla_strdup(name, GFP_KERNEL);
- if (!nft_trans_chain_name(trans)) {
- kfree(trans);
- free_percpu(stats);
- return -ENOMEM;
+ if (nla[NFTA_CHAIN_HANDLE] &&
+ nla[NFTA_CHAIN_NAME]) {
+ struct nft_trans *tmp;
+ char *name;
+
+ err = -ENOMEM;
+ name = nla_strdup(nla[NFTA_CHAIN_NAME], GFP_KERNEL);
+ if (!name)
+ goto err;
+
+ err = -EEXIST;
+ list_for_each_entry(tmp, &ctx->net->nft.commit_list, list) {
+ if (tmp->msg_type == NFT_MSG_NEWCHAIN &&
+ tmp->ctx.table == table &&
+ nft_trans_chain_update(tmp) &&
+ nft_trans_chain_name(tmp) &&
+ strcmp(name, nft_trans_chain_name(tmp)) == 0) {
+ kfree(name);
+ goto err;
+ }
}
+
+ nft_trans_chain_name(trans) = name;
}
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
+err:
+ free_percpu(stats);
+ kfree(trans);
+ return err;
}
static int nf_tables_newchain(struct net *net, struct sock *nlsk,
@@ -1690,9 +1760,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
u8 policy = NF_ACCEPT;
struct nft_ctx ctx;
u64 handle = 0;
- bool create;
- create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+ lockdep_assert_held(&net->nft.commit_mutex);
table = nft_table_lookup(net, nla[NFTA_CHAIN_TABLE], family, genmask);
if (IS_ERR(table)) {
@@ -1712,7 +1781,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
}
attr = nla[NFTA_CHAIN_HANDLE];
} else {
- chain = nft_chain_lookup(table, attr, genmask);
+ chain = nft_chain_lookup(net, table, attr, genmask);
if (IS_ERR(chain)) {
if (PTR_ERR(chain) != -ENOENT) {
NL_SET_BAD_ATTR(extack, attr);
@@ -1755,10 +1824,10 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- return nf_tables_updchain(&ctx, genmask, policy, create);
+ return nf_tables_updchain(&ctx, genmask, policy);
}
- return nf_tables_addchain(&ctx, family, genmask, policy, create);
+ return nf_tables_addchain(&ctx, family, genmask, policy);
}
static int nf_tables_delchain(struct net *net, struct sock *nlsk,
@@ -1790,7 +1859,7 @@ static int nf_tables_delchain(struct net *net, struct sock *nlsk,
chain = nft_chain_lookup_byhandle(table, handle, genmask);
} else {
attr = nla[NFTA_CHAIN_NAME];
- chain = nft_chain_lookup(table, attr, genmask);
+ chain = nft_chain_lookup(net, table, attr, genmask);
}
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, attr);
@@ -1875,7 +1944,8 @@ static const struct nft_expr_type *__nft_expr_type_get(u8 family,
return NULL;
}
-static const struct nft_expr_type *nft_expr_type_get(u8 family,
+static const struct nft_expr_type *nft_expr_type_get(struct net *net,
+ u8 family,
struct nlattr *nla)
{
const struct nft_expr_type *type;
@@ -1887,19 +1957,16 @@ static const struct nft_expr_type *nft_expr_type_get(u8 family,
if (type != NULL && try_module_get(type->owner))
return type;
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-expr-%u-%.*s", family,
- nla_len(nla), (char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-expr-%u-%.*s", family,
+ nla_len(nla), (char *)nla_data(nla));
if (__nft_expr_type_get(family, nla))
return ERR_PTR(-EAGAIN);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-expr-%.*s",
- nla_len(nla), (char *)nla_data(nla));
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-expr-%.*s",
+ nla_len(nla), (char *)nla_data(nla));
if (__nft_expr_type_get(family, nla))
return ERR_PTR(-EAGAIN);
}
@@ -1968,7 +2035,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx,
if (err < 0)
return err;
- type = nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+ type = nft_expr_type_get(ctx->net, ctx->family, tb[NFTA_EXPR_NAME]);
if (IS_ERR(type))
return PTR_ERR(type);
@@ -2255,6 +2322,39 @@ done:
return skb->len;
}
+static int nf_tables_dump_rules_start(struct netlink_callback *cb)
+{
+ const struct nlattr * const *nla = cb->data;
+ struct nft_rule_dump_ctx *ctx = NULL;
+
+ if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
+ ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (nla[NFTA_RULE_TABLE]) {
+ ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
+ GFP_ATOMIC);
+ if (!ctx->table) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ }
+ if (nla[NFTA_RULE_CHAIN]) {
+ ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
+ GFP_ATOMIC);
+ if (!ctx->chain) {
+ kfree(ctx->table);
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ }
+ }
+
+ cb->data = ctx;
+ return 0;
+}
+
static int nf_tables_dump_rules_done(struct netlink_callback *cb)
{
struct nft_rule_dump_ctx *ctx = cb->data;
@@ -2284,38 +2384,13 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start= nf_tables_dump_rules_start,
.dump = nf_tables_dump_rules,
.done = nf_tables_dump_rules_done,
.module = THIS_MODULE,
+ .data = (void *)nla,
};
- if (nla[NFTA_RULE_TABLE] || nla[NFTA_RULE_CHAIN]) {
- struct nft_rule_dump_ctx *ctx;
-
- ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC);
- if (!ctx)
- return -ENOMEM;
-
- if (nla[NFTA_RULE_TABLE]) {
- ctx->table = nla_strdup(nla[NFTA_RULE_TABLE],
- GFP_ATOMIC);
- if (!ctx->table) {
- kfree(ctx);
- return -ENOMEM;
- }
- }
- if (nla[NFTA_RULE_CHAIN]) {
- ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN],
- GFP_ATOMIC);
- if (!ctx->chain) {
- kfree(ctx->table);
- kfree(ctx);
- return -ENOMEM;
- }
- }
- c.data = ctx;
- }
-
return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
@@ -2325,7 +2400,7 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
@@ -2359,6 +2434,7 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
{
struct nft_expr *expr;
+ lockdep_assert_held(&ctx->net->nft.commit_mutex);
/*
* Careful: some expressions might not be initialized in case this
* is called on error from nf_tables_newrule().
@@ -2385,6 +2461,9 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain)
struct nft_rule *rule;
int err;
+ if (ctx->level == NFT_JUMP_STACK_SIZE)
+ return -EMLINK;
+
list_for_each_entry(rule, &chain->rules, list) {
if (!nft_is_active_next(ctx->net, rule))
continue;
@@ -2427,8 +2506,6 @@ static int nft_table_validate(struct net *net, const struct nft_table *table)
#define NFT_RULE_MAXEXPRS 128
-static struct nft_expr_info *info;
-
static int nf_tables_newrule(struct net *net, struct sock *nlsk,
struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[],
@@ -2436,6 +2513,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u8 genmask = nft_genmask_next(net);
+ struct nft_expr_info *info = NULL;
int family = nfmsg->nfgen_family;
struct nft_table *table;
struct nft_chain *chain;
@@ -2447,10 +2525,9 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
struct nlattr *tmp;
unsigned int size, i, n, ulen = 0, usize = 0;
int err, rem;
- bool create;
u64 handle, pos_handle;
- create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
+ lockdep_assert_held(&net->nft.commit_mutex);
table = nft_table_lookup(net, nla[NFTA_RULE_TABLE], family, genmask);
if (IS_ERR(table)) {
@@ -2458,7 +2535,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
return PTR_ERR(table);
}
- chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN], genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
@@ -2481,7 +2558,8 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
else
return -EOPNOTSUPP;
} else {
- if (!create || nlh->nlmsg_flags & NLM_F_REPLACE)
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE) ||
+ nlh->nlmsg_flags & NLM_F_REPLACE)
return -EINVAL;
handle = nf_tables_alloc_handle(table);
@@ -2506,6 +2584,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
n = 0;
size = 0;
if (nla[NFTA_RULE_EXPRESSIONS]) {
+ info = kvmalloc_array(NFT_RULE_MAXEXPRS,
+ sizeof(struct nft_expr_info),
+ GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+
nla_for_each_nested(tmp, nla[NFTA_RULE_EXPRESSIONS], rem) {
err = -EINVAL;
if (nla_type(tmp) != NFTA_LIST_ELEM)
@@ -2598,6 +2682,7 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk,
list_add_rcu(&rule->list, &chain->rules);
}
}
+ kvfree(info);
chain->use++;
if (net->nft.validate_state == NFT_VALIDATE_DO)
@@ -2611,6 +2696,7 @@ err1:
if (info[i].ops != NULL)
module_put(info[i].ops->type->owner);
}
+ kvfree(info);
return err;
}
@@ -2650,7 +2736,8 @@ static int nf_tables_delrule(struct net *net, struct sock *nlsk,
}
if (nla[NFTA_RULE_CHAIN]) {
- chain = nft_chain_lookup(table, nla[NFTA_RULE_CHAIN], genmask);
+ chain = nft_chain_lookup(net, table, nla[NFTA_RULE_CHAIN],
+ genmask);
if (IS_ERR(chain)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_CHAIN]);
return PTR_ERR(chain);
@@ -2742,11 +2829,11 @@ nft_select_set_ops(const struct nft_ctx *ctx,
const struct nft_set_type *type;
u32 flags = 0;
+ lockdep_assert_held(&ctx->net->nft.commit_mutex);
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (list_empty(&nf_tables_set_types)) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-set");
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(ctx->net, "nft-set");
if (!list_empty(&nf_tables_set_types))
return ERR_PTR(-EAGAIN);
}
@@ -3162,6 +3249,18 @@ done:
return skb->len;
}
+static int nf_tables_dump_sets_start(struct netlink_callback *cb)
+{
+ struct nft_ctx *ctx_dump = NULL;
+
+ ctx_dump = kmemdup(cb->data, sizeof(*ctx_dump), GFP_ATOMIC);
+ if (ctx_dump == NULL)
+ return -ENOMEM;
+
+ cb->data = ctx_dump;
+ return 0;
+}
+
static int nf_tables_dump_sets_done(struct netlink_callback *cb)
{
kfree(cb->data);
@@ -3189,18 +3288,12 @@ static int nf_tables_getset(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start = nf_tables_dump_sets_start,
.dump = nf_tables_dump_sets,
.done = nf_tables_dump_sets_done,
+ .data = &ctx,
.module = THIS_MODULE,
};
- struct nft_ctx *ctx_dump;
-
- ctx_dump = kmalloc(sizeof(*ctx_dump), GFP_ATOMIC);
- if (ctx_dump == NULL)
- return -ENOMEM;
-
- *ctx_dump = ctx;
- c.data = ctx_dump;
return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
@@ -3262,7 +3355,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_ctx ctx;
char *name;
unsigned int size;
- bool create;
u64 timeout;
u32 ktype, dtype, flags, policy, gc_int, objtype;
struct nft_set_desc desc;
@@ -3363,8 +3455,6 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
return err;
}
- create = nlh->nlmsg_flags & NLM_F_CREATE ? true : false;
-
table = nft_table_lookup(net, nla[NFTA_SET_TABLE], family, genmask);
if (IS_ERR(table)) {
NL_SET_BAD_ATTR(extack, nla[NFTA_SET_TABLE]);
@@ -3850,6 +3940,15 @@ nla_put_failure:
return -ENOSPC;
}
+static int nf_tables_dump_set_start(struct netlink_callback *cb)
+{
+ struct nft_set_dump_ctx *dump_ctx = cb->data;
+
+ cb->data = kmemdup(dump_ctx, sizeof(*dump_ctx), GFP_ATOMIC);
+
+ return cb->data ? 0 : -ENOMEM;
+}
+
static int nf_tables_dump_set_done(struct netlink_callback *cb)
{
kfree(cb->data);
@@ -3921,7 +4020,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
- const struct nft_set_ext *ext;
struct nft_data_desc desc;
struct nft_set_elem elem;
struct sk_buff *skb;
@@ -3955,7 +4053,6 @@ static int nft_get_set_elem(struct nft_ctx *ctx, struct nft_set *set,
return PTR_ERR(priv);
elem.priv = priv;
- ext = nft_set_elem_ext(set, &elem);
err = -ENOMEM;
skb = nlmsg_new(NLMSG_GOODSIZE, GFP_ATOMIC);
@@ -4003,20 +4100,17 @@ static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start = nf_tables_dump_set_start,
.dump = nf_tables_dump_set,
.done = nf_tables_dump_set_done,
.module = THIS_MODULE,
};
- struct nft_set_dump_ctx *dump_ctx;
-
- dump_ctx = kmalloc(sizeof(*dump_ctx), GFP_ATOMIC);
- if (!dump_ctx)
- return -ENOMEM;
-
- dump_ctx->set = set;
- dump_ctx->ctx = ctx;
+ struct nft_set_dump_ctx dump_ctx = {
+ .set = set,
+ .ctx = ctx,
+ };
- c.data = dump_ctx;
+ c.data = &dump_ctx;
return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
@@ -4779,7 +4873,8 @@ static const struct nft_object_type *__nft_obj_type_get(u32 objtype)
return NULL;
}
-static const struct nft_object_type *nft_obj_type_get(u32 objtype)
+static const struct nft_object_type *
+nft_obj_type_get(struct net *net, u32 objtype)
{
const struct nft_object_type *type;
@@ -4787,11 +4882,10 @@ static const struct nft_object_type *nft_obj_type_get(u32 objtype)
if (type != NULL && try_module_get(type->owner))
return type;
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nft-obj-%u", objtype);
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nft-obj-%u", objtype);
if (__nft_obj_type_get(objtype))
return ERR_PTR(-EAGAIN);
}
@@ -4843,7 +4937,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk,
nft_ctx_init(&ctx, net, skb, nlh, family, table, NULL, nla);
- type = nft_obj_type_get(objtype);
+ type = nft_obj_type_get(net, objtype);
if (IS_ERR(type))
return PTR_ERR(type);
@@ -4976,38 +5070,42 @@ done:
return skb->len;
}
-static int nf_tables_dump_obj_done(struct netlink_callback *cb)
+static int nf_tables_dump_obj_start(struct netlink_callback *cb)
{
- struct nft_obj_filter *filter = cb->data;
+ const struct nlattr * const *nla = cb->data;
+ struct nft_obj_filter *filter = NULL;
- if (filter) {
- kfree(filter->table);
- kfree(filter);
+ if (nla[NFTA_OBJ_TABLE] || nla[NFTA_OBJ_TYPE]) {
+ filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
+ if (!filter)
+ return -ENOMEM;
+
+ if (nla[NFTA_OBJ_TABLE]) {
+ filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
+ if (!filter->table) {
+ kfree(filter);
+ return -ENOMEM;
+ }
+ }
+
+ if (nla[NFTA_OBJ_TYPE])
+ filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
}
+ cb->data = filter;
return 0;
}
-static struct nft_obj_filter *
-nft_obj_filter_alloc(const struct nlattr * const nla[])
+static int nf_tables_dump_obj_done(struct netlink_callback *cb)
{
- struct nft_obj_filter *filter;
-
- filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
- if (!filter)
- return ERR_PTR(-ENOMEM);
+ struct nft_obj_filter *filter = cb->data;
- if (nla[NFTA_OBJ_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_ATOMIC);
- if (!filter->table) {
- kfree(filter);
- return ERR_PTR(-ENOMEM);
- }
+ if (filter) {
+ kfree(filter->table);
+ kfree(filter);
}
- if (nla[NFTA_OBJ_TYPE])
- filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE]));
- return filter;
+ return 0;
}
/* called with rcu_read_lock held */
@@ -5028,21 +5126,13 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start = nf_tables_dump_obj_start,
.dump = nf_tables_dump_obj,
.done = nf_tables_dump_obj_done,
.module = THIS_MODULE,
+ .data = (void *)nla,
};
- if (nla[NFTA_OBJ_TABLE] ||
- nla[NFTA_OBJ_TYPE]) {
- struct nft_obj_filter *filter;
-
- filter = nft_obj_filter_alloc(nla);
- if (IS_ERR(filter))
- return -ENOMEM;
-
- c.data = filter;
- }
return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
@@ -5321,8 +5411,6 @@ static int nf_tables_flowtable_parse_hook(const struct nft_ctx *ctx,
flowtable->ops[i].priv = &flowtable->data;
flowtable->ops[i].hook = flowtable->data.type->hook;
flowtable->ops[i].dev = dev_array[i];
- flowtable->dev_name[i] = kstrdup(dev_array[i]->name,
- GFP_KERNEL);
}
return err;
@@ -5339,7 +5427,8 @@ static const struct nf_flowtable_type *__nft_flowtable_type_get(u8 family)
return NULL;
}
-static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
+static const struct nf_flowtable_type *
+nft_flowtable_type_get(struct net *net, u8 family)
{
const struct nf_flowtable_type *type;
@@ -5347,11 +5436,10 @@ static const struct nf_flowtable_type *nft_flowtable_type_get(u8 family)
if (type != NULL && try_module_get(type->owner))
return type;
+ lockdep_nfnl_nft_mutex_not_held();
#ifdef CONFIG_MODULES
if (type == NULL) {
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
- request_module("nf-flowtable-%u", family);
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ nft_request_module(net, "nf-flowtable-%u", family);
if (__nft_flowtable_type_get(family))
return ERR_PTR(-EAGAIN);
}
@@ -5431,7 +5519,7 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
goto err1;
}
- type = nft_flowtable_type_get(family);
+ type = nft_flowtable_type_get(net, family);
if (IS_ERR(type)) {
err = PTR_ERR(type);
goto err2;
@@ -5480,10 +5568,8 @@ static int nf_tables_newflowtable(struct net *net, struct sock *nlsk,
err6:
i = flowtable->ops_len;
err5:
- for (k = i - 1; k >= 0; k--) {
- kfree(flowtable->dev_name[k]);
+ for (k = i - 1; k >= 0; k--)
nf_unregister_net_hook(net, &flowtable->ops[k]);
- }
kfree(flowtable->ops);
err4:
@@ -5582,9 +5668,10 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net,
goto nla_put_failure;
for (i = 0; i < flowtable->ops_len; i++) {
- if (flowtable->dev_name[i][0] &&
- nla_put_string(skb, NFTA_DEVICE_NAME,
- flowtable->dev_name[i]))
+ const struct net_device *dev = READ_ONCE(flowtable->ops[i].dev);
+
+ if (dev &&
+ nla_put_string(skb, NFTA_DEVICE_NAME, dev->name))
goto nla_put_failure;
}
nla_nest_end(skb, nest_devs);
@@ -5651,37 +5738,39 @@ done:
return skb->len;
}
-static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
+static int nf_tables_dump_flowtable_start(struct netlink_callback *cb)
{
- struct nft_flowtable_filter *filter = cb->data;
+ const struct nlattr * const *nla = cb->data;
+ struct nft_flowtable_filter *filter = NULL;
- if (!filter)
- return 0;
+ if (nla[NFTA_FLOWTABLE_TABLE]) {
+ filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
+ if (!filter)
+ return -ENOMEM;
- kfree(filter->table);
- kfree(filter);
+ filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
+ GFP_ATOMIC);
+ if (!filter->table) {
+ kfree(filter);
+ return -ENOMEM;
+ }
+ }
+ cb->data = filter;
return 0;
}
-static struct nft_flowtable_filter *
-nft_flowtable_filter_alloc(const struct nlattr * const nla[])
+static int nf_tables_dump_flowtable_done(struct netlink_callback *cb)
{
- struct nft_flowtable_filter *filter;
+ struct nft_flowtable_filter *filter = cb->data;
- filter = kzalloc(sizeof(*filter), GFP_ATOMIC);
if (!filter)
- return ERR_PTR(-ENOMEM);
+ return 0;
- if (nla[NFTA_FLOWTABLE_TABLE]) {
- filter->table = nla_strdup(nla[NFTA_FLOWTABLE_TABLE],
- GFP_ATOMIC);
- if (!filter->table) {
- kfree(filter);
- return ERR_PTR(-ENOMEM);
- }
- }
- return filter;
+ kfree(filter->table);
+ kfree(filter);
+
+ return 0;
}
/* called with rcu_read_lock held */
@@ -5701,20 +5790,13 @@ static int nf_tables_getflowtable(struct net *net, struct sock *nlsk,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
+ .start = nf_tables_dump_flowtable_start,
.dump = nf_tables_dump_flowtable,
.done = nf_tables_dump_flowtable_done,
.module = THIS_MODULE,
+ .data = (void *)nla,
};
- if (nla[NFTA_FLOWTABLE_TABLE]) {
- struct nft_flowtable_filter *filter;
-
- filter = nft_flowtable_filter_alloc(nla);
- if (IS_ERR(filter))
- return -ENOMEM;
-
- c.data = filter;
- }
return nft_netlink_dump_start_rcu(nlsk, skb, nlh, &c);
}
@@ -5784,6 +5866,7 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable)
kfree(flowtable->name);
flowtable->data.type->free(&flowtable->data);
module_put(flowtable->data.type->owner);
+ kfree(flowtable);
}
static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net,
@@ -5826,7 +5909,6 @@ static void nft_flowtable_event(unsigned long event, struct net_device *dev,
continue;
nf_unregister_net_hook(dev_net(dev), &flowtable->ops[i]);
- flowtable->dev_name[i][0] = '\0';
flowtable->ops[i].dev = NULL;
break;
}
@@ -5847,13 +5929,13 @@ static int nf_tables_flowtable_event(struct notifier_block *this,
if (!net)
return 0;
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ mutex_lock(&net->nft.commit_mutex);
list_for_each_entry(table, &net->nft.tables, list) {
list_for_each_entry(flowtable, &table->flowtables, list) {
nft_flowtable_event(event, dev, flowtable);
}
}
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ mutex_unlock(&net->nft.commit_mutex);
put_net(net);
return NOTIFY_DONE;
}
@@ -6087,6 +6169,9 @@ static void nft_commit_release(struct nft_trans *trans)
case NFT_MSG_DELTABLE:
nf_tables_table_destroy(&trans->ctx);
break;
+ case NFT_MSG_NEWCHAIN:
+ kfree(nft_trans_chain_name(trans));
+ break;
case NFT_MSG_DELCHAIN:
nf_tables_chain_destroy(&trans->ctx);
break;
@@ -6202,9 +6287,9 @@ static void nf_tables_commit_chain_active(struct net *net, struct nft_chain *cha
next_genbit = nft_gencursor_next(net);
g0 = rcu_dereference_protected(chain->rules_gen_0,
- lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ lockdep_commit_lock_is_held(net));
g1 = rcu_dereference_protected(chain->rules_gen_1,
- lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES));
+ lockdep_commit_lock_is_held(net));
/* No changes to this chain? */
if (chain->rules_next == NULL) {
@@ -6316,13 +6401,15 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_table_notify(&trans->ctx, NFT_MSG_DELTABLE);
break;
case NFT_MSG_NEWCHAIN:
- if (nft_trans_chain_update(trans))
+ if (nft_trans_chain_update(trans)) {
nft_chain_commit_update(trans);
- else
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ /* trans destroyed after rcu grace period */
+ } else {
nft_clear(net, trans->ctx.chain);
-
- nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
- nft_trans_destroy(trans);
+ nf_tables_chain_notify(&trans->ctx, NFT_MSG_NEWCHAIN);
+ nft_trans_destroy(trans);
+ }
break;
case NFT_MSG_DELCHAIN:
nft_chain_del(trans->ctx.chain);
@@ -6412,6 +6499,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb)
nf_tables_commit_release(net);
nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN);
+ mutex_unlock(&net->nft.commit_mutex);
return 0;
}
@@ -6472,7 +6560,7 @@ static int __nf_tables_abort(struct net *net)
case NFT_MSG_NEWCHAIN:
if (nft_trans_chain_update(trans)) {
free_percpu(nft_trans_chain_stats(trans));
-
+ kfree(nft_trans_chain_name(trans));
nft_trans_destroy(trans);
} else {
trans->ctx.table->use--;
@@ -6563,12 +6651,25 @@ static void nf_tables_cleanup(struct net *net)
static int nf_tables_abort(struct net *net, struct sk_buff *skb)
{
- return __nf_tables_abort(net);
+ int ret = __nf_tables_abort(net);
+
+ mutex_unlock(&net->nft.commit_mutex);
+
+ return ret;
}
static bool nf_tables_valid_genid(struct net *net, u32 genid)
{
- return net->nft.base_seq == genid;
+ bool genid_ok;
+
+ mutex_lock(&net->nft.commit_mutex);
+
+ genid_ok = genid == 0 || net->nft.base_seq == genid;
+ if (!genid_ok)
+ mutex_unlock(&net->nft.commit_mutex);
+
+ /* else, commit mutex has to be released by commit or abort function */
+ return genid_ok;
}
static const struct nfnetlink_subsystem nf_tables_subsys = {
@@ -6580,6 +6681,7 @@ static const struct nfnetlink_subsystem nf_tables_subsys = {
.abort = nf_tables_abort,
.cleanup = nf_tables_cleanup,
.valid_genid = nf_tables_valid_genid,
+ .owner = THIS_MODULE,
};
int nft_chain_validate_dependency(const struct nft_chain *chain,
@@ -6838,13 +6940,6 @@ int nft_validate_register_store(const struct nft_ctx *ctx,
err = nf_tables_check_loops(ctx, data->verdict.chain);
if (err < 0)
return err;
-
- if (ctx->chain->level + 1 >
- data->verdict.chain->level) {
- if (ctx->chain->level + 1 == NFT_JUMP_STACK_SIZE)
- return -EMLINK;
- data->verdict.chain->level = ctx->chain->level + 1;
- }
}
return 0;
@@ -6906,8 +7001,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data,
case NFT_GOTO:
if (!tb[NFTA_VERDICT_CHAIN])
return -EINVAL;
- chain = nft_chain_lookup(ctx->table, tb[NFTA_VERDICT_CHAIN],
- genmask);
+ chain = nft_chain_lookup(ctx->net, ctx->table,
+ tb[NFTA_VERDICT_CHAIN], genmask);
if (IS_ERR(chain))
return PTR_ERR(chain);
if (nft_is_base_chain(chain))
@@ -7152,6 +7247,7 @@ static int __net_init nf_tables_init_net(struct net *net)
{
INIT_LIST_HEAD(&net->nft.tables);
INIT_LIST_HEAD(&net->nft.commit_list);
+ mutex_init(&net->nft.commit_mutex);
net->nft.base_seq = 1;
net->nft.validate_state = NFT_VALIDATE_SKIP;
@@ -7160,11 +7256,11 @@ static int __net_init nf_tables_init_net(struct net *net)
static void __net_exit nf_tables_exit_net(struct net *net)
{
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ mutex_lock(&net->nft.commit_mutex);
if (!list_empty(&net->nft.commit_list))
__nf_tables_abort(net);
__nft_release_tables(net);
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ mutex_unlock(&net->nft.commit_mutex);
WARN_ON_ONCE(!list_empty(&net->nft.tables));
}
@@ -7179,29 +7275,19 @@ static int __init nf_tables_module_init(void)
nft_chain_filter_init();
- info = kmalloc_array(NFT_RULE_MAXEXPRS, sizeof(struct nft_expr_info),
- GFP_KERNEL);
- if (info == NULL) {
- err = -ENOMEM;
- goto err1;
- }
-
err = nf_tables_core_module_init();
if (err < 0)
- goto err2;
+ return err;
err = nfnetlink_subsys_register(&nf_tables_subsys);
if (err < 0)
- goto err3;
+ goto err;
register_netdevice_notifier(&nf_tables_flowtable_notifier);
return register_pernet_subsys(&nf_tables_net_ops);
-err3:
+err:
nf_tables_core_module_exit();
-err2:
- kfree(info);
-err1:
return err;
}
@@ -7213,7 +7299,6 @@ static void __exit nf_tables_module_exit(void)
unregister_pernet_subsys(&nf_tables_net_ops);
rcu_barrier();
nf_tables_core_module_exit();
- kfree(info);
}
module_init(nf_tables_module_init);
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 8de912ca53d3..ffd5c0f9412b 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -120,6 +120,20 @@ struct nft_jumpstack {
struct nft_rule *const *rules;
};
+static void expr_call_ops_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ struct nft_pktinfo *pkt)
+{
+ unsigned long e = (unsigned long)expr->ops->eval;
+
+ if (e == (unsigned long)nft_meta_get_eval)
+ nft_meta_get_eval(expr, regs, pkt);
+ else if (e == (unsigned long)nft_lookup_eval)
+ nft_lookup_eval(expr, regs, pkt);
+ else
+ expr->ops->eval(expr, regs, pkt);
+}
+
unsigned int
nft_do_chain(struct nft_pktinfo *pkt, void *priv)
{
@@ -153,7 +167,7 @@ next_rule:
nft_cmp_fast_eval(expr, &regs);
else if (expr->ops != &nft_payload_fast_ops ||
!nft_payload_fast_eval(expr, &regs, pkt))
- expr->ops->eval(expr, &regs, pkt);
+ expr_call_ops_eval(expr, &regs, pkt);
if (regs.verdict.code != NFT_CONTINUE)
break;
diff --git a/net/netfilter/nf_tables_set_core.c b/net/netfilter/nf_tables_set_core.c
new file mode 100644
index 000000000000..814789644bd3
--- /dev/null
+++ b/net/netfilter/nf_tables_set_core.c
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <net/netfilter/nf_tables_core.h>
+
+static int __init nf_tables_set_module_init(void)
+{
+ nft_register_set(&nft_set_hash_fast_type);
+ nft_register_set(&nft_set_hash_type);
+ nft_register_set(&nft_set_rhash_type);
+ nft_register_set(&nft_set_bitmap_type);
+ nft_register_set(&nft_set_rbtree_type);
+
+ return 0;
+}
+
+static void __exit nf_tables_set_module_exit(void)
+{
+ nft_unregister_set(&nft_set_rbtree_type);
+ nft_unregister_set(&nft_set_bitmap_type);
+ nft_unregister_set(&nft_set_rhash_type);
+ nft_unregister_set(&nft_set_hash_type);
+ nft_unregister_set(&nft_set_hash_fast_type);
+}
+
+module_init(nf_tables_set_module_init);
+module_exit(nf_tables_set_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e1b6be29848d..916913454624 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -331,18 +331,27 @@ replay:
}
}
- if (!ss->commit || !ss->abort) {
+ if (!ss->valid_genid || !ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
return kfree_skb(skb);
}
- if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) {
+ if (!try_module_get(ss->owner)) {
+ nfnl_unlock(subsys_id);
+ netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL);
+ return kfree_skb(skb);
+ }
+
+ if (!ss->valid_genid(net, genid)) {
+ module_put(ss->owner);
nfnl_unlock(subsys_id);
netlink_ack(oskb, nlh, -ERESTART, NULL);
return kfree_skb(skb);
}
+ nfnl_unlock(subsys_id);
+
while (skb->len >= nlmsg_total_size(0)) {
int msglen, type;
@@ -464,14 +473,10 @@ ack:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- const struct nfnetlink_subsystem *ss2;
-
- ss2 = nfnl_dereference_protected(subsys_id);
- if (ss2 == ss)
- ss->abort(net, oskb);
+ ss->abort(net, oskb);
nfnl_err_reset(&err_list);
- nfnl_unlock(subsys_id);
kfree_skb(skb);
+ module_put(ss->owner);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
err = ss->commit(net, oskb);
@@ -489,8 +494,8 @@ done:
ss->cleanup(net);
nfnl_err_deliver(&err_list, oskb);
- nfnl_unlock(subsys_id);
kfree_skb(skb);
+ module_put(ss->owner);
}
static const struct nla_policy nfnl_batch_policy[NFNL_BATCH_MAX + 1] = {
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index 9ee5fa551fa6..4199e5300575 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -26,7 +26,6 @@
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
-#include <net/netfilter/nf_conntrack_l3proto.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_timeout.h>
@@ -47,7 +46,7 @@ static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
};
static int
-ctnl_timeout_parse_policy(void *timeouts,
+ctnl_timeout_parse_policy(void *timeout,
const struct nf_conntrack_l4proto *l4proto,
struct net *net, const struct nlattr *attr)
{
@@ -68,7 +67,7 @@ ctnl_timeout_parse_policy(void *timeouts,
if (ret < 0)
goto err;
- ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts);
+ ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeout);
err:
kfree(tb);
@@ -373,7 +372,6 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
struct netlink_ext_ack *extack)
{
const struct nf_conntrack_l4proto *l4proto;
- unsigned int *timeouts;
__u16 l3num;
__u8 l4num;
int ret;
@@ -393,9 +391,7 @@ static int cttimeout_default_set(struct net *net, struct sock *ctnl,
goto err;
}
- timeouts = l4proto->get_timeouts(net);
-
- ret = ctnl_timeout_parse_policy(timeouts, l4proto, net,
+ ret = ctnl_timeout_parse_policy(NULL, l4proto, net,
cda[CTA_TIMEOUT_DATA]);
if (ret < 0)
goto err;
@@ -432,7 +428,6 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) {
struct nlattr *nest_parms;
- unsigned int *timeouts = l4proto->get_timeouts(net);
int ret;
nest_parms = nla_nest_start(skb,
@@ -440,7 +435,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid,
if (!nest_parms)
goto nla_put_failure;
- ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, timeouts);
+ ret = l4proto->ctnl_timeout.obj_to_nlattr(skb, NULL);
if (ret < 0)
goto nla_put_failure;
@@ -508,7 +503,6 @@ err:
return err;
}
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
static struct ctnl_timeout *
ctnl_timeout_find_get(struct net *net, const char *name)
{
@@ -539,7 +533,6 @@ static void ctnl_timeout_put(struct ctnl_timeout *timeout)
module_put(THIS_MODULE);
}
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
static const struct nfnl_callback cttimeout_cb[IPCTNL_MSG_TIMEOUT_MAX] = {
[IPCTNL_MSG_TIMEOUT_NEW] = { .call = cttimeout_new_timeout,
@@ -610,10 +603,8 @@ static int __init cttimeout_init(void)
"nfnetlink.\n");
goto err_out;
}
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, ctnl_timeout_find_get);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, ctnl_timeout_put);
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
return 0;
err_out:
@@ -626,11 +617,9 @@ static void __exit cttimeout_exit(void)
nfnetlink_subsys_unregister(&cttimeout_subsys);
unregister_pernet_subsys(&cttimeout_ops);
-#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
synchronize_rcu();
-#endif /* CONFIG_NF_CONNTRACK_TIMEOUT */
}
module_init(cttimeout_init);
diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c
new file mode 100644
index 000000000000..f9dba62c450f
--- /dev/null
+++ b/net/netfilter/nfnetlink_osf.c
@@ -0,0 +1,436 @@
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+#include <linux/capability.h>
+#include <linux/if.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/list.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/x_tables.h>
+#include <net/netfilter/nf_log.h>
+#include <linux/netfilter/nfnetlink_osf.h>
+
+/*
+ * Indexed by dont-fragment bit.
+ * It is the only constant value in the fingerprint.
+ */
+struct list_head nf_osf_fingers[2];
+EXPORT_SYMBOL_GPL(nf_osf_fingers);
+
+static inline int nf_osf_ttl(const struct sk_buff *skb,
+ int ttl_check, unsigned char f_ttl)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+
+ if (ttl_check != -1) {
+ if (ttl_check == NF_OSF_TTL_TRUE)
+ return ip->ttl == f_ttl;
+ if (ttl_check == NF_OSF_TTL_NOCHECK)
+ return 1;
+ else if (ip->ttl <= f_ttl)
+ return 1;
+ else {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+ int ret = 0;
+
+ for_ifa(in_dev) {
+ if (inet_ifa_match(ip->saddr, ifa)) {
+ ret = (ip->ttl == f_ttl);
+ break;
+ }
+ }
+ endfor_ifa(in_dev);
+
+ return ret;
+ }
+ }
+
+ return ip->ttl == f_ttl;
+}
+
+struct nf_osf_hdr_ctx {
+ bool df;
+ u16 window;
+ u16 totlen;
+ const unsigned char *optp;
+ unsigned int optsize;
+};
+
+static bool nf_osf_match_one(const struct sk_buff *skb,
+ const struct nf_osf_user_finger *f,
+ int ttl_check,
+ struct nf_osf_hdr_ctx *ctx)
+{
+ unsigned int check_WSS = 0;
+ int fmatch = FMATCH_WRONG;
+ int foptsize, optnum;
+ u16 mss = 0;
+
+ if (ctx->totlen != f->ss || !nf_osf_ttl(skb, ttl_check, f->ttl))
+ return false;
+
+ /*
+ * Should not happen if userspace parser was written correctly.
+ */
+ if (f->wss.wc >= OSF_WSS_MAX)
+ return false;
+
+ /* Check options */
+
+ foptsize = 0;
+ for (optnum = 0; optnum < f->opt_num; ++optnum)
+ foptsize += f->opt[optnum].length;
+
+ if (foptsize > MAX_IPOPTLEN ||
+ ctx->optsize > MAX_IPOPTLEN ||
+ ctx->optsize != foptsize)
+ return false;
+
+ check_WSS = f->wss.wc;
+
+ for (optnum = 0; optnum < f->opt_num; ++optnum) {
+ if (f->opt[optnum].kind == *ctx->optp) {
+ __u32 len = f->opt[optnum].length;
+ const __u8 *optend = ctx->optp + len;
+
+ fmatch = FMATCH_OK;
+
+ switch (*ctx->optp) {
+ case OSFOPT_MSS:
+ mss = ctx->optp[3];
+ mss <<= 8;
+ mss |= ctx->optp[2];
+
+ mss = ntohs((__force __be16)mss);
+ break;
+ case OSFOPT_TS:
+ break;
+ }
+
+ ctx->optp = optend;
+ } else
+ fmatch = FMATCH_OPT_WRONG;
+
+ if (fmatch != FMATCH_OK)
+ break;
+ }
+
+ if (fmatch != FMATCH_OPT_WRONG) {
+ fmatch = FMATCH_WRONG;
+
+ switch (check_WSS) {
+ case OSF_WSS_PLAIN:
+ if (f->wss.val == 0 || ctx->window == f->wss.val)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MSS:
+ /*
+ * Some smart modems decrease mangle MSS to
+ * SMART_MSS_2, so we check standard, decreased
+ * and the one provided in the fingerprint MSS
+ * values.
+ */
+#define SMART_MSS_1 1460
+#define SMART_MSS_2 1448
+ if (ctx->window == f->wss.val * mss ||
+ ctx->window == f->wss.val * SMART_MSS_1 ||
+ ctx->window == f->wss.val * SMART_MSS_2)
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MTU:
+ if (ctx->window == f->wss.val * (mss + 40) ||
+ ctx->window == f->wss.val * (SMART_MSS_1 + 40) ||
+ ctx->window == f->wss.val * (SMART_MSS_2 + 40))
+ fmatch = FMATCH_OK;
+ break;
+ case OSF_WSS_MODULO:
+ if ((ctx->window % f->wss.val) == 0)
+ fmatch = FMATCH_OK;
+ break;
+ }
+ }
+
+ return fmatch == FMATCH_OK;
+}
+
+static const struct tcphdr *nf_osf_hdr_ctx_init(struct nf_osf_hdr_ctx *ctx,
+ const struct sk_buff *skb,
+ const struct iphdr *ip,
+ unsigned char *opts)
+{
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph);
+ if (!tcp)
+ return NULL;
+
+ if (!tcp->syn)
+ return NULL;
+
+ ctx->totlen = ntohs(ip->tot_len);
+ ctx->df = ntohs(ip->frag_off) & IP_DF;
+ ctx->window = ntohs(tcp->window);
+
+ if (tcp->doff * 4 > sizeof(struct tcphdr)) {
+ ctx->optsize = tcp->doff * 4 - sizeof(struct tcphdr);
+
+ ctx->optp = skb_header_pointer(skb, ip_hdrlen(skb) +
+ sizeof(struct tcphdr), ctx->optsize, opts);
+ }
+
+ return tcp;
+}
+
+bool
+nf_osf_match(const struct sk_buff *skb, u_int8_t family,
+ int hooknum, struct net_device *in, struct net_device *out,
+ const struct nf_osf_info *info, struct net *net,
+ const struct list_head *nf_osf_fingers)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct nf_osf_user_finger *f;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct nf_osf_finger *kf;
+ int fcount = 0, ttl_check;
+ int fmatch = FMATCH_WRONG;
+ struct nf_osf_hdr_ctx ctx;
+ const struct tcphdr *tcp;
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ if (!tcp)
+ return false;
+
+ ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1;
+
+ list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
+
+ f = &kf->finger;
+
+ if (!(info->flags & NF_OSF_LOG) && strcmp(info->genre, f->genre))
+ continue;
+
+ if (!nf_osf_match_one(skb, f, ttl_check, &ctx))
+ continue;
+
+ fmatch = FMATCH_OK;
+
+ fcount++;
+
+ if (info->flags & NF_OSF_LOG)
+ nf_log_packet(net, family, hooknum, skb,
+ in, out, NULL,
+ "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n",
+ f->genre, f->version, f->subtype,
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest),
+ f->ttl - ip->ttl);
+
+ if ((info->flags & NF_OSF_LOG) &&
+ info->loglevel == NF_OSF_LOGLEVEL_FIRST)
+ break;
+ }
+
+ if (!fcount && (info->flags & NF_OSF_LOG))
+ nf_log_packet(net, family, hooknum, skb, in, out, NULL,
+ "Remote OS is not known: %pI4:%u -> %pI4:%u\n",
+ &ip->saddr, ntohs(tcp->source),
+ &ip->daddr, ntohs(tcp->dest));
+
+ if (fcount)
+ fmatch = FMATCH_OK;
+
+ return fmatch == FMATCH_OK;
+}
+EXPORT_SYMBOL_GPL(nf_osf_match);
+
+const char *nf_osf_find(const struct sk_buff *skb,
+ const struct list_head *nf_osf_fingers)
+{
+ const struct iphdr *ip = ip_hdr(skb);
+ const struct nf_osf_user_finger *f;
+ unsigned char opts[MAX_IPOPTLEN];
+ const struct nf_osf_finger *kf;
+ struct nf_osf_hdr_ctx ctx;
+ const struct tcphdr *tcp;
+ const char *genre = NULL;
+
+ memset(&ctx, 0, sizeof(ctx));
+
+ tcp = nf_osf_hdr_ctx_init(&ctx, skb, ip, opts);
+ if (!tcp)
+ return false;
+
+ list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) {
+ f = &kf->finger;
+ if (!nf_osf_match_one(skb, f, -1, &ctx))
+ continue;
+
+ genre = f->genre;
+ break;
+ }
+
+ return genre;
+}
+EXPORT_SYMBOL_GPL(nf_osf_find);
+
+static const struct nla_policy nfnl_osf_policy[OSF_ATTR_MAX + 1] = {
+ [OSF_ATTR_FINGER] = { .len = sizeof(struct nf_osf_user_finger) },
+};
+
+static int nfnl_osf_add_callback(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const osf_attrs[],
+ struct netlink_ext_ack *extack)
+{
+ struct nf_osf_user_finger *f;
+ struct nf_osf_finger *kf = NULL, *sf;
+ int err = 0;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!osf_attrs[OSF_ATTR_FINGER])
+ return -EINVAL;
+
+ if (!(nlh->nlmsg_flags & NLM_F_CREATE))
+ return -EINVAL;
+
+ f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+ kf = kmalloc(sizeof(struct nf_osf_finger), GFP_KERNEL);
+ if (!kf)
+ return -ENOMEM;
+
+ memcpy(&kf->finger, f, sizeof(struct nf_osf_user_finger));
+
+ list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
+ if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
+ continue;
+
+ kfree(kf);
+ kf = NULL;
+
+ if (nlh->nlmsg_flags & NLM_F_EXCL)
+ err = -EEXIST;
+ break;
+ }
+
+ /*
+ * We are protected by nfnl mutex.
+ */
+ if (kf)
+ list_add_tail_rcu(&kf->finger_entry, &nf_osf_fingers[!!f->df]);
+
+ return err;
+}
+
+static int nfnl_osf_remove_callback(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const osf_attrs[],
+ struct netlink_ext_ack *extack)
+{
+ struct nf_osf_user_finger *f;
+ struct nf_osf_finger *sf;
+ int err = -ENOENT;
+
+ if (!capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!osf_attrs[OSF_ATTR_FINGER])
+ return -EINVAL;
+
+ f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
+
+ list_for_each_entry(sf, &nf_osf_fingers[!!f->df], finger_entry) {
+ if (memcmp(&sf->finger, f, sizeof(struct nf_osf_user_finger)))
+ continue;
+
+ /*
+ * We are protected by nfnl mutex.
+ */
+ list_del_rcu(&sf->finger_entry);
+ kfree_rcu(sf, rcu_head);
+
+ err = 0;
+ break;
+ }
+
+ return err;
+}
+
+static const struct nfnl_callback nfnl_osf_callbacks[OSF_MSG_MAX] = {
+ [OSF_MSG_ADD] = {
+ .call = nfnl_osf_add_callback,
+ .attr_count = OSF_ATTR_MAX,
+ .policy = nfnl_osf_policy,
+ },
+ [OSF_MSG_REMOVE] = {
+ .call = nfnl_osf_remove_callback,
+ .attr_count = OSF_ATTR_MAX,
+ .policy = nfnl_osf_policy,
+ },
+};
+
+static const struct nfnetlink_subsystem nfnl_osf_subsys = {
+ .name = "osf",
+ .subsys_id = NFNL_SUBSYS_OSF,
+ .cb_count = OSF_MSG_MAX,
+ .cb = nfnl_osf_callbacks,
+};
+
+static int __init nfnl_osf_init(void)
+{
+ int err = -EINVAL;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i)
+ INIT_LIST_HEAD(&nf_osf_fingers[i]);
+
+ err = nfnetlink_subsys_register(&nfnl_osf_subsys);
+ if (err < 0) {
+ pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
+ goto err_out_exit;
+ }
+ return 0;
+
+err_out_exit:
+ return err;
+}
+
+static void __exit nfnl_osf_fini(void)
+{
+ struct nf_osf_finger *f;
+ int i;
+
+ nfnetlink_subsys_unregister(&nfnl_osf_subsys);
+
+ rcu_read_lock();
+ for (i = 0; i < ARRAY_SIZE(nf_osf_fingers); ++i) {
+ list_for_each_entry_rcu(f, &nf_osf_fingers[i], finger_entry) {
+ list_del_rcu(&f->finger_entry);
+ kfree_rcu(f, rcu_head);
+ }
+ }
+ rcu_read_unlock();
+
+ rcu_barrier();
+}
+
+module_init(nfnl_osf_init);
+module_exit(nfnl_osf_fini);
+
+MODULE_LICENSE("GPL");
diff --git a/net/netfilter/nft_chain_filter.c b/net/netfilter/nft_chain_filter.c
index d21834bed805..ea5b7c4944f6 100644
--- a/net/netfilter/nft_chain_filter.c
+++ b/net/netfilter/nft_chain_filter.c
@@ -322,7 +322,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
if (!ctx.net)
return NOTIFY_DONE;
- nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ mutex_lock(&ctx.net->nft.commit_mutex);
list_for_each_entry(table, &ctx.net->nft.tables, list) {
if (table->family != NFPROTO_NETDEV)
continue;
@@ -337,7 +337,7 @@ static int nf_tables_netdev_event(struct notifier_block *this,
nft_netdev_event(event, dev, &ctx);
}
}
- nfnl_unlock(NFNL_SUBSYS_NFTABLES);
+ mutex_unlock(&ctx.net->nft.commit_mutex);
put_net(ctx.net);
return NOTIFY_DONE;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 8d1ff654e5af..32535eea51b2 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -832,10 +832,18 @@ nft_target_select_ops(const struct nft_ctx *ctx,
rev = ntohl(nla_get_be32(tb[NFTA_TARGET_REV]));
family = ctx->family;
+ if (strcmp(tg_name, XT_ERROR_TARGET) == 0 ||
+ strcmp(tg_name, XT_STANDARD_TARGET) == 0 ||
+ strcmp(tg_name, "standard") == 0)
+ return ERR_PTR(-EINVAL);
+
/* Re-use the existing target if it's already loaded. */
list_for_each_entry(nft_target, &nft_target_list, head) {
struct xt_target *target = nft_target->ops.data;
+ if (!target->target)
+ continue;
+
if (nft_target_cmp(target, tg_name, rev, family))
return &nft_target->ops;
}
@@ -844,6 +852,11 @@ nft_target_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(target))
return ERR_PTR(-ENOENT);
+ if (!target->target) {
+ err = -EINVAL;
+ goto err;
+ }
+
if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO])) {
err = -EINVAL;
goto err;
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index a832c59f0a9c..b90d96ba4a12 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -14,10 +14,9 @@
#include <net/netfilter/nf_conntrack_zones.h>
struct nft_connlimit {
- spinlock_t lock;
- struct hlist_head hhead;
- u32 limit;
- bool invert;
+ struct nf_conncount_list list;
+ u32 limit;
+ bool invert;
};
static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
@@ -45,21 +44,19 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv,
return;
}
- spin_lock_bh(&priv->lock);
- count = nf_conncount_lookup(nft_net(pkt), &priv->hhead, tuple_ptr, zone,
- &addit);
+ nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone,
+ &addit);
+ count = priv->list.count;
if (!addit)
goto out;
- if (!nf_conncount_add(&priv->hhead, tuple_ptr, zone)) {
+ if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) {
regs->verdict.code = NF_DROP;
- spin_unlock_bh(&priv->lock);
return;
}
count++;
out:
- spin_unlock_bh(&priv->lock);
if ((count > priv->limit) ^ priv->invert) {
regs->verdict.code = NFT_BREAK;
@@ -87,8 +84,7 @@ static int nft_connlimit_do_init(const struct nft_ctx *ctx,
invert = true;
}
- spin_lock_init(&priv->lock);
- INIT_HLIST_HEAD(&priv->hhead);
+ nf_conncount_list_init(&priv->list);
priv->limit = limit;
priv->invert = invert;
@@ -99,7 +95,7 @@ static void nft_connlimit_do_destroy(const struct nft_ctx *ctx,
struct nft_connlimit *priv)
{
nf_ct_netns_put(ctx->net, ctx->family);
- nf_conncount_cache_free(&priv->hhead);
+ nf_conncount_cache_free(&priv->list);
}
static int nft_connlimit_do_dump(struct sk_buff *skb,
@@ -212,8 +208,7 @@ static int nft_connlimit_clone(struct nft_expr *dst, const struct nft_expr *src)
struct nft_connlimit *priv_dst = nft_expr_priv(dst);
struct nft_connlimit *priv_src = nft_expr_priv(src);
- spin_lock_init(&priv_dst->lock);
- INIT_HLIST_HEAD(&priv_dst->hhead);
+ nf_conncount_list_init(&priv_dst->list);
priv_dst->limit = priv_src->limit;
priv_dst->invert = priv_src->invert;
@@ -225,21 +220,14 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx,
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- nf_conncount_cache_free(&priv->hhead);
+ nf_conncount_cache_free(&priv->list);
}
static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr)
{
struct nft_connlimit *priv = nft_expr_priv(expr);
- bool addit, ret;
- spin_lock_bh(&priv->lock);
- nf_conncount_lookup(net, &priv->hhead, NULL, &nf_ct_zone_dflt, &addit);
-
- ret = hlist_empty(&priv->hhead);
- spin_unlock_bh(&priv->lock);
-
- return ret;
+ return nf_conncount_gc_list(net, &priv->list);
}
static struct nft_expr_type nft_connlimit_type;
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 1435ffc5f57e..3bc82ee5464d 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -870,7 +870,7 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj,
if (test_bit(IPS_HELPER_BIT, &ct->status))
return;
- help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC);
+ help = nf_ct_helper_ext_add(ct, GFP_ATOMIC);
if (help) {
rcu_assign_pointer(help->helper, to_assign);
set_bit(IPS_HELPER_BIT, &ct->status);
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 27d7e4598ab6..81184c244d1a 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -118,6 +118,8 @@ static int nft_dynset_init(const struct nft_ctx *ctx,
u64 timeout;
int err;
+ lockdep_assert_held(&ctx->net->nft.commit_mutex);
+
if (tb[NFTA_DYNSET_SET_NAME] == NULL ||
tb[NFTA_DYNSET_OP] == NULL ||
tb[NFTA_DYNSET_SREG_KEY] == NULL)
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 15adf8ca82c3..0777a93211e2 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -98,6 +98,7 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
const struct nft_data **d)
{
const struct nft_immediate_expr *priv = nft_expr_priv(expr);
+ struct nft_ctx *pctx = (struct nft_ctx *)ctx;
const struct nft_data *data;
int err;
@@ -109,9 +110,11 @@ static int nft_immediate_validate(const struct nft_ctx *ctx,
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
+ pctx->level++;
err = nft_chain_validate(ctx, data->verdict.chain);
if (err < 0)
return err;
+ pctx->level--;
break;
default:
break;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index 42e6fadf1417..ad13e8643599 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -26,9 +26,9 @@ struct nft_lookup {
struct nft_set_binding binding;
};
-static void nft_lookup_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_lookup_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_lookup *priv = nft_expr_priv(expr);
const struct nft_set *set = priv->set;
@@ -155,7 +155,9 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
struct nft_set_elem *elem)
{
const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
+ struct nft_ctx *pctx = (struct nft_ctx *)ctx;
const struct nft_data *data;
+ int err;
if (nft_set_ext_exists(ext, NFT_SET_EXT_FLAGS) &&
*nft_set_ext_flags(ext) & NFT_SET_ELEM_INTERVAL_END)
@@ -165,10 +167,17 @@ static int nft_lookup_validate_setelem(const struct nft_ctx *ctx,
switch (data->verdict.code) {
case NFT_JUMP:
case NFT_GOTO:
- return nft_chain_validate(ctx, data->verdict.chain);
+ pctx->level++;
+ err = nft_chain_validate(ctx, data->verdict.chain);
+ if (err < 0)
+ return err;
+ pctx->level--;
+ break;
default:
- return 0;
+ break;
}
+
+ return 0;
}
static int nft_lookup_validate(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 2b94dcc43456..297fe7d97c18 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -41,9 +41,9 @@ static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
#include "../bridge/br_private.h"
#endif
-static void nft_meta_get_eval(const struct nft_expr *expr,
- struct nft_regs *regs,
- const struct nft_pktinfo *pkt)
+void nft_meta_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
{
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 1f4d0854cf70..649d1700ec5b 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -237,10 +237,8 @@ static int nft_ng_random_map_init(const struct nft_ctx *ctx,
priv->map = nft_set_lookup_global(ctx->net, ctx->table,
tb[NFTA_NG_SET_NAME],
tb[NFTA_NG_SET_ID], genmask);
- if (IS_ERR(priv->map))
- return PTR_ERR(priv->map);
- return 0;
+ return PTR_ERR_OR_ZERO(priv->map);
}
static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
new file mode 100644
index 000000000000..9b2f3de7be4f
--- /dev/null
+++ b/net/netfilter/nft_osf.c
@@ -0,0 +1,106 @@
+#include <net/ip.h>
+#include <net/tcp.h>
+
+#include <net/netfilter/nf_tables.h>
+#include <linux/netfilter/nfnetlink_osf.h>
+
+#define OSF_GENRE_SIZE 32
+
+struct nft_osf {
+ enum nft_registers dreg:8;
+};
+
+static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = {
+ [NFTA_OSF_DREG] = { .type = NLA_U32 },
+};
+
+static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct sk_buff *skb = pkt->skb;
+ const struct tcphdr *tcp;
+ struct tcphdr _tcph;
+ const char *os_name;
+
+ tcp = skb_header_pointer(skb, ip_hdrlen(skb),
+ sizeof(struct tcphdr), &_tcph);
+ if (!tcp) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ if (!tcp->syn) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ os_name = nf_osf_find(skb, nf_osf_fingers);
+ if (!os_name)
+ strncpy((char *)dest, "unknown", IFNAMSIZ);
+ else
+ strncpy((char *)dest, os_name, IFNAMSIZ);
+}
+
+static int nft_osf_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_osf *priv = nft_expr_priv(expr);
+ int err;
+
+ priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]);
+ err = nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFTA_DATA_VALUE, OSF_GENRE_SIZE);
+ if (err < 0)
+ return err;
+
+ return 0;
+}
+
+static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_osf *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_osf_type;
+static const struct nft_expr_ops nft_osf_op = {
+ .eval = nft_osf_eval,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_osf)),
+ .init = nft_osf_init,
+ .dump = nft_osf_dump,
+ .type = &nft_osf_type,
+};
+
+static struct nft_expr_type nft_osf_type __read_mostly = {
+ .ops = &nft_osf_op,
+ .name = "osf",
+ .owner = THIS_MODULE,
+ .policy = nft_osf_policy,
+ .maxattr = NFTA_OSF_MAX,
+};
+
+static int __init nft_osf_module_init(void)
+{
+ return nft_register_expr(&nft_osf_type);
+}
+
+static void __exit nft_osf_module_exit(void)
+{
+ return nft_unregister_expr(&nft_osf_type);
+}
+
+module_init(nft_osf_module_init);
+module_exit(nft_osf_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Fernando Fernandez <ffmancera@riseup.net>");
+MODULE_ALIAS_NFT_EXPR("osf");
diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c
index d6626e01c7ee..128bc16f52dd 100644
--- a/net/netfilter/nft_set_bitmap.c
+++ b/net/netfilter/nft_set_bitmap.c
@@ -296,7 +296,7 @@ static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_bitmap_type __read_mostly = {
+struct nft_set_type nft_set_bitmap_type __read_mostly = {
.owner = THIS_MODULE,
.ops = {
.privsize = nft_bitmap_privsize,
@@ -314,20 +314,3 @@ static struct nft_set_type nft_bitmap_type __read_mostly = {
.get = nft_bitmap_get,
},
};
-
-static int __init nft_bitmap_module_init(void)
-{
- return nft_register_set(&nft_bitmap_type);
-}
-
-static void __exit nft_bitmap_module_exit(void)
-{
- nft_unregister_set(&nft_bitmap_type);
-}
-
-module_init(nft_bitmap_module_init);
-module_exit(nft_bitmap_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 6f9a1365a09f..90c3e7e6cacb 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -387,6 +387,7 @@ static void nft_rhash_destroy(const struct nft_set *set)
struct nft_rhash *priv = nft_set_priv(set);
cancel_delayed_work_sync(&priv->gc_work);
+ rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
(void *)set);
}
@@ -654,7 +655,7 @@ static bool nft_hash_fast_estimate(const struct nft_set_desc *desc, u32 features
return true;
}
-static struct nft_set_type nft_rhash_type __read_mostly = {
+struct nft_set_type nft_set_rhash_type __read_mostly = {
.owner = THIS_MODULE,
.features = NFT_SET_MAP | NFT_SET_OBJECT |
NFT_SET_TIMEOUT | NFT_SET_EVAL,
@@ -677,7 +678,7 @@ static struct nft_set_type nft_rhash_type __read_mostly = {
},
};
-static struct nft_set_type nft_hash_type __read_mostly = {
+struct nft_set_type nft_set_hash_type __read_mostly = {
.owner = THIS_MODULE,
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
@@ -697,7 +698,7 @@ static struct nft_set_type nft_hash_type __read_mostly = {
},
};
-static struct nft_set_type nft_hash_fast_type __read_mostly = {
+struct nft_set_type nft_set_hash_fast_type __read_mostly = {
.owner = THIS_MODULE,
.features = NFT_SET_MAP | NFT_SET_OBJECT,
.ops = {
@@ -716,26 +717,3 @@ static struct nft_set_type nft_hash_fast_type __read_mostly = {
.get = nft_hash_get,
},
};
-
-static int __init nft_hash_module_init(void)
-{
- if (nft_register_set(&nft_hash_fast_type) ||
- nft_register_set(&nft_hash_type) ||
- nft_register_set(&nft_rhash_type))
- return 1;
- return 0;
-}
-
-static void __exit nft_hash_module_exit(void)
-{
- nft_unregister_set(&nft_rhash_type);
- nft_unregister_set(&nft_hash_type);
- nft_unregister_set(&nft_hash_fast_type);
-}
-
-module_init(nft_hash_module_init);
-module_exit(nft_hash_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 7f3a9a211034..9873d734b494 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -381,7 +381,7 @@ static void nft_rbtree_gc(struct work_struct *work)
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (!gcb)
- goto out;
+ break;
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, rbe);
@@ -390,10 +390,12 @@ static void nft_rbtree_gc(struct work_struct *work)
rbe = rb_entry(prev, struct nft_rbtree_elem, node);
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, rbe);
+ prev = NULL;
}
node = rb_next(node);
+ if (!node)
+ break;
}
-out:
if (gcb) {
for (i = 0; i < gcb->head.cnt; i++) {
rbe = gcb->elems[i];
@@ -440,6 +442,7 @@ static void nft_rbtree_destroy(const struct nft_set *set)
struct rb_node *node;
cancel_delayed_work_sync(&priv->gc_work);
+ rcu_barrier();
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -462,7 +465,7 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
-static struct nft_set_type nft_rbtree_type __read_mostly = {
+struct nft_set_type nft_set_rbtree_type __read_mostly = {
.owner = THIS_MODULE,
.features = NFT_SET_INTERVAL | NFT_SET_MAP | NFT_SET_OBJECT | NFT_SET_TIMEOUT,
.ops = {
@@ -481,20 +484,3 @@ static struct nft_set_type nft_rbtree_type __read_mostly = {
.get = nft_rbtree_get,
},
};
-
-static int __init nft_rbtree_module_init(void)
-{
- return nft_register_set(&nft_rbtree_type);
-}
-
-static void __exit nft_rbtree_module_exit(void)
-{
- nft_unregister_set(&nft_rbtree_type);
-}
-
-module_init(nft_rbtree_module_init);
-module_exit(nft_rbtree_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
-MODULE_ALIAS_NFT_SET();
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 998c2b546f6d..d7f3776dfd71 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -31,7 +31,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
case NFPROTO_IPV4:
sk = nf_sk_lookup_slow_v4(nft_net(pkt), skb, nft_in(pkt));
break;
-#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case NFPROTO_IPV6:
sk = nf_sk_lookup_slow_v6(nft_net(pkt), skb, nft_in(pkt));
break;
@@ -43,7 +43,7 @@ static void nft_socket_eval(const struct nft_expr *expr,
}
if (!sk) {
- nft_reg_store8(dest, 0);
+ regs->verdict.code = NFT_BREAK;
return;
}
@@ -54,6 +54,14 @@ static void nft_socket_eval(const struct nft_expr *expr,
case NFT_SOCKET_TRANSPARENT:
nft_reg_store8(dest, inet_sk_transparent(sk));
break;
+ case NFT_SOCKET_MARK:
+ if (sk_fullsock(sk)) {
+ *dest = sk->sk_mark;
+ } else {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ break;
default:
WARN_ON(1);
regs->verdict.code = NFT_BREAK;
@@ -77,7 +85,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
switch(ctx->family) {
case NFPROTO_IPV4:
-#if IS_ENABLED(CONFIG_NF_SOCKET_IPV6)
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
case NFPROTO_IPV6:
#endif
case NFPROTO_INET:
@@ -91,6 +99,9 @@ static int nft_socket_init(const struct nft_ctx *ctx,
case NFT_SOCKET_TRANSPARENT:
len = sizeof(u8);
break;
+ case NFT_SOCKET_MARK:
+ len = sizeof(u32);
+ break;
default:
return -EOPNOTSUPP;
}
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
new file mode 100644
index 000000000000..eff99dffc842
--- /dev/null
+++ b/net/netfilter/nft_tproxy.c
@@ -0,0 +1,316 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/module.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tproxy.h>
+#include <net/inet_sock.h>
+#include <net/tcp.h>
+#include <linux/if_ether.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#endif
+
+struct nft_tproxy {
+ enum nft_registers sreg_addr:8;
+ enum nft_registers sreg_port:8;
+ u8 family;
+};
+
+static void nft_tproxy_eval_v4(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const struct iphdr *iph = ip_hdr(skb);
+ struct udphdr _hdr, *hp;
+ __be32 taddr = 0;
+ __be16 tport = 0;
+ struct sock *sk;
+
+ hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr);
+ if (!hp) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ /* check if there's an ongoing connection on the packet addresses, this
+ * happens if the redirect already happened and the current packet
+ * belongs to an already established connection
+ */
+ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
+ iph->saddr, iph->daddr,
+ hp->source, hp->dest,
+ skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
+
+ if (priv->sreg_addr)
+ taddr = regs->data[priv->sreg_addr];
+ taddr = nf_tproxy_laddr4(skb, taddr, iph->daddr);
+
+ if (priv->sreg_port)
+ tport = regs->data[priv->sreg_port];
+ if (!tport)
+ tport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT) {
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = nf_tproxy_handle_time_wait4(nft_net(pkt), skb, taddr, tport, sk);
+ } else if (!sk) {
+ /* no, there's no established connection, check if
+ * there's a listener on the redirected addr/port
+ */
+ sk = nf_tproxy_get_sock_v4(nft_net(pkt), skb, iph->protocol,
+ iph->saddr, taddr,
+ hp->source, tport,
+ skb->dev, NF_TPROXY_LOOKUP_LISTENER);
+ }
+
+ if (sk && nf_tproxy_sk_is_transparent(sk))
+ nf_tproxy_assign_sock(skb, sk);
+ else
+ regs->verdict.code = NFT_BREAK;
+}
+
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+static void nft_tproxy_eval_v6(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct in6_addr taddr = {0};
+ int thoff = pkt->xt.thoff;
+ struct udphdr _hdr, *hp;
+ __be16 tport = 0;
+ struct sock *sk;
+ int l4proto;
+
+ if (!pkt->tprot_set) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ l4proto = pkt->tprot;
+
+ hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr);
+ if (hp == NULL) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+
+ /* check if there's an ongoing connection on the packet addresses, this
+ * happens if the redirect already happened and the current packet
+ * belongs to an already established connection
+ */
+ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff, l4proto,
+ &iph->saddr, &iph->daddr,
+ hp->source, hp->dest,
+ nft_in(pkt), NF_TPROXY_LOOKUP_ESTABLISHED);
+
+ if (priv->sreg_addr)
+ memcpy(&taddr, &regs->data[priv->sreg_addr], sizeof(taddr));
+ taddr = *nf_tproxy_laddr6(skb, &taddr, &iph->daddr);
+
+ if (priv->sreg_port)
+ tport = regs->data[priv->sreg_port];
+ if (!tport)
+ tport = hp->dest;
+
+ /* UDP has no TCP_TIME_WAIT state, so we never enter here */
+ if (sk && sk->sk_state == TCP_TIME_WAIT) {
+ /* reopening a TIME_WAIT connection needs special handling */
+ sk = nf_tproxy_handle_time_wait6(skb, l4proto, thoff,
+ nft_net(pkt),
+ &taddr,
+ tport,
+ sk);
+ } else if (!sk) {
+ /* no there's no established connection, check if
+ * there's a listener on the redirected addr/port
+ */
+ sk = nf_tproxy_get_sock_v6(nft_net(pkt), skb, thoff,
+ l4proto, &iph->saddr, &taddr,
+ hp->source, tport,
+ nft_in(pkt), NF_TPROXY_LOOKUP_LISTENER);
+ }
+
+ /* NOTE: assign_sock consumes our sk reference */
+ if (sk && nf_tproxy_sk_is_transparent(sk))
+ nf_tproxy_assign_sock(skb, sk);
+ else
+ regs->verdict.code = NFT_BREAK;
+}
+#endif
+
+static void nft_tproxy_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ switch (nft_pf(pkt)) {
+ case NFPROTO_IPV4:
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ case NFPROTO_UNSPEC:
+ nft_tproxy_eval_v4(expr, regs, pkt);
+ return;
+ }
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ switch (priv->family) {
+ case NFPROTO_IPV6:
+ case NFPROTO_UNSPEC:
+ nft_tproxy_eval_v6(expr, regs, pkt);
+ return;
+ }
+#endif
+ }
+ regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_tproxy_policy[NFTA_TPROXY_MAX + 1] = {
+ [NFTA_TPROXY_FAMILY] = { .type = NLA_U32 },
+ [NFTA_TPROXY_REG_ADDR] = { .type = NLA_U32 },
+ [NFTA_TPROXY_REG_PORT] = { .type = NLA_U32 },
+};
+
+static int nft_tproxy_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_tproxy *priv = nft_expr_priv(expr);
+ unsigned int alen = 0;
+ int err;
+
+ if (!tb[NFTA_TPROXY_FAMILY] ||
+ (!tb[NFTA_TPROXY_REG_ADDR] && !tb[NFTA_TPROXY_REG_PORT]))
+ return -EINVAL;
+
+ priv->family = ntohl(nla_get_be32(tb[NFTA_TPROXY_FAMILY]));
+
+ switch (ctx->family) {
+ case NFPROTO_IPV4:
+ if (priv->family != NFPROTO_IPV4)
+ return -EINVAL;
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ if (priv->family != NFPROTO_IPV6)
+ return -EINVAL;
+ break;
+#endif
+ case NFPROTO_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* Address is specified but the rule family is not set accordingly */
+ if (priv->family == NFPROTO_UNSPEC && tb[NFTA_TPROXY_REG_ADDR])
+ return -EINVAL;
+
+ switch (priv->family) {
+ case NFPROTO_IPV4:
+ alen = FIELD_SIZEOF(union nf_inet_addr, in);
+ err = nf_defrag_ipv4_enable(ctx->net);
+ if (err)
+ return err;
+ break;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ case NFPROTO_IPV6:
+ alen = FIELD_SIZEOF(union nf_inet_addr, in6);
+ err = nf_defrag_ipv6_enable(ctx->net);
+ if (err)
+ return err;
+ break;
+#endif
+ case NFPROTO_UNSPEC:
+ /* No address is specified here */
+ err = nf_defrag_ipv4_enable(ctx->net);
+ if (err)
+ return err;
+#if IS_ENABLED(CONFIG_NF_TABLES_IPV6)
+ err = nf_defrag_ipv6_enable(ctx->net);
+ if (err)
+ return err;
+#endif
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ if (tb[NFTA_TPROXY_REG_ADDR]) {
+ priv->sreg_addr = nft_parse_register(tb[NFTA_TPROXY_REG_ADDR]);
+ err = nft_validate_register_load(priv->sreg_addr, alen);
+ if (err < 0)
+ return err;
+ }
+
+ if (tb[NFTA_TPROXY_REG_PORT]) {
+ priv->sreg_port = nft_parse_register(tb[NFTA_TPROXY_REG_PORT]);
+ err = nft_validate_register_load(priv->sreg_port, sizeof(u16));
+ if (err < 0)
+ return err;
+ }
+
+ return 0;
+}
+
+static int nft_tproxy_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_tproxy *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_TPROXY_FAMILY, htonl(priv->family)))
+ return -1;
+
+ if (priv->sreg_addr &&
+ nft_dump_register(skb, NFTA_TPROXY_REG_ADDR, priv->sreg_addr))
+ return -1;
+
+ if (priv->sreg_port &&
+ nft_dump_register(skb, NFTA_TPROXY_REG_PORT, priv->sreg_port))
+ return -1;
+
+ return 0;
+}
+
+static struct nft_expr_type nft_tproxy_type;
+static const struct nft_expr_ops nft_tproxy_ops = {
+ .type = &nft_tproxy_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_tproxy)),
+ .eval = nft_tproxy_eval,
+ .init = nft_tproxy_init,
+ .dump = nft_tproxy_dump,
+};
+
+static struct nft_expr_type nft_tproxy_type __read_mostly = {
+ .name = "tproxy",
+ .ops = &nft_tproxy_ops,
+ .policy = nft_tproxy_policy,
+ .maxattr = NFTA_TPROXY_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_tproxy_module_init(void)
+{
+ return nft_register_expr(&nft_tproxy_type);
+}
+
+static void __exit nft_tproxy_module_exit(void)
+{
+ nft_unregister_expr(&nft_tproxy_type);
+}
+
+module_init(nft_tproxy_module_init);
+module_exit(nft_tproxy_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Máté Eckl");
+MODULE_DESCRIPTION("nf_tables tproxy support module");
+MODULE_ALIAS_NFT_EXPR("tproxy");
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
new file mode 100644
index 000000000000..3a15f219e4e7
--- /dev/null
+++ b/net/netfilter/nft_tunnel.c
@@ -0,0 +1,566 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/seqlock.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/dst_metadata.h>
+#include <net/ip_tunnels.h>
+#include <net/vxlan.h>
+#include <net/erspan.h>
+
+struct nft_tunnel {
+ enum nft_tunnel_keys key:8;
+ enum nft_registers dreg:8;
+};
+
+static void nft_tunnel_get_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+ u32 *dest = &regs->data[priv->dreg];
+ struct ip_tunnel_info *tun_info;
+
+ tun_info = skb_tunnel_info(pkt->skb);
+
+ switch (priv->key) {
+ case NFT_TUNNEL_PATH:
+ nft_reg_store8(dest, !!tun_info);
+ break;
+ case NFT_TUNNEL_ID:
+ if (!tun_info) {
+ regs->verdict.code = NFT_BREAK;
+ return;
+ }
+ *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id));
+ break;
+ default:
+ WARN_ON(1);
+ regs->verdict.code = NFT_BREAK;
+ }
+}
+
+static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = {
+ [NFTA_TUNNEL_KEY] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_DREG] = { .type = NLA_U32 },
+};
+
+static int nft_tunnel_get_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_tunnel *priv = nft_expr_priv(expr);
+ u32 len;
+
+ if (!tb[NFTA_TUNNEL_KEY] &&
+ !tb[NFTA_TUNNEL_DREG])
+ return -EINVAL;
+
+ priv->key = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY]));
+ switch (priv->key) {
+ case NFT_TUNNEL_PATH:
+ len = sizeof(u8);
+ break;
+ case NFT_TUNNEL_ID:
+ len = sizeof(u32);
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]);
+
+ return nft_validate_register_store(ctx, priv->dreg, NULL,
+ NFT_DATA_VALUE, len);
+}
+
+static int nft_tunnel_get_dump(struct sk_buff *skb,
+ const struct nft_expr *expr)
+{
+ const struct nft_tunnel *priv = nft_expr_priv(expr);
+
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY, htonl(priv->key)))
+ goto nla_put_failure;
+ if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_tunnel_type;
+static const struct nft_expr_ops nft_tunnel_get_ops = {
+ .type = &nft_tunnel_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_tunnel)),
+ .eval = nft_tunnel_get_eval,
+ .init = nft_tunnel_get_init,
+ .dump = nft_tunnel_get_dump,
+};
+
+static struct nft_expr_type nft_tunnel_type __read_mostly = {
+ .name = "tunnel",
+ .ops = &nft_tunnel_get_ops,
+ .policy = nft_tunnel_policy,
+ .maxattr = NFTA_TUNNEL_MAX,
+ .owner = THIS_MODULE,
+};
+
+struct nft_tunnel_opts {
+ union {
+ struct vxlan_metadata vxlan;
+ struct erspan_metadata erspan;
+ } u;
+ u32 len;
+ __be16 flags;
+};
+
+struct nft_tunnel_obj {
+ struct metadata_dst *md;
+ struct nft_tunnel_opts opts;
+};
+
+static const struct nla_policy nft_tunnel_ip_policy[NFTA_TUNNEL_KEY_IP_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_IP_SRC] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY_IP_DST] = { .type = NLA_U32 },
+};
+
+static int nft_tunnel_obj_ip_init(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct ip_tunnel_info *info)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_IP_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP_MAX, attr,
+ nft_tunnel_ip_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_IP_DST])
+ return -EINVAL;
+
+ if (tb[NFTA_TUNNEL_KEY_IP_SRC])
+ info->key.u.ipv4.src = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_SRC]);
+ if (tb[NFTA_TUNNEL_KEY_IP_DST])
+ info->key.u.ipv4.dst = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP_DST]);
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_ip6_policy[NFTA_TUNNEL_KEY_IP6_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_IP6_SRC] = { .len = sizeof(struct in6_addr), },
+ [NFTA_TUNNEL_KEY_IP6_DST] = { .len = sizeof(struct in6_addr), },
+ [NFTA_TUNNEL_KEY_IP6_FLOWLABEL] = { .type = NLA_U32, }
+};
+
+static int nft_tunnel_obj_ip6_init(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct ip_tunnel_info *info)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_IP6_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_IP6_MAX, attr,
+ nft_tunnel_ip6_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_IP6_DST])
+ return -EINVAL;
+
+ if (tb[NFTA_TUNNEL_KEY_IP6_SRC]) {
+ memcpy(&info->key.u.ipv6.src,
+ nla_data(tb[NFTA_TUNNEL_KEY_IP6_SRC]),
+ sizeof(struct in6_addr));
+ }
+ if (tb[NFTA_TUNNEL_KEY_IP6_DST]) {
+ memcpy(&info->key.u.ipv6.dst,
+ nla_data(tb[NFTA_TUNNEL_KEY_IP6_DST]),
+ sizeof(struct in6_addr));
+ }
+ if (tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL])
+ info->key.label = nla_get_be32(tb[NFTA_TUNNEL_KEY_IP6_FLOWLABEL]);
+
+ info->mode |= IP_TUNNEL_INFO_IPV6;
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_vxlan_policy[NFTA_TUNNEL_KEY_VXLAN_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_VXLAN_GBP] = { .type = NLA_U32 },
+};
+
+static int nft_tunnel_obj_vxlan_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_VXLAN_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_VXLAN_MAX, attr,
+ nft_tunnel_opts_vxlan_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (!tb[NFTA_TUNNEL_KEY_VXLAN_GBP])
+ return -EINVAL;
+
+ opts->u.vxlan.gbp = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_VXLAN_GBP]));
+
+ opts->len = sizeof(struct vxlan_metadata);
+ opts->flags = TUNNEL_VXLAN_OPT;
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_erspan_policy[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX] = { .type = NLA_U32 },
+ [NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] = { .type = NLA_U8 },
+ [NFTA_TUNNEL_KEY_ERSPAN_V2_HWID] = { .type = NLA_U8 },
+};
+
+static int nft_tunnel_obj_erspan_init(const struct nlattr *attr,
+ struct nft_tunnel_opts *opts)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_ERSPAN_MAX + 1];
+ uint8_t hwid, dir;
+ int err, version;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_ERSPAN_MAX, attr,
+ nft_tunnel_opts_erspan_policy, NULL);
+ if (err < 0)
+ return err;
+
+ version = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_VERSION]));
+ switch (version) {
+ case ERSPAN_VERSION:
+ if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX])
+ return -EINVAL;
+
+ opts->u.erspan.u.index =
+ nla_get_be32(tb[NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX]);
+ break;
+ case ERSPAN_VERSION2:
+ if (!tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR] ||
+ !tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID])
+ return -EINVAL;
+
+ hwid = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_HWID]);
+ dir = nla_get_u8(tb[NFTA_TUNNEL_KEY_ERSPAN_V2_DIR]);
+
+ set_hwid(&opts->u.erspan.u.md2, hwid);
+ opts->u.erspan.u.md2.dir = dir;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+ opts->u.erspan.version = version;
+
+ opts->len = sizeof(struct erspan_metadata);
+ opts->flags = TUNNEL_ERSPAN_OPT;
+
+ return 0;
+}
+
+static const struct nla_policy nft_tunnel_opts_policy[NFTA_TUNNEL_KEY_OPTS_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_OPTS_VXLAN] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_OPTS_ERSPAN] = { .type = NLA_NESTED, },
+};
+
+static int nft_tunnel_obj_opts_init(const struct nft_ctx *ctx,
+ const struct nlattr *attr,
+ struct ip_tunnel_info *info,
+ struct nft_tunnel_opts *opts)
+{
+ struct nlattr *tb[NFTA_TUNNEL_KEY_OPTS_MAX + 1];
+ int err;
+
+ err = nla_parse_nested(tb, NFTA_TUNNEL_KEY_OPTS_MAX, attr,
+ nft_tunnel_opts_policy, NULL);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_TUNNEL_KEY_OPTS_VXLAN]) {
+ err = nft_tunnel_obj_vxlan_init(tb[NFTA_TUNNEL_KEY_OPTS_VXLAN],
+ opts);
+ } else if (tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN]) {
+ err = nft_tunnel_obj_erspan_init(tb[NFTA_TUNNEL_KEY_OPTS_ERSPAN],
+ opts);
+ } else {
+ return -EOPNOTSUPP;
+ }
+
+ return err;
+}
+
+static const struct nla_policy nft_tunnel_key_policy[NFTA_TUNNEL_KEY_MAX + 1] = {
+ [NFTA_TUNNEL_KEY_IP] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_IP6] = { .type = NLA_NESTED, },
+ [NFTA_TUNNEL_KEY_ID] = { .type = NLA_U32, },
+ [NFTA_TUNNEL_KEY_FLAGS] = { .type = NLA_U32, },
+ [NFTA_TUNNEL_KEY_TOS] = { .type = NLA_U8, },
+ [NFTA_TUNNEL_KEY_TTL] = { .type = NLA_U8, },
+ [NFTA_TUNNEL_KEY_OPTS] = { .type = NLA_NESTED, },
+};
+
+static int nft_tunnel_obj_init(const struct nft_ctx *ctx,
+ const struct nlattr * const tb[],
+ struct nft_object *obj)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+ struct ip_tunnel_info info;
+ struct metadata_dst *md;
+ int err;
+
+ if (!tb[NFTA_TUNNEL_KEY_ID])
+ return -EINVAL;
+
+ memset(&info, 0, sizeof(info));
+ info.mode = IP_TUNNEL_INFO_TX;
+ info.key.tun_id = key32_to_tunnel_id(nla_get_be32(tb[NFTA_TUNNEL_KEY_ID]));
+ info.key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+
+ if (tb[NFTA_TUNNEL_KEY_IP]) {
+ err = nft_tunnel_obj_ip_init(ctx, tb[NFTA_TUNNEL_KEY_IP], &info);
+ if (err < 0)
+ return err;
+ } else if (tb[NFTA_TUNNEL_KEY_IP6]) {
+ err = nft_tunnel_obj_ip6_init(ctx, tb[NFTA_TUNNEL_KEY_IP6], &info);
+ if (err < 0)
+ return err;
+ } else {
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_TUNNEL_KEY_SPORT]) {
+ info.key.tp_src = nla_get_be16(tb[NFTA_TUNNEL_KEY_SPORT]);
+ }
+ if (tb[NFTA_TUNNEL_KEY_DPORT]) {
+ info.key.tp_dst = nla_get_be16(tb[NFTA_TUNNEL_KEY_DPORT]);
+ }
+
+ if (tb[NFTA_TUNNEL_KEY_FLAGS]) {
+ u32 tun_flags;
+
+ tun_flags = ntohl(nla_get_be32(tb[NFTA_TUNNEL_KEY_FLAGS]));
+ if (tun_flags & ~NFT_TUNNEL_F_MASK)
+ return -EOPNOTSUPP;
+
+ if (tun_flags & NFT_TUNNEL_F_ZERO_CSUM_TX)
+ info.key.tun_flags &= ~TUNNEL_CSUM;
+ if (tun_flags & NFT_TUNNEL_F_DONT_FRAGMENT)
+ info.key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+ if (tun_flags & NFT_TUNNEL_F_SEQ_NUMBER)
+ info.key.tun_flags |= TUNNEL_SEQ;
+ }
+ if (tb[NFTA_TUNNEL_KEY_TOS])
+ info.key.tos = nla_get_u8(tb[NFTA_TUNNEL_KEY_TOS]);
+ if (tb[NFTA_TUNNEL_KEY_TTL])
+ info.key.ttl = nla_get_u8(tb[NFTA_TUNNEL_KEY_TTL]);
+ else
+ info.key.ttl = U8_MAX;
+
+ if (tb[NFTA_TUNNEL_KEY_OPTS]) {
+ err = nft_tunnel_obj_opts_init(ctx, tb[NFTA_TUNNEL_KEY_OPTS],
+ &info, &priv->opts);
+ if (err < 0)
+ return err;
+ }
+
+ md = metadata_dst_alloc(priv->opts.len, METADATA_IP_TUNNEL, GFP_KERNEL);
+ if (!md)
+ return -ENOMEM;
+
+ memcpy(&md->u.tun_info, &info, sizeof(info));
+ ip_tunnel_info_opts_set(&md->u.tun_info, &priv->opts.u, priv->opts.len,
+ priv->opts.flags);
+ priv->md = md;
+
+ return 0;
+}
+
+static inline void nft_tunnel_obj_eval(struct nft_object *obj,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+ struct sk_buff *skb = pkt->skb;
+
+ skb_dst_drop(skb);
+ dst_hold((struct dst_entry *) priv->md);
+ skb_dst_set(skb, (struct dst_entry *) priv->md);
+}
+
+static int nft_tunnel_ip_dump(struct sk_buff *skb, struct ip_tunnel_info *info)
+{
+ struct nlattr *nest;
+
+ if (info->mode & IP_TUNNEL_INFO_IPV6) {
+ nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP6);
+ if (!nest)
+ return -1;
+
+ if (nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_SRC, &info->key.u.ipv6.src) < 0 ||
+ nla_put_in6_addr(skb, NFTA_TUNNEL_KEY_IP6_DST, &info->key.u.ipv6.dst) < 0 ||
+ nla_put_be32(skb, NFTA_TUNNEL_KEY_IP6_FLOWLABEL, info->key.label))
+ return -1;
+
+ nla_nest_end(skb, nest);
+ } else {
+ nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_IP);
+ if (!nest)
+ return -1;
+
+ if (nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_SRC, info->key.u.ipv4.src) < 0 ||
+ nla_put_in_addr(skb, NFTA_TUNNEL_KEY_IP_DST, info->key.u.ipv4.dst) < 0)
+ return -1;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+}
+
+static int nft_tunnel_opts_dump(struct sk_buff *skb,
+ struct nft_tunnel_obj *priv)
+{
+ struct nft_tunnel_opts *opts = &priv->opts;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, NFTA_TUNNEL_KEY_OPTS);
+ if (!nest)
+ return -1;
+
+ if (opts->flags & TUNNEL_VXLAN_OPT) {
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_VXLAN_GBP,
+ htonl(opts->u.vxlan.gbp)))
+ return -1;
+ } else if (opts->flags & TUNNEL_ERSPAN_OPT) {
+ switch (opts->u.erspan.version) {
+ case ERSPAN_VERSION:
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ERSPAN_V1_INDEX,
+ opts->u.erspan.u.index))
+ return -1;
+ break;
+ case ERSPAN_VERSION2:
+ if (nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_HWID,
+ get_hwid(&opts->u.erspan.u.md2)) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_ERSPAN_V2_DIR,
+ opts->u.erspan.u.md2.dir))
+ return -1;
+ break;
+ }
+ }
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+static int nft_tunnel_ports_dump(struct sk_buff *skb,
+ struct ip_tunnel_info *info)
+{
+ if (nla_put_be16(skb, NFTA_TUNNEL_KEY_SPORT, htons(info->key.tp_src)) < 0 ||
+ nla_put_be16(skb, NFTA_TUNNEL_KEY_DPORT, htons(info->key.tp_dst)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_tunnel_flags_dump(struct sk_buff *skb,
+ struct ip_tunnel_info *info)
+{
+ u32 flags = 0;
+
+ if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
+ flags |= NFT_TUNNEL_F_DONT_FRAGMENT;
+ if (!(info->key.tun_flags & TUNNEL_CSUM))
+ flags |= NFT_TUNNEL_F_ZERO_CSUM_TX;
+ if (info->key.tun_flags & TUNNEL_SEQ)
+ flags |= NFT_TUNNEL_F_SEQ_NUMBER;
+
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_FLAGS, htonl(flags)) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int nft_tunnel_obj_dump(struct sk_buff *skb,
+ struct nft_object *obj, bool reset)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+ struct ip_tunnel_info *info = &priv->md->u.tun_info;
+
+ if (nla_put_be32(skb, NFTA_TUNNEL_KEY_ID,
+ tunnel_id_to_key32(info->key.tun_id)) ||
+ nft_tunnel_ip_dump(skb, info) < 0 ||
+ nft_tunnel_ports_dump(skb, info) < 0 ||
+ nft_tunnel_flags_dump(skb, info) < 0 ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_TOS, info->key.tos) ||
+ nla_put_u8(skb, NFTA_TUNNEL_KEY_TTL, info->key.ttl) ||
+ nft_tunnel_opts_dump(skb, priv) < 0)
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static void nft_tunnel_obj_destroy(const struct nft_ctx *ctx,
+ struct nft_object *obj)
+{
+ struct nft_tunnel_obj *priv = nft_obj_data(obj);
+
+ metadata_dst_free(priv->md);
+}
+
+static struct nft_object_type nft_tunnel_obj_type;
+static const struct nft_object_ops nft_tunnel_obj_ops = {
+ .type = &nft_tunnel_obj_type,
+ .size = sizeof(struct nft_tunnel_obj),
+ .eval = nft_tunnel_obj_eval,
+ .init = nft_tunnel_obj_init,
+ .destroy = nft_tunnel_obj_destroy,
+ .dump = nft_tunnel_obj_dump,
+};
+
+static struct nft_object_type nft_tunnel_obj_type __read_mostly = {
+ .type = NFT_OBJECT_TUNNEL,
+ .ops = &nft_tunnel_obj_ops,
+ .maxattr = NFTA_TUNNEL_KEY_MAX,
+ .policy = nft_tunnel_key_policy,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_tunnel_module_init(void)
+{
+ int err;
+
+ err = nft_register_expr(&nft_tunnel_type);
+ if (err < 0)
+ return err;
+
+ err = nft_register_obj(&nft_tunnel_obj_type);
+ if (err < 0)
+ nft_unregister_expr(&nft_tunnel_type);
+
+ return err;
+}
+
+static void __exit nft_tunnel_module_exit(void)
+{
+ nft_unregister_obj(&nft_tunnel_obj_type);
+ nft_unregister_expr(&nft_tunnel_type);
+}
+
+module_init(nft_tunnel_module_init);
+module_exit(nft_tunnel_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_EXPR("tunnel");
+MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_TUNNEL);
diff --git a/net/netfilter/utils.c b/net/netfilter/utils.c
index 0b660c568156..e8da9a9bba73 100644
--- a/net/netfilter/utils.c
+++ b/net/netfilter/utils.c
@@ -1,14 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_queue.h>
+#include <net/ip6_checksum.h>
+
+#ifdef CONFIG_INET
+__sum16 nf_ip_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u8 protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if ((protocol == 0 && !csum_fold(skb->csum)) ||
+ !csum_tcpudp_magic(iph->saddr, iph->daddr,
+ skb->len - dataoff, protocol,
+ skb->csum)) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ if (protocol == 0)
+ skb->csum = 0;
+ else
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ skb->len - dataoff,
+ protocol, 0);
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip_checksum);
+#endif
+
+static __sum16 nf_ip_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u8 protocol)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr, protocol,
+ skb->len - dataoff, 0);
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+}
+
+__sum16 nf_ip6_checksum(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, u8 protocol)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (hook != NF_INET_PRE_ROUTING && hook != NF_INET_LOCAL_IN)
+ break;
+ if (!csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ skb->len - dataoff, protocol,
+ csum_sub(skb->csum,
+ skb_checksum(skb, 0,
+ dataoff, 0)))) {
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ break;
+ }
+ /* fall through */
+ case CHECKSUM_NONE:
+ skb->csum = ~csum_unfold(
+ csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+ skb->len - dataoff,
+ protocol,
+ csum_sub(0,
+ skb_checksum(skb, 0,
+ dataoff, 0))));
+ csum = __skb_checksum_complete(skb);
+ }
+ return csum;
+}
+EXPORT_SYMBOL(nf_ip6_checksum);
+
+static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
+ unsigned int dataoff, unsigned int len,
+ u8 protocol)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ __wsum hsum;
+ __sum16 csum = 0;
+
+ switch (skb->ip_summed) {
+ case CHECKSUM_COMPLETE:
+ if (len == skb->len - dataoff)
+ return nf_ip6_checksum(skb, hook, dataoff, protocol);
+ /* fall through */
+ case CHECKSUM_NONE:
+ hsum = skb_checksum(skb, 0, dataoff, 0);
+ skb->csum = ~csum_unfold(csum_ipv6_magic(&ip6h->saddr,
+ &ip6h->daddr,
+ skb->len - dataoff,
+ protocol,
+ csum_sub(0, hsum)));
+ skb->ip_summed = CHECKSUM_NONE;
+ return __skb_checksum_complete_head(skb, dataoff + len);
+ }
+ return csum;
+};
__sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
- unsigned int dataoff, u_int8_t protocol,
+ unsigned int dataoff, u8 protocol,
unsigned short family)
{
- const struct nf_ipv6_ops *v6ops;
__sum16 csum = 0;
switch (family) {
@@ -16,9 +130,7 @@ __sum16 nf_checksum(struct sk_buff *skb, unsigned int hook,
csum = nf_ip_checksum(skb, hook, dataoff, protocol);
break;
case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- csum = v6ops->checksum(skb, hook, dataoff, protocol);
+ csum = nf_ip6_checksum(skb, hook, dataoff, protocol);
break;
}
@@ -28,9 +140,8 @@ EXPORT_SYMBOL_GPL(nf_checksum);
__sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
unsigned int dataoff, unsigned int len,
- u_int8_t protocol, unsigned short family)
+ u8 protocol, unsigned short family)
{
- const struct nf_ipv6_ops *v6ops;
__sum16 csum = 0;
switch (family) {
@@ -39,10 +150,8 @@ __sum16 nf_checksum_partial(struct sk_buff *skb, unsigned int hook,
protocol);
break;
case AF_INET6:
- v6ops = rcu_dereference(nf_ipv6_ops);
- if (v6ops)
- csum = v6ops->checksum_partial(skb, hook, dataoff, len,
- protocol);
+ csum = nf_ip6_checksum_partial(skb, hook, dataoff, len,
+ protocol);
break;
}
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 03b9a50ec93b..7ba454e9e3fa 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -93,7 +93,7 @@ xt_ct_set_helper(struct nf_conn *ct, const char *helper_name,
return -ENOENT;
}
- help = nf_ct_helper_ext_add(ct, helper, GFP_KERNEL);
+ help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
if (help == NULL) {
nf_conntrack_helper_put(helper);
return -ENOMEM;
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 475957cfcf50..0d0d68c989df 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -38,7 +38,7 @@ tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
return XT_CONTINUE;
}
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
static unsigned int
tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
@@ -141,7 +141,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_IPV6)
+#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
{
.name = "TEE",
.revision = 1,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 58fce4e749a9..ad7420cdc439 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -36,15 +36,6 @@
#include <net/netfilter/nf_tproxy.h>
#include <linux/netfilter/xt_TPROXY.h>
-/* assign a socket to the skb -- consumes sk */
-static void
-nf_tproxy_assign_sock(struct sk_buff *skb, struct sock *sk)
-{
- skb_orphan(skb);
- skb->sk = sk;
- skb->destructor = sock_edemux;
-}
-
static unsigned int
tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
u_int32_t mark_mask, u_int32_t mark_value)
@@ -61,7 +52,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
skb->dev, NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -77,7 +68,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, skb, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
skb->dev, NF_TPROXY_LOOKUP_LISTENER);
@@ -150,7 +141,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp, tproto,
+ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
xt_in(par), NF_TPROXY_LOOKUP_ESTABLISHED);
@@ -171,7 +162,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff, hp,
+ sk = nf_tproxy_get_sock_v6(xt_net(par), skb, thoff,
tproto, &iph->saddr, laddr,
hp->source, lport,
xt_in(par), NF_TPROXY_LOOKUP_LISTENER);
diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index 6275106ccf50..bc6c8ab0fa62 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -93,10 +93,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
/* init private data */
info->data = nf_conncount_init(par->net, par->family, keylen);
- if (IS_ERR(info->data))
- return PTR_ERR(info->data);
- return 0;
+ return PTR_ERR_OR_ZERO(info->data);
}
static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index 9cfef73b4107..bf7bba80e24c 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -37,118 +37,6 @@
#include <net/netfilter/nf_log.h>
#include <linux/netfilter/xt_osf.h>
-/*
- * Indexed by dont-fragment bit.
- * It is the only constant value in the fingerprint.
- */
-static struct list_head xt_osf_fingers[2];
-
-static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
- [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) },
-};
-
-static int xt_osf_add_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb, const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
-{
- struct xt_osf_user_finger *f;
- struct xt_osf_finger *kf = NULL, *sf;
- int err = 0;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!osf_attrs[OSF_ATTR_FINGER])
- return -EINVAL;
-
- if (!(nlh->nlmsg_flags & NLM_F_CREATE))
- return -EINVAL;
-
- f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
-
- kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL);
- if (!kf)
- return -ENOMEM;
-
- memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger));
-
- list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
- if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
- continue;
-
- kfree(kf);
- kf = NULL;
-
- if (nlh->nlmsg_flags & NLM_F_EXCL)
- err = -EEXIST;
- break;
- }
-
- /*
- * We are protected by nfnl mutex.
- */
- if (kf)
- list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]);
-
- return err;
-}
-
-static int xt_osf_remove_callback(struct net *net, struct sock *ctnl,
- struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const osf_attrs[],
- struct netlink_ext_ack *extack)
-{
- struct xt_osf_user_finger *f;
- struct xt_osf_finger *sf;
- int err = -ENOENT;
-
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
-
- if (!osf_attrs[OSF_ATTR_FINGER])
- return -EINVAL;
-
- f = nla_data(osf_attrs[OSF_ATTR_FINGER]);
-
- list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) {
- if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger)))
- continue;
-
- /*
- * We are protected by nfnl mutex.
- */
- list_del_rcu(&sf->finger_entry);
- kfree_rcu(sf, rcu_head);
-
- err = 0;
- break;
- }
-
- return err;
-}
-
-static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = {
- [OSF_MSG_ADD] = {
- .call = xt_osf_add_callback,
- .attr_count = OSF_ATTR_MAX,
- .policy = xt_osf_policy,
- },
- [OSF_MSG_REMOVE] = {
- .call = xt_osf_remove_callback,
- .attr_count = OSF_ATTR_MAX,
- .policy = xt_osf_policy,
- },
-};
-
-static const struct nfnetlink_subsystem xt_osf_nfnetlink = {
- .name = "osf",
- .subsys_id = NFNL_SUBSYS_OSF,
- .cb_count = OSF_MSG_MAX,
- .cb = xt_osf_nfnetlink_callbacks,
-};
-
static bool
xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
{
@@ -159,7 +47,7 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
return false;
return nf_osf_match(skb, xt_family(p), xt_hooknum(p), xt_in(p),
- xt_out(p), info, net, xt_osf_fingers);
+ xt_out(p), info, net, nf_osf_fingers);
}
static struct xt_match xt_osf_match = {
@@ -177,52 +65,21 @@ static struct xt_match xt_osf_match = {
static int __init xt_osf_init(void)
{
- int err = -EINVAL;
- int i;
-
- for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i)
- INIT_LIST_HEAD(&xt_osf_fingers[i]);
-
- err = nfnetlink_subsys_register(&xt_osf_nfnetlink);
- if (err < 0) {
- pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err);
- goto err_out_exit;
- }
+ int err;
err = xt_register_match(&xt_osf_match);
if (err) {
pr_err("Failed to register OS fingerprint "
"matching module (%d)\n", err);
- goto err_out_remove;
+ return err;
}
return 0;
-
-err_out_remove:
- nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
-err_out_exit:
- return err;
}
static void __exit xt_osf_fini(void)
{
- struct xt_osf_finger *f;
- int i;
-
- nfnetlink_subsys_unregister(&xt_osf_nfnetlink);
xt_unregister_match(&xt_osf_match);
-
- rcu_read_lock();
- for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) {
-
- list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) {
- list_del_rcu(&f->finger_entry);
- kfree_rcu(f, rcu_head);
- }
- }
- rcu_read_unlock();
-
- rcu_barrier();
}
module_init(xt_osf_init);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 393573a99a5a..930d17fa906c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -63,6 +63,7 @@
#include <linux/hash.h>
#include <linux/genetlink.h>
#include <linux/net_namespace.h>
+#include <linux/nospec.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
@@ -679,6 +680,7 @@ static int netlink_create(struct net *net, struct socket *sock, int protocol,
if (protocol < 0 || protocol >= MAX_LINKS)
return -EPROTONOSUPPORT;
+ protocol = array_index_nospec(protocol, MAX_LINKS);
netlink_lock_table();
#ifdef CONFIG_MODULES
@@ -1009,6 +1011,11 @@ static int netlink_bind(struct socket *sock, struct sockaddr *addr,
return err;
}
+ if (nlk->ngroups == 0)
+ groups = 0;
+ else if (nlk->ngroups < 8*sizeof(groups))
+ groups &= (1UL << nlk->ngroups) - 1;
+
bound = nlk->bound;
if (bound) {
/* Ensure nlk->portid is up-to-date. */
@@ -2300,7 +2307,6 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
cb = &nlk->cb;
memset(cb, 0, sizeof(*cb));
- cb->start = control->start;
cb->dump = control->dump;
cb->done = control->done;
cb->nlh = nlh;
@@ -2309,8 +2315,8 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
cb->min_dump_alloc = control->min_dump_alloc;
cb->skb = skb;
- if (cb->start) {
- ret = cb->start(cb);
+ if (control->start) {
+ ret = control->start(cb);
if (ret)
goto error_put;
}
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 2ceefa183cee..6a196e438b6c 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -752,11 +752,14 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
pr_debug("Fragment %zd bytes remaining %zd",
frag_len, remaining_len);
- pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, MSG_DONTWAIT,
+ pdu = nfc_alloc_send_skb(sock->dev, &sock->sk, 0,
frag_len + LLCP_HEADER_SIZE, &err);
if (pdu == NULL) {
- pr_err("Could not allocate PDU\n");
- continue;
+ pr_err("Could not allocate PDU (error=%d)\n", err);
+ len -= remaining_len;
+ if (len == 0)
+ len = err;
+ break;
}
pdu = llcp_add_header(pdu, dsap, ssap, LLCP_PDU_UI);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index ea0c0c6f1874..dd4adf8b1167 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -556,7 +556,7 @@ static __poll_t llcp_sock_poll(struct file *file, struct socket *sock,
pr_debug("%p\n", sk);
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
if (sk->sk_state == LLCP_LISTEN)
return llcp_accept_poll(sk);
diff --git a/net/nsh/nsh.c b/net/nsh/nsh.c
index 9696ef96b719..1a30e165eeb4 100644
--- a/net/nsh/nsh.c
+++ b/net/nsh/nsh.c
@@ -104,7 +104,7 @@ static struct sk_buff *nsh_gso_segment(struct sk_buff *skb,
__skb_pull(skb, nsh_len);
skb_reset_mac_header(skb);
- skb_reset_mac_len(skb);
+ skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0;
skb->protocol = proto;
features &= NETIF_F_SG;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 284aca2a252d..86a75105af1a 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -26,6 +26,7 @@
#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#include <net/ipv6_frag.h>
#ifdef CONFIG_NF_NAT_NEEDED
#include <linux/netfilter/nf_nat.h>
@@ -607,23 +608,12 @@ static struct nf_conn *
ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
u8 l3num, struct sk_buff *skb, bool natted)
{
- const struct nf_conntrack_l3proto *l3proto;
- const struct nf_conntrack_l4proto *l4proto;
struct nf_conntrack_tuple tuple;
struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct;
- unsigned int dataoff;
- u8 protonum;
- l3proto = __nf_ct_l3proto_find(l3num);
- if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
- &protonum) <= 0) {
- pr_debug("ovs_ct_find_existing: Can't get protonum\n");
- return NULL;
- }
- l4proto = __nf_ct_l4proto_find(l3num, protonum);
- if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
- protonum, net, &tuple, l3proto, l4proto)) {
+ if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num,
+ net, &tuple)) {
pr_debug("ovs_ct_find_existing: Can't get tuple\n");
return NULL;
}
@@ -632,7 +622,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
if (natted) {
struct nf_conntrack_tuple inverse;
- if (!nf_ct_invert_tuple(&inverse, &tuple, l3proto, l4proto)) {
+ if (!nf_ct_invert_tuplepr(&inverse, &tuple)) {
pr_debug("ovs_ct_find_existing: Inversion failed!\n");
return NULL;
}
@@ -1314,7 +1304,7 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
return -EINVAL;
}
- help = nf_ct_helper_ext_add(info->ct, helper, GFP_KERNEL);
+ help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
if (!help) {
nf_conntrack_helper_put(helper);
return -ENOMEM;
diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c
index b891a91577f8..c038e021a591 100644
--- a/net/openvswitch/meter.c
+++ b/net/openvswitch/meter.c
@@ -211,6 +211,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a)
if (!meter)
return ERR_PTR(-ENOMEM);
+ meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]);
meter->used = div_u64(ktime_get_ns(), 1000 * 1000);
meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0;
meter->keep_stats = !a[OVS_METER_ATTR_CLEAR];
@@ -280,6 +281,10 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
u32 meter_id;
bool failed;
+ if (!a[OVS_METER_ATTR_ID]) {
+ return -ENODEV;
+ }
+
meter = dp_meter_create(a);
if (IS_ERR_OR_NULL(meter))
return PTR_ERR(meter);
@@ -298,11 +303,6 @@ static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock;
}
- if (!a[OVS_METER_ATTR_ID]) {
- err = -ENODEV;
- goto exit_unlock;
- }
-
meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]);
/* Cannot fail after this. */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 00189a3b07f2..345e38058ae5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1582,7 +1582,7 @@ static int fanout_set_data(struct packet_sock *po, char __user *data,
return fanout_set_data_ebpf(po, data, len);
default:
return -EINVAL;
- };
+ }
}
static void fanout_release_data(struct packet_fanout *f)
@@ -1591,7 +1591,7 @@ static void fanout_release_data(struct packet_fanout *f)
case PACKET_FANOUT_CBPF:
case PACKET_FANOUT_EBPF:
__fanout_set_data_bpf(f, NULL);
- };
+ }
}
static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
@@ -2881,6 +2881,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
goto out_free;
} else if (reserve) {
skb_reserve(skb, -reserve);
+ if (len < reserve)
+ skb_reset_network_header(skb);
}
/* Returns -EFAULT on error */
diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c
index 2aa07b547b16..86e1e37eb4e8 100644
--- a/net/qrtr/qrtr.c
+++ b/net/qrtr/qrtr.c
@@ -191,8 +191,13 @@ static int qrtr_node_enqueue(struct qrtr_node *node, struct sk_buff *skb,
hdr->type = cpu_to_le32(type);
hdr->src_node_id = cpu_to_le32(from->sq_node);
hdr->src_port_id = cpu_to_le32(from->sq_port);
- hdr->dst_node_id = cpu_to_le32(to->sq_node);
- hdr->dst_port_id = cpu_to_le32(to->sq_port);
+ if (to->sq_port == QRTR_PORT_CTRL) {
+ hdr->dst_node_id = cpu_to_le32(node->nid);
+ hdr->dst_port_id = cpu_to_le32(QRTR_NODE_BCAST);
+ } else {
+ hdr->dst_node_id = cpu_to_le32(to->sq_node);
+ hdr->dst_port_id = cpu_to_le32(to->sq_port);
+ }
hdr->size = cpu_to_le32(len);
hdr->confirm_rx = 0;
@@ -764,6 +769,10 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
node = NULL;
if (addr->sq_node == QRTR_NODE_BCAST) {
enqueue_fn = qrtr_bcast_enqueue;
+ if (addr->sq_port != QRTR_PORT_CTRL) {
+ release_sock(sk);
+ return -ENOTCONN;
+ }
} else if (addr->sq_node == ipc->us.sq_node) {
enqueue_fn = qrtr_local_enqueue;
} else {
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index bffde4b46c5d..41f75563b54b 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -24,4 +24,3 @@ config RDS_DEBUG
bool "RDS debugging messages"
depends on RDS
default n
-
diff --git a/net/rds/Makefile b/net/rds/Makefile
index b5d568bd479c..e647f9de104a 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -15,4 +15,3 @@ rds_tcp-y := tcp.o tcp_connect.o tcp_listen.o tcp_recv.o \
tcp_send.o tcp_stats.o
ccflags-$(CONFIG_RDS_DEBUG) := -DRDS_DEBUG
-
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index ab751a150f70..65387e1e6964 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/kernel.h>
#include <linux/gfp.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include <linux/poll.h>
#include <net/sock.h>
@@ -113,26 +114,82 @@ void rds_wake_sk_sleep(struct rds_sock *rs)
static int rds_getname(struct socket *sock, struct sockaddr *uaddr,
int peer)
{
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sock->sk);
-
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int uaddr_len;
/* racey, don't care */
if (peer) {
- if (!rs->rs_conn_addr)
+ if (ipv6_addr_any(&rs->rs_conn_addr))
return -ENOTCONN;
- sin->sin_port = rs->rs_conn_port;
- sin->sin_addr.s_addr = rs->rs_conn_addr;
+ if (ipv6_addr_v4mapped(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_conn_port;
+ sin->sin_addr.s_addr = rs->rs_conn_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_conn_port;
+ sin6->sin6_addr = rs->rs_conn_addr;
+ sin6->sin6_flowinfo = 0;
+ /* scope_id is the same as in the bound address. */
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
} else {
- sin->sin_port = rs->rs_bound_port;
- sin->sin_addr.s_addr = rs->rs_bound_addr;
+ /* If socket is not yet bound and the socket is connected,
+ * set the return address family to be the same as the
+ * connected address, but with 0 address value. If it is not
+ * connected, set the family to be AF_UNSPEC (value 0) and
+ * the address size to be that of an IPv4 address.
+ */
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
+ if (ipv6_addr_any(&rs->rs_conn_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_UNSPEC;
+ return sizeof(*sin);
+ }
+
+#if IS_ENABLED(CONFIG_IPV6)
+ if (!(ipv6_addr_type(&rs->rs_conn_addr) &
+ IPV6_ADDR_MAPPED)) {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ memset(sin6, 0, sizeof(*sin6));
+ sin6->sin6_family = AF_INET6;
+ return sizeof(*sin6);
+ }
+#endif
+
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin, 0, sizeof(*sin));
+ sin->sin_family = AF_INET;
+ return sizeof(*sin);
+ }
+ if (ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ sin = (struct sockaddr_in *)uaddr;
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ sin->sin_family = AF_INET;
+ sin->sin_port = rs->rs_bound_port;
+ sin->sin_addr.s_addr = rs->rs_bound_addr_v4;
+ uaddr_len = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = rs->rs_bound_port;
+ sin6->sin6_addr = rs->rs_bound_addr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ uaddr_len = sizeof(*sin6);
+ }
}
- sin->sin_family = AF_INET;
-
- return sizeof(*sin);
+ return uaddr_len;
}
/*
@@ -203,11 +260,12 @@ static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
int len)
{
+ struct sockaddr_in6 sin6;
struct sockaddr_in sin;
int ret = 0;
/* racing with another thread binding seems ok here */
- if (rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
@@ -215,14 +273,23 @@ static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval,
if (len < sizeof(struct sockaddr_in)) {
ret = -EINVAL;
goto out;
+ } else if (len < sizeof(struct sockaddr_in6)) {
+ /* Assume IPv4 */
+ if (copy_from_user(&sin, optval, sizeof(struct sockaddr_in))) {
+ ret = -EFAULT;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(sin.sin_addr.s_addr, &sin6.sin6_addr);
+ sin6.sin6_port = sin.sin_port;
+ } else {
+ if (copy_from_user(&sin6, optval,
+ sizeof(struct sockaddr_in6))) {
+ ret = -EFAULT;
+ goto out;
+ }
}
- if (copy_from_user(&sin, optval, sizeof(sin))) {
- ret = -EFAULT;
- goto out;
- }
-
- rds_send_drop_to(rs, &sin);
+ rds_send_drop_to(rs, &sin6);
out:
return ret;
}
@@ -435,31 +502,91 @@ static int rds_connect(struct socket *sock, struct sockaddr *uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ struct sockaddr_in *sin;
struct rds_sock *rs = rds_sk_to_rs(sk);
int ret = 0;
lock_sock(sk);
- if (addr_len != sizeof(struct sockaddr_in)) {
- ret = -EINVAL;
- goto out;
- }
+ switch (uaddr->sa_family) {
+ case AF_INET:
+ sin = (struct sockaddr_in *)uaddr;
+ if (addr_len < sizeof(struct sockaddr_in)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ ret = -EDESTADDRREQ;
+ break;
+ }
+ if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST)) {
+ ret = -EINVAL;
+ break;
+ }
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &rs->rs_conn_addr);
+ rs->rs_conn_port = sin->sin_port;
+ break;
- if (sin->sin_family != AF_INET) {
- ret = -EAFNOSUPPORT;
- goto out;
- }
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ struct sockaddr_in6 *sin6;
+ int addr_type;
- if (sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
- ret = -EDESTADDRREQ;
- goto out;
+ sin6 = (struct sockaddr_in6 *)uaddr;
+ if (addr_len < sizeof(struct sockaddr_in6)) {
+ ret = -EINVAL;
+ break;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED)) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+
+ /* It is a mapped address. Need to do some sanity
+ * checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(addr4))) {
+ ret = -EPROTOTYPE;
+ break;
+ }
+ }
+
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ /* If socket is arleady bound to a link local address,
+ * the peer address must be on the same link.
+ */
+ if (sin6->sin6_scope_id == 0 ||
+ (!ipv6_addr_any(&rs->rs_bound_addr) &&
+ rs->rs_bound_scope_id &&
+ sin6->sin6_scope_id != rs->rs_bound_scope_id)) {
+ ret = -EINVAL;
+ break;
+ }
+ /* Remember the connected address scope ID. It will
+ * be checked against the binding local address when
+ * the socket is bound.
+ */
+ rs->rs_bound_scope_id = sin6->sin6_scope_id;
+ }
+ rs->rs_conn_addr = sin6->sin6_addr;
+ rs->rs_conn_port = sin6->sin6_port;
+ break;
}
+#endif
- rs->rs_conn_addr = sin->sin_addr.s_addr;
- rs->rs_conn_port = sin->sin_port;
+ default:
+ ret = -EAFNOSUPPORT;
+ break;
+ }
-out:
release_sock(sk);
return ret;
}
@@ -578,8 +705,10 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len,
list_for_each_entry(inc, &rs->rs_recv_queue, i_item) {
total++;
if (total <= len)
- rds_inc_info_copy(inc, iter, inc->i_saddr,
- rs->rs_bound_addr, 1);
+ rds_inc_info_copy(inc, iter,
+ inc->i_saddr.s6_addr32[3],
+ rs->rs_bound_addr_v4,
+ 1);
}
read_unlock(&rs->rs_recv_lock);
@@ -608,8 +737,8 @@ static void rds_sock_info(struct socket *sock, unsigned int len,
list_for_each_entry(rs, &rds_sock_list, rs_item) {
sinfo.sndbuf = rds_sk_sndbuf(rs);
sinfo.rcvbuf = rds_sk_rcvbuf(rs);
- sinfo.bound_addr = rs->rs_bound_addr;
- sinfo.connected_addr = rs->rs_conn_addr;
+ sinfo.bound_addr = rs->rs_bound_addr_v4;
+ sinfo.connected_addr = rs->rs_conn_addr_v4;
sinfo.bound_port = rs->rs_bound_port;
sinfo.connected_port = rs->rs_conn_port;
sinfo.inum = sock_i_ino(rds_rs_to_sk(rs));
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 5aa3a64aa4f0..3ab55784b637 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
#include <linux/kernel.h>
#include <net/sock.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include <linux/if_arp.h>
#include <linux/jhash.h>
#include <linux/ratelimit.h>
@@ -42,42 +43,58 @@ static struct rhashtable bind_hash_table;
static const struct rhashtable_params ht_parms = {
.nelem_hint = 768,
- .key_len = sizeof(u64),
+ .key_len = RDS_BOUND_KEY_LEN,
.key_offset = offsetof(struct rds_sock, rs_bound_key),
.head_offset = offsetof(struct rds_sock, rs_bound_node),
.max_size = 16384,
.min_size = 1024,
};
+/* Create a key for the bind hash table manipulation. Port is in network byte
+ * order.
+ */
+static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr,
+ __be16 port, __u32 scope_id)
+{
+ memcpy(key, addr, sizeof(*addr));
+ key += sizeof(*addr);
+ memcpy(key, &port, sizeof(port));
+ key += sizeof(port);
+ memcpy(key, &scope_id, sizeof(scope_id));
+}
+
/*
* Return the rds_sock bound at the given local address.
*
* The rx path can race with rds_release. We notice if rds_release() has
* marked this socket and don't return a rs ref to the rx path.
*/
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port)
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id)
{
- u64 key = ((u64)addr << 32) | port;
+ u8 key[RDS_BOUND_KEY_LEN];
struct rds_sock *rs;
- rs = rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms);
+ __rds_create_bind_key(key, addr, port, scope_id);
+ rs = rhashtable_lookup_fast(&bind_hash_table, key, ht_parms);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
- rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr,
- ntohs(port));
+ rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
+ ntohs(port));
return rs;
}
/* returns -ve errno or +ve port */
-static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
+static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr,
+ __be16 *port, __u32 scope_id)
{
int ret = -EADDRINUSE;
u16 rover, last;
- u64 key;
+ u8 key[RDS_BOUND_KEY_LEN];
if (*port != 0) {
rover = be16_to_cpu(*port);
@@ -95,12 +112,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (rover == RDS_FLAG_PROBE_PORT)
continue;
- key = ((u64)addr << 32) | cpu_to_be16(rover);
- if (rhashtable_lookup_fast(&bind_hash_table, &key, ht_parms))
+ __rds_create_bind_key(key, addr, cpu_to_be16(rover),
+ scope_id);
+ if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms))
continue;
- rs->rs_bound_key = key;
- rs->rs_bound_addr = addr;
+ memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key));
+ rs->rs_bound_addr = *addr;
net_get_random_once(&rs->rs_hash_initval,
sizeof(rs->rs_hash_initval));
rs->rs_bound_port = cpu_to_be16(rover);
@@ -109,12 +127,13 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
if (!rhashtable_insert_fast(&bind_hash_table,
&rs->rs_bound_node, ht_parms)) {
*port = rs->rs_bound_port;
+ rs->rs_bound_scope_id = scope_id;
ret = 0;
- rdsdebug("rs %p binding to %pI4:%d\n",
- rs, &addr, (int)ntohs(*port));
+ rdsdebug("rs %p binding to %pI6c:%d\n",
+ rs, addr, (int)ntohs(*port));
break;
} else {
- rs->rs_bound_addr = 0;
+ rs->rs_bound_addr = in6addr_any;
rds_sock_put(rs);
ret = -ENOMEM;
break;
@@ -127,44 +146,103 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port)
void rds_remove_bound(struct rds_sock *rs)
{
- if (!rs->rs_bound_addr)
+ if (ipv6_addr_any(&rs->rs_bound_addr))
return;
- rdsdebug("rs %p unbinding from %pI4:%d\n",
+ rdsdebug("rs %p unbinding from %pI6c:%d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port));
rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms);
rds_sock_put(rs);
- rs->rs_bound_addr = 0;
+ rs->rs_bound_addr = in6addr_any;
}
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sock *sk = sock->sk;
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
struct rds_sock *rs = rds_sk_to_rs(sk);
+ struct in6_addr v6addr, *binding_addr;
struct rds_transport *trans;
+ __u32 scope_id = 0;
int ret = 0;
+ __be16 port;
+
+ /* We allow an RDS socket to be bound to either IPv4 or IPv6
+ * address.
+ */
+ if (uaddr->sa_family == AF_INET) {
+ struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+
+ if (addr_len < sizeof(struct sockaddr_in) ||
+ sin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+ sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
+ return -EINVAL;
+ ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr);
+ binding_addr = &v6addr;
+ port = sin->sin_port;
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (uaddr->sa_family == AF_INET6) {
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr;
+ int addr_type;
+
+ if (addr_len < sizeof(struct sockaddr_in6))
+ return -EINVAL;
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+ if (!(addr_type & IPV6_ADDR_MAPPED))
+ return -EINVAL;
+
+ /* It is a mapped address. Need to do some sanity
+ * checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(addr4)))
+ return -EINVAL;
+ }
+ /* The scope ID must be specified for link local address. */
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0)
+ return -EINVAL;
+ scope_id = sin6->sin6_scope_id;
+ }
+ binding_addr = &sin6->sin6_addr;
+ port = sin6->sin6_port;
+#endif
+ } else {
+ return -EINVAL;
+ }
lock_sock(sk);
- if (addr_len != sizeof(struct sockaddr_in) ||
- sin->sin_family != AF_INET ||
- rs->rs_bound_addr ||
- sin->sin_addr.s_addr == htonl(INADDR_ANY)) {
+ /* RDS socket does not allow re-binding. */
+ if (!ipv6_addr_any(&rs->rs_bound_addr)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ /* Socket is connected. The binding address should have the same
+ * scope ID as the connected address, except the case when one is
+ * non-link local address (scope_id is 0).
+ */
+ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id &&
+ rs->rs_bound_scope_id &&
+ scope_id != rs->rs_bound_scope_id) {
ret = -EINVAL;
goto out;
}
- ret = rds_add_bound(rs, sin->sin_addr.s_addr, &sin->sin_port);
+ ret = rds_add_bound(rs, binding_addr, &port, scope_id);
if (ret)
goto out;
if (rs->rs_transport) { /* previously bound */
trans = rs->rs_transport;
if (trans->laddr_check(sock_net(sock->sk),
- sin->sin_addr.s_addr) != 0) {
+ binding_addr, scope_id) != 0) {
ret = -ENOPROTOOPT;
rds_remove_bound(rs);
} else {
@@ -172,13 +250,13 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
}
goto out;
}
- trans = rds_trans_get_preferred(sock_net(sock->sk),
- sin->sin_addr.s_addr);
+ trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr,
+ scope_id);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
- pr_info_ratelimited("RDS: %s could not find a transport for %pI4, load rds_tcp or rds_rdma?\n",
- __func__, &sin->sin_addr.s_addr);
+ pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n",
+ __func__, binding_addr);
goto out;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 63da9d2f142d..ccdff09a79c8 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007 Oracle. All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -101,7 +101,7 @@ static DEFINE_RWLOCK(rds_cong_monitor_lock);
static DEFINE_SPINLOCK(rds_cong_lock);
static struct rb_root rds_cong_tree = RB_ROOT;
-static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
+static struct rds_cong_map *rds_cong_tree_walk(const struct in6_addr *addr,
struct rds_cong_map *insert)
{
struct rb_node **p = &rds_cong_tree.rb_node;
@@ -109,12 +109,15 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
struct rds_cong_map *map;
while (*p) {
+ int diff;
+
parent = *p;
map = rb_entry(parent, struct rds_cong_map, m_rb_node);
- if (addr < map->m_addr)
+ diff = rds_addr_cmp(addr, &map->m_addr);
+ if (diff < 0)
p = &(*p)->rb_left;
- else if (addr > map->m_addr)
+ else if (diff > 0)
p = &(*p)->rb_right;
else
return map;
@@ -132,7 +135,7 @@ static struct rds_cong_map *rds_cong_tree_walk(__be32 addr,
* these bitmaps in the process getting pointers to them. The bitmaps are only
* ever freed as the module is removed after all connections have been freed.
*/
-static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
+static struct rds_cong_map *rds_cong_from_addr(const struct in6_addr *addr)
{
struct rds_cong_map *map;
struct rds_cong_map *ret = NULL;
@@ -144,7 +147,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr)
if (!map)
return NULL;
- map->m_addr = addr;
+ map->m_addr = *addr;
init_waitqueue_head(&map->m_waitq);
INIT_LIST_HEAD(&map->m_conn_list);
@@ -171,7 +174,7 @@ out:
kfree(map);
}
- rdsdebug("map %p for addr %x\n", ret, be32_to_cpu(addr));
+ rdsdebug("map %p for addr %pI6c\n", ret, addr);
return ret;
}
@@ -202,8 +205,8 @@ void rds_cong_remove_conn(struct rds_connection *conn)
int rds_cong_get_maps(struct rds_connection *conn)
{
- conn->c_lcong = rds_cong_from_addr(conn->c_laddr);
- conn->c_fcong = rds_cong_from_addr(conn->c_faddr);
+ conn->c_lcong = rds_cong_from_addr(&conn->c_laddr);
+ conn->c_fcong = rds_cong_from_addr(&conn->c_faddr);
if (!(conn->c_lcong && conn->c_fcong))
return -ENOMEM;
@@ -353,7 +356,7 @@ void rds_cong_remove_socket(struct rds_sock *rs)
/* update congestion map for now-closed port */
spin_lock_irqsave(&rds_cong_lock, flags);
- map = rds_cong_tree_walk(rs->rs_bound_addr, NULL);
+ map = rds_cong_tree_walk(&rs->rs_bound_addr, NULL);
spin_unlock_irqrestore(&rds_cong_lock, flags);
if (map && rds_cong_test_bit(map, rs->rs_bound_port)) {
diff --git a/net/rds/connection.c b/net/rds/connection.c
index cfb05953b0e5..3bd2f4a5a30d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -34,7 +34,9 @@
#include <linux/list.h>
#include <linux/slab.h>
#include <linux/export.h>
-#include <net/inet_hashtables.h>
+#include <net/ipv6.h>
+#include <net/inet6_hashtables.h>
+#include <net/addrconf.h>
#include "rds.h"
#include "loop.h"
@@ -49,18 +51,25 @@ static unsigned long rds_conn_count;
static struct hlist_head rds_conn_hash[RDS_CONNECTION_HASH_ENTRIES];
static struct kmem_cache *rds_conn_slab;
-static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
+static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
+ const struct in6_addr *faddr)
{
+ static u32 rds6_hash_secret __read_mostly;
static u32 rds_hash_secret __read_mostly;
- unsigned long hash;
+ u32 lhash, fhash, hash;
net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
+ net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
+
+ lhash = (__force u32)laddr->s6_addr32[3];
+#if IS_ENABLED(CONFIG_IPV6)
+ fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+#else
+ fhash = (__force u32)faddr->s6_addr32[3];
+#endif
+ hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
- /* Pass NULL, don't need struct net for hash */
- hash = __inet_ehashfn(be32_to_cpu(laddr), 0,
- be32_to_cpu(faddr), 0,
- rds_hash_secret);
return &rds_conn_hash[hash & RDS_CONNECTION_HASH_MASK];
}
@@ -72,20 +81,25 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr)
/* rcu read lock must be held or the connection spinlock */
static struct rds_connection *rds_conn_lookup(struct net *net,
struct hlist_head *head,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ int dev_if)
{
struct rds_connection *conn, *ret = NULL;
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
- if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
- conn->c_trans == trans && net == rds_conn_net(conn)) {
+ if (ipv6_addr_equal(&conn->c_faddr, faddr) &&
+ ipv6_addr_equal(&conn->c_laddr, laddr) &&
+ conn->c_trans == trans &&
+ net == rds_conn_net(conn) &&
+ conn->c_dev_if == dev_if) {
ret = conn;
break;
}
}
- rdsdebug("returning conn %p for %pI4 -> %pI4\n", ret,
- &laddr, &faddr);
+ rdsdebug("returning conn %p for %pI6c -> %pI6c\n", ret,
+ laddr, faddr);
return ret;
}
@@ -99,8 +113,8 @@ static void rds_conn_path_reset(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
- rdsdebug("connection %pI4 to %pI4 reset\n",
- &conn->c_laddr, &conn->c_faddr);
+ rdsdebug("connection %pI6c to %pI6c reset\n",
+ &conn->c_laddr, &conn->c_faddr);
rds_stats_inc(s_conn_reset);
rds_send_path_reset(cp);
@@ -142,9 +156,12 @@ static void __rds_conn_path_init(struct rds_connection *conn,
* are torn down as the module is removed, if ever.
*/
static struct rds_connection *__rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp,
- int is_outgoing)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp,
+ int is_outgoing,
+ int dev_if)
{
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
@@ -154,9 +171,12 @@ static struct rds_connection *__rds_conn_create(struct net *net,
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
rcu_read_lock();
- conn = rds_conn_lookup(net, head, laddr, faddr, trans);
- if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport &&
- laddr == faddr && !is_outgoing) {
+ conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if);
+ if (conn &&
+ conn->c_loopback &&
+ conn->c_trans != &rds_loop_transport &&
+ ipv6_addr_equal(laddr, faddr) &&
+ !is_outgoing) {
/* This is a looped back IB connection, and we're
* called by the code handling the incoming connect.
* We need a second connection object into which we
@@ -181,8 +201,22 @@ static struct rds_connection *__rds_conn_create(struct net *net,
}
INIT_HLIST_NODE(&conn->c_hash_node);
- conn->c_laddr = laddr;
- conn->c_faddr = faddr;
+ conn->c_laddr = *laddr;
+ conn->c_isv6 = !ipv6_addr_v4mapped(laddr);
+ conn->c_faddr = *faddr;
+ conn->c_dev_if = dev_if;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* If the local address is link local, set c_bound_if to be the
+ * index used for this connection. Otherwise, set it to 0 as
+ * the socket is not bound to an interface. c_bound_if is used
+ * to look up a socket when a packet is received
+ */
+ if (ipv6_addr_type(laddr) & IPV6_ADDR_LINKLOCAL)
+ conn->c_bound_if = dev_if;
+ else
+#endif
+ conn->c_bound_if = 0;
rds_conn_net_set(conn, net);
@@ -199,7 +233,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
* can bind to the destination address then we'd rather the messages
* flow through loopback rather than either transport.
*/
- loop_trans = rds_trans_get_preferred(net, faddr);
+ loop_trans = rds_trans_get_preferred(net, faddr, conn->c_dev_if);
if (loop_trans) {
rds_trans_put(loop_trans);
conn->c_loopback = 1;
@@ -233,10 +267,10 @@ static struct rds_connection *__rds_conn_create(struct net *net,
goto out;
}
- rdsdebug("allocated conn %p for %pI4 -> %pI4 over %s %s\n",
- conn, &laddr, &faddr,
- strnlen(trans->t_name, sizeof(trans->t_name)) ? trans->t_name :
- "[unknown]", is_outgoing ? "(outgoing)" : "");
+ rdsdebug("allocated conn %p for %pI6c -> %pI6c over %s %s\n",
+ conn, laddr, faddr,
+ strnlen(trans->t_name, sizeof(trans->t_name)) ?
+ trans->t_name : "[unknown]", is_outgoing ? "(outgoing)" : "");
/*
* Since we ran without holding the conn lock, someone could
@@ -262,7 +296,8 @@ static struct rds_connection *__rds_conn_create(struct net *net,
/* Creating normal conn */
struct rds_connection *found;
- found = rds_conn_lookup(net, head, laddr, faddr, trans);
+ found = rds_conn_lookup(net, head, laddr, faddr, trans,
+ dev_if);
if (found) {
struct rds_conn_path *cp;
int i;
@@ -295,18 +330,22 @@ out:
}
struct rds_connection *rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create);
struct rds_connection *rds_conn_create_outgoing(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp)
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp, int dev_if)
{
- return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
+ return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if);
}
EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
@@ -464,10 +503,23 @@ void rds_conn_destroy(struct rds_connection *conn)
}
EXPORT_SYMBOL_GPL(rds_conn_destroy);
-static void rds_conn_message_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens,
- int want_send)
+static void __rds_inc_msg_cp(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ void *saddr, void *daddr, int flip, bool isv6)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+ if (isv6)
+ rds6_inc_info_copy(inc, iter, saddr, daddr, flip);
+ else
+#endif
+ rds_inc_info_copy(inc, iter, *(__be32 *)saddr,
+ *(__be32 *)daddr, flip);
+}
+
+static void rds_conn_message_info_cmn(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send, bool isv6)
{
struct hlist_head *head;
struct list_head *list;
@@ -478,7 +530,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
size_t i;
int j;
- len /= sizeof(struct rds_info_message);
+ if (isv6)
+ len /= sizeof(struct rds6_info_message);
+ else
+ len /= sizeof(struct rds_info_message);
rcu_read_lock();
@@ -488,6 +543,9 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
struct rds_conn_path *cp;
int npaths;
+ if (!isv6 && conn->c_isv6)
+ continue;
+
npaths = (conn->c_trans->t_mp_capable ?
RDS_MPATH_WORKERS : 1);
@@ -504,11 +562,11 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
list_for_each_entry(rm, list, m_conn_item) {
total++;
if (total <= len)
- rds_inc_info_copy(&rm->m_inc,
- iter,
- conn->c_laddr,
- conn->c_faddr,
- 0);
+ __rds_inc_msg_cp(&rm->m_inc,
+ iter,
+ &conn->c_laddr,
+ &conn->c_faddr,
+ 0, isv6);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
@@ -518,9 +576,30 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len,
rcu_read_unlock();
lens->nr = total;
- lens->each = sizeof(struct rds_info_message);
+ if (isv6)
+ lens->each = sizeof(struct rds6_info_message);
+ else
+ lens->each = sizeof(struct rds_info_message);
}
+static void rds_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens,
+ int want_send)
+{
+ rds_conn_message_info_cmn(sock, len, iter, lens, want_send, true);
+}
+#endif
+
static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -528,6 +607,15 @@ static void rds_conn_message_info_send(struct socket *sock, unsigned int len,
rds_conn_message_info(sock, len, iter, lens, 1);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info_send(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds6_conn_message_info(sock, len, iter, lens, 1);
+}
+#endif
+
static void rds_conn_message_info_retrans(struct socket *sock,
unsigned int len,
struct rds_info_iterator *iter,
@@ -536,6 +624,16 @@ static void rds_conn_message_info_retrans(struct socket *sock,
rds_conn_message_info(sock, len, iter, lens, 0);
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_message_info_retrans(struct socket *sock,
+ unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ rds6_conn_message_info(sock, len, iter, lens, 0);
+}
+#endif
+
void rds_for_each_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens,
@@ -584,7 +682,6 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
struct hlist_head *head;
struct rds_connection *conn;
size_t i;
- int j;
rcu_read_lock();
@@ -595,17 +692,20 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
i++, head++) {
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
struct rds_conn_path *cp;
- int npaths;
- npaths = (conn->c_trans->t_mp_capable ?
- RDS_MPATH_WORKERS : 1);
- for (j = 0; j < npaths; j++) {
- cp = &conn->c_path[j];
+ /* XXX We only copy the information from the first
+ * path for now. The problem is that if there are
+ * more than one underlying paths, we cannot report
+ * information of all of them using the existing
+ * API. For example, there is only one next_tx_seq,
+ * which path's next_tx_seq should we report? It is
+ * a bug in the design of MPRDS.
+ */
+ cp = conn->c_path;
- /* XXX no cp_lock usage.. */
- if (!visitor(cp, buffer))
- continue;
- }
+ /* XXX no cp_lock usage.. */
+ if (!visitor(cp, buffer))
+ continue;
/* We copy as much as we can fit in the buffer,
* but we count all items so that the caller
@@ -624,12 +724,16 @@ static void rds_walk_conn_path_info(struct socket *sock, unsigned int len,
static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
{
struct rds_info_connection *cinfo = buffer;
+ struct rds_connection *conn = cp->cp_conn;
+
+ if (conn->c_isv6)
+ return 0;
cinfo->next_tx_seq = cp->cp_next_tx_seq;
cinfo->next_rx_seq = cp->cp_next_rx_seq;
- cinfo->laddr = cp->cp_conn->c_laddr;
- cinfo->faddr = cp->cp_conn->c_faddr;
- strncpy(cinfo->transport, cp->cp_conn->c_trans->t_name,
+ cinfo->laddr = conn->c_laddr.s6_addr32[3];
+ cinfo->faddr = conn->c_faddr.s6_addr32[3];
+ strncpy(cinfo->transport, conn->c_trans->t_name,
sizeof(cinfo->transport));
cinfo->flags = 0;
@@ -645,6 +749,36 @@ static int rds_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
return 1;
}
+#if IS_ENABLED(CONFIG_IPV6)
+static int rds6_conn_info_visitor(struct rds_conn_path *cp, void *buffer)
+{
+ struct rds6_info_connection *cinfo6 = buffer;
+ struct rds_connection *conn = cp->cp_conn;
+
+ cinfo6->next_tx_seq = cp->cp_next_tx_seq;
+ cinfo6->next_rx_seq = cp->cp_next_rx_seq;
+ cinfo6->laddr = conn->c_laddr;
+ cinfo6->faddr = conn->c_faddr;
+ strncpy(cinfo6->transport, conn->c_trans->t_name,
+ sizeof(cinfo6->transport));
+ cinfo6->flags = 0;
+
+ rds_conn_info_set(cinfo6->flags, test_bit(RDS_IN_XMIT, &cp->cp_flags),
+ SENDING);
+ /* XXX Future: return the state rather than these funky bits */
+ rds_conn_info_set(cinfo6->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_CONNECTING,
+ CONNECTING);
+ rds_conn_info_set(cinfo6->flags,
+ atomic_read(&cp->cp_state) == RDS_CONN_UP,
+ CONNECTED);
+ /* Just return 1 as there is no error case. This is a helper function
+ * for rds_walk_conn_path_info() and it wants a return value.
+ */
+ return 1;
+}
+#endif
+
static void rds_conn_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -657,6 +791,20 @@ static void rds_conn_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_connection));
}
+#if IS_ENABLED(CONFIG_IPV6)
+static void rds6_conn_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds6_info_connection) + 7) / 8];
+
+ rds_walk_conn_path_info(sock, len, iter, lens,
+ rds6_conn_info_visitor,
+ buffer,
+ sizeof(struct rds6_info_connection));
+}
+#endif
+
int rds_conn_init(void)
{
int ret;
@@ -678,7 +826,13 @@ int rds_conn_init(void)
rds_conn_message_info_send);
rds_info_register_func(RDS_INFO_RETRANS_MESSAGES,
rds_conn_message_info_retrans);
-
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+ rds_info_register_func(RDS6_INFO_SEND_MESSAGES,
+ rds6_conn_message_info_send);
+ rds_info_register_func(RDS6_INFO_RETRANS_MESSAGES,
+ rds6_conn_message_info_retrans);
+#endif
return 0;
}
@@ -696,6 +850,13 @@ void rds_conn_exit(void)
rds_conn_message_info_send);
rds_info_deregister_func(RDS_INFO_RETRANS_MESSAGES,
rds_conn_message_info_retrans);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_CONNECTIONS, rds6_conn_info);
+ rds_info_deregister_func(RDS6_INFO_SEND_MESSAGES,
+ rds6_conn_message_info_send);
+ rds_info_deregister_func(RDS6_INFO_RETRANS_MESSAGES,
+ rds6_conn_message_info_retrans);
+#endif
}
/*
diff --git a/net/rds/ib.c b/net/rds/ib.c
index b6ad38e48f62..89c6333ecd39 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -39,6 +39,7 @@
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <net/addrconf.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -295,9 +296,11 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_ib_transport)
return 0;
+ if (conn->c_isv6)
+ return 0;
- iinfo->src_addr = conn->c_laddr;
- iinfo->dst_addr = conn->c_faddr;
+ iinfo->src_addr = conn->c_laddr.s6_addr32[3];
+ iinfo->dst_addr = conn->c_faddr.s6_addr32[3];
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
@@ -318,6 +321,45 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn,
return 1;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/* IPv6 version of rds_ib_conn_info_visitor(). */
+static int rds6_ib_conn_info_visitor(struct rds_connection *conn,
+ void *buffer)
+{
+ struct rds6_info_rdma_connection *iinfo6 = buffer;
+ struct rds_ib_connection *ic;
+
+ /* We will only ever look at IB transports */
+ if (conn->c_trans != &rds_ib_transport)
+ return 0;
+
+ iinfo6->src_addr = conn->c_laddr;
+ iinfo6->dst_addr = conn->c_faddr;
+
+ memset(&iinfo6->src_gid, 0, sizeof(iinfo6->src_gid));
+ memset(&iinfo6->dst_gid, 0, sizeof(iinfo6->dst_gid));
+
+ if (rds_conn_state(conn) == RDS_CONN_UP) {
+ struct rds_ib_device *rds_ibdev;
+ struct rdma_dev_addr *dev_addr;
+
+ ic = conn->c_transport_data;
+ dev_addr = &ic->i_cm_id->route.addr.dev_addr;
+ rdma_addr_get_sgid(dev_addr,
+ (union ib_gid *)&iinfo6->src_gid);
+ rdma_addr_get_dgid(dev_addr,
+ (union ib_gid *)&iinfo6->dst_gid);
+
+ rds_ibdev = ic->rds_ibdev;
+ iinfo6->max_send_wr = ic->i_send_ring.w_nr;
+ iinfo6->max_recv_wr = ic->i_recv_ring.w_nr;
+ iinfo6->max_send_sge = rds_ibdev->max_sge;
+ rds6_ib_get_mr_info(rds_ibdev, iinfo6);
+ }
+ return 1;
+}
+#endif
+
static void rds_ib_ic_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -330,6 +372,20 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
sizeof(struct rds_info_rdma_connection));
}
+#if IS_ENABLED(CONFIG_IPV6)
+/* IPv6 version of rds_ib_ic_info(). */
+static void rds6_ib_ic_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
+{
+ u64 buffer[(sizeof(struct rds6_info_rdma_connection) + 7) / 8];
+
+ rds_for_each_conn_info(sock, len, iter, lens,
+ rds6_ib_conn_info_visitor,
+ buffer,
+ sizeof(struct rds6_info_rdma_connection));
+}
+#endif
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
@@ -341,12 +397,19 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len,
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
-static int rds_ib_laddr_check(struct net *net, __be32 addr)
+static int rds_ib_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
{
int ret;
struct rdma_cm_id *cm_id;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6;
+#endif
struct sockaddr_in sin;
+ struct sockaddr *sa;
+ bool isv4;
+ isv4 = ipv6_addr_v4mapped(addr);
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
@@ -355,22 +418,66 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr;
+ if (isv4) {
+ memset(&sin, 0, sizeof(sin));
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = addr->s6_addr32[3];
+ sa = (struct sockaddr *)&sin;
+ } else {
+#if IS_ENABLED(CONFIG_IPV6)
+ memset(&sin6, 0, sizeof(sin6));
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = *addr;
+ sin6.sin6_scope_id = scope_id;
+ sa = (struct sockaddr *)&sin6;
+
+ /* XXX Do a special IPv6 link local address check here. The
+ * reason is that rdma_bind_addr() always succeeds with IPv6
+ * link local address regardless it is indeed configured in a
+ * system.
+ */
+ if (ipv6_addr_type(addr) & IPV6_ADDR_LINKLOCAL) {
+ struct net_device *dev;
+
+ if (scope_id == 0) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ /* Use init_net for now as RDS is not network
+ * name space aware.
+ */
+ dev = dev_get_by_index(&init_net, scope_id);
+ if (!dev) {
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+ if (!ipv6_chk_addr(&init_net, addr, dev, 1)) {
+ dev_put(dev);
+ ret = -EADDRNOTAVAIL;
+ goto out;
+ }
+ dev_put(dev);
+ }
+#else
+ ret = -EADDRNOTAVAIL;
+ goto out;
+#endif
+ }
/* rdma_bind_addr will only succeed for IB & iWARP devices */
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ ret = rdma_bind_addr(cm_id, sa);
/* due to this, we will claim to support iWARP devices unless we
check node_type. */
if (ret || !cm_id->device ||
cm_id->device->node_type != RDMA_NODE_IB_CA)
ret = -EADDRNOTAVAIL;
- rdsdebug("addr %pI4 ret %d node type %d\n",
- &addr, ret,
- cm_id->device ? cm_id->device->node_type : -1);
+ rdsdebug("addr %pI6c%%%u ret %d node type %d\n",
+ addr, scope_id, ret,
+ cm_id->device ? cm_id->device->node_type : -1);
+out:
rdma_destroy_id(cm_id);
return ret;
@@ -401,6 +508,9 @@ void rds_ib_exit(void)
rds_ib_set_unloading();
synchronize_rcu();
rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
+#endif
rds_ib_unregister_client();
rds_ib_destroy_nodev_conns();
rds_ib_sysctl_exit();
@@ -462,6 +572,9 @@ int rds_ib_init(void)
rds_trans_register(&rds_ib_transport);
rds_info_register_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_IB_CONNECTIONS, rds6_ib_ic_info);
+#endif
goto out;
@@ -476,4 +589,3 @@ out:
}
MODULE_LICENSE("GPL");
-
diff --git a/net/rds/ib.h b/net/rds/ib.h
index a6f4d7d68e95..73427ff439f9 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -57,16 +57,44 @@ struct rds_ib_refill_cache {
struct list_head *ready;
};
+/* This is the common structure for the IB private data exchange in setting up
+ * an RDS connection. The exchange is different for IPv4 and IPv6 connections.
+ * The reason is that the address size is different and the addresses
+ * exchanged are in the beginning of the structure. Hence it is not possible
+ * for interoperability if same structure is used.
+ */
+struct rds_ib_conn_priv_cmn {
+ u8 ricpc_protocol_major;
+ u8 ricpc_protocol_minor;
+ __be16 ricpc_protocol_minor_mask; /* bitmask */
+ __be32 ricpc_reserved1;
+ __be64 ricpc_ack_seq;
+ __be32 ricpc_credit; /* non-zero enables flow ctl */
+};
+
struct rds_ib_connect_private {
/* Add new fields at the end, and don't permute existing fields. */
- __be32 dp_saddr;
- __be32 dp_daddr;
- u8 dp_protocol_major;
- u8 dp_protocol_minor;
- __be16 dp_protocol_minor_mask; /* bitmask */
- __be32 dp_reserved1;
- __be64 dp_ack_seq;
- __be32 dp_credit; /* non-zero enables flow ctl */
+ __be32 dp_saddr;
+ __be32 dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+struct rds6_ib_connect_private {
+ /* Add new fields at the end, and don't permute existing fields. */
+ struct in6_addr dp_saddr;
+ struct in6_addr dp_daddr;
+ struct rds_ib_conn_priv_cmn dp_cmn;
+};
+
+#define dp_protocol_major dp_cmn.ricpc_protocol_major
+#define dp_protocol_minor dp_cmn.ricpc_protocol_minor
+#define dp_protocol_minor_mask dp_cmn.ricpc_protocol_minor_mask
+#define dp_ack_seq dp_cmn.ricpc_ack_seq
+#define dp_credit dp_cmn.ricpc_credit
+
+union rds_ib_conn_priv {
+ struct rds_ib_connect_private ricp_v4;
+ struct rds6_ib_connect_private ricp_v6;
};
struct rds_ib_send_work {
@@ -351,8 +379,8 @@ void rds_ib_listen_stop(void);
__printf(2, 3)
void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...);
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id);
+ struct rdma_cm_event *event, bool isv6);
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6);
void rds_ib_cm_connect_complete(struct rds_connection *conn,
struct rdma_cm_event *event);
@@ -361,7 +389,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn,
__rds_ib_conn_error(conn, KERN_WARNING "RDS/IB: " fmt)
/* ib_rdma.c */
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
@@ -371,7 +400,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
int rds_ib_recv_init(void);
void rds_ib_recv_exit(void);
int rds_ib_recv_path(struct rds_conn_path *conn);
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic);
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp);
void rds_ib_recv_free_caches(struct rds_ib_connection *ic);
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp);
void rds_ib_inc_free(struct rds_incoming *inc);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index f1684ae6abfd..bfbb31f0c7fd 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/ratelimit.h>
+#include <net/addrconf.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -95,25 +96,45 @@ rds_ib_tune_rnr(struct rds_ib_connection *ic, struct ib_qp_attr *attr)
*/
void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
{
- const struct rds_ib_connect_private *dp = NULL;
struct rds_ib_connection *ic = conn->c_transport_data;
+ const union rds_ib_conn_priv *dp = NULL;
struct ib_qp_attr qp_attr;
+ __be64 ack_seq = 0;
+ __be32 credit = 0;
+ u8 major = 0;
+ u8 minor = 0;
int err;
- if (event->param.conn.private_data_len >= sizeof(*dp)) {
- dp = event->param.conn.private_data;
-
- /* make sure it isn't empty data */
- if (dp->dp_protocol_major) {
- rds_ib_set_protocol(conn,
- RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ dp = event->param.conn.private_data;
+ if (conn->c_isv6) {
+ if (event->param.conn.private_data_len >=
+ sizeof(struct rds6_ib_connect_private)) {
+ major = dp->ricp_v6.dp_protocol_major;
+ minor = dp->ricp_v6.dp_protocol_minor;
+ credit = dp->ricp_v6.dp_credit;
+ /* dp structure start is not guaranteed to be 8 bytes
+ * aligned. Since dp_ack_seq is 64-bit extended load
+ * operations can be used so go through get_unaligned
+ * to avoid unaligned errors.
+ */
+ ack_seq = get_unaligned(&dp->ricp_v6.dp_ack_seq);
}
+ } else if (event->param.conn.private_data_len >=
+ sizeof(struct rds_ib_connect_private)) {
+ major = dp->ricp_v4.dp_protocol_major;
+ minor = dp->ricp_v4.dp_protocol_minor;
+ credit = dp->ricp_v4.dp_credit;
+ ack_seq = get_unaligned(&dp->ricp_v4.dp_ack_seq);
+ }
+
+ /* make sure it isn't empty data */
+ if (major) {
+ rds_ib_set_protocol(conn, RDS_PROTOCOL(major, minor));
+ rds_ib_set_flow_control(conn, be32_to_cpu(credit));
}
if (conn->c_version < RDS_PROTOCOL(3, 1)) {
- pr_notice("RDS/IB: Connection <%pI4,%pI4> version %u.%u no longer supported\n",
+ pr_notice("RDS/IB: Connection <%pI6c,%pI6c> version %u.%u no longer supported\n",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
RDS_PROTOCOL_MINOR(conn->c_version));
@@ -121,7 +142,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
rds_conn_destroy(conn);
return;
} else {
- pr_notice("RDS/IB: %s conn connected <%pI4,%pI4> version %u.%u%s\n",
+ pr_notice("RDS/IB: %s conn connected <%pI6c,%pI6c> version %u.%u%s\n",
ic->i_active_side ? "Active" : "Passive",
&conn->c_laddr, &conn->c_faddr,
RDS_PROTOCOL_MAJOR(conn->c_version),
@@ -150,7 +171,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err);
/* update ib_device with this local ipaddr */
- err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr);
+ err = rds_ib_update_ipaddr(ic->rds_ibdev, &conn->c_laddr);
if (err)
printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n",
err);
@@ -158,14 +179,8 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
if (dp) {
- /* dp structure start is not guaranteed to be 8 bytes aligned.
- * Since dp_ack_seq is 64-bit extended load operations can be
- * used so go through get_unaligned to avoid unaligned errors.
- */
- __be64 dp_ack_seq = get_unaligned(&dp->dp_ack_seq);
-
- if (dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp_ack_seq),
+ if (ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(ack_seq),
NULL);
}
@@ -173,11 +188,12 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even
}
static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
- struct rdma_conn_param *conn_param,
- struct rds_ib_connect_private *dp,
- u32 protocol_version,
- u32 max_responder_resources,
- u32 max_initiator_depth)
+ struct rdma_conn_param *conn_param,
+ union rds_ib_conn_priv *dp,
+ u32 protocol_version,
+ u32 max_responder_resources,
+ u32 max_initiator_depth,
+ bool isv6)
{
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_device *rds_ibdev = ic->rds_ibdev;
@@ -193,24 +209,49 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn,
if (dp) {
memset(dp, 0, sizeof(*dp));
- dp->dp_saddr = conn->c_laddr;
- dp->dp_daddr = conn->c_faddr;
- dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
- dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
- dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
- dp->dp_ack_seq = cpu_to_be64(rds_ib_piggyb_ack(ic));
+ if (isv6) {
+ dp->ricp_v6.dp_saddr = conn->c_laddr;
+ dp->ricp_v6.dp_daddr = conn->c_faddr;
+ dp->ricp_v6.dp_protocol_major =
+ RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->ricp_v6.dp_protocol_minor =
+ RDS_PROTOCOL_MINOR(protocol_version);
+ dp->ricp_v6.dp_protocol_minor_mask =
+ cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->ricp_v6.dp_ack_seq =
+ cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+ conn_param->private_data = &dp->ricp_v6;
+ conn_param->private_data_len = sizeof(dp->ricp_v6);
+ } else {
+ dp->ricp_v4.dp_saddr = conn->c_laddr.s6_addr32[3];
+ dp->ricp_v4.dp_daddr = conn->c_faddr.s6_addr32[3];
+ dp->ricp_v4.dp_protocol_major =
+ RDS_PROTOCOL_MAJOR(protocol_version);
+ dp->ricp_v4.dp_protocol_minor =
+ RDS_PROTOCOL_MINOR(protocol_version);
+ dp->ricp_v4.dp_protocol_minor_mask =
+ cpu_to_be16(RDS_IB_SUPPORTED_PROTOCOLS);
+ dp->ricp_v4.dp_ack_seq =
+ cpu_to_be64(rds_ib_piggyb_ack(ic));
+
+ conn_param->private_data = &dp->ricp_v4;
+ conn_param->private_data_len = sizeof(dp->ricp_v4);
+ }
/* Advertise flow control */
if (ic->i_flowctl) {
unsigned int credits;
- credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
- dp->dp_credit = cpu_to_be32(credits);
- atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
+ credits = IB_GET_POST_CREDITS
+ (atomic_read(&ic->i_credits));
+ if (isv6)
+ dp->ricp_v6.dp_credit = cpu_to_be32(credits);
+ else
+ dp->ricp_v4.dp_credit = cpu_to_be32(credits);
+ atomic_sub(IB_SET_POST_CREDITS(credits),
+ &ic->i_credits);
}
-
- conn_param->private_data = dp;
- conn_param->private_data_len = sizeof(*dp);
}
}
@@ -349,7 +390,7 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data)
break;
default:
rdsdebug("Fatal QP Event %u (%s) "
- "- connection %pI4->%pI4, reconnecting\n",
+ "- connection %pI6c->%pI6c, reconnecting\n",
event->event, ib_event_msg(event->event),
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
@@ -580,11 +621,13 @@ out:
return ret;
}
-static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
+static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event, bool isv6)
{
- const struct rds_ib_connect_private *dp = event->param.conn.private_data;
- u16 common;
+ const union rds_ib_conn_priv *dp = event->param.conn.private_data;
+ u8 data_len, major, minor;
u32 version = 0;
+ __be16 mask;
+ u16 common;
/*
* rdma_cm private data is odd - when there is any private data in the
@@ -603,51 +646,140 @@ static u32 rds_ib_protocol_compatible(struct rdma_cm_event *event)
return 0;
}
+ if (isv6) {
+ data_len = sizeof(struct rds6_ib_connect_private);
+ major = dp->ricp_v6.dp_protocol_major;
+ minor = dp->ricp_v6.dp_protocol_minor;
+ mask = dp->ricp_v6.dp_protocol_minor_mask;
+ } else {
+ data_len = sizeof(struct rds_ib_connect_private);
+ major = dp->ricp_v4.dp_protocol_major;
+ minor = dp->ricp_v4.dp_protocol_minor;
+ mask = dp->ricp_v4.dp_protocol_minor_mask;
+ }
+
/* Even if len is crap *now* I still want to check it. -ASG */
- if (event->param.conn.private_data_len < sizeof (*dp) ||
- dp->dp_protocol_major == 0)
+ if (event->param.conn.private_data_len < data_len || major == 0)
return RDS_PROTOCOL_3_0;
- common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IB_SUPPORTED_PROTOCOLS;
- if (dp->dp_protocol_major == 3 && common) {
+ common = be16_to_cpu(mask) & RDS_IB_SUPPORTED_PROTOCOLS;
+ if (major == 3 && common) {
version = RDS_PROTOCOL_3_0;
while ((common >>= 1) != 0)
version++;
- } else
- printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
- &dp->dp_saddr,
- dp->dp_protocol_major,
- dp->dp_protocol_minor);
+ } else {
+ if (isv6)
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI6c using incompatible protocol version %u.%u\n",
+ &dp->ricp_v6.dp_saddr, major, minor);
+ else
+ printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using incompatible protocol version %u.%u\n",
+ &dp->ricp_v4.dp_saddr, major, minor);
+ }
return version;
}
+#if IS_ENABLED(CONFIG_IPV6)
+/* Given an IPv6 address, find the net_device which hosts that address and
+ * return its index. This is used by the rds_ib_cm_handle_connect() code to
+ * find the interface index of where an incoming request comes from when
+ * the request is using a link local address.
+ *
+ * Note one problem in this search. It is possible that two interfaces have
+ * the same link local address. Unfortunately, this cannot be solved unless
+ * the underlying layer gives us the interface which an incoming RDMA connect
+ * request comes from.
+ */
+static u32 __rds_find_ifindex(struct net *net, const struct in6_addr *addr)
+{
+ struct net_device *dev;
+ int idx = 0;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(net, dev) {
+ if (ipv6_chk_addr(net, addr, dev, 1)) {
+ idx = dev->ifindex;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return idx;
+}
+#endif
+
int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
+ struct rdma_cm_event *event, bool isv6)
{
__be64 lguid = cm_id->route.path_rec->sgid.global.interface_id;
__be64 fguid = cm_id->route.path_rec->dgid.global.interface_id;
- const struct rds_ib_connect_private *dp = event->param.conn.private_data;
- struct rds_ib_connect_private dp_rep;
+ const struct rds_ib_conn_priv_cmn *dp_cmn;
struct rds_connection *conn = NULL;
struct rds_ib_connection *ic = NULL;
struct rdma_conn_param conn_param;
+ const union rds_ib_conn_priv *dp;
+ union rds_ib_conn_priv dp_rep;
+ struct in6_addr s_mapped_addr;
+ struct in6_addr d_mapped_addr;
+ const struct in6_addr *saddr6;
+ const struct in6_addr *daddr6;
+ int destroy = 1;
+ u32 ifindex = 0;
u32 version;
- int err = 1, destroy = 1;
+ int err = 1;
/* Check whether the remote protocol version matches ours. */
- version = rds_ib_protocol_compatible(event);
+ version = rds_ib_protocol_compatible(event, isv6);
if (!version)
goto out;
- rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u lguid 0x%llx fguid "
- "0x%llx\n", &dp->dp_saddr, &dp->dp_daddr,
+ dp = event->param.conn.private_data;
+ if (isv6) {
+#if IS_ENABLED(CONFIG_IPV6)
+ dp_cmn = &dp->ricp_v6.dp_cmn;
+ saddr6 = &dp->ricp_v6.dp_saddr;
+ daddr6 = &dp->ricp_v6.dp_daddr;
+ /* If either address is link local, need to find the
+ * interface index in order to create a proper RDS
+ * connection.
+ */
+ if (ipv6_addr_type(daddr6) & IPV6_ADDR_LINKLOCAL) {
+ /* Using init_net for now .. */
+ ifindex = __rds_find_ifindex(&init_net, daddr6);
+ /* No index found... Need to bail out. */
+ if (ifindex == 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ } else if (ipv6_addr_type(saddr6) & IPV6_ADDR_LINKLOCAL) {
+ /* Use our address to find the correct index. */
+ ifindex = __rds_find_ifindex(&init_net, daddr6);
+ /* No index found... Need to bail out. */
+ if (ifindex == 0) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ }
+#else
+ err = -EOPNOTSUPP;
+ goto out;
+#endif
+ } else {
+ dp_cmn = &dp->ricp_v4.dp_cmn;
+ ipv6_addr_set_v4mapped(dp->ricp_v4.dp_saddr, &s_mapped_addr);
+ ipv6_addr_set_v4mapped(dp->ricp_v4.dp_daddr, &d_mapped_addr);
+ saddr6 = &s_mapped_addr;
+ daddr6 = &d_mapped_addr;
+ }
+
+ rdsdebug("saddr %pI6c daddr %pI6c RDSv%u.%u lguid 0x%llx fguid "
+ "0x%llx\n", saddr6, daddr6,
RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version),
(unsigned long long)be64_to_cpu(lguid),
(unsigned long long)be64_to_cpu(fguid));
/* RDS/IB is not currently netns aware, thus init_net */
- conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
- &rds_ib_transport, GFP_KERNEL);
+ conn = rds_conn_create(&init_net, daddr6, saddr6,
+ &rds_ib_transport, GFP_KERNEL, ifindex);
if (IS_ERR(conn)) {
rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
conn = NULL;
@@ -678,12 +810,13 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
ic = conn->c_transport_data;
rds_ib_set_protocol(conn, version);
- rds_ib_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
+ rds_ib_set_flow_control(conn, be32_to_cpu(dp_cmn->ricpc_credit));
/* If the peer gave us the last packet it saw, process this as if
* we had received a regular ACK. */
- if (dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
+ if (dp_cmn->ricpc_ack_seq)
+ rds_send_drop_acked(conn, be64_to_cpu(dp_cmn->ricpc_ack_seq),
+ NULL);
BUG_ON(cm_id->context);
BUG_ON(ic->i_cm_id);
@@ -702,8 +835,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version,
- event->param.conn.responder_resources,
- event->param.conn.initiator_depth);
+ event->param.conn.responder_resources,
+ event->param.conn.initiator_depth, isv6);
/* rdma_accept() calls rdma_reject() internally if it fails */
if (rdma_accept(cm_id, &conn_param))
@@ -718,12 +851,12 @@ out:
}
-int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
+int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id, bool isv6)
{
struct rds_connection *conn = cm_id->context;
struct rds_ib_connection *ic = conn->c_transport_data;
struct rdma_conn_param conn_param;
- struct rds_ib_connect_private dp;
+ union rds_ib_conn_priv dp;
int ret;
/* If the peer doesn't do protocol negotiation, we must
@@ -738,7 +871,7 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id)
}
rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION,
- UINT_MAX, UINT_MAX);
+ UINT_MAX, UINT_MAX, isv6);
ret = rdma_connect(cm_id, &conn_param);
if (ret)
rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret);
@@ -758,13 +891,22 @@ out:
int rds_ib_conn_path_connect(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
- struct rds_ib_connection *ic = conn->c_transport_data;
- struct sockaddr_in src, dest;
+ struct sockaddr_storage src, dest;
+ rdma_cm_event_handler handler;
+ struct rds_ib_connection *ic;
int ret;
+ ic = conn->c_transport_data;
+
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
+#if IS_ENABLED(CONFIG_IPV6)
+ if (conn->c_isv6)
+ handler = rds6_rdma_cm_event_handler;
+ else
+#endif
+ handler = rds_rdma_cm_event_handler;
+ ic->i_cm_id = rdma_create_id(&init_net, handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
@@ -775,13 +917,33 @@ int rds_ib_conn_path_connect(struct rds_conn_path *cp)
rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
+ if (ipv6_addr_v4mapped(&conn->c_faddr)) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&src;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin->sin_port = 0;
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_PORT);
+ sin = (struct sockaddr_in *)&dest;
+ sin->sin_family = AF_INET;
+ sin->sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin->sin_port = htons(RDS_PORT);
+ } else {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&src;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = conn->c_laddr;
+ sin6->sin6_port = 0;
+ sin6->sin6_scope_id = conn->c_dev_if;
+
+ sin6 = (struct sockaddr_in6 *)&dest;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_addr = conn->c_faddr;
+ sin6->sin6_port = htons(RDS_CM_PORT);
+ sin6->sin6_scope_id = conn->c_dev_if;
+ }
ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
(struct sockaddr *)&dest,
@@ -949,7 +1111,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
if (!ic)
return -ENOMEM;
- ret = rds_ib_recv_alloc_caches(ic);
+ ret = rds_ib_recv_alloc_caches(ic, gfp);
if (ret) {
kfree(ic);
return ret;
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
index 48332a6ed738..d152e48ea371 100644
--- a/net/rds/ib_frmr.c
+++ b/net/rds/ib_frmr.c
@@ -344,6 +344,11 @@ struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
struct rds_ib_frmr *frmr;
int ret;
+ if (!ic) {
+ /* TODO: Add FRWR support for RDS_GET_MR using proxy qp*/
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
do {
if (ibmr)
rds_ib_free_frmr(ibmr, true);
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index 0ea4ab017a8c..5da12c248431 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -113,9 +113,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
struct rds_info_rdma_connection *iinfo);
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds6_info_rdma_connection *iinfo6);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret);
+ struct rds_sock *rs, u32 *key_ret,
+ struct rds_connection *conn);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index e678699268a2..63c8d107adcf 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -100,18 +100,19 @@ static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
kfree_rcu(to_free, rcu);
}
-int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr)
+int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev,
+ struct in6_addr *ipaddr)
{
struct rds_ib_device *rds_ibdev_old;
- rds_ibdev_old = rds_ib_get_device(ipaddr);
+ rds_ibdev_old = rds_ib_get_device(ipaddr->s6_addr32[3]);
if (!rds_ibdev_old)
- return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
if (rds_ibdev_old != rds_ibdev) {
- rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr);
+ rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr->s6_addr32[3]);
rds_ib_dev_put(rds_ibdev_old);
- return rds_ib_add_ipaddr(rds_ibdev, ipaddr);
+ return rds_ib_add_ipaddr(rds_ibdev, ipaddr->s6_addr32[3]);
}
rds_ib_dev_put(rds_ibdev_old);
@@ -179,6 +180,17 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}
+#if IS_ENABLED(CONFIG_IPV6)
+void rds6_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds6_info_rdma_connection *iinfo6)
+{
+ struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
+
+ iinfo6->rdma_mr_max = pool_1m->max_items;
+ iinfo6->rdma_mr_size = pool_1m->fmr_attr.max_pages;
+}
+#endif
+
struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
@@ -537,19 +549,23 @@ void rds_ib_flush_mrs(void)
}
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret)
+ struct rds_sock *rs, u32 *key_ret,
+ struct rds_connection *conn)
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
- struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
+ struct rds_ib_connection *ic = NULL;
int ret;
- rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
+ rds_ibdev = rds_ib_get_device(rs->rs_bound_addr.s6_addr32[3]);
if (!rds_ibdev) {
ret = -ENODEV;
goto out;
}
+ if (conn)
+ ic = conn->c_transport_data;
+
if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) {
ret = -ENODEV;
goto out;
@@ -559,17 +575,18 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
else
ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
- if (ibmr)
- rds_ibdev = NULL;
-
- out:
- if (!ibmr)
+ if (IS_ERR(ibmr)) {
+ ret = PTR_ERR(ibmr);
pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
+ } else {
+ return ibmr;
+ }
+ out:
if (rds_ibdev)
rds_ib_dev_put(rds_ibdev);
- return ibmr;
+ return ERR_PTR(ret);
}
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index b4e421aa9727..d300186b8dc0 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -98,12 +98,12 @@ static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache)
}
}
-static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
+static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache, gfp_t gfp)
{
struct rds_ib_cache_head *head;
int cpu;
- cache->percpu = alloc_percpu(struct rds_ib_cache_head);
+ cache->percpu = alloc_percpu_gfp(struct rds_ib_cache_head, gfp);
if (!cache->percpu)
return -ENOMEM;
@@ -118,13 +118,13 @@ static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache)
return 0;
}
-int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic)
+int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic, gfp_t gfp)
{
int ret;
- ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs);
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs, gfp);
if (!ret) {
- ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags);
+ ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags, gfp);
if (ret)
free_percpu(ic->i_cache_incs.percpu);
}
@@ -266,7 +266,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i
rds_ib_stats_inc(s_ib_rx_total_incs);
}
INIT_LIST_HEAD(&ibinc->ii_frags);
- rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr);
+ rds_inc_init(&ibinc->ii_inc, ic->conn, &ic->conn->c_faddr);
return ibinc;
}
@@ -376,8 +376,6 @@ static void release_refill(struct rds_connection *conn)
* This tries to allocate and post unused work requests after making sure that
* they have all the allocations they need to queue received fragments into
* sockets.
- *
- * -1 is returned if posting fails due to temporary resource exhaustion.
*/
void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
{
@@ -420,7 +418,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
if (ret) {
rds_ib_conn_error(conn, "recv post on "
- "%pI4 returned %d, disconnecting and "
+ "%pI6c returned %d, disconnecting and "
"reconnecting\n", &conn->c_faddr,
ret);
break;
@@ -850,7 +848,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
if (data_len < sizeof(struct rds_header)) {
rds_ib_conn_error(conn, "incoming message "
- "from %pI4 didn't include a "
+ "from %pI6c didn't include a "
"header, disconnecting and "
"reconnecting\n",
&conn->c_faddr);
@@ -863,7 +861,7 @@ static void rds_ib_process_recv(struct rds_connection *conn,
/* Validate the checksum. */
if (!rds_message_verify_checksum(ihdr)) {
rds_ib_conn_error(conn, "incoming message "
- "from %pI4 has corrupted header - "
+ "from %pI6c has corrupted header - "
"forcing a reconnect\n",
&conn->c_faddr);
rds_stats_inc(s_recv_drop_bad_checksum);
@@ -943,10 +941,10 @@ static void rds_ib_process_recv(struct rds_connection *conn,
ic->i_recv_data_rem = 0;
ic->i_ibinc = NULL;
- if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
+ if (ibinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP) {
rds_ib_cong_recv(conn, ibinc);
- else {
- rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
+ } else {
+ rds_recv_incoming(conn, &conn->c_faddr, &conn->c_laddr,
&ibinc->ii_inc, GFP_ATOMIC);
state->ack_next = be64_to_cpu(hdr->h_sequence);
state->ack_next_valid = 1;
@@ -990,7 +988,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
} else {
/* We expect errors as the qp is drained during shutdown */
if (rds_conn_up(conn) || rds_conn_connecting(conn))
- rds_ib_conn_error(conn, "recv completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "recv completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr,
wc->status,
ib_wc_status_msg(wc->status));
@@ -1025,7 +1023,6 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
{
struct rds_connection *conn = cp->cp_conn;
struct rds_ib_connection *ic = conn->c_transport_data;
- int ret = 0;
rdsdebug("conn %p\n", conn);
if (rds_conn_up(conn)) {
@@ -1034,7 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp)
rds_ib_stats_inc(s_ib_rx_refill_from_thread);
}
- return ret;
+ return 0;
}
int rds_ib_recv_init(void)
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 8557a1cae041..c8dd3125d398 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -305,7 +305,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_ib_conn_error(conn, "send completion on <%pI4,%pI4> had status %u (%s), disconnecting and reconnecting\n",
+ rds_ib_conn_error(conn, "send completion on <%pI6c,%pI6c> had status %u (%s), disconnecting and reconnecting\n",
&conn->c_laddr, &conn->c_faddr, wc->status,
ib_wc_status_msg(wc->status));
}
@@ -730,7 +730,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
first, &first->s_wr, ret, failed_wr);
BUG_ON(failed_wr != &first->s_wr);
if (ret) {
- printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 "
+ printk(KERN_WARNING "RDS/IB: ib_post_send to %pI6c "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
@@ -759,14 +759,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
struct rds_ib_connection *ic = conn->c_transport_data;
struct rds_ib_send_work *send = NULL;
struct ib_send_wr *failed_wr;
- struct rds_ib_device *rds_ibdev;
u32 pos;
u32 work_alloc;
int ret;
int nr_sig = 0;
- rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client);
-
work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos);
if (work_alloc != 1) {
rds_ib_stats_inc(s_ib_tx_ring_full);
@@ -827,7 +824,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
send, &send->s_atomic_wr, ret, failed_wr);
BUG_ON(failed_wr != &send->s_atomic_wr.wr);
if (ret) {
- printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
+ printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI6c "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
@@ -967,7 +964,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
first, &first->s_rdma_wr.wr, ret, failed_wr);
BUG_ON(failed_wr != &first->s_rdma_wr.wr);
if (ret) {
- printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
+ printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI6c "
"returned %d\n", &conn->c_faddr, ret);
rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc);
rds_ib_sub_signaled(ic, nr_sig);
diff --git a/net/rds/loop.c b/net/rds/loop.c
index feea1f96ee2a..1d73ad79c847 100644
--- a/net/rds/loop.c
+++ b/net/rds/loop.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -35,6 +35,7 @@
#include <linux/in.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <linux/ipv6.h>
#include "rds_single_path.h"
#include "rds.h"
@@ -88,11 +89,11 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm,
BUG_ON(hdr_off || sg || off);
- rds_inc_init(&rm->m_inc, conn, conn->c_laddr);
+ rds_inc_init(&rm->m_inc, conn, &conn->c_laddr);
/* For the embedded inc. Matching put is in loop_inc_free() */
rds_message_addref(rm);
- rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc,
+ rds_recv_incoming(conn, &conn->c_laddr, &conn->c_faddr, &rm->m_inc,
GFP_KERNEL);
rds_send_drop_acked(conn, be64_to_cpu(rm->m_inc.i_hdr.h_sequence),
diff --git a/net/rds/message.c b/net/rds/message.c
index a35f76971984..4b00b1152a5f 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -514,4 +514,3 @@ void rds_message_unmapped(struct rds_message *rm)
wake_up_interruptible(&rm->m_flush_wait);
}
EXPORT_SYMBOL_GPL(rds_message_unmapped);
-
diff --git a/net/rds/rdma.c b/net/rds/rdma.c
index 634cfcb7bba6..98237feb607a 100644
--- a/net/rds/rdma.c
+++ b/net/rds/rdma.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2007 Oracle. All rights reserved.
+ * Copyright (c) 2007, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -170,7 +170,8 @@ static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
}
static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
- u64 *cookie_ret, struct rds_mr **mr_ret)
+ u64 *cookie_ret, struct rds_mr **mr_ret,
+ struct rds_conn_path *cp)
{
struct rds_mr *mr = NULL, *found;
unsigned int nr_pages;
@@ -183,7 +184,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
long i;
int ret;
- if (rs->rs_bound_addr == 0 || !rs->rs_transport) {
+ if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
@@ -269,7 +270,8 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
* Note that dma_map() implies that pending writes are
* flushed to RAM, so no dma_sync is needed here. */
trans_private = rs->rs_transport->get_mr(sg, nents, rs,
- &mr->r_key);
+ &mr->r_key,
+ cp ? cp->cp_conn : NULL);
if (IS_ERR(trans_private)) {
for (i = 0 ; i < nents; i++)
@@ -330,7 +332,7 @@ int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen)
sizeof(struct rds_get_mr_args)))
return -EFAULT;
- return __rds_rdma_map(rs, &args, NULL, NULL);
+ return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
}
int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
@@ -354,7 +356,7 @@ int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen)
new_args.cookie_addr = args.cookie_addr;
new_args.flags = args.flags;
- return __rds_rdma_map(rs, &new_args, NULL, NULL);
+ return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL);
}
/*
@@ -574,7 +576,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
args = CMSG_DATA(cmsg);
- if (rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr)) {
ret = -ENOTCONN; /* XXX not a great errno */
goto out_ret;
}
@@ -782,7 +784,8 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
rm->m_rdma_cookie != 0)
return -EINVAL;
- return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr);
+ return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie,
+ &rm->rdma.op_rdma_mr, rm->m_conn_path);
}
/*
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index fc59821f0a27..6b0f57c83a2a 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2009 Oracle. All rights reserved.
+ * Copyright (c) 2009, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -37,10 +37,15 @@
#include "rdma_transport.h"
#include "ib.h"
+/* Global IPv4 and IPv6 RDS RDMA listener cm_id */
static struct rdma_cm_id *rds_rdma_listen_id;
+#if IS_ENABLED(CONFIG_IPV6)
+static struct rdma_cm_id *rds6_rdma_listen_id;
+#endif
-int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
+static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event,
+ bool isv6)
{
/* this can be null in the listening path */
struct rds_connection *conn = cm_id->context;
@@ -72,7 +77,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
switch (event->event) {
case RDMA_CM_EVENT_CONNECT_REQUEST:
- ret = trans->cm_handle_connect(cm_id, event);
+ ret = trans->cm_handle_connect(cm_id, event, isv6);
break;
case RDMA_CM_EVENT_ADDR_RESOLVED:
@@ -90,7 +95,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
ibic = conn->c_transport_data;
if (ibic && ibic->i_cm_id == cm_id)
- ret = trans->cm_initiate_connect(cm_id);
+ ret = trans->cm_initiate_connect(cm_id, isv6);
else
rds_conn_drop(conn);
}
@@ -116,14 +121,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
case RDMA_CM_EVENT_DISCONNECTED:
rdsdebug("DISCONNECT event - dropping connection "
- "%pI4->%pI4\n", &conn->c_laddr,
+ "%pI6c->%pI6c\n", &conn->c_laddr,
&conn->c_faddr);
rds_conn_drop(conn);
break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
if (conn) {
- pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+ pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI6c->%pI6c\n",
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
}
@@ -146,13 +151,28 @@ out:
return ret;
}
-static int rds_rdma_listen_init(void)
+int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, false);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event)
+{
+ return rds_rdma_cm_event_handler_cmn(cm_id, event, true);
+}
+#endif
+
+static int rds_rdma_listen_init_common(rdma_cm_event_handler handler,
+ struct sockaddr *sa,
+ struct rdma_cm_id **ret_cm_id)
{
- struct sockaddr_in sin;
struct rdma_cm_id *cm_id;
int ret;
- cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+ cm_id = rdma_create_id(&init_net, handler, NULL,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
@@ -161,15 +181,11 @@ static int rds_rdma_listen_init(void)
return ret;
}
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
- sin.sin_port = (__force u16)htons(RDS_PORT);
-
/*
* XXX I bet this binds the cm_id to a device. If we want to support
* fail-over we'll have to take this into consideration.
*/
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
+ ret = rdma_bind_addr(cm_id, sa);
if (ret) {
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
"rdma_bind_addr() returned %d\n", ret);
@@ -185,7 +201,7 @@ static int rds_rdma_listen_init(void)
rdsdebug("cm %p listening on port %u\n", cm_id, RDS_PORT);
- rds_rdma_listen_id = cm_id;
+ *ret_cm_id = cm_id;
cm_id = NULL;
out:
if (cm_id)
@@ -193,6 +209,45 @@ out:
return ret;
}
+/* Initialize the RDS RDMA listeners. We create two listeners for
+ * compatibility reason. The one on RDS_PORT is used for IPv4
+ * requests only. The one on RDS_CM_PORT is used for IPv6 requests
+ * only. So only IPv6 enabled RDS module will communicate using this
+ * port.
+ */
+static int rds_rdma_listen_init(void)
+{
+ int ret;
+#if IS_ENABLED(CONFIG_IPV6)
+ struct sockaddr_in6 sin6;
+#endif
+ struct sockaddr_in sin;
+
+ sin.sin_family = PF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(RDS_PORT);
+ ret = rds_rdma_listen_init_common(rds_rdma_cm_event_handler,
+ (struct sockaddr *)&sin,
+ &rds_rdma_listen_id);
+ if (ret != 0)
+ return ret;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ sin6.sin6_family = PF_INET6;
+ sin6.sin6_addr = in6addr_any;
+ sin6.sin6_port = htons(RDS_CM_PORT);
+ sin6.sin6_scope_id = 0;
+ sin6.sin6_flowinfo = 0;
+ ret = rds_rdma_listen_init_common(rds6_rdma_cm_event_handler,
+ (struct sockaddr *)&sin6,
+ &rds6_rdma_listen_id);
+ /* Keep going even when IPv6 is not enabled in the system. */
+ if (ret != 0)
+ rdsdebug("Cannot set up IPv6 RDMA listener\n");
+#endif
+ return 0;
+}
+
static void rds_rdma_listen_stop(void)
{
if (rds_rdma_listen_id) {
@@ -200,6 +255,13 @@ static void rds_rdma_listen_stop(void)
rdma_destroy_id(rds_rdma_listen_id);
rds_rdma_listen_id = NULL;
}
+#if IS_ENABLED(CONFIG_IPV6)
+ if (rds6_rdma_listen_id) {
+ rdsdebug("cm %p\n", rds6_rdma_listen_id);
+ rdma_destroy_id(rds6_rdma_listen_id);
+ rds6_rdma_listen_id = NULL;
+ }
+#endif
}
static int rds_rdma_init(void)
@@ -229,4 +291,3 @@ module_exit(rds_rdma_exit);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: IB transport");
MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index d309c4430124..200d3134aaae 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -6,11 +6,16 @@
#include <rdma/rdma_cm.h>
#include "rds.h"
+/* RDMA_CM also uses 16385 as the listener port. */
+#define RDS_CM_PORT 16385
+
#define RDS_RDMA_RESOLVE_TIMEOUT_MS 5000
int rds_rdma_conn_connect(struct rds_connection *conn);
int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
struct rdma_cm_event *event);
+int rds6_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
+ struct rdma_cm_event *event);
/* from ib.c */
extern struct rds_transport rds_ib_transport;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index f2272fb8cd45..c4dcf654d8fe 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -10,6 +10,7 @@
#include <linux/rds.h>
#include <linux/rhashtable.h>
#include <linux/refcount.h>
+#include <linux/in6.h>
#include "info.h"
@@ -23,11 +24,13 @@
#define RDS_PROTOCOL_MINOR(v) ((v) & 255)
#define RDS_PROTOCOL(maj, min) (((maj) << 8) | min)
-/*
- * XXX randomly chosen, but at least seems to be unused:
- * # 18464-18768 Unassigned
- * We should do better. We want a reserved port to discourage unpriv'ed
- * userspace from listening.
+/* The following ports, 16385, 18634, 18635, are registered with IANA as
+ * the ports to be used for RDS over TCP and UDP. Currently, only RDS over
+ * TCP and RDS over IB/RDMA are implemented. 18634 is the historical value
+ * used for the RDMA_CM listener port. RDS/TCP uses port 16385. After
+ * IPv6 work, RDMA_CM also uses 16385 as the listener port. 18634 is kept
+ * to ensure compatibility with older RDS modules. Those ports are defined
+ * in each transport's header file.
*/
#define RDS_PORT 18634
@@ -61,7 +64,7 @@ void rdsdebug(char *fmt, ...)
struct rds_cong_map {
struct rb_node m_rb_node;
- __be32 m_addr;
+ struct in6_addr m_addr;
wait_queue_head_t m_waitq;
struct list_head m_conn_list;
unsigned long m_page_addrs[RDS_CONG_MAP_PAGES];
@@ -136,11 +139,14 @@ struct rds_conn_path {
/* One rds_connection per RDS address pair */
struct rds_connection {
struct hlist_node c_hash_node;
- __be32 c_laddr;
- __be32 c_faddr;
+ struct in6_addr c_laddr;
+ struct in6_addr c_faddr;
+ int c_dev_if; /* ifindex used for this conn */
+ int c_bound_if; /* ifindex of c_laddr */
unsigned int c_loopback:1,
+ c_isv6:1,
c_ping_triggered:1,
- c_pad_to_32:30;
+ c_pad_to_32:29;
int c_npaths;
struct rds_connection *c_passive;
struct rds_transport *c_trans;
@@ -269,7 +275,7 @@ struct rds_incoming {
struct rds_conn_path *i_conn_path;
struct rds_header i_hdr;
unsigned long i_rx_jiffies;
- __be32 i_saddr;
+ struct in6_addr i_saddr;
rds_rdma_cookie_t i_rdma_cookie;
struct timeval i_rx_tstamp;
@@ -386,7 +392,7 @@ struct rds_message {
struct list_head m_conn_item;
struct rds_incoming m_inc;
u64 m_ack_seq;
- __be32 m_daddr;
+ struct in6_addr m_daddr;
unsigned long m_flags;
/* Never access m_rs without holding m_rs_lock.
@@ -464,6 +470,8 @@ struct rds_message {
struct scatterlist *op_sg;
} data;
};
+
+ struct rds_conn_path *m_conn_path;
};
/*
@@ -519,7 +527,8 @@ struct rds_transport {
t_mp_capable:1;
unsigned int t_type;
- int (*laddr_check)(struct net *net, __be32 addr);
+ int (*laddr_check)(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
int (*conn_path_connect)(struct rds_conn_path *cp);
@@ -535,8 +544,8 @@ struct rds_transport {
void (*inc_free)(struct rds_incoming *inc);
int (*cm_handle_connect)(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
- int (*cm_initiate_connect)(struct rdma_cm_id *cm_id);
+ struct rdma_cm_event *event, bool isv6);
+ int (*cm_initiate_connect)(struct rdma_cm_id *cm_id, bool isv6);
void (*cm_connect_complete)(struct rds_connection *conn,
struct rdma_cm_event *event);
@@ -544,13 +553,20 @@ struct rds_transport {
unsigned int avail);
void (*exit)(void);
void *(*get_mr)(struct scatterlist *sg, unsigned long nr_sg,
- struct rds_sock *rs, u32 *key_ret);
+ struct rds_sock *rs, u32 *key_ret,
+ struct rds_connection *conn);
void (*sync_mr)(void *trans_private, int direction);
void (*free_mr)(void *trans_private, int invalidate);
void (*flush_mrs)(void);
bool (*t_unloading)(struct rds_connection *conn);
};
+/* Bind hash table key length. It is the sum of the size of a struct
+ * in6_addr, a scope_id and a port.
+ */
+#define RDS_BOUND_KEY_LEN \
+ (sizeof(struct in6_addr) + sizeof(__u32) + sizeof(__be16))
+
struct rds_sock {
struct sock rs_sk;
@@ -562,10 +578,14 @@ struct rds_sock {
* support.
*/
struct rhash_head rs_bound_node;
- u64 rs_bound_key;
- __be32 rs_bound_addr;
- __be32 rs_conn_addr;
- __be16 rs_bound_port;
+ u8 rs_bound_key[RDS_BOUND_KEY_LEN];
+ struct sockaddr_in6 rs_bound_sin6;
+#define rs_bound_addr rs_bound_sin6.sin6_addr
+#define rs_bound_addr_v4 rs_bound_sin6.sin6_addr.s6_addr32[3]
+#define rs_bound_port rs_bound_sin6.sin6_port
+#define rs_bound_scope_id rs_bound_sin6.sin6_scope_id
+ struct in6_addr rs_conn_addr;
+#define rs_conn_addr_v4 rs_conn_addr.s6_addr32[3]
__be16 rs_conn_port;
struct rds_transport *rs_transport;
@@ -701,7 +721,8 @@ extern wait_queue_head_t rds_poll_waitq;
/* bind.c */
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
void rds_remove_bound(struct rds_sock *rs);
-struct rds_sock *rds_find_bound(__be32 addr, __be16 port);
+struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port,
+ __u32 scope_id);
int rds_bind_lock_init(void);
void rds_bind_lock_destroy(void);
@@ -720,16 +741,20 @@ void rds_cong_remove_socket(struct rds_sock *);
void rds_cong_exit(void);
struct rds_message *rds_cong_update_alloc(struct rds_connection *conn);
-/* conn.c */
+/* connection.c */
extern u32 rds_gen_num;
int rds_conn_init(void);
void rds_conn_exit(void);
struct rds_connection *rds_conn_create(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp);
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans, gfp_t gfp,
+ int dev_if);
struct rds_connection *rds_conn_create_outgoing(struct net *net,
- __be32 laddr, __be32 faddr,
- struct rds_transport *trans, gfp_t gfp);
+ const struct in6_addr *laddr,
+ const struct in6_addr *faddr,
+ struct rds_transport *trans,
+ gfp_t gfp, int dev_if);
void rds_conn_shutdown(struct rds_conn_path *cpath);
void rds_conn_destroy(struct rds_connection *conn);
void rds_conn_drop(struct rds_connection *conn);
@@ -840,11 +865,12 @@ void rds_page_exit(void);
/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
- __be32 saddr);
+ struct in6_addr *saddr);
void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *conn,
- __be32 saddr);
+ struct in6_addr *saddr);
void rds_inc_put(struct rds_incoming *inc);
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+ struct in6_addr *daddr,
struct rds_incoming *inc, gfp_t gfp);
int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
int msg_flags);
@@ -853,13 +879,17 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
void rds_inc_info_copy(struct rds_incoming *inc,
struct rds_info_iterator *iter,
__be32 saddr, __be32 daddr, int flip);
+void rds6_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ int flip);
/* send.c */
int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
void rds_send_path_reset(struct rds_conn_path *conn);
int rds_send_xmit(struct rds_conn_path *cp);
struct sockaddr_in;
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
is_acked_func is_acked);
@@ -946,11 +976,14 @@ void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
void rds_connect_path_complete(struct rds_conn_path *conn, int curr);
void rds_connect_complete(struct rds_connection *conn);
+int rds_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2);
/* transport.c */
void rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr);
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+ const struct in6_addr *addr,
+ __u32 scope_id);
void rds_trans_put(struct rds_transport *trans);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 192ac6f78ded..504cd6bcc54c 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -41,14 +41,14 @@
#include "rds.h"
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
- __be32 saddr)
+ struct in6_addr *saddr)
{
int i;
refcount_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = conn;
- inc->i_saddr = saddr;
+ inc->i_saddr = *saddr;
inc->i_rdma_cookie = 0;
inc->i_rx_tstamp.tv_sec = 0;
inc->i_rx_tstamp.tv_usec = 0;
@@ -59,13 +59,13 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
EXPORT_SYMBOL_GPL(rds_inc_init);
void rds_inc_path_init(struct rds_incoming *inc, struct rds_conn_path *cp,
- __be32 saddr)
+ struct in6_addr *saddr)
{
refcount_set(&inc->i_refcount, 1);
INIT_LIST_HEAD(&inc->i_item);
inc->i_conn = cp->cp_conn;
inc->i_conn_path = cp;
- inc->i_saddr = saddr;
+ inc->i_saddr = *saddr;
inc->i_rdma_cookie = 0;
inc->i_rx_tstamp.tv_sec = 0;
inc->i_rx_tstamp.tv_usec = 0;
@@ -110,7 +110,7 @@ static void rds_recv_rcvbuf_delta(struct rds_sock *rs, struct sock *sk,
now_congested = rs->rs_rcv_bytes > rds_sk_rcvbuf(rs);
- rdsdebug("rs %p (%pI4:%u) recv bytes %d buf %d "
+ rdsdebug("rs %p (%pI6c:%u) recv bytes %d buf %d "
"now_cong %d delta %d\n",
rs, &rs->rs_bound_addr,
ntohs(rs->rs_bound_port), rs->rs_rcv_bytes,
@@ -260,7 +260,7 @@ static void rds_start_mprds(struct rds_connection *conn)
struct rds_conn_path *cp;
if (conn->c_npaths > 1 &&
- IS_CANONICAL(conn->c_laddr, conn->c_faddr)) {
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) < 0) {
for (i = 0; i < conn->c_npaths; i++) {
cp = &conn->c_path[i];
rds_conn_path_connect_if_down(cp);
@@ -284,7 +284,8 @@ static void rds_start_mprds(struct rds_connection *conn)
* conn. This lets loopback, who only has one conn for both directions,
* tell us which roles the addrs in the conn are playing for this message.
*/
-void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
+void rds_recv_incoming(struct rds_connection *conn, struct in6_addr *saddr,
+ struct in6_addr *daddr,
struct rds_incoming *inc, gfp_t gfp)
{
struct rds_sock *rs = NULL;
@@ -339,7 +340,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
if (rds_sysctl_ping_enable && inc->i_hdr.h_dport == 0) {
if (inc->i_hdr.h_sport == 0) {
- rdsdebug("ignore ping with 0 sport from 0x%x\n", saddr);
+ rdsdebug("ignore ping with 0 sport from %pI6c\n",
+ saddr);
goto out;
}
rds_stats_inc(s_recv_ping);
@@ -362,7 +364,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
goto out;
}
- rs = rds_find_bound(daddr, inc->i_hdr.h_dport);
+ rs = rds_find_bound(daddr, inc->i_hdr.h_dport, conn->c_bound_if);
if (!rs) {
rds_stats_inc(s_recv_drop_no_sock);
goto out;
@@ -625,6 +627,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
struct rds_sock *rs = rds_sk_to_rs(sk);
long timeo;
int ret = 0, nonblock = msg_flags & MSG_DONTWAIT;
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name);
struct rds_incoming *inc = NULL;
@@ -673,7 +676,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
break;
}
- rdsdebug("copying inc %p from %pI4:%u to user\n", inc,
+ rdsdebug("copying inc %p from %pI6c:%u to user\n", inc,
&inc->i_conn->c_faddr,
ntohs(inc->i_hdr.h_sport));
ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter);
@@ -707,12 +710,26 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
rds_stats_inc(s_recv_delivered);
- if (sin) {
- sin->sin_family = AF_INET;
- sin->sin_port = inc->i_hdr.h_sport;
- sin->sin_addr.s_addr = inc->i_saddr;
- memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
- msg->msg_namelen = sizeof(*sin);
+ if (msg->msg_name) {
+ if (ipv6_addr_v4mapped(&inc->i_saddr)) {
+ sin = (struct sockaddr_in *)msg->msg_name;
+
+ sin->sin_family = AF_INET;
+ sin->sin_port = inc->i_hdr.h_sport;
+ sin->sin_addr.s_addr =
+ inc->i_saddr.s6_addr32[3];
+ memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
+ msg->msg_namelen = sizeof(*sin);
+ } else {
+ sin6 = (struct sockaddr_in6 *)msg->msg_name;
+
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = inc->i_hdr.h_sport;
+ sin6->sin6_addr = inc->i_saddr;
+ sin6->sin6_flowinfo = 0;
+ sin6->sin6_scope_id = rs->rs_bound_scope_id;
+ msg->msg_namelen = sizeof(*sin6);
+ }
}
break;
}
@@ -775,3 +792,30 @@ void rds_inc_info_copy(struct rds_incoming *inc,
rds_info_copy(iter, &minfo, sizeof(minfo));
}
+
+#if IS_ENABLED(CONFIG_IPV6)
+void rds6_inc_info_copy(struct rds_incoming *inc,
+ struct rds_info_iterator *iter,
+ struct in6_addr *saddr, struct in6_addr *daddr,
+ int flip)
+{
+ struct rds6_info_message minfo6;
+
+ minfo6.seq = be64_to_cpu(inc->i_hdr.h_sequence);
+ minfo6.len = be32_to_cpu(inc->i_hdr.h_len);
+
+ if (flip) {
+ minfo6.laddr = *daddr;
+ minfo6.faddr = *saddr;
+ minfo6.lport = inc->i_hdr.h_dport;
+ minfo6.fport = inc->i_hdr.h_sport;
+ } else {
+ minfo6.laddr = *saddr;
+ minfo6.faddr = *daddr;
+ minfo6.lport = inc->i_hdr.h_sport;
+ minfo6.fport = inc->i_hdr.h_dport;
+ }
+
+ rds_info_copy(iter, &minfo6, sizeof(minfo6));
+}
+#endif
diff --git a/net/rds/send.c b/net/rds/send.c
index 94c7f74909be..57b3d5a8b2db 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -709,7 +709,7 @@ void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
}
EXPORT_SYMBOL_GPL(rds_send_drop_acked);
-void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
+void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in6 *dest)
{
struct rds_message *rm, *tmp;
struct rds_connection *conn;
@@ -721,8 +721,9 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest)
spin_lock_irqsave(&rs->rs_lock, flags);
list_for_each_entry_safe(rm, tmp, &rs->rs_send_queue, m_sock_item) {
- if (dest && (dest->sin_addr.s_addr != rm->m_daddr ||
- dest->sin_port != rm->m_inc.i_hdr.h_dport))
+ if (dest &&
+ (!ipv6_addr_equal(&dest->sin6_addr, &rm->m_daddr) ||
+ dest->sin6_port != rm->m_inc.i_hdr.h_dport))
continue;
list_move(&rm->m_sock_item, &list);
@@ -1059,8 +1060,8 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
{
struct sock *sk = sock->sk;
struct rds_sock *rs = rds_sk_to_rs(sk);
+ DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
- __be32 daddr;
__be16 dport;
struct rds_message *rm = NULL;
struct rds_connection *conn;
@@ -1069,10 +1070,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
int nonblock = msg->msg_flags & MSG_DONTWAIT;
long timeo = sock_sndtimeo(sk, nonblock);
struct rds_conn_path *cpath;
+ struct in6_addr daddr;
+ __u32 scope_id = 0;
size_t total_payload_len = payload_len, rdma_payload_len = 0;
bool zcopy = ((msg->msg_flags & MSG_ZEROCOPY) &&
sock_flag(rds_rs_to_sk(rs), SOCK_ZEROCOPY));
int num_sgs = ceil(payload_len, PAGE_SIZE);
+ int namelen;
/* Mirror Linux UDP mirror of BSD error message compatibility */
/* XXX: Perhaps MSG_MORE someday */
@@ -1081,27 +1085,108 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
- if (msg->msg_namelen) {
- /* XXX fail non-unicast destination IPs? */
- if (msg->msg_namelen < sizeof(*usin) || usin->sin_family != AF_INET) {
+ namelen = msg->msg_namelen;
+ if (namelen != 0) {
+ if (namelen < sizeof(*usin)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ switch (usin->sin_family) {
+ case AF_INET:
+ if (usin->sin_addr.s_addr == htonl(INADDR_ANY) ||
+ usin->sin_addr.s_addr == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(usin->sin_addr.s_addr))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ ipv6_addr_set_v4mapped(usin->sin_addr.s_addr, &daddr);
+ dport = usin->sin_port;
+ break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+ case AF_INET6: {
+ int addr_type;
+
+ if (namelen < sizeof(*sin6)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ addr_type = ipv6_addr_type(&sin6->sin6_addr);
+ if (!(addr_type & IPV6_ADDR_UNICAST)) {
+ __be32 addr4;
+
+ if (!(addr_type & IPV6_ADDR_MAPPED)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* It is a mapped address. Need to do some
+ * sanity checks.
+ */
+ addr4 = sin6->sin6_addr.s6_addr32[3];
+ if (addr4 == htonl(INADDR_ANY) ||
+ addr4 == htonl(INADDR_BROADCAST) ||
+ IN_MULTICAST(ntohl(addr4))) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+ if (addr_type & IPV6_ADDR_LINKLOCAL) {
+ if (sin6->sin6_scope_id == 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ scope_id = sin6->sin6_scope_id;
+ }
+
+ daddr = sin6->sin6_addr;
+ dport = sin6->sin6_port;
+ break;
+ }
+#endif
+
+ default:
ret = -EINVAL;
goto out;
}
- daddr = usin->sin_addr.s_addr;
- dport = usin->sin_port;
} else {
/* We only care about consistency with ->connect() */
lock_sock(sk);
daddr = rs->rs_conn_addr;
dport = rs->rs_conn_port;
+ scope_id = rs->rs_bound_scope_id;
release_sock(sk);
}
lock_sock(sk);
- if (daddr == 0 || rs->rs_bound_addr == 0) {
+ if (ipv6_addr_any(&rs->rs_bound_addr) || ipv6_addr_any(&daddr)) {
release_sock(sk);
- ret = -ENOTCONN; /* XXX not a great errno */
+ ret = -ENOTCONN;
goto out;
+ } else if (namelen != 0) {
+ /* Cannot send to an IPv4 address using an IPv6 source
+ * address and cannot send to an IPv6 address using an
+ * IPv4 source address.
+ */
+ if (ipv6_addr_v4mapped(&daddr) ^
+ ipv6_addr_v4mapped(&rs->rs_bound_addr)) {
+ release_sock(sk);
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ /* If the socket is already bound to a link local address,
+ * it can only send to peers on the same link. But allow
+ * communicating beween link local and non-link local address.
+ */
+ if (scope_id != rs->rs_bound_scope_id) {
+ if (!scope_id) {
+ scope_id = rs->rs_bound_scope_id;
+ } else if (rs->rs_bound_scope_id) {
+ release_sock(sk);
+ ret = -EINVAL;
+ goto out;
+ }
+ }
}
release_sock(sk);
@@ -1155,13 +1240,14 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
/* rds_conn_create has a spinlock that runs with IRQ off.
* Caching the conn in the socket helps a lot. */
- if (rs->rs_conn && rs->rs_conn->c_faddr == daddr)
+ if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr))
conn = rs->rs_conn;
else {
conn = rds_conn_create_outgoing(sock_net(sock->sk),
- rs->rs_bound_addr, daddr,
- rs->rs_transport,
- sock->sk->sk_allocation);
+ &rs->rs_bound_addr, &daddr,
+ rs->rs_transport,
+ sock->sk->sk_allocation,
+ scope_id);
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
goto out;
@@ -1169,6 +1255,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
rs->rs_conn = conn;
}
+ if (conn->c_trans->t_mp_capable)
+ cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
+ else
+ cpath = &conn->c_path[0];
+
+ rm->m_conn_path = cpath;
+
/* Parse any control messages the user may have included. */
ret = rds_cmsg_send(rs, rm, msg, &allocated_mr);
if (ret) {
@@ -1192,11 +1285,6 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
goto out;
}
- if (conn->c_trans->t_mp_capable)
- cpath = &conn->c_path[rds_send_mprds_hash(rs, conn)];
- else
- cpath = &conn->c_path[0];
-
if (rds_destroy_pending(conn)) {
ret = -EAGAIN;
goto out;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 351a28474667..2c7b7c352d3e 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -37,6 +37,8 @@
#include <net/tcp.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/tcp.h>
+#include <net/addrconf.h>
#include "rds.h"
#include "tcp.h"
@@ -44,7 +46,14 @@
/* only for info exporting */
static DEFINE_SPINLOCK(rds_tcp_tc_list_lock);
static LIST_HEAD(rds_tcp_tc_list);
+
+/* rds_tcp_tc_count counts only IPv4 connections.
+ * rds6_tcp_tc_count counts both IPv4 and IPv6 connections.
+ */
static unsigned int rds_tcp_tc_count;
+#if IS_ENABLED(CONFIG_IPV6)
+static unsigned int rds6_tcp_tc_count;
+#endif
/* Track rds_tcp_connection structs so they can be cleaned up */
static DEFINE_SPINLOCK(rds_tcp_conn_lock);
@@ -111,7 +120,11 @@ void rds_tcp_restore_callbacks(struct socket *sock,
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_del_init(&tc->t_list_item);
- rds_tcp_tc_count--;
+#if IS_ENABLED(CONFIG_IPV6)
+ rds6_tcp_tc_count--;
+#endif
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count--;
spin_unlock(&rds_tcp_tc_list_lock);
tc->t_sock = NULL;
@@ -198,7 +211,11 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
/* done under the callback_lock to serialize with write_space */
spin_lock(&rds_tcp_tc_list_lock);
list_add_tail(&tc->t_list_item, &rds_tcp_tc_list);
- rds_tcp_tc_count++;
+#if IS_ENABLED(CONFIG_IPV6)
+ rds6_tcp_tc_count++;
+#endif
+ if (!tc->t_cpath->cp_conn->c_isv6)
+ rds_tcp_tc_count++;
spin_unlock(&rds_tcp_tc_list_lock);
/* accepted sockets need our listen data ready undone */
@@ -219,6 +236,9 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
write_unlock_bh(&sock->sk->sk_callback_lock);
}
+/* Handle RDS_INFO_TCP_SOCKETS socket option. It only returns IPv4
+ * connections for backward compatibility.
+ */
static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
@@ -226,8 +246,6 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
struct rds_info_tcp_socket tsinfo;
struct rds_tcp_connection *tc;
unsigned long flags;
- struct sockaddr_in sin;
- struct socket *sock;
spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
@@ -235,16 +253,15 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len,
goto out;
list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct inet_sock *inet = inet_sk(tc->t_sock->sk);
- sock = tc->t_sock;
- if (sock) {
- sock->ops->getname(sock, (struct sockaddr *)&sin, 0);
- tsinfo.local_addr = sin.sin_addr.s_addr;
- tsinfo.local_port = sin.sin_port;
- sock->ops->getname(sock, (struct sockaddr *)&sin, 1);
- tsinfo.peer_addr = sin.sin_addr.s_addr;
- tsinfo.peer_port = sin.sin_port;
- }
+ if (tc->t_cpath->cp_conn->c_isv6)
+ continue;
+
+ tsinfo.local_addr = inet->inet_saddr;
+ tsinfo.local_port = inet->inet_sport;
+ tsinfo.peer_addr = inet->inet_daddr;
+ tsinfo.peer_port = inet->inet_dport;
tsinfo.hdr_rem = tc->t_tinc_hdr_rem;
tsinfo.data_rem = tc->t_tinc_data_rem;
@@ -262,10 +279,82 @@ out:
spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
}
-static int rds_tcp_laddr_check(struct net *net, __be32 addr)
+#if IS_ENABLED(CONFIG_IPV6)
+/* Handle RDS6_INFO_TCP_SOCKETS socket option. It returns both IPv4 and
+ * IPv6 connections. IPv4 connection address is returned in an IPv4 mapped
+ * address.
+ */
+static void rds6_tcp_tc_info(struct socket *sock, unsigned int len,
+ struct rds_info_iterator *iter,
+ struct rds_info_lengths *lens)
{
- if (inet_addr_type(net, addr) == RTN_LOCAL)
+ struct rds6_info_tcp_socket tsinfo6;
+ struct rds_tcp_connection *tc;
+ unsigned long flags;
+
+ spin_lock_irqsave(&rds_tcp_tc_list_lock, flags);
+
+ if (len / sizeof(tsinfo6) < rds6_tcp_tc_count)
+ goto out;
+
+ list_for_each_entry(tc, &rds_tcp_tc_list, t_list_item) {
+ struct sock *sk = tc->t_sock->sk;
+ struct inet_sock *inet = inet_sk(sk);
+
+ tsinfo6.local_addr = sk->sk_v6_rcv_saddr;
+ tsinfo6.local_port = inet->inet_sport;
+ tsinfo6.peer_addr = sk->sk_v6_daddr;
+ tsinfo6.peer_port = inet->inet_dport;
+
+ tsinfo6.hdr_rem = tc->t_tinc_hdr_rem;
+ tsinfo6.data_rem = tc->t_tinc_data_rem;
+ tsinfo6.last_sent_nxt = tc->t_last_sent_nxt;
+ tsinfo6.last_expected_una = tc->t_last_expected_una;
+ tsinfo6.last_seen_una = tc->t_last_seen_una;
+
+ rds_info_copy(iter, &tsinfo6, sizeof(tsinfo6));
+ }
+
+out:
+ lens->nr = rds6_tcp_tc_count;
+ lens->each = sizeof(tsinfo6);
+
+ spin_unlock_irqrestore(&rds_tcp_tc_list_lock, flags);
+}
+#endif
+
+static int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
+ __u32 scope_id)
+{
+ struct net_device *dev = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+ int ret;
+#endif
+
+ if (ipv6_addr_v4mapped(addr)) {
+ if (inet_addr_type(net, addr->s6_addr32[3]) == RTN_LOCAL)
+ return 0;
+ return -EADDRNOTAVAIL;
+ }
+
+ /* If the scope_id is specified, check only those addresses
+ * hosted on the specified interface.
+ */
+ if (scope_id != 0) {
+ rcu_read_lock();
+ dev = dev_get_by_index_rcu(net, scope_id);
+ /* scope_id is not valid... */
+ if (!dev) {
+ rcu_read_unlock();
+ return -EADDRNOTAVAIL;
+ }
+ rcu_read_unlock();
+ }
+#if IS_ENABLED(CONFIG_IPV6)
+ ret = ipv6_chk_addr(net, addr, dev, 0);
+ if (ret)
return 0;
+#endif
return -EADDRNOTAVAIL;
}
@@ -468,13 +557,27 @@ static __net_init int rds_tcp_init_net(struct net *net)
err = -ENOMEM;
goto fail;
}
- rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
+
+#if IS_ENABLED(CONFIG_IPV6)
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, true);
+#else
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+#endif
if (!rtn->rds_tcp_listen_sock) {
- pr_warn("could not set up listen sock\n");
- unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
- rtn->rds_tcp_sysctl = NULL;
- err = -EAFNOSUPPORT;
- goto fail;
+ pr_warn("could not set up IPv6 listen sock\n");
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* Try IPv4 as some systems disable IPv6 */
+ rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net, false);
+ if (!rtn->rds_tcp_listen_sock) {
+#endif
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+ rtn->rds_tcp_sysctl = NULL;
+ err = -EAFNOSUPPORT;
+ goto fail;
+#if IS_ENABLED(CONFIG_IPV6)
+ }
+#endif
}
INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
return 0;
@@ -588,6 +691,9 @@ static void rds_tcp_exit(void)
rds_tcp_set_unloading();
synchronize_rcu();
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_deregister_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
+#endif
unregister_pernet_device(&rds_tcp_net_ops);
rds_tcp_destroy_conns();
rds_trans_unregister(&rds_tcp_transport);
@@ -619,6 +725,9 @@ static int rds_tcp_init(void)
rds_trans_register(&rds_tcp_transport);
rds_info_register_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+#if IS_ENABLED(CONFIG_IPV6)
+ rds_info_register_func(RDS6_INFO_TCP_SOCKETS, rds6_tcp_tc_info);
+#endif
goto out;
out_recv:
@@ -633,4 +742,3 @@ module_init(rds_tcp_init);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: TCP transport");
MODULE_LICENSE("Dual BSD/GPL");
-
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index c6fa080e9b6d..3c69361d21c7 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -67,7 +67,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *conn);
void rds_tcp_state_change(struct sock *sk);
/* tcp_listen.c */
-struct socket *rds_tcp_listen_init(struct net *);
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
int rds_tcp_accept_one(struct socket *sock);
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index d999e7075645..008f50fb25dd 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -66,7 +66,8 @@ void rds_tcp_state_change(struct sock *sk)
* RDS connection as RDS_CONN_UP until the reconnect,
* to avoid RDS datagram loss.
*/
- if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) &&
+ if (rds_addr_cmp(&cp->cp_conn->c_laddr,
+ &cp->cp_conn->c_faddr) >= 0 &&
rds_conn_path_transition(cp, RDS_CONN_CONNECTING,
RDS_CONN_ERROR)) {
rds_conn_path_drop(cp, false);
@@ -88,7 +89,11 @@ out:
int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
{
struct socket *sock = NULL;
- struct sockaddr_in src, dest;
+ struct sockaddr_in6 sin6;
+ struct sockaddr_in sin;
+ struct sockaddr *addr;
+ int addrlen;
+ bool isv6;
int ret;
struct rds_connection *conn = cp->cp_conn;
struct rds_tcp_connection *tc = cp->cp_transport_data;
@@ -105,37 +110,68 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
mutex_unlock(&tc->t_conn_path_lock);
return 0;
}
- ret = sock_create_kern(rds_conn_net(conn), PF_INET,
- SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ipv6_addr_v4mapped(&conn->c_laddr)) {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = false;
+ } else {
+ ret = sock_create_kern(rds_conn_net(conn), PF_INET6,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
+ isv6 = true;
+ }
+
if (ret < 0)
goto out;
rds_tcp_tune(sock);
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_laddr;
+ sin6.sin6_port = 0;
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_laddr.s6_addr32[3];
+ sin.sin_port = 0;
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
- ret = sock->ops->bind(sock, (struct sockaddr *)&src, sizeof(src));
+ ret = sock->ops->bind(sock, addr, addrlen);
if (ret) {
- rdsdebug("bind failed with %d at address %pI4\n",
+ rdsdebug("bind failed with %d at address %pI6c\n",
ret, &conn->c_laddr);
goto out;
}
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_TCP_PORT);
+ if (isv6) {
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = conn->c_faddr;
+ sin6.sin6_port = htons(RDS_TCP_PORT);
+ sin6.sin6_flowinfo = 0;
+ sin6.sin6_scope_id = conn->c_dev_if;
+ addr = (struct sockaddr *)&sin6;
+ addrlen = sizeof(sin6);
+ } else {
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = conn->c_faddr.s6_addr32[3];
+ sin.sin_port = htons(RDS_TCP_PORT);
+ addr = (struct sockaddr *)&sin;
+ addrlen = sizeof(sin);
+ }
/*
* once we call connect() we can start getting callbacks and they
* own the socket
*/
rds_tcp_set_callbacks(sock, cp);
- ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest),
- O_NONBLOCK);
+ ret = sock->ops->connect(sock, addr, addrlen, O_NONBLOCK);
- rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret);
+ rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret);
if (ret == -EINPROGRESS)
ret = 0;
if (ret == 0) {
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 22571189f21e..c12203f646da 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006, 2018 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -83,13 +83,12 @@ static
struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
{
int i;
- bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr);
int npaths = max_t(int, 1, conn->c_npaths);
/* for mprds, all paths MUST be initiated by the peer
* with the smaller address.
*/
- if (!peer_is_smaller) {
+ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
/* Make sure we initiate at least one path if this
* has not already been done; rds_start_mprds() will
* take care of additional paths, if necessary.
@@ -132,6 +131,11 @@ int rds_tcp_accept_one(struct socket *sock)
struct rds_tcp_connection *rs_tcp = NULL;
int conn_state;
struct rds_conn_path *cp;
+ struct in6_addr *my_addr, *peer_addr;
+#if !IS_ENABLED(CONFIG_IPV6)
+ struct in6_addr saddr, daddr;
+#endif
+ int dev_if = 0;
if (!sock) /* module unload or netns delete in progress */
return -ENETUNREACH;
@@ -164,13 +168,40 @@ int rds_tcp_accept_one(struct socket *sock)
inet = inet_sk(new_sock->sk);
- rdsdebug("accepted tcp %pI4:%u -> %pI4:%u\n",
- &inet->inet_saddr, ntohs(inet->inet_sport),
- &inet->inet_daddr, ntohs(inet->inet_dport));
+#if IS_ENABLED(CONFIG_IPV6)
+ my_addr = &new_sock->sk->sk_v6_rcv_saddr;
+ peer_addr = &new_sock->sk->sk_v6_daddr;
+#else
+ ipv6_addr_set_v4mapped(inet->inet_saddr, &saddr);
+ ipv6_addr_set_v4mapped(inet->inet_daddr, &daddr);
+ my_addr = &saddr;
+ peer_addr = &daddr;
+#endif
+ rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
+ sock->sk->sk_family,
+ my_addr, ntohs(inet->inet_sport),
+ peer_addr, ntohs(inet->inet_dport));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ /* sk_bound_dev_if is not set if the peer address is not link local
+ * address. In this case, it happens that mcast_oif is set. So
+ * just use it.
+ */
+ if ((ipv6_addr_type(my_addr) & IPV6_ADDR_LINKLOCAL) &&
+ !(ipv6_addr_type(peer_addr) & IPV6_ADDR_LINKLOCAL)) {
+ struct ipv6_pinfo *inet6;
+
+ inet6 = inet6_sk(new_sock->sk);
+ dev_if = inet6->mcast_oif;
+ } else {
+ dev_if = new_sock->sk->sk_bound_dev_if;
+ }
+#endif
conn = rds_conn_create(sock_net(sock->sk),
- inet->inet_saddr, inet->inet_daddr,
- &rds_tcp_transport, GFP_KERNEL);
+ my_addr, peer_addr,
+ &rds_tcp_transport, GFP_KERNEL, dev_if);
+
if (IS_ERR(conn)) {
ret = PTR_ERR(conn);
goto out;
@@ -254,15 +285,22 @@ out:
ready(sk);
}
-struct socket *rds_tcp_listen_init(struct net *net)
+struct socket *rds_tcp_listen_init(struct net *net, bool isv6)
{
- struct sockaddr_in sin;
struct socket *sock = NULL;
+ struct sockaddr_storage ss;
+ struct sockaddr_in6 *sin6;
+ struct sockaddr_in *sin;
+ int addr_len;
int ret;
- ret = sock_create_kern(net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
- if (ret < 0)
+ ret = sock_create_kern(net, isv6 ? PF_INET6 : PF_INET, SOCK_STREAM,
+ IPPROTO_TCP, &sock);
+ if (ret < 0) {
+ rdsdebug("could not create %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
goto out;
+ }
sock->sk->sk_reuse = SK_CAN_REUSE;
rds_tcp_nonagle(sock);
@@ -272,13 +310,28 @@ struct socket *rds_tcp_listen_init(struct net *net)
sock->sk->sk_data_ready = rds_tcp_listen_data_ready;
write_unlock_bh(&sock->sk->sk_callback_lock);
- sin.sin_family = PF_INET;
- sin.sin_addr.s_addr = (__force u32)htonl(INADDR_ANY);
- sin.sin_port = (__force u16)htons(RDS_TCP_PORT);
+ if (isv6) {
+ sin6 = (struct sockaddr_in6 *)&ss;
+ sin6->sin6_family = PF_INET6;
+ sin6->sin6_addr = in6addr_any;
+ sin6->sin6_port = (__force u16)htons(RDS_TCP_PORT);
+ sin6->sin6_scope_id = 0;
+ sin6->sin6_flowinfo = 0;
+ addr_len = sizeof(*sin6);
+ } else {
+ sin = (struct sockaddr_in *)&ss;
+ sin->sin_family = PF_INET;
+ sin->sin_addr.s_addr = INADDR_ANY;
+ sin->sin_port = (__force u16)htons(RDS_TCP_PORT);
+ addr_len = sizeof(*sin);
+ }
- ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
- if (ret < 0)
+ ret = sock->ops->bind(sock, (struct sockaddr *)&ss, addr_len);
+ if (ret < 0) {
+ rdsdebug("could not bind %s listener socket: %d\n",
+ isv6 ? "IPv6" : "IPv4", ret);
goto out;
+ }
ret = sock->ops->listen(sock, 64);
if (ret < 0)
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index b9fbd2ee74ef..42c5ff1eda95 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -179,7 +179,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
tc->t_tinc = tinc;
rdsdebug("alloced tinc %p\n", tinc);
rds_inc_path_init(&tinc->ti_inc, cp,
- cp->cp_conn->c_faddr);
+ &cp->cp_conn->c_faddr);
tinc->ti_inc.i_rx_lat_trace[RDS_MSG_RX_HDR] =
local_clock();
@@ -239,8 +239,9 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb,
if (tinc->ti_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
rds_tcp_cong_recv(conn, tinc);
else
- rds_recv_incoming(conn, conn->c_faddr,
- conn->c_laddr, &tinc->ti_inc,
+ rds_recv_incoming(conn, &conn->c_faddr,
+ &conn->c_laddr,
+ &tinc->ti_inc,
arg->gfp);
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7df869d37afd..78a2554a4497 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -153,7 +153,7 @@ out:
* an incoming RST.
*/
if (rds_conn_path_up(cp)) {
- pr_warn("RDS/tcp: send to %pI4 on cp [%d]"
+ pr_warn("RDS/tcp: send to %pI6c on cp [%d]"
"returned %d, "
"disconnecting and reconnecting\n",
&conn->c_faddr, cp->cp_index, ret);
diff --git a/net/rds/threads.c b/net/rds/threads.c
index c52861d77a59..e64f9e4c3cda 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2018 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -82,8 +82,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
return;
}
- rdsdebug("conn %p for %pI4 to %pI4 complete\n",
- cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
+ rdsdebug("conn %p for %pI6c to %pI6c complete\n",
+ cp->cp_conn, &cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr);
cp->cp_reconnect_jiffies = 0;
set_bit(0, &cp->cp_conn->c_map_queued);
@@ -125,13 +125,13 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
unsigned long rand;
struct rds_connection *conn = cp->cp_conn;
- rdsdebug("conn %p for %pI4 to %pI4 reconnect jiffies %lu\n",
- conn, &conn->c_laddr, &conn->c_faddr,
- cp->cp_reconnect_jiffies);
+ rdsdebug("conn %p for %pI6c to %pI6c reconnect jiffies %lu\n",
+ conn, &conn->c_laddr, &conn->c_faddr,
+ cp->cp_reconnect_jiffies);
/* let peer with smaller addr initiate reconnect, to avoid duels */
if (conn->c_trans->t_type == RDS_TRANS_TCP &&
- !IS_CANONICAL(conn->c_laddr, conn->c_faddr))
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) >= 0)
return;
set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
@@ -145,7 +145,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
}
get_random_bytes(&rand, sizeof(rand));
- rdsdebug("%lu delay %lu ceil conn %p for %pI4 -> %pI4\n",
+ rdsdebug("%lu delay %lu ceil conn %p for %pI6c -> %pI6c\n",
rand % cp->cp_reconnect_jiffies, cp->cp_reconnect_jiffies,
conn, &conn->c_laddr, &conn->c_faddr);
rcu_read_lock();
@@ -167,14 +167,14 @@ void rds_connect_worker(struct work_struct *work)
int ret;
if (cp->cp_index > 0 &&
- !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr))
+ rds_addr_cmp(&cp->cp_conn->c_laddr, &cp->cp_conn->c_faddr) >= 0)
return;
clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
if (ret) {
ret = conn->c_trans->conn_path_connect(cp);
- rdsdebug("conn %p for %pI4 to %pI4 dispatched, ret %d\n",
- conn, &conn->c_laddr, &conn->c_faddr, ret);
+ rdsdebug("conn %p for %pI6c to %pI6c dispatched, ret %d\n",
+ conn, &conn->c_laddr, &conn->c_faddr, ret);
if (ret) {
if (rds_conn_path_transition(cp,
@@ -259,3 +259,50 @@ int rds_threads_init(void)
return 0;
}
+
+/* Compare two IPv6 addresses. Return 0 if the two addresses are equal.
+ * Return 1 if the first is greater. Return -1 if the second is greater.
+ */
+int rds_addr_cmp(const struct in6_addr *addr1,
+ const struct in6_addr *addr2)
+{
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64
+ const __be64 *a1, *a2;
+ u64 x, y;
+
+ a1 = (__be64 *)addr1;
+ a2 = (__be64 *)addr2;
+
+ if (*a1 != *a2) {
+ if (be64_to_cpu(*a1) < be64_to_cpu(*a2))
+ return -1;
+ else
+ return 1;
+ } else {
+ x = be64_to_cpu(*++a1);
+ y = be64_to_cpu(*++a2);
+ if (x < y)
+ return -1;
+ else if (x > y)
+ return 1;
+ else
+ return 0;
+ }
+#else
+ u32 a, b;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ if (addr1->s6_addr32[i] != addr2->s6_addr32[i]) {
+ a = ntohl(addr1->s6_addr32[i]);
+ b = ntohl(addr2->s6_addr32[i]);
+ if (a < b)
+ return -1;
+ else if (a > b)
+ return 1;
+ }
+ }
+ return 0;
+#endif
+}
+EXPORT_SYMBOL_GPL(rds_addr_cmp);
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 0b188dd0a344..46f709a4b577 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2006 Oracle. All rights reserved.
+ * Copyright (c) 2006, 2017 Oracle and/or its affiliates. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
@@ -33,6 +33,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/in.h>
+#include <linux/ipv6.h>
#include "rds.h"
#include "loop.h"
@@ -75,20 +76,26 @@ void rds_trans_put(struct rds_transport *trans)
module_put(trans->t_owner);
}
-struct rds_transport *rds_trans_get_preferred(struct net *net, __be32 addr)
+struct rds_transport *rds_trans_get_preferred(struct net *net,
+ const struct in6_addr *addr,
+ __u32 scope_id)
{
struct rds_transport *ret = NULL;
struct rds_transport *trans;
unsigned int i;
- if (IN_LOOPBACK(ntohl(addr)))
+ if (ipv6_addr_v4mapped(addr)) {
+ if (*(u_int8_t *)&addr->s6_addr32[3] == IN_LOOPBACKNET)
+ return &rds_loop_transport;
+ } else if (ipv6_addr_loopback(addr)) {
return &rds_loop_transport;
+ }
down_read(&rds_trans_sem);
for (i = 0; i < RDS_TRANS_COUNT; i++) {
trans = transports[i];
- if (trans && (trans->laddr_check(net, addr) == 0) &&
+ if (trans && (trans->laddr_check(net, addr, scope_id) == 0) &&
(!trans->t_owner || try_module_get(trans->t_owner))) {
ret = trans;
break;
@@ -152,4 +159,3 @@ unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
return total;
}
-
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 2b463047dd7b..ac44d8afffb1 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -741,7 +741,7 @@ static __poll_t rxrpc_poll(struct file *file, struct socket *sock,
struct rxrpc_sock *rx = rxrpc_sk(sk);
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
mask = 0;
/* the socket is readable if there are any messages waiting on the Rx
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 5fb7d3254d9e..9d9278a13d91 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -420,6 +420,7 @@ struct rxrpc_connection {
struct rxrpc_channel {
unsigned long final_ack_at; /* Time at which to issue final ACK */
struct rxrpc_call __rcu *call; /* Active call */
+ unsigned int call_debug_id; /* call->debug_id */
u32 call_id; /* ID of current call */
u32 call_counter; /* Call ID counter */
u32 last_call; /* ID of last call */
@@ -478,6 +479,7 @@ enum rxrpc_call_flag {
RXRPC_CALL_RETRANS_TIMEOUT, /* Retransmission due to timeout occurred */
RXRPC_CALL_BEGAN_RX_TIMER, /* We began the expect_rx_by timer */
RXRPC_CALL_RX_HEARD, /* The peer responded at least once to this call */
+ RXRPC_CALL_RX_UNDERRUN, /* Got data underrun */
};
/*
@@ -588,7 +590,7 @@ struct rxrpc_call {
*/
#define RXRPC_RXTX_BUFF_SIZE 64
#define RXRPC_RXTX_BUFF_MASK (RXRPC_RXTX_BUFF_SIZE - 1)
-#define RXRPC_INIT_RX_WINDOW_SIZE 32
+#define RXRPC_INIT_RX_WINDOW_SIZE 63
struct sk_buff **rxtx_buffer;
u8 *rxtx_annotations;
#define RXRPC_TX_ANNO_ACK 0
diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c
index a9a9be5519b9..9d1e298b784c 100644
--- a/net/rxrpc/call_accept.c
+++ b/net/rxrpc/call_accept.c
@@ -116,9 +116,9 @@ static int rxrpc_service_prealloc_one(struct rxrpc_sock *rx,
while (*pp) {
parent = *pp;
xcall = rb_entry(parent, struct rxrpc_call, sock_node);
- if (user_call_ID < call->user_call_ID)
+ if (user_call_ID < xcall->user_call_ID)
pp = &(*pp)->rb_left;
- else if (user_call_ID > call->user_call_ID)
+ else if (user_call_ID > xcall->user_call_ID)
pp = &(*pp)->rb_right;
else
goto id_in_use;
diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c
index 20210418904b..8e7434e92097 100644
--- a/net/rxrpc/call_event.c
+++ b/net/rxrpc/call_event.c
@@ -162,7 +162,6 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call)
*/
static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
{
- struct rxrpc_skb_priv *sp;
struct sk_buff *skb;
unsigned long resend_at;
rxrpc_seq_t cursor, seq, top;
@@ -207,7 +206,6 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j)
skb = call->rxtx_buffer[ix];
rxrpc_see_skb(skb, rxrpc_skb_tx_seen);
- sp = rxrpc_skb(skb);
if (anno_type == RXRPC_TX_ANNO_UNACK) {
if (ktime_after(skb->tstamp, max_age)) {
diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c
index 5736f643c516..f8f37188a932 100644
--- a/net/rxrpc/conn_client.c
+++ b/net/rxrpc/conn_client.c
@@ -590,6 +590,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn,
*/
smp_wmb();
chan->call_id = call_id;
+ chan->call_debug_id = call->debug_id;
rcu_assign_pointer(chan->call, call);
wake_up(&call->waitq);
}
@@ -1051,7 +1052,6 @@ void rxrpc_discard_expired_client_conns(struct work_struct *work)
container_of(work, struct rxrpc_net, client_conn_reaper);
unsigned long expiry, conn_expires_at, now;
unsigned int nr_conns;
- bool did_discard = false;
_enter("");
@@ -1113,7 +1113,6 @@ next:
* If someone re-sets the flag and re-gets the ref, that's fine.
*/
rxrpc_put_connection(conn);
- did_discard = true;
nr_conns--;
goto next;
diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c
index 8229a52c2acd..84d40ba9856f 100644
--- a/net/rxrpc/conn_event.c
+++ b/net/rxrpc/conn_event.c
@@ -129,8 +129,10 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
_proto("Tx ABORT %%%u { %d } [re]", serial, conn->local_abort);
break;
case RXRPC_PACKET_TYPE_ACK:
- trace_rxrpc_tx_ack(NULL, serial, chan->last_seq, 0,
- RXRPC_ACK_DUPLICATE, 0);
+ trace_rxrpc_tx_ack(chan->call_debug_id, serial,
+ ntohl(pkt.ack.firstPacket),
+ ntohl(pkt.ack.serial),
+ pkt.ack.reason, 0);
_proto("Tx ACK %%%u [re]", serial);
break;
}
@@ -138,8 +140,11 @@ static void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, ioc, len);
conn->params.peer->last_tx_at = ktime_get_real();
if (ret < 0)
- trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_call_final_resend);
+ trace_rxrpc_tx_fail(chan->call_debug_id, serial, ret,
+ rxrpc_tx_point_call_final_resend);
+ else
+ trace_rxrpc_tx_packet(chan->call_debug_id, &pkt.whdr,
+ rxrpc_tx_point_call_final_resend);
_leave("");
}
@@ -240,11 +245,13 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_conn_abort);
+ rxrpc_tx_point_conn_abort);
_debug("sendmsg failed: %d", ret);
return -EAGAIN;
}
+ trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort);
+
conn->params.peer->last_tx_at = ktime_get_real();
_leave(" = 0");
diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c
index 608d078a4981..cfdc199c6351 100644
--- a/net/rxrpc/input.c
+++ b/net/rxrpc/input.c
@@ -496,7 +496,7 @@ next_subpacket:
return rxrpc_proto_abort("LSA", call, seq);
}
- trace_rxrpc_rx_data(call, seq, serial, flags, annotation);
+ trace_rxrpc_rx_data(call->debug_id, seq, serial, flags, annotation);
if (before_eq(seq, hard_ack)) {
ack = RXRPC_ACK_DUPLICATE;
ack_serial = serial;
@@ -592,9 +592,15 @@ ack:
rxrpc_propose_ACK(call, ack, skew, ack_serial,
immediate_ack, true,
rxrpc_propose_ack_input_data);
+ else
+ rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial,
+ false, true,
+ rxrpc_propose_ack_input_data);
- if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1)
+ if (sp->hdr.seq == READ_ONCE(call->rx_hard_ack) + 1) {
+ trace_rxrpc_notify_socket(call->debug_id, serial);
rxrpc_notify_socket(call);
+ }
_leave(" [queued]");
}
@@ -1262,6 +1268,11 @@ void rxrpc_data_ready(struct sock *udp_sk)
/* But otherwise we need to retransmit the final packet
* from data cached in the connection record.
*/
+ if (sp->hdr.type == RXRPC_PACKET_TYPE_DATA)
+ trace_rxrpc_rx_data(chan->call_debug_id,
+ sp->hdr.seq,
+ sp->hdr.serial,
+ sp->hdr.flags, 0);
rxrpc_post_packet_to_conn(conn, skb);
goto out_unlock;
}
diff --git a/net/rxrpc/local_event.c b/net/rxrpc/local_event.c
index 8325f1b86840..13bd8a4dfac7 100644
--- a/net/rxrpc/local_event.c
+++ b/net/rxrpc/local_event.c
@@ -72,7 +72,10 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
ret = kernel_sendmsg(local->socket, &msg, iov, 2, len);
if (ret < 0)
trace_rxrpc_tx_fail(local->debug_id, 0, ret,
- rxrpc_tx_fail_version_reply);
+ rxrpc_tx_point_version_reply);
+ else
+ trace_rxrpc_tx_packet(local->debug_id, &whdr,
+ rxrpc_tx_point_version_reply);
_leave("");
}
diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c
index f03de1c59ba3..801dbf3d3478 100644
--- a/net/rxrpc/output.c
+++ b/net/rxrpc/output.c
@@ -183,7 +183,7 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
serial = atomic_inc_return(&conn->serial);
pkt->whdr.serial = htonl(serial);
- trace_rxrpc_tx_ack(call, serial,
+ trace_rxrpc_tx_ack(call->debug_id, serial,
ntohl(pkt->ack.firstPacket),
ntohl(pkt->ack.serial),
pkt->ack.reason, pkt->ack.nAcks);
@@ -212,7 +212,10 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping,
conn->params.peer->last_tx_at = ktime_get_real();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_ack);
+ rxrpc_tx_point_call_ack);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr,
+ rxrpc_tx_point_call_ack);
if (call->state < RXRPC_CALL_COMPLETE) {
if (ret < 0) {
@@ -299,7 +302,10 @@ int rxrpc_send_abort_packet(struct rxrpc_call *call)
conn->params.peer->last_tx_at = ktime_get_real();
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_abort);
+ rxrpc_tx_point_call_abort);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
+ rxrpc_tx_point_call_abort);
rxrpc_put_connection(conn);
@@ -396,7 +402,10 @@ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb,
up_read(&conn->params.local->defrag_sem);
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_data_nofrag);
+ rxrpc_tx_point_call_data_nofrag);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &whdr,
+ rxrpc_tx_point_call_data_nofrag);
if (ret == -EMSGSIZE)
goto send_fragmentable;
@@ -488,7 +497,10 @@ send_fragmentable:
if (ret < 0)
trace_rxrpc_tx_fail(call->debug_id, serial, ret,
- rxrpc_tx_fail_call_data_frag);
+ rxrpc_tx_point_call_data_frag);
+ else
+ trace_rxrpc_tx_packet(call->debug_id, &whdr,
+ rxrpc_tx_point_call_data_frag);
up_write(&conn->params.local->defrag_sem);
goto done;
@@ -545,7 +557,10 @@ void rxrpc_reject_packets(struct rxrpc_local *local)
ret = kernel_sendmsg(local->socket, &msg, iov, 2, size);
if (ret < 0)
trace_rxrpc_tx_fail(local->debug_id, 0, ret,
- rxrpc_tx_fail_reject);
+ rxrpc_tx_point_reject);
+ else
+ trace_rxrpc_tx_packet(local->debug_id, &whdr,
+ rxrpc_tx_point_reject);
}
rxrpc_free_skb(skb, rxrpc_skb_rx_freed);
@@ -597,7 +612,10 @@ void rxrpc_send_keepalive(struct rxrpc_peer *peer)
ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len);
if (ret < 0)
trace_rxrpc_tx_fail(peer->debug_id, 0, ret,
- rxrpc_tx_fail_version_keepalive);
+ rxrpc_tx_point_version_keepalive);
+ else
+ trace_rxrpc_tx_packet(peer->debug_id, &whdr,
+ rxrpc_tx_point_version_keepalive);
peer->last_tx_at = ktime_get_real();
_leave("");
diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c
index d9fca8c4bcdc..9805e3b85c36 100644
--- a/net/rxrpc/proc.c
+++ b/net/rxrpc/proc.c
@@ -63,6 +63,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
struct rxrpc_peer *peer;
struct rxrpc_call *call;
struct rxrpc_net *rxnet = rxrpc_net(seq_file_net(seq));
+ unsigned long timeout = 0;
rxrpc_seq_t tx_hard_ack, rx_hard_ack;
char lbuff[50], rbuff[50];
@@ -71,7 +72,7 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
"Proto Local "
" Remote "
" SvID ConnID CallID End Use State Abort "
- " UserID\n");
+ " UserID TxSeq TW RxSeq RW RxSerial RxTimo\n");
return 0;
}
@@ -94,11 +95,16 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
else
strcpy(rbuff, "no_connection");
+ if (call->state != RXRPC_CALL_SERVER_PREALLOC) {
+ timeout = READ_ONCE(call->expect_rx_by);
+ timeout -= jiffies;
+ }
+
tx_hard_ack = READ_ONCE(call->tx_hard_ack);
rx_hard_ack = READ_ONCE(call->rx_hard_ack);
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %08x %s %3u"
- " %-8.8s %08x %lx %08x %02x %08x %02x\n",
+ " %-8.8s %08x %lx %08x %02x %08x %02x %08x %06lx\n",
lbuff,
rbuff,
call->service_id,
@@ -110,7 +116,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
call->abort_code,
call->user_call_ID,
tx_hard_ack, READ_ONCE(call->tx_top) - tx_hard_ack,
- rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack);
+ rx_hard_ack, READ_ONCE(call->rx_top) - rx_hard_ack,
+ call->rx_serial,
+ timeout);
return 0;
}
@@ -179,7 +187,7 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
print:
seq_printf(seq,
"UDP %-47.47s %-47.47s %4x %08x %s %3u"
- " %s %08x %08x %08x\n",
+ " %s %08x %08x %08x %08x %08x %08x %08x\n",
lbuff,
rbuff,
conn->service_id,
@@ -189,7 +197,11 @@ print:
rxrpc_conn_states[conn->state],
key_serial(conn->params.key),
atomic_read(&conn->serial),
- conn->hi_serial);
+ conn->hi_serial,
+ conn->channels[0].call_id,
+ conn->channels[1].call_id,
+ conn->channels[2].call_id,
+ conn->channels[3].call_id);
return 0;
}
diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c
index 7bff716e911e..816b19a78809 100644
--- a/net/rxrpc/recvmsg.c
+++ b/net/rxrpc/recvmsg.c
@@ -144,13 +144,11 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial)
trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top);
ASSERTCMP(call->rx_hard_ack, ==, call->rx_top);
-#if 0 // TODO: May want to transmit final ACK under some circumstances anyway
if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) {
- rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false,
+ rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, false, true,
rxrpc_propose_ack_terminal_ack);
- rxrpc_send_ack_packet(call, false, NULL);
+ //rxrpc_send_ack_packet(call, false, NULL);
}
-#endif
write_lock_bh(&call->state_lock);
@@ -315,6 +313,10 @@ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call,
unsigned int rx_pkt_offset, rx_pkt_len;
int ix, copy, ret = -EAGAIN, ret2;
+ if (test_and_clear_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags) &&
+ call->ackr_reason)
+ rxrpc_send_ack_packet(call, false, NULL);
+
rx_pkt_offset = call->rx_pkt_offset;
rx_pkt_len = call->rx_pkt_len;
@@ -414,6 +416,8 @@ out:
done:
trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq,
rx_pkt_offset, rx_pkt_len, ret);
+ if (ret == -EAGAIN)
+ set_bit(RXRPC_CALL_RX_UNDERRUN, &call->flags);
return ret;
}
@@ -607,9 +611,7 @@ wait_error:
* rxrpc_kernel_recv_data - Allow a kernel service to receive data/info
* @sock: The socket that the call exists on
* @call: The call to send data through
- * @buf: The buffer to receive into
- * @size: The size of the buffer, including data already read
- * @_offset: The running offset into the buffer.
+ * @iter: The buffer to receive into
* @want_more: True if more data is expected to be read
* @_abort: Where the abort code is stored if -ECONNABORTED is returned
* @_service: Where to store the actual service ID (may be upgraded)
@@ -622,39 +624,30 @@ wait_error:
* Note that we may return -EAGAIN to drain empty packets at the end of the
* data, even if we've already copied over the requested data.
*
- * This function adds the amount it transfers to *_offset, so this should be
- * precleared as appropriate. Note that the amount remaining in the buffer is
- * taken to be size - *_offset.
- *
* *_abort should also be initialised to 0.
*/
int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
- void *buf, size_t size, size_t *_offset,
+ struct iov_iter *iter,
bool want_more, u32 *_abort, u16 *_service)
{
- struct iov_iter iter;
- struct kvec iov;
+ size_t offset = 0;
int ret;
- _enter("{%d,%s},%zu/%zu,%d",
+ _enter("{%d,%s},%zu,%d",
call->debug_id, rxrpc_call_states[call->state],
- *_offset, size, want_more);
+ iov_iter_count(iter), want_more);
- ASSERTCMP(*_offset, <=, size);
ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING);
- iov.iov_base = buf + *_offset;
- iov.iov_len = size - *_offset;
- iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset);
-
mutex_lock(&call->user_mutex);
switch (READ_ONCE(call->state)) {
case RXRPC_CALL_CLIENT_RECV_REPLY:
case RXRPC_CALL_SERVER_RECV_REQUEST:
case RXRPC_CALL_SERVER_ACK_REQUEST:
- ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0,
- _offset);
+ ret = rxrpc_recvmsg_data(sock, call, NULL, iter,
+ iov_iter_count(iter), 0,
+ &offset);
if (ret < 0)
goto out;
@@ -663,7 +656,7 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
* full buffer or have been given -EAGAIN.
*/
if (ret == 1) {
- if (*_offset < size)
+ if (iov_iter_count(iter) > 0)
goto short_data;
if (!want_more)
goto read_phase_complete;
@@ -686,10 +679,21 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call,
read_phase_complete:
ret = 1;
out:
+ switch (call->ackr_reason) {
+ case RXRPC_ACK_IDLE:
+ break;
+ case RXRPC_ACK_DELAY:
+ if (ret != -EAGAIN)
+ break;
+ /* Fall through */
+ default:
+ rxrpc_send_ack_packet(call, false, NULL);
+ }
+
if (_service)
*_service = call->service_id;
mutex_unlock(&call->user_mutex);
- _leave(" = %d [%zu,%d]", ret, *_offset, *_abort);
+ _leave(" = %d [%zu,%d]", ret, iov_iter_count(iter), *_abort);
return ret;
short_data:
@@ -705,7 +709,7 @@ call_complete:
ret = call->error;
if (call->completion == RXRPC_CALL_SUCCEEDED) {
ret = 1;
- if (size > 0)
+ if (iov_iter_count(iter) > 0)
ret = -ECONNRESET;
}
goto out;
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index 278ac0807a60..eaf8f4f446b0 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -146,10 +146,10 @@ static int rxkad_prime_packet_security(struct rxrpc_connection *conn)
static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
struct sk_buff *skb,
u32 data_size,
- void *sechdr)
+ void *sechdr,
+ struct skcipher_request *req)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxkad_level1_hdr hdr;
struct rxrpc_crypt iv;
struct scatterlist sg;
@@ -183,12 +183,12 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
struct sk_buff *skb,
u32 data_size,
- void *sechdr)
+ void *sechdr,
+ struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr rxkhdr;
struct rxrpc_skb_priv *sp;
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
struct sk_buff *trailer;
@@ -296,11 +296,12 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
ret = 0;
break;
case RXRPC_SECURITY_AUTH:
- ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr);
+ ret = rxkad_secure_packet_auth(call, skb, data_size, sechdr,
+ req);
break;
case RXRPC_SECURITY_ENCRYPT:
ret = rxkad_secure_packet_encrypt(call, skb, data_size,
- sechdr);
+ sechdr, req);
break;
default:
ret = -EPERM;
@@ -316,10 +317,10 @@ static int rxkad_secure_packet(struct rxrpc_call *call,
*/
static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb,
unsigned int offset, unsigned int len,
- rxrpc_seq_t seq)
+ rxrpc_seq_t seq,
+ struct skcipher_request *req)
{
struct rxkad_level1_hdr sechdr;
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
struct sk_buff *trailer;
@@ -402,11 +403,11 @@ nomem:
*/
static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb,
unsigned int offset, unsigned int len,
- rxrpc_seq_t seq)
+ rxrpc_seq_t seq,
+ struct skcipher_request *req)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr sechdr;
- SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
struct sk_buff *trailer;
@@ -549,9 +550,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb,
case RXRPC_SECURITY_PLAIN:
return 0;
case RXRPC_SECURITY_AUTH:
- return rxkad_verify_packet_1(call, skb, offset, len, seq);
+ return rxkad_verify_packet_1(call, skb, offset, len, seq, req);
case RXRPC_SECURITY_ENCRYPT:
- return rxkad_verify_packet_2(call, skb, offset, len, seq);
+ return rxkad_verify_packet_2(call, skb, offset, len, seq, req);
default:
return -ENOANO;
}
@@ -665,11 +666,13 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_conn_challenge);
+ rxrpc_tx_point_rxkad_challenge);
return -EAGAIN;
}
conn->params.peer->last_tx_at = ktime_get_real();
+ trace_rxrpc_tx_packet(conn->debug_id, &whdr,
+ rxrpc_tx_point_rxkad_challenge);
_leave(" = 0");
return 0;
}
@@ -721,11 +724,12 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 3, len);
if (ret < 0) {
trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
- rxrpc_tx_fail_conn_response);
+ rxrpc_tx_point_rxkad_response);
return -EAGAIN;
}
conn->params.peer->last_tx_at = ktime_get_real();
+ trace_rxrpc_tx_packet(0, &whdr, rxrpc_tx_point_rxkad_response);
_leave(" = 0");
return 0;
}
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 7af246764a35..e95741388311 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -1,6 +1,6 @@
#
# Traffic control configuration.
-#
+#
menuconfig NET_SCHED
bool "QoS and/or fair queueing"
@@ -251,6 +251,19 @@ config NET_SCH_MQPRIO
If unsure, say N.
+config NET_SCH_SKBPRIO
+ tristate "SKB priority queue scheduler (SKBPRIO)"
+ help
+ Say Y here if you want to use the SKB priority queue
+ scheduler. This schedules packets according to skb->priority,
+ which is useful for request packets in DoS mitigation systems such
+ as Gatekeeper.
+
+ To compile this driver as a module, choose M here: the module will
+ be called sch_skbprio.
+
+ If unsure, say N.
+
config NET_SCH_CHOKE
tristate "CHOose and Keep responsive flow scheduler (CHOKE)"
help
@@ -706,7 +719,7 @@ config NET_CLS_ACT
config NET_ACT_POLICE
tristate "Traffic Policing"
- depends on NET_CLS_ACT
+ depends on NET_CLS_ACT
---help---
Say Y here if you want to do traffic policing, i.e. strict
bandwidth limiting. This action replaces the existing policing
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 673ee7d26ff2..f0403f49edcb 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
-obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
+obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFB) += sch_sfb.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
@@ -46,6 +46,7 @@ obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o
obj-$(CONFIG_NET_SCH_DRR) += sch_drr.o
obj-$(CONFIG_NET_SCH_PLUG) += sch_plug.o
obj-$(CONFIG_NET_SCH_MQPRIO) += sch_mqprio.o
+obj-$(CONFIG_NET_SCH_SKBPRIO) += sch_skbprio.o
obj-$(CONFIG_NET_SCH_CHOKE) += sch_choke.o
obj-$(CONFIG_NET_SCH_QFQ) += sch_qfq.o
obj-$(CONFIG_NET_SCH_CODEL) += sch_codel.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 148a89ab789b..229d63c99be2 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,7 +36,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
if (!tp)
return -EINVAL;
- a->goto_chain = tcf_chain_get(tp->chain->block, chain_index, true);
+ a->goto_chain = tcf_chain_get_by_act(tp->chain->block, chain_index);
if (!a->goto_chain)
return -ENOMEM;
return 0;
@@ -44,7 +44,7 @@ static int tcf_action_goto_chain_init(struct tc_action *a, struct tcf_proto *tp)
static void tcf_action_goto_chain_fini(struct tc_action *a)
{
- tcf_chain_put(a->goto_chain);
+ tcf_chain_put_by_act(a->goto_chain);
}
static void tcf_action_goto_chain_exec(const struct tc_action *a,
@@ -786,6 +786,15 @@ static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb)
return c;
}
+static bool tcf_action_valid(int action)
+{
+ int opcode = TC_ACT_EXT_OPCODE(action);
+
+ if (!opcode)
+ return action <= TC_ACT_VALUE_MAX;
+ return opcode <= TC_ACT_EXT_OPCODE_MAX || action == TC_ACT_UNSPEC;
+}
+
struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
struct nlattr *nla, struct nlattr *est,
char *name, int ovr, int bind,
@@ -895,6 +904,11 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
}
}
+ if (!tcf_action_valid(a->tcfa_action)) {
+ NL_SET_ERR_MSG(extack, "invalid action value, using TC_ACT_UNSPEC instead");
+ a->tcfa_action = TC_ACT_UNSPEC;
+ }
+
return a;
err_mod:
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 06f743d8ed41..6203eb075c9a 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -196,12 +196,10 @@ static int tcf_bpf_init_from_ops(struct nlattr **tb, struct tcf_bpf_cfg *cfg)
if (bpf_size != nla_len(tb[TCA_ACT_BPF_OPS]))
return -EINVAL;
- bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ bpf_ops = kmemdup(nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size, GFP_KERNEL);
if (bpf_ops == NULL)
return -ENOMEM;
- memcpy(bpf_ops, nla_data(tb[TCA_ACT_BPF_OPS]), bpf_size);
-
fprog_tmp.len = bpf_num_ops;
fprog_tmp.filter = bpf_ops;
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index 1e31f0e448e2..2f9bc833d046 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -252,4 +252,3 @@ module_exit(connmark_cleanup_module);
MODULE_AUTHOR("Felix Fietkau <nbd@openwrt.org>");
MODULE_DESCRIPTION("Connection tracking mark restoring");
MODULE_LICENSE("GPL");
-
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index bd232d3bd022..648a3a35b720 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -97,7 +97,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla,
}
params_old = rtnl_dereference(p->params);
- params_new->action = parm->action;
+ p->tcf_action = parm->action;
params_new->update_flags = parm->update_flags;
rcu_assign_pointer(p->params, params_new);
if (params_old)
@@ -561,15 +561,14 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
u32 update_flags;
int action;
- rcu_read_lock();
- params = rcu_dereference(p->params);
+ params = rcu_dereference_bh(p->params);
tcf_lastuse_update(&p->tcf_tm);
bstats_cpu_update(this_cpu_ptr(p->common.cpu_bstats), skb);
- action = params->action;
+ action = READ_ONCE(p->tcf_action);
if (unlikely(action == TC_ACT_SHOT))
- goto drop_stats;
+ goto drop;
update_flags = params->update_flags;
switch (tc_skb_protocol(skb)) {
@@ -583,16 +582,11 @@ static int tcf_csum(struct sk_buff *skb, const struct tc_action *a,
break;
}
-unlock:
- rcu_read_unlock();
return action;
drop:
- action = TC_ACT_SHOT;
-
-drop_stats:
qstats_drop_inc(this_cpu_ptr(p->common.cpu_qstats));
- goto unlock;
+ return TC_ACT_SHOT;
}
static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
@@ -605,11 +599,11 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tc_action *a, int bind,
.index = p->tcf_index,
.refcnt = refcount_read(&p->tcf_refcnt) - ref,
.bindcnt = atomic_read(&p->tcf_bindcnt) - bind,
+ .action = p->tcf_action,
};
struct tcf_t t;
params = rtnl_dereference(p->params);
- opt.action = params->action;
opt.update_flags = params->update_flags;
if (nla_put(skb, TCA_CSUM_PARMS, sizeof(opt), &opt))
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
index 3d6e265758c0..df4060e32d43 100644
--- a/net/sched/act_ife.c
+++ b/net/sched/act_ife.c
@@ -820,14 +820,11 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_ife_params *p;
int ret;
- rcu_read_lock();
- p = rcu_dereference(ife->params);
+ p = rcu_dereference_bh(ife->params);
if (p->flags & IFE_ENCODE) {
ret = tcf_ife_encode(skb, a, res, p);
- rcu_read_unlock();
return ret;
}
- rcu_read_unlock();
return tcf_ife_decode(skb, a, res);
}
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 6afd89a36c69..b26d060da08e 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -25,6 +25,7 @@
#include <net/net_namespace.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
#include <linux/tc_act/tc_mirred.h>
#include <net/tc_act/tc_mirred.h>
@@ -49,6 +50,18 @@ static bool tcf_mirred_act_wants_ingress(int action)
}
}
+static bool tcf_mirred_can_reinsert(int action)
+{
+ switch (action) {
+ case TC_ACT_SHOT:
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ case TC_ACT_TRAP:
+ return true;
+ }
+ return false;
+}
+
static void tcf_mirred_release(struct tc_action *a)
{
struct tcf_mirred *m = to_mirred(a);
@@ -171,21 +184,23 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
struct tcf_mirred *m = to_mirred(a);
+ struct sk_buff *skb2 = skb;
bool m_mac_header_xmit;
struct net_device *dev;
- struct sk_buff *skb2;
int retval, err = 0;
+ bool use_reinsert;
+ bool want_ingress;
+ bool is_redirect;
int m_eaction;
int mac_len;
tcf_lastuse_update(&m->tcf_tm);
bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb);
- rcu_read_lock();
m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit);
m_eaction = READ_ONCE(m->tcfm_eaction);
retval = READ_ONCE(m->tcf_action);
- dev = rcu_dereference(m->tcfm_dev);
+ dev = rcu_dereference_bh(m->tcfm_dev);
if (unlikely(!dev)) {
pr_notice_once("tc mirred: target device is gone\n");
goto out;
@@ -197,16 +212,25 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
goto out;
}
- skb2 = skb_clone(skb, GFP_ATOMIC);
- if (!skb2)
- goto out;
+ /* we could easily avoid the clone only if called by ingress and clsact;
+ * since we can't easily detect the clsact caller, skip clone only for
+ * ingress - that covers the TC S/W datapath.
+ */
+ is_redirect = tcf_mirred_is_act_redirect(m_eaction);
+ use_reinsert = skb_at_tc_ingress(skb) && is_redirect &&
+ tcf_mirred_can_reinsert(retval);
+ if (!use_reinsert) {
+ skb2 = skb_clone(skb, GFP_ATOMIC);
+ if (!skb2)
+ goto out;
+ }
/* If action's target direction differs than filter's direction,
* and devices expect a mac header on xmit, then mac push/pull is
* needed.
*/
- if (skb_at_tc_ingress(skb) != tcf_mirred_act_wants_ingress(m_eaction) &&
- m_mac_header_xmit) {
+ want_ingress = tcf_mirred_act_wants_ingress(m_eaction);
+ if (skb_at_tc_ingress(skb) != want_ingress && m_mac_header_xmit) {
if (!skb_at_tc_ingress(skb)) {
/* caught at egress, act ingress: pull mac */
mac_len = skb_network_header(skb) - skb_mac_header(skb);
@@ -217,15 +241,23 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
}
}
+ skb2->skb_iif = skb->dev->ifindex;
+ skb2->dev = dev;
+
/* mirror is always swallowed */
- if (tcf_mirred_is_act_redirect(m_eaction)) {
+ if (is_redirect) {
skb2->tc_redirected = 1;
skb2->tc_from_ingress = skb2->tc_at_ingress;
+
+ /* let's the caller reinsert the packet, if possible */
+ if (use_reinsert) {
+ res->ingress = want_ingress;
+ res->qstats = this_cpu_ptr(m->common.cpu_qstats);
+ return TC_ACT_REINSERT;
+ }
}
- skb2->skb_iif = skb->dev->ifindex;
- skb2->dev = dev;
- if (!tcf_mirred_act_wants_ingress(m_eaction))
+ if (!want_ingress)
err = dev_queue_xmit(skb2);
else
err = netif_receive_skb(skb2);
@@ -236,7 +268,6 @@ out:
if (tcf_mirred_is_act_redirect(m_eaction))
retval = TC_ACT_SHOT;
}
- rcu_read_unlock();
return retval;
}
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index cc8ffcd1ddb5..43ba999b2d23 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -286,7 +286,7 @@ static int pedit_skb_hdr_offset(struct sk_buff *skb,
default:
ret = -EINVAL;
break;
- };
+ }
return ret;
}
@@ -516,4 +516,3 @@ static void __exit pedit_cleanup_module(void)
module_init(pedit_init_module);
module_exit(pedit_cleanup_module);
-
diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c
index 3079e7be5bde..2608ccc83e5e 100644
--- a/net/sched/act_sample.c
+++ b/net/sched/act_sample.c
@@ -140,8 +140,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb);
retval = READ_ONCE(s->tcf_action);
- rcu_read_lock();
- psample_group = rcu_dereference(s->psample_group);
+ psample_group = rcu_dereference_bh(s->psample_group);
/* randomly sample packets according to rate */
if (psample_group && (prandom_u32() % s->rate == 0)) {
@@ -165,7 +164,6 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a,
skb_pull(skb, skb->mac_len);
}
- rcu_read_unlock();
return retval;
}
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index da56e6938c9e..a6db47ebec11 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -43,8 +43,7 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
tcf_lastuse_update(&d->tcf_tm);
bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb);
- rcu_read_lock();
- params = rcu_dereference(d->params);
+ params = rcu_dereference_bh(d->params);
action = READ_ONCE(d->tcf_action);
if (params->flags & SKBEDIT_F_PRIORITY)
@@ -77,14 +76,11 @@ static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
}
if (params->flags & SKBEDIT_F_PTYPE)
skb->pkt_type = params->ptype;
-
-unlock:
- rcu_read_unlock();
return action;
+
err:
qstats_drop_inc(this_cpu_ptr(d->common.cpu_qstats));
- action = TC_ACT_SHOT;
- goto unlock;
+ return TC_ACT_SHOT;
}
static const struct nla_policy skbedit_policy[TCA_SKBEDIT_MAX + 1] = {
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index cdc6bacfb190..c437c6d51a71 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -41,20 +41,14 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
* then MAX_EDIT_LEN needs to change appropriately
*/
err = skb_ensure_writable(skb, MAX_EDIT_LEN);
- if (unlikely(err)) { /* best policy is to drop on the floor */
- qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
- return TC_ACT_SHOT;
- }
+ if (unlikely(err)) /* best policy is to drop on the floor */
+ goto drop;
- rcu_read_lock();
action = READ_ONCE(d->tcf_action);
- if (unlikely(action == TC_ACT_SHOT)) {
- qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
- rcu_read_unlock();
- return action;
- }
+ if (unlikely(action == TC_ACT_SHOT))
+ goto drop;
- p = rcu_dereference(d->skbmod_p);
+ p = rcu_dereference_bh(d->skbmod_p);
flags = p->flags;
if (flags & SKBMOD_F_DMAC)
ether_addr_copy(eth_hdr(skb)->h_dest, p->eth_dst);
@@ -62,7 +56,6 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
ether_addr_copy(eth_hdr(skb)->h_source, p->eth_src);
if (flags & SKBMOD_F_ETYPE)
eth_hdr(skb)->h_proto = p->eth_type;
- rcu_read_unlock();
if (flags & SKBMOD_F_SWAPMAC) {
u16 tmpaddr[ETH_ALEN / 2]; /* ether_addr_copy() requirement */
@@ -73,6 +66,10 @@ static int tcf_skbmod_run(struct sk_buff *skb, const struct tc_action *a,
}
return action;
+
+drop:
+ qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
+ return TC_ACT_SHOT;
}
static const struct nla_policy skbmod_policy[TCA_SKBMOD_MAX + 1] = {
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 3ec585d58762..d42d9e112789 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -31,13 +31,11 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
struct tcf_tunnel_key_params *params;
int action;
- rcu_read_lock();
-
- params = rcu_dereference(t->params);
+ params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
bstats_cpu_update(this_cpu_ptr(t->common.cpu_bstats), skb);
- action = params->action;
+ action = READ_ONCE(t->tcf_action);
switch (params->tcft_action) {
case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -53,8 +51,6 @@ static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a,
break;
}
- rcu_read_unlock();
-
return action;
}
@@ -197,6 +193,8 @@ static const struct nla_policy tunnel_key_policy[TCA_TUNNEL_KEY_MAX + 1] = {
[TCA_TUNNEL_KEY_ENC_DST_PORT] = {.type = NLA_U16},
[TCA_TUNNEL_KEY_NO_CSUM] = { .type = NLA_U8 },
[TCA_TUNNEL_KEY_ENC_OPTS] = { .type = NLA_NESTED },
+ [TCA_TUNNEL_KEY_ENC_TOS] = { .type = NLA_U8 },
+ [TCA_TUNNEL_KEY_ENC_TTL] = { .type = NLA_U8 },
};
static int tunnel_key_init(struct net *net, struct nlattr *nla,
@@ -216,6 +214,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
int opts_len = 0;
__be64 key_id;
__be16 flags;
+ u8 tos, ttl;
int ret = 0;
int err;
@@ -273,6 +272,13 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
}
}
+ tos = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TOS])
+ tos = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TOS]);
+ ttl = 0;
+ if (tb[TCA_TUNNEL_KEY_ENC_TTL])
+ ttl = nla_get_u8(tb[TCA_TUNNEL_KEY_ENC_TTL]);
+
if (tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC] &&
tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]) {
__be32 saddr;
@@ -281,7 +287,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_SRC]);
daddr = nla_get_in_addr(tb[TCA_TUNNEL_KEY_ENC_IPV4_DST]);
- metadata = __ip_tun_set_dst(saddr, daddr, 0, 0,
+ metadata = __ip_tun_set_dst(saddr, daddr, tos, ttl,
dst_port, flags,
key_id, opts_len);
} else if (tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC] &&
@@ -292,7 +298,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
saddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_SRC]);
daddr = nla_get_in6_addr(tb[TCA_TUNNEL_KEY_ENC_IPV6_DST]);
- metadata = __ipv6_tun_set_dst(&saddr, &daddr, 0, 0, dst_port,
+ metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port,
0, flags,
key_id, 0);
} else {
@@ -350,7 +356,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
params_old = rtnl_dereference(t->params);
- params_new->action = parm->action;
+ t->tcf_action = parm->action;
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
@@ -479,13 +485,13 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
.index = t->tcf_index,
.refcnt = refcount_read(&t->tcf_refcnt) - ref,
.bindcnt = atomic_read(&t->tcf_bindcnt) - bind,
+ .action = t->tcf_action,
};
struct tcf_t tm;
params = rtnl_dereference(t->params);
opt.t_action = params->tcft_action;
- opt.action = params->action;
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -504,6 +510,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
!(key->tun_flags & TUNNEL_CSUM)) ||
tunnel_key_opts_dump(skb, info))
goto nla_put_failure;
+
+ if (key->tos && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TOS, key->tos))
+ goto nla_put_failure;
+
+ if (key->ttl && nla_put_u8(skb, TCA_TUNNEL_KEY_ENC_TTL, key->ttl))
+ goto nla_put_failure;
}
tcf_tm_dump(&tm, &t->tcf_tm);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index ad37f308175a..15a0ee214c9c 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -40,11 +40,9 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
if (skb_at_tc_ingress(skb))
skb_push_rcsum(skb, skb->mac_len);
- rcu_read_lock();
-
action = READ_ONCE(v->tcf_action);
- p = rcu_dereference(v->vlan_p);
+ p = rcu_dereference_bh(v->vlan_p);
switch (p->tcfv_action) {
case TCA_VLAN_ACT_POP:
@@ -61,7 +59,7 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
case TCA_VLAN_ACT_MODIFY:
/* No-op if no vlan tag (either hw-accel or in-payload) */
if (!skb_vlan_tagged(skb))
- goto unlock;
+ goto out;
/* extract existing tag (and guarantee no hw-accel tag) */
if (skb_vlan_tag_present(skb)) {
tci = skb_vlan_tag_get(skb);
@@ -86,18 +84,15 @@ static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
BUG();
}
- goto unlock;
-
-drop:
- action = TC_ACT_SHOT;
- qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
-
-unlock:
- rcu_read_unlock();
+out:
if (skb_at_tc_ingress(skb))
skb_pull_rcsum(skb, skb->mac_len);
return action;
+
+drop:
+ qstats_drop_inc(this_cpu_ptr(v->common.cpu_qstats));
+ return TC_ACT_SHOT;
}
static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = {
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 73d9967c3739..194c2e0b2737 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -39,7 +39,7 @@ static DEFINE_RWLOCK(cls_mod_lock);
/* Find classifier type by string name */
-static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
+static const struct tcf_proto_ops *__tcf_proto_lookup_ops(const char *kind)
{
const struct tcf_proto_ops *t, *res = NULL;
@@ -57,6 +57,33 @@ static const struct tcf_proto_ops *tcf_proto_lookup_ops(const char *kind)
return res;
}
+static const struct tcf_proto_ops *
+tcf_proto_lookup_ops(const char *kind, struct netlink_ext_ack *extack)
+{
+ const struct tcf_proto_ops *ops;
+
+ ops = __tcf_proto_lookup_ops(kind);
+ if (ops)
+ return ops;
+#ifdef CONFIG_MODULES
+ rtnl_unlock();
+ request_module("cls_%s", kind);
+ rtnl_lock();
+ ops = __tcf_proto_lookup_ops(kind);
+ /* We dropped the RTNL semaphore in order to perform
+ * the module load. So, even if we succeeded in loading
+ * the module we have to replay the request. We indicate
+ * this using -EAGAIN.
+ */
+ if (ops) {
+ module_put(ops->owner);
+ return ERR_PTR(-EAGAIN);
+ }
+#endif
+ NL_SET_ERR_MSG(extack, "TC classifier not found");
+ return ERR_PTR(-ENOENT);
+}
+
/* Register(unregister) new classifier type */
int register_tcf_proto_ops(struct tcf_proto_ops *ops)
@@ -133,27 +160,9 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol,
if (!tp)
return ERR_PTR(-ENOBUFS);
- err = -ENOENT;
- tp->ops = tcf_proto_lookup_ops(kind);
- if (!tp->ops) {
-#ifdef CONFIG_MODULES
- rtnl_unlock();
- request_module("cls_%s", kind);
- rtnl_lock();
- tp->ops = tcf_proto_lookup_ops(kind);
- /* We dropped the RTNL semaphore in order to perform
- * the module load. So, even if we succeeded in loading
- * the module we have to replay the request. We indicate
- * this using -EAGAIN.
- */
- if (tp->ops) {
- module_put(tp->ops->owner);
- err = -EAGAIN;
- } else {
- NL_SET_ERR_MSG(extack, "TC classifier not found");
- err = -ENOENT;
- }
-#endif
+ tp->ops = tcf_proto_lookup_ops(kind, extack);
+ if (IS_ERR(tp->ops)) {
+ err = PTR_ERR(tp->ops);
goto errout;
}
tp->classify = tp->ops->classify;
@@ -195,11 +204,12 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block,
chain = kzalloc(sizeof(*chain), GFP_KERNEL);
if (!chain)
return NULL;
- INIT_LIST_HEAD(&chain->filter_chain_list);
list_add_tail(&chain->list, &block->chain_list);
chain->block = block;
chain->index = chain_index;
chain->refcnt = 1;
+ if (!chain->index)
+ block->chain0.chain = chain;
return chain;
}
@@ -209,35 +219,28 @@ static void tcf_chain_head_change_item(struct tcf_filter_chain_list_item *item,
if (item->chain_head_change)
item->chain_head_change(tp_head, item->chain_head_change_priv);
}
-static void tcf_chain_head_change(struct tcf_chain *chain,
- struct tcf_proto *tp_head)
+
+static void tcf_chain0_head_change(struct tcf_chain *chain,
+ struct tcf_proto *tp_head)
{
struct tcf_filter_chain_list_item *item;
+ struct tcf_block *block = chain->block;
- list_for_each_entry(item, &chain->filter_chain_list, list)
+ if (chain->index)
+ return;
+ list_for_each_entry(item, &block->chain0.filter_chain_list, list)
tcf_chain_head_change_item(item, tp_head);
}
-static void tcf_chain_flush(struct tcf_chain *chain)
-{
- struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
-
- tcf_chain_head_change(chain, NULL);
- while (tp) {
- RCU_INIT_POINTER(chain->filter_chain, tp->next);
- tcf_proto_destroy(tp, NULL);
- tp = rtnl_dereference(chain->filter_chain);
- tcf_chain_put(chain);
- }
-}
-
static void tcf_chain_destroy(struct tcf_chain *chain)
{
struct tcf_block *block = chain->block;
list_del(&chain->list);
+ if (!chain->index)
+ block->chain0.chain = NULL;
kfree(chain);
- if (list_empty(&block->chain_list))
+ if (list_empty(&block->chain_list) && block->refcnt == 0)
kfree(block);
}
@@ -246,28 +249,119 @@ static void tcf_chain_hold(struct tcf_chain *chain)
++chain->refcnt;
}
-struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
- bool create)
+static bool tcf_chain_held_by_acts_only(struct tcf_chain *chain)
+{
+ /* In case all the references are action references, this
+ * chain should not be shown to the user.
+ */
+ return chain->refcnt == chain->action_refcnt;
+}
+
+static struct tcf_chain *tcf_chain_lookup(struct tcf_block *block,
+ u32 chain_index)
{
struct tcf_chain *chain;
list_for_each_entry(chain, &block->chain_list, list) {
- if (chain->index == chain_index) {
- tcf_chain_hold(chain);
+ if (chain->index == chain_index)
return chain;
- }
}
+ return NULL;
+}
- return create ? tcf_chain_create(block, chain_index) : NULL;
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+ u32 seq, u16 flags, int event, bool unicast);
+
+static struct tcf_chain *__tcf_chain_get(struct tcf_block *block,
+ u32 chain_index, bool create,
+ bool by_act)
+{
+ struct tcf_chain *chain = tcf_chain_lookup(block, chain_index);
+
+ if (chain) {
+ tcf_chain_hold(chain);
+ } else {
+ if (!create)
+ return NULL;
+ chain = tcf_chain_create(block, chain_index);
+ if (!chain)
+ return NULL;
+ }
+
+ if (by_act)
+ ++chain->action_refcnt;
+
+ /* Send notification only in case we got the first
+ * non-action reference. Until then, the chain acts only as
+ * a placeholder for actions pointing to it and user ought
+ * not know about them.
+ */
+ if (chain->refcnt - chain->action_refcnt == 1 && !by_act)
+ tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+ RTM_NEWCHAIN, false);
+
+ return chain;
+}
+
+static struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
+ bool create)
+{
+ return __tcf_chain_get(block, chain_index, create, false);
}
-EXPORT_SYMBOL(tcf_chain_get);
-void tcf_chain_put(struct tcf_chain *chain)
+struct tcf_chain *tcf_chain_get_by_act(struct tcf_block *block, u32 chain_index)
+{
+ return __tcf_chain_get(block, chain_index, true, true);
+}
+EXPORT_SYMBOL(tcf_chain_get_by_act);
+
+static void tc_chain_tmplt_del(struct tcf_chain *chain);
+
+static void __tcf_chain_put(struct tcf_chain *chain, bool by_act)
{
- if (--chain->refcnt == 0)
+ if (by_act)
+ chain->action_refcnt--;
+ chain->refcnt--;
+
+ /* The last dropped non-action reference will trigger notification. */
+ if (chain->refcnt - chain->action_refcnt == 0 && !by_act)
+ tc_chain_notify(chain, NULL, 0, 0, RTM_DELCHAIN, false);
+
+ if (chain->refcnt == 0) {
+ tc_chain_tmplt_del(chain);
tcf_chain_destroy(chain);
+ }
+}
+
+static void tcf_chain_put(struct tcf_chain *chain)
+{
+ __tcf_chain_put(chain, false);
+}
+
+void tcf_chain_put_by_act(struct tcf_chain *chain)
+{
+ __tcf_chain_put(chain, true);
+}
+EXPORT_SYMBOL(tcf_chain_put_by_act);
+
+static void tcf_chain_put_explicitly_created(struct tcf_chain *chain)
+{
+ if (chain->explicitly_created)
+ tcf_chain_put(chain);
+}
+
+static void tcf_chain_flush(struct tcf_chain *chain)
+{
+ struct tcf_proto *tp = rtnl_dereference(chain->filter_chain);
+
+ tcf_chain0_head_change(chain, NULL);
+ while (tp) {
+ RCU_INIT_POINTER(chain->filter_chain, tp->next);
+ tcf_proto_destroy(tp, NULL);
+ tp = rtnl_dereference(chain->filter_chain);
+ tcf_chain_put(chain);
+ }
}
-EXPORT_SYMBOL(tcf_chain_put);
static bool tcf_block_offload_in_use(struct tcf_block *block)
{
@@ -337,10 +431,11 @@ no_offload_dev_dec:
}
static int
-tcf_chain_head_change_cb_add(struct tcf_chain *chain,
- struct tcf_block_ext_info *ei,
- struct netlink_ext_ack *extack)
+tcf_chain0_head_change_cb_add(struct tcf_block *block,
+ struct tcf_block_ext_info *ei,
+ struct netlink_ext_ack *extack)
{
+ struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
item = kmalloc(sizeof(*item), GFP_KERNEL);
@@ -350,23 +445,25 @@ tcf_chain_head_change_cb_add(struct tcf_chain *chain,
}
item->chain_head_change = ei->chain_head_change;
item->chain_head_change_priv = ei->chain_head_change_priv;
- if (chain->filter_chain)
- tcf_chain_head_change_item(item, chain->filter_chain);
- list_add(&item->list, &chain->filter_chain_list);
+ if (chain0 && chain0->filter_chain)
+ tcf_chain_head_change_item(item, chain0->filter_chain);
+ list_add(&item->list, &block->chain0.filter_chain_list);
return 0;
}
static void
-tcf_chain_head_change_cb_del(struct tcf_chain *chain,
- struct tcf_block_ext_info *ei)
+tcf_chain0_head_change_cb_del(struct tcf_block *block,
+ struct tcf_block_ext_info *ei)
{
+ struct tcf_chain *chain0 = block->chain0.chain;
struct tcf_filter_chain_list_item *item;
- list_for_each_entry(item, &chain->filter_chain_list, list) {
+ list_for_each_entry(item, &block->chain0.filter_chain_list, list) {
if ((!ei->chain_head_change && !ei->chain_head_change_priv) ||
(item->chain_head_change == ei->chain_head_change &&
item->chain_head_change_priv == ei->chain_head_change_priv)) {
- tcf_chain_head_change_item(item, NULL);
+ if (chain0)
+ tcf_chain_head_change_item(item, NULL);
list_del(&item->list);
kfree(item);
return;
@@ -402,8 +499,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
struct netlink_ext_ack *extack)
{
struct tcf_block *block;
- struct tcf_chain *chain;
- int err;
block = kzalloc(sizeof(*block), GFP_KERNEL);
if (!block) {
@@ -413,14 +508,8 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
INIT_LIST_HEAD(&block->chain_list);
INIT_LIST_HEAD(&block->cb_list);
INIT_LIST_HEAD(&block->owner_list);
+ INIT_LIST_HEAD(&block->chain0.filter_chain_list);
- /* Create chain 0 by default, it has to be always present. */
- chain = tcf_chain_create(block, 0);
- if (!chain) {
- NL_SET_ERR_MSG(extack, "Failed to create new tcf chain");
- err = -ENOMEM;
- goto err_chain_create;
- }
block->refcnt = 1;
block->net = net;
block->index = block_index;
@@ -429,10 +518,6 @@ static struct tcf_block *tcf_block_create(struct net *net, struct Qdisc *q,
if (!tcf_block_shared(block))
block->q = q;
return block;
-
-err_chain_create:
- kfree(block);
- return ERR_PTR(err);
}
static struct tcf_block *tcf_block_lookup(struct net *net, u32 block_index)
@@ -514,11 +599,6 @@ static struct tcf_block *tcf_block_find(struct net *net, struct Qdisc **q,
return block;
}
-static struct tcf_chain *tcf_block_chain_zero(struct tcf_block *block)
-{
- return list_first_entry(&block->chain_list, struct tcf_chain, list);
-}
-
struct tcf_block_owner_item {
struct list_head list;
struct Qdisc *q;
@@ -612,10 +692,9 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
tcf_block_owner_netif_keep_dst(block, q, ei->binder_type);
- err = tcf_chain_head_change_cb_add(tcf_block_chain_zero(block),
- ei, extack);
+ err = tcf_chain0_head_change_cb_add(block, ei, extack);
if (err)
- goto err_chain_head_change_cb_add;
+ goto err_chain0_head_change_cb_add;
err = tcf_block_offload_bind(block, q, ei, extack);
if (err)
@@ -625,15 +704,14 @@ int tcf_block_get_ext(struct tcf_block **p_block, struct Qdisc *q,
return 0;
err_block_offload_bind:
- tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
-err_chain_head_change_cb_add:
+ tcf_chain0_head_change_cb_del(block, ei);
+err_chain0_head_change_cb_add:
tcf_block_owner_del(block, q, ei->binder_type);
err_block_owner_add:
if (created) {
if (tcf_block_shared(block))
tcf_block_remove(block, net);
err_block_insert:
- kfree(tcf_block_chain_zero(block));
kfree(block);
} else {
block->refcnt--;
@@ -673,10 +751,10 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
if (!block)
return;
- tcf_chain_head_change_cb_del(tcf_block_chain_zero(block), ei);
+ tcf_chain0_head_change_cb_del(block, ei);
tcf_block_owner_del(block, q, ei->binder_type);
- if (--block->refcnt == 0) {
+ if (block->refcnt == 1) {
if (tcf_block_shared(block))
tcf_block_remove(block, block->net);
@@ -692,13 +770,16 @@ void tcf_block_put_ext(struct tcf_block *block, struct Qdisc *q,
tcf_block_offload_unbind(block, q, ei);
- if (block->refcnt == 0) {
+ if (block->refcnt == 1) {
/* At this point, all the chains should have refcnt >= 1. */
- list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) {
+ tcf_chain_put_explicitly_created(chain);
tcf_chain_put(chain);
+ }
- /* Finally, put chain 0 and allow block to be freed. */
- tcf_chain_put(tcf_block_chain_zero(block));
+ block->refcnt--;
+ if (list_empty(&block->chain_list))
+ kfree(block);
}
}
EXPORT_SYMBOL(tcf_block_put_ext);
@@ -818,7 +899,7 @@ int tcf_block_cb_register(struct tcf_block *block,
block_cb = __tcf_block_cb_register(block, cb, cb_ident, cb_priv,
extack);
- return IS_ERR(block_cb) ? PTR_ERR(block_cb) : 0;
+ return PTR_ERR_OR_ZERO(block_cb);
}
EXPORT_SYMBOL(tcf_block_cb_register);
@@ -938,7 +1019,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain,
struct tcf_proto *tp)
{
if (*chain_info->pprev == chain->filter_chain)
- tcf_chain_head_change(chain, tp);
+ tcf_chain0_head_change(chain, tp);
RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info));
rcu_assign_pointer(*chain_info->pprev, tp);
tcf_chain_hold(chain);
@@ -951,7 +1032,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain,
struct tcf_proto *next = rtnl_dereference(chain_info->next);
if (tp == chain->filter_chain)
- tcf_chain_head_change(chain, next);
+ tcf_chain0_head_change(chain, next);
RCU_INIT_POINTER(*chain_info->pprev, next);
tcf_chain_put(chain);
}
@@ -1098,7 +1179,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
for (tp = rtnl_dereference(chain->filter_chain);
tp; tp = rtnl_dereference(tp->next))
tfilter_notify(net, oskb, n, tp, block,
- q, parent, 0, event, false);
+ q, parent, NULL, event, false);
}
static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
@@ -1227,6 +1308,12 @@ replay:
goto errout;
}
+ if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) {
+ NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind");
+ err = -EINVAL;
+ goto errout;
+ }
+
err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
n->nlmsg_flags & NLM_F_CREATE ? TCA_ACT_NOREPLACE : TCA_ACT_REPLACE,
extack);
@@ -1302,6 +1389,13 @@ static int tc_del_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
}
chain = tcf_chain_get(block, chain_index, false);
if (!chain) {
+ /* User requested flush on non-existent chain. Nothing to do,
+ * so just return success.
+ */
+ if (prio == 0) {
+ err = 0;
+ goto errout;
+ }
NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
err = -EINVAL;
goto errout;
@@ -1489,7 +1583,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
memset(&cb->args[1], 0,
sizeof(cb->args) - sizeof(cb->args[0]));
if (cb->args[1] == 0) {
- if (tcf_fill_node(net, skb, tp, block, q, parent, 0,
+ if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER) <= 0)
@@ -1508,7 +1602,9 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
arg.w.stop = 0;
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
+ arg.w.cookie = cb->args[2];
tp->ops->walk(tp, &arg.w);
+ cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
if (arg.w.stop)
return false;
@@ -1606,6 +1702,330 @@ out:
return skb->len;
}
+static int tc_chain_fill_node(struct tcf_chain *chain, struct net *net,
+ struct sk_buff *skb, struct tcf_block *block,
+ u32 portid, u32 seq, u16 flags, int event)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ const struct tcf_proto_ops *ops;
+ struct nlmsghdr *nlh;
+ struct tcmsg *tcm;
+ void *priv;
+
+ ops = chain->tmplt_ops;
+ priv = chain->tmplt_priv;
+
+ nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
+ if (!nlh)
+ goto out_nlmsg_trim;
+ tcm = nlmsg_data(nlh);
+ tcm->tcm_family = AF_UNSPEC;
+ tcm->tcm__pad1 = 0;
+ tcm->tcm__pad2 = 0;
+ tcm->tcm_handle = 0;
+ if (block->q) {
+ tcm->tcm_ifindex = qdisc_dev(block->q)->ifindex;
+ tcm->tcm_parent = block->q->handle;
+ } else {
+ tcm->tcm_ifindex = TCM_IFINDEX_MAGIC_BLOCK;
+ tcm->tcm_block_index = block->index;
+ }
+
+ if (nla_put_u32(skb, TCA_CHAIN, chain->index))
+ goto nla_put_failure;
+
+ if (ops) {
+ if (nla_put_string(skb, TCA_KIND, ops->kind))
+ goto nla_put_failure;
+ if (ops->tmplt_dump(skb, net, priv) < 0)
+ goto nla_put_failure;
+ }
+
+ nlh->nlmsg_len = skb_tail_pointer(skb) - b;
+ return skb->len;
+
+out_nlmsg_trim:
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -EMSGSIZE;
+}
+
+static int tc_chain_notify(struct tcf_chain *chain, struct sk_buff *oskb,
+ u32 seq, u16 flags, int event, bool unicast)
+{
+ u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
+ struct tcf_block *block = chain->block;
+ struct net *net = block->net;
+ struct sk_buff *skb;
+
+ skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!skb)
+ return -ENOBUFS;
+
+ if (tc_chain_fill_node(chain, net, skb, block, portid,
+ seq, flags, event) <= 0) {
+ kfree_skb(skb);
+ return -EINVAL;
+ }
+
+ if (unicast)
+ return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
+ return rtnetlink_send(skb, net, portid, RTNLGRP_TC, flags & NLM_F_ECHO);
+}
+
+static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ const struct tcf_proto_ops *ops;
+ void *tmplt_priv;
+
+ /* If kind is not set, user did not specify template. */
+ if (!tca[TCA_KIND])
+ return 0;
+
+ ops = tcf_proto_lookup_ops(nla_data(tca[TCA_KIND]), extack);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
+ NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+ return -EOPNOTSUPP;
+ }
+
+ tmplt_priv = ops->tmplt_create(net, chain, tca, extack);
+ if (IS_ERR(tmplt_priv)) {
+ module_put(ops->owner);
+ return PTR_ERR(tmplt_priv);
+ }
+ chain->tmplt_ops = ops;
+ chain->tmplt_priv = tmplt_priv;
+ return 0;
+}
+
+static void tc_chain_tmplt_del(struct tcf_chain *chain)
+{
+ const struct tcf_proto_ops *ops = chain->tmplt_ops;
+
+ /* If template ops are set, no work to do for us. */
+ if (!ops)
+ return;
+
+ ops->tmplt_destroy(chain->tmplt_priv);
+ module_put(ops->owner);
+}
+
+/* Add/delete/get a chain */
+
+static int tc_ctl_chain(struct sk_buff *skb, struct nlmsghdr *n,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct tcmsg *t;
+ u32 parent;
+ u32 chain_index;
+ struct Qdisc *q = NULL;
+ struct tcf_chain *chain = NULL;
+ struct tcf_block *block;
+ unsigned long cl;
+ int err;
+
+ if (n->nlmsg_type != RTM_GETCHAIN &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+replay:
+ err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack);
+ if (err < 0)
+ return err;
+
+ t = nlmsg_data(n);
+ parent = t->tcm_parent;
+ cl = 0;
+
+ block = tcf_block_find(net, &q, &parent, &cl,
+ t->tcm_ifindex, t->tcm_block_index, extack);
+ if (IS_ERR(block))
+ return PTR_ERR(block);
+
+ chain_index = tca[TCA_CHAIN] ? nla_get_u32(tca[TCA_CHAIN]) : 0;
+ if (chain_index > TC_ACT_EXT_VAL_MASK) {
+ NL_SET_ERR_MSG(extack, "Specified chain index exceeds upper limit");
+ return -EINVAL;
+ }
+ chain = tcf_chain_lookup(block, chain_index);
+ if (n->nlmsg_type == RTM_NEWCHAIN) {
+ if (chain) {
+ if (tcf_chain_held_by_acts_only(chain)) {
+ /* The chain exists only because there is
+ * some action referencing it.
+ */
+ tcf_chain_hold(chain);
+ } else {
+ NL_SET_ERR_MSG(extack, "Filter chain already exists");
+ return -EEXIST;
+ }
+ } else {
+ if (!(n->nlmsg_flags & NLM_F_CREATE)) {
+ NL_SET_ERR_MSG(extack, "Need both RTM_NEWCHAIN and NLM_F_CREATE to create a new chain");
+ return -ENOENT;
+ }
+ chain = tcf_chain_create(block, chain_index);
+ if (!chain) {
+ NL_SET_ERR_MSG(extack, "Failed to create filter chain");
+ return -ENOMEM;
+ }
+ }
+ } else {
+ if (!chain || tcf_chain_held_by_acts_only(chain)) {
+ NL_SET_ERR_MSG(extack, "Cannot find specified filter chain");
+ return -EINVAL;
+ }
+ tcf_chain_hold(chain);
+ }
+
+ switch (n->nlmsg_type) {
+ case RTM_NEWCHAIN:
+ err = tc_chain_tmplt_add(chain, net, tca, extack);
+ if (err)
+ goto errout;
+ /* In case the chain was successfully added, take a reference
+ * to the chain. This ensures that an empty chain
+ * does not disappear at the end of this function.
+ */
+ tcf_chain_hold(chain);
+ chain->explicitly_created = true;
+ tc_chain_notify(chain, NULL, 0, NLM_F_CREATE | NLM_F_EXCL,
+ RTM_NEWCHAIN, false);
+ break;
+ case RTM_DELCHAIN:
+ /* Flush the chain first as the user requested chain removal. */
+ tcf_chain_flush(chain);
+ /* In case the chain was successfully deleted, put a reference
+ * to the chain previously taken during addition.
+ */
+ tcf_chain_put_explicitly_created(chain);
+ chain->explicitly_created = false;
+ break;
+ case RTM_GETCHAIN:
+ err = tc_chain_notify(chain, skb, n->nlmsg_seq,
+ n->nlmsg_seq, n->nlmsg_type, true);
+ if (err < 0)
+ NL_SET_ERR_MSG(extack, "Failed to send chain notify message");
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ NL_SET_ERR_MSG(extack, "Unsupported message type");
+ goto errout;
+ }
+
+errout:
+ tcf_chain_put(chain);
+ if (err == -EAGAIN)
+ /* Replay the request. */
+ goto replay;
+ return err;
+}
+
+/* called with RTNL */
+static int tc_dump_chain(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct nlattr *tca[TCA_MAX + 1];
+ struct Qdisc *q = NULL;
+ struct tcf_block *block;
+ struct tcf_chain *chain;
+ struct tcmsg *tcm = nlmsg_data(cb->nlh);
+ long index_start;
+ long index;
+ u32 parent;
+ int err;
+
+ if (nlmsg_len(cb->nlh) < sizeof(*tcm))
+ return skb->len;
+
+ err = nlmsg_parse(cb->nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL);
+ if (err)
+ return err;
+
+ if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
+ block = tcf_block_lookup(net, tcm->tcm_block_index);
+ if (!block)
+ goto out;
+ /* If we work with block index, q is NULL and parent value
+ * will never be used in the following code. The check
+ * in tcf_fill_node prevents it. However, compiler does not
+ * see that far, so set parent to zero to silence the warning
+ * about parent being uninitialized.
+ */
+ parent = 0;
+ } else {
+ const struct Qdisc_class_ops *cops;
+ struct net_device *dev;
+ unsigned long cl = 0;
+
+ dev = __dev_get_by_index(net, tcm->tcm_ifindex);
+ if (!dev)
+ return skb->len;
+
+ parent = tcm->tcm_parent;
+ if (!parent) {
+ q = dev->qdisc;
+ parent = q->handle;
+ } else {
+ q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent));
+ }
+ if (!q)
+ goto out;
+ cops = q->ops->cl_ops;
+ if (!cops)
+ goto out;
+ if (!cops->tcf_block)
+ goto out;
+ if (TC_H_MIN(tcm->tcm_parent)) {
+ cl = cops->find(q, tcm->tcm_parent);
+ if (cl == 0)
+ goto out;
+ }
+ block = cops->tcf_block(q, cl, NULL);
+ if (!block)
+ goto out;
+ if (tcf_block_shared(block))
+ q = NULL;
+ }
+
+ index_start = cb->args[0];
+ index = 0;
+
+ list_for_each_entry(chain, &block->chain_list, list) {
+ if ((tca[TCA_CHAIN] &&
+ nla_get_u32(tca[TCA_CHAIN]) != chain->index))
+ continue;
+ if (index < index_start) {
+ index++;
+ continue;
+ }
+ if (tcf_chain_held_by_acts_only(chain))
+ continue;
+ err = tc_chain_fill_node(chain, net, skb, block,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ RTM_NEWCHAIN);
+ if (err <= 0)
+ break;
+ index++;
+ }
+
+ cb->args[0] = index;
+
+out:
+ /* If we did no progress, the error (EMSGSIZE) is real */
+ if (skb->len == 0 && err)
+ return err;
+ return skb->len;
+}
+
void tcf_exts_destroy(struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
@@ -1822,6 +2242,10 @@ static int __init tc_filter_init(void)
rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, 0);
rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_get_tfilter,
tc_dump_tfilter, 0);
+ rtnl_register(PF_UNSPEC, RTM_NEWCHAIN, tc_ctl_chain, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_DELCHAIN, tc_ctl_chain, NULL, 0);
+ rtnl_register(PF_UNSPEC, RTM_GETCHAIN, tc_ctl_chain,
+ tc_dump_chain, 0);
return 0;
diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c
index 95367f37098d..6a5dce8baf19 100644
--- a/net/sched/cls_basic.c
+++ b/net/sched/cls_basic.c
@@ -324,4 +324,3 @@ static void __exit exit_basic(void)
module_init(init_basic)
module_exit(exit_basic)
MODULE_LICENSE("GPL");
-
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 66e0ac9811f9..fa6fe2fe0f32 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -349,12 +349,10 @@ static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog)
if (bpf_size != nla_len(tb[TCA_BPF_OPS]))
return -EINVAL;
- bpf_ops = kzalloc(bpf_size, GFP_KERNEL);
+ bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL);
if (bpf_ops == NULL)
return -ENOMEM;
- memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size);
-
fprog_tmp.len = bpf_num_ops;
fprog_tmp.filter = bpf_ops;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 8b2474293db1..a3b69bb6f4b0 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -52,6 +52,7 @@ struct fl_flow_key {
struct flow_dissector_key_mpls mpls;
struct flow_dissector_key_tcp tcp;
struct flow_dissector_key_ip ip;
+ struct flow_dissector_key_ip enc_ip;
} __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */
struct fl_flow_mask_range {
@@ -71,6 +72,13 @@ struct fl_flow_mask {
struct list_head list;
};
+struct fl_flow_tmplt {
+ struct fl_flow_key dummy_key;
+ struct fl_flow_key mask;
+ struct flow_dissector dissector;
+ struct tcf_chain *chain;
+};
+
struct cls_fl_head {
struct rhashtable ht;
struct list_head masks;
@@ -146,6 +154,23 @@ static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key,
*lmkey++ = *lkey++ & *lmask++;
}
+static bool fl_mask_fits_tmplt(struct fl_flow_tmplt *tmplt,
+ struct fl_flow_mask *mask)
+{
+ const long *lmask = fl_key_get_start(&mask->key, mask);
+ const long *ltmplt;
+ int i;
+
+ if (!tmplt)
+ return true;
+ ltmplt = fl_key_get_start(&tmplt->mask, mask);
+ for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) {
+ if (~*ltmplt++ & *lmask++)
+ return false;
+ }
+ return true;
+}
+
static void fl_clear_masked_range(struct fl_flow_key *key,
struct fl_flow_mask *mask)
{
@@ -453,6 +478,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = {
[TCA_FLOWER_KEY_CVLAN_ID] = { .type = NLA_U16 },
[TCA_FLOWER_KEY_CVLAN_PRIO] = { .type = NLA_U8 },
[TCA_FLOWER_KEY_CVLAN_ETH_TYPE] = { .type = NLA_U16 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TOS_MASK] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL] = { .type = NLA_U8 },
+ [TCA_FLOWER_KEY_ENC_IP_TTL_MASK] = { .type = NLA_U8 },
};
static void fl_set_key_val(struct nlattr **tb,
@@ -561,17 +590,17 @@ static int fl_set_key_flags(struct nlattr **tb,
return 0;
}
-static void fl_set_key_ip(struct nlattr **tb,
+static void fl_set_key_ip(struct nlattr **tb, bool encap,
struct flow_dissector_key_ip *key,
struct flow_dissector_key_ip *mask)
{
- fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS,
- &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK,
- sizeof(key->tos));
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
- fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL,
- &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK,
- sizeof(key->ttl));
+ fl_set_key_val(tb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos));
+ fl_set_key_val(tb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl));
}
static int fl_set_key(struct net *net, struct nlattr **tb,
@@ -633,7 +662,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto));
- fl_set_key_ip(tb, &key->ip, &mask->ip);
+ fl_set_key_ip(tb, false, &key->ip, &mask->ip);
}
if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
@@ -768,6 +797,8 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
&mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
sizeof(key->enc_tp.dst));
+ fl_set_key_ip(tb, true, &key->enc_ip, &mask->enc_ip);
+
if (tb[TCA_FLOWER_KEY_FLAGS])
ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags);
@@ -819,49 +850,52 @@ static int fl_init_mask_hashtable(struct fl_flow_mask *mask)
FL_KEY_SET(keys, cnt, id, member); \
} while(0);
-static void fl_init_dissector(struct fl_flow_mask *mask)
+static void fl_init_dissector(struct flow_dissector *dissector,
+ struct fl_flow_key *mask)
{
struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX];
size_t cnt = 0;
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control);
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ETH_ADDRS, eth);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_PORTS, tp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_IP, ip);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_TCP, tcp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ICMP, icmp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ARP, arp);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_MPLS, mpls);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_VLAN, vlan);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_CVLAN, cvlan);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6);
- if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) ||
- FL_KEY_IS_MASKED(&mask->key, enc_ipv6))
+ if (FL_KEY_IS_MASKED(mask, enc_ipv4) ||
+ FL_KEY_IS_MASKED(mask, enc_ipv6))
FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL,
enc_control);
- FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt,
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp);
+ FL_KEY_SET_IF_MASKED(mask, keys, cnt,
+ FLOW_DISSECTOR_KEY_ENC_IP, enc_ip);
- skb_flow_dissector_init(&mask->dissector, keys, cnt);
+ skb_flow_dissector_init(dissector, keys, cnt);
}
static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
@@ -880,7 +914,7 @@ static struct fl_flow_mask *fl_create_new_mask(struct cls_fl_head *head,
if (err)
goto errout_free;
- fl_init_dissector(newmask);
+ fl_init_dissector(&newmask->dissector, &newmask->key);
INIT_LIST_HEAD_RCU(&newmask->filters);
@@ -929,6 +963,7 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
struct cls_fl_filter *f, struct fl_flow_mask *mask,
unsigned long base, struct nlattr **tb,
struct nlattr *est, bool ovr,
+ struct fl_flow_tmplt *tmplt,
struct netlink_ext_ack *extack)
{
int err;
@@ -949,6 +984,11 @@ static int fl_set_parms(struct net *net, struct tcf_proto *tp,
fl_mask_update_range(mask);
fl_set_masked_key(&f->mkey, &f->key, mask);
+ if (!fl_mask_fits_tmplt(tmplt, mask)) {
+ NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -1014,7 +1054,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr,
- extack);
+ tp->chain->tmplt_priv, extack);
if (err)
goto errout_idr;
@@ -1099,19 +1139,17 @@ static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
struct cls_fl_filter *f;
- struct fl_flow_mask *mask;
- list_for_each_entry_rcu(mask, &head->masks, list) {
- list_for_each_entry_rcu(f, &mask->filters, list) {
- if (arg->count < arg->skip)
- goto skip;
- if (arg->fn(tp, f, arg) < 0) {
- arg->stop = 1;
- break;
- }
-skip:
- arg->count++;
+ arg->count = arg->skip;
+
+ while ((f = idr_get_next_ul(&head->handle_idr,
+ &arg->cookie)) != NULL) {
+ if (arg->fn(tp, f, arg) < 0) {
+ arg->stop = 1;
+ break;
}
+ arg->cookie = f->handle + 1;
+ arg->count++;
}
}
@@ -1156,6 +1194,93 @@ static int fl_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb,
return 0;
}
+static void fl_hw_create_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = chain->block;
+ struct tcf_exts dummy_exts = { 0, };
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = TC_CLSFLOWER_TMPLT_CREATE;
+ cls_flower.cookie = (unsigned long) tmplt;
+ cls_flower.dissector = &tmplt->dissector;
+ cls_flower.mask = &tmplt->mask;
+ cls_flower.key = &tmplt->dummy_key;
+ cls_flower.exts = &dummy_exts;
+
+ /* We don't care if driver (any of them) fails to handle this
+ * call. It serves just as a hint for it.
+ */
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static void fl_hw_destroy_tmplt(struct tcf_chain *chain,
+ struct fl_flow_tmplt *tmplt)
+{
+ struct tc_cls_flower_offload cls_flower = {};
+ struct tcf_block *block = chain->block;
+
+ cls_flower.common.chain_index = chain->index;
+ cls_flower.command = TC_CLSFLOWER_TMPLT_DESTROY;
+ cls_flower.cookie = (unsigned long) tmplt;
+
+ tc_setup_cb_call(block, NULL, TC_SETUP_CLSFLOWER,
+ &cls_flower, false);
+}
+
+static void *fl_tmplt_create(struct net *net, struct tcf_chain *chain,
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack)
+{
+ struct fl_flow_tmplt *tmplt;
+ struct nlattr **tb;
+ int err;
+
+ if (!tca[TCA_OPTIONS])
+ return ERR_PTR(-EINVAL);
+
+ tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL);
+ if (!tb)
+ return ERR_PTR(-ENOBUFS);
+ err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS],
+ fl_policy, NULL);
+ if (err)
+ goto errout_tb;
+
+ tmplt = kzalloc(sizeof(*tmplt), GFP_KERNEL);
+ if (!tmplt) {
+ err = -ENOMEM;
+ goto errout_tb;
+ }
+ tmplt->chain = chain;
+ err = fl_set_key(net, tb, &tmplt->dummy_key, &tmplt->mask, extack);
+ if (err)
+ goto errout_tmplt;
+ kfree(tb);
+
+ fl_init_dissector(&tmplt->dissector, &tmplt->mask);
+
+ fl_hw_create_tmplt(chain, tmplt);
+
+ return tmplt;
+
+errout_tmplt:
+ kfree(tmplt);
+errout_tb:
+ kfree(tb);
+ return ERR_PTR(err);
+}
+
+static void fl_tmplt_destroy(void *tmplt_priv)
+{
+ struct fl_flow_tmplt *tmplt = tmplt_priv;
+
+ fl_hw_destroy_tmplt(tmplt->chain, tmplt);
+ kfree(tmplt);
+}
+
static int fl_dump_key_val(struct sk_buff *skb,
void *val, int val_type,
void *mask, int mask_type, int len)
@@ -1210,14 +1335,17 @@ static int fl_dump_key_mpls(struct sk_buff *skb,
return 0;
}
-static int fl_dump_key_ip(struct sk_buff *skb,
+static int fl_dump_key_ip(struct sk_buff *skb, bool encap,
struct flow_dissector_key_ip *key,
struct flow_dissector_key_ip *mask)
{
- if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos,
- TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) ||
- fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl,
- TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl)))
+ int tos_key = encap ? TCA_FLOWER_KEY_ENC_IP_TOS : TCA_FLOWER_KEY_IP_TOS;
+ int ttl_key = encap ? TCA_FLOWER_KEY_ENC_IP_TTL : TCA_FLOWER_KEY_IP_TTL;
+ int tos_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TOS_MASK : TCA_FLOWER_KEY_IP_TOS_MASK;
+ int ttl_mask = encap ? TCA_FLOWER_KEY_ENC_IP_TTL_MASK : TCA_FLOWER_KEY_IP_TTL_MASK;
+
+ if (fl_dump_key_val(skb, &key->tos, tos_key, &mask->tos, tos_mask, sizeof(key->tos)) ||
+ fl_dump_key_val(skb, &key->ttl, ttl_key, &mask->ttl, ttl_mask, sizeof(key->ttl)))
return -1;
return 0;
@@ -1286,29 +1414,9 @@ static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask)
return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask);
}
-static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
- struct sk_buff *skb, struct tcmsg *t)
+static int fl_dump_key(struct sk_buff *skb, struct net *net,
+ struct fl_flow_key *key, struct fl_flow_key *mask)
{
- struct cls_fl_filter *f = fh;
- struct nlattr *nest;
- struct fl_flow_key *key, *mask;
-
- if (!f)
- return skb->len;
-
- t->tcm_handle = f->handle;
-
- nest = nla_nest_start(skb, TCA_OPTIONS);
- if (!nest)
- goto nla_put_failure;
-
- if (f->res.classid &&
- nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
- goto nla_put_failure;
-
- key = &f->key;
- mask = &f->mask->key;
-
if (mask->indev_ifindex) {
struct net_device *dev;
@@ -1317,9 +1425,6 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
goto nla_put_failure;
}
- if (!tc_skip_hw(f->flags))
- fl_hw_update_stats(tp, f);
-
if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST,
mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK,
sizeof(key->eth.dst)) ||
@@ -1342,8 +1447,8 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
TCA_FLOWER_KEY_CVLAN_PRIO,
&key->cvlan, &mask->cvlan) ||
(mask->cvlan.vlan_tpid &&
- nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
- key->cvlan.vlan_tpid)))
+ nla_put_be16(skb, TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+ key->cvlan.vlan_tpid)))
goto nla_put_failure;
if (mask->basic.n_proto) {
@@ -1363,7 +1468,7 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
(fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto)) ||
- fl_dump_key_ip(skb, &key->ip, &mask->ip)))
+ fl_dump_key_ip(skb, false, &key->ip, &mask->ip)))
goto nla_put_failure;
if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS &&
@@ -1488,12 +1593,48 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
TCA_FLOWER_KEY_ENC_UDP_DST_PORT,
&mask->enc_tp.dst,
TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK,
- sizeof(key->enc_tp.dst)))
+ sizeof(key->enc_tp.dst)) ||
+ fl_dump_key_ip(skb, true, &key->enc_ip, &mask->enc_ip))
goto nla_put_failure;
if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags))
goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
+ struct sk_buff *skb, struct tcmsg *t)
+{
+ struct cls_fl_filter *f = fh;
+ struct nlattr *nest;
+ struct fl_flow_key *key, *mask;
+
+ if (!f)
+ return skb->len;
+
+ t->tcm_handle = f->handle;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ if (f->res.classid &&
+ nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid))
+ goto nla_put_failure;
+
+ key = &f->key;
+ mask = &f->mask->key;
+
+ if (fl_dump_key(skb, net, key, mask))
+ goto nla_put_failure;
+
+ if (!tc_skip_hw(f->flags))
+ fl_hw_update_stats(tp, f);
+
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
goto nla_put_failure;
@@ -1512,6 +1653,31 @@ nla_put_failure:
return -1;
}
+static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
+{
+ struct fl_flow_tmplt *tmplt = tmplt_priv;
+ struct fl_flow_key *key, *mask;
+ struct nlattr *nest;
+
+ nest = nla_nest_start(skb, TCA_OPTIONS);
+ if (!nest)
+ goto nla_put_failure;
+
+ key = &tmplt->dummy_key;
+ mask = &tmplt->mask;
+
+ if (fl_dump_key(skb, net, key, mask))
+ goto nla_put_failure;
+
+ nla_nest_end(skb, nest);
+
+ return skb->len;
+
+nla_put_failure:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static void fl_bind_class(void *fh, u32 classid, unsigned long cl)
{
struct cls_fl_filter *f = fh;
@@ -1532,6 +1698,9 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.reoffload = fl_reoffload,
.dump = fl_dump,
.bind_class = fl_bind_class,
+ .tmplt_create = fl_tmplt_create,
+ .tmplt_destroy = fl_tmplt_destroy,
+ .tmplt_dump = fl_tmplt_dump,
.owner = THIS_MODULE,
};
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 30695691e9ff..35fc7252187c 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -80,7 +80,6 @@
#define CAKE_QUEUES (1024)
#define CAKE_FLOW_MASK 63
#define CAKE_FLOW_NAT_FLAG 64
-#define CAKE_SPLIT_GSO_THRESHOLD (125000000) /* 1Gbps */
/* struct cobalt_params - contains codel and blue parameters
* @interval: codel initial drop rate
@@ -1546,7 +1545,7 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
TC_H_MIN(skb->priority) <= q->tin_cnt) {
- tin = TC_H_MIN(skb->priority) - 1;
+ tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
if (q->rate_flags & CAKE_FLAG_WASH)
cake_wash_diffserv(skb);
@@ -2569,10 +2568,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
if (tb[TCA_CAKE_MEMORY])
q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
- if (q->rate_bps && q->rate_bps <= CAKE_SPLIT_GSO_THRESHOLD)
- q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
- else
- q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+ if (tb[TCA_CAKE_SPLIT_GSO]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
+ else
+ q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
+ }
if (q->tins) {
sch_tree_lock(sch);
@@ -2608,7 +2609,7 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
q->target = 5000; /* 5ms: codel RFC argues
* for 5 to 10% of interval
*/
-
+ q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
q->cur_tin = 0;
q->cur_flow = 0;
diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c
index cdd96b9a27bc..e26a24017faa 100644
--- a/net/sched/sch_cbs.c
+++ b/net/sched/sch_cbs.c
@@ -78,18 +78,42 @@ struct cbs_sched_data {
s64 sendslope; /* in bytes/s */
s64 idleslope; /* in bytes/s */
struct qdisc_watchdog watchdog;
- int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch);
+ int (*enqueue)(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free);
struct sk_buff *(*dequeue)(struct Qdisc *sch);
+ struct Qdisc *qdisc;
};
-static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch)
+static int cbs_child_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct Qdisc *child,
+ struct sk_buff **to_free)
{
- return qdisc_enqueue_tail(skb, sch);
+ int err;
+
+ err = child->ops->enqueue(skb, child, to_free);
+ if (err != NET_XMIT_SUCCESS)
+ return err;
+
+ qdisc_qstats_backlog_inc(sch, skb);
+ sch->q.qlen++;
+
+ return NET_XMIT_SUCCESS;
}
-static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
+static int cbs_enqueue_offload(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
{
struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ return cbs_child_enqueue(skb, sch, qdisc, to_free);
+}
+
+static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
if (sch->q.qlen == 0 && q->credits > 0) {
/* We need to stop accumulating credits when there's
@@ -99,7 +123,7 @@ static int cbs_enqueue_soft(struct sk_buff *skb, struct Qdisc *sch)
q->last = ktime_get_ns();
}
- return qdisc_enqueue_tail(skb, sch);
+ return cbs_child_enqueue(skb, sch, qdisc, to_free);
}
static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -107,7 +131,7 @@ static int cbs_enqueue(struct sk_buff *skb, struct Qdisc *sch,
{
struct cbs_sched_data *q = qdisc_priv(sch);
- return q->enqueue(skb, sch);
+ return q->enqueue(skb, sch, to_free);
}
/* timediff is in ns, slope is in bytes/s */
@@ -132,9 +156,25 @@ static s64 credits_from_len(unsigned int len, s64 slope, s64 port_rate)
return div64_s64(len * slope, port_rate);
}
+static struct sk_buff *cbs_child_dequeue(struct Qdisc *sch, struct Qdisc *child)
+{
+ struct sk_buff *skb;
+
+ skb = child->ops->dequeue(child);
+ if (!skb)
+ return NULL;
+
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+ sch->q.qlen--;
+
+ return skb;
+}
+
static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
{
struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
s64 now = ktime_get_ns();
struct sk_buff *skb;
s64 credits;
@@ -157,8 +197,7 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
return NULL;
}
}
-
- skb = qdisc_dequeue_head(sch);
+ skb = cbs_child_dequeue(sch, qdisc);
if (!skb)
return NULL;
@@ -178,7 +217,10 @@ static struct sk_buff *cbs_dequeue_soft(struct Qdisc *sch)
static struct sk_buff *cbs_dequeue_offload(struct Qdisc *sch)
{
- return qdisc_dequeue_head(sch);
+ struct cbs_sched_data *q = qdisc_priv(sch);
+ struct Qdisc *qdisc = q->qdisc;
+
+ return cbs_child_dequeue(sch, qdisc);
}
static struct sk_buff *cbs_dequeue(struct Qdisc *sch)
@@ -310,6 +352,13 @@ static int cbs_init(struct Qdisc *sch, struct nlattr *opt,
return -EINVAL;
}
+ q->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, extack);
+ if (!q->qdisc)
+ return -ENOMEM;
+
+ qdisc_hash_add(q->qdisc, false);
+
q->queue = sch->dev_queue - netdev_get_tx_queue(dev, 0);
q->enqueue = cbs_enqueue_soft;
@@ -328,6 +377,9 @@ static void cbs_destroy(struct Qdisc *sch)
qdisc_watchdog_cancel(&q->watchdog);
cbs_disable_offload(dev, q);
+
+ if (q->qdisc)
+ qdisc_destroy(q->qdisc);
}
static int cbs_dump(struct Qdisc *sch, struct sk_buff *skb)
@@ -356,8 +408,72 @@ nla_put_failure:
return -1;
}
+static int cbs_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ if (cl != 1 || !q->qdisc) /* only one class */
+ return -ENOENT;
+
+ tcm->tcm_handle |= TC_H_MIN(1);
+ tcm->tcm_info = q->qdisc->handle;
+
+ return 0;
+}
+
+static int cbs_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ if (!new) {
+ new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
+ sch->handle, NULL);
+ if (!new)
+ new = &noop_qdisc;
+ }
+
+ *old = qdisc_replace(sch, new, &q->qdisc);
+ return 0;
+}
+
+static struct Qdisc *cbs_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ struct cbs_sched_data *q = qdisc_priv(sch);
+
+ return q->qdisc;
+}
+
+static unsigned long cbs_find(struct Qdisc *sch, u32 classid)
+{
+ return 1;
+}
+
+static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker)
+{
+ if (!walker->stop) {
+ if (walker->count >= walker->skip) {
+ if (walker->fn(sch, 1, walker) < 0) {
+ walker->stop = 1;
+ return;
+ }
+ }
+ walker->count++;
+ }
+}
+
+static const struct Qdisc_class_ops cbs_class_ops = {
+ .graft = cbs_graft,
+ .leaf = cbs_leaf,
+ .find = cbs_find,
+ .walk = cbs_walk,
+ .dump = cbs_dump_class,
+};
+
static struct Qdisc_ops cbs_qdisc_ops __read_mostly = {
.id = "cbs",
+ .cl_ops = &cbs_class_ops,
.priv_size = sizeof(struct cbs_sched_data),
.enqueue = cbs_enqueue,
.dequeue = cbs_dequeue,
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index cd2e0e342fb6..6c0a9d5dbf94 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -479,24 +479,28 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
q->cparams.mtu = psched_mtu(qdisc_dev(sch));
if (opt) {
- int err = fq_codel_change(sch, opt, extack);
+ err = fq_codel_change(sch, opt, extack);
if (err)
- return err;
+ goto init_failure;
}
err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
if (err)
- return err;
+ goto init_failure;
if (!q->flows) {
q->flows = kvcalloc(q->flows_cnt,
sizeof(struct fq_codel_flow),
GFP_KERNEL);
- if (!q->flows)
- return -ENOMEM;
+ if (!q->flows) {
+ err = -ENOMEM;
+ goto init_failure;
+ }
q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL);
- if (!q->backlogs)
- return -ENOMEM;
+ if (!q->backlogs) {
+ err = -ENOMEM;
+ goto alloc_failure;
+ }
for (i = 0; i < q->flows_cnt; i++) {
struct fq_codel_flow *flow = q->flows + i;
@@ -509,6 +513,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt,
else
sch->flags &= ~TCQ_F_CAN_BYPASS;
return 0;
+
+alloc_failure:
+ kvfree(q->flows);
+ q->flows = NULL;
+init_failure:
+ q->flows_cnt = 0;
+ return err;
}
static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb)
diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c
new file mode 100644
index 000000000000..52c0b6d8f1d7
--- /dev/null
+++ b/net/sched/sch_skbprio.c
@@ -0,0 +1,320 @@
+/*
+ * net/sched/sch_skbprio.c SKB Priority Queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Nishanth Devarajan, <ndev2021@gmail.com>
+ * Cody Doucette, <doucette@bu.edu>
+ * original idea by Michel Machado, Cody Doucette, and Qiaobin Fu
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <net/pkt_sched.h>
+#include <net/sch_generic.h>
+#include <net/inet_ecn.h>
+
+/* SKB Priority Queue
+ * =================================
+ *
+ * Skbprio (SKB Priority Queue) is a queueing discipline that prioritizes
+ * packets according to their skb->priority field. Under congestion,
+ * Skbprio drops already-enqueued lower priority packets to make space
+ * available for higher priority packets; it was conceived as a solution
+ * for denial-of-service defenses that need to route packets with different
+ * priorities as a mean to overcome DoS attacks.
+ */
+
+struct skbprio_sched_data {
+ /* Queue state. */
+ struct sk_buff_head qdiscs[SKBPRIO_MAX_PRIORITY];
+ struct gnet_stats_queue qstats[SKBPRIO_MAX_PRIORITY];
+ u16 highest_prio;
+ u16 lowest_prio;
+};
+
+static u16 calc_new_high_prio(const struct skbprio_sched_data *q)
+{
+ int prio;
+
+ for (prio = q->highest_prio - 1; prio >= q->lowest_prio; prio--) {
+ if (!skb_queue_empty(&q->qdiscs[prio]))
+ return prio;
+ }
+
+ /* SKB queue is empty, return 0 (default highest priority setting). */
+ return 0;
+}
+
+static u16 calc_new_low_prio(const struct skbprio_sched_data *q)
+{
+ int prio;
+
+ for (prio = q->lowest_prio + 1; prio <= q->highest_prio; prio++) {
+ if (!skb_queue_empty(&q->qdiscs[prio]))
+ return prio;
+ }
+
+ /* SKB queue is empty, return SKBPRIO_MAX_PRIORITY - 1
+ * (default lowest priority setting).
+ */
+ return SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static int skbprio_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+ struct sk_buff **to_free)
+{
+ const unsigned int max_priority = SKBPRIO_MAX_PRIORITY - 1;
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *qdisc;
+ struct sk_buff_head *lp_qdisc;
+ struct sk_buff *to_drop;
+ u16 prio, lp;
+
+ /* Obtain the priority of @skb. */
+ prio = min(skb->priority, max_priority);
+
+ qdisc = &q->qdiscs[prio];
+ if (sch->q.qlen < sch->limit) {
+ __skb_queue_tail(qdisc, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+ /* Check to update highest and lowest priorities. */
+ if (prio > q->highest_prio)
+ q->highest_prio = prio;
+
+ if (prio < q->lowest_prio)
+ q->lowest_prio = prio;
+
+ sch->q.qlen++;
+ return NET_XMIT_SUCCESS;
+ }
+
+ /* If this packet has the lowest priority, drop it. */
+ lp = q->lowest_prio;
+ if (prio <= lp) {
+ q->qstats[prio].drops++;
+ q->qstats[prio].overlimits++;
+ return qdisc_drop(skb, sch, to_free);
+ }
+
+ __skb_queue_tail(qdisc, skb);
+ qdisc_qstats_backlog_inc(sch, skb);
+ q->qstats[prio].backlog += qdisc_pkt_len(skb);
+
+ /* Drop the packet at the tail of the lowest priority qdisc. */
+ lp_qdisc = &q->qdiscs[lp];
+ to_drop = __skb_dequeue_tail(lp_qdisc);
+ BUG_ON(!to_drop);
+ qdisc_qstats_backlog_dec(sch, to_drop);
+ qdisc_drop(to_drop, sch, to_free);
+
+ q->qstats[lp].backlog -= qdisc_pkt_len(to_drop);
+ q->qstats[lp].drops++;
+ q->qstats[lp].overlimits++;
+
+ /* Check to update highest and lowest priorities. */
+ if (skb_queue_empty(lp_qdisc)) {
+ if (q->lowest_prio == q->highest_prio) {
+ /* The incoming packet is the only packet in queue. */
+ BUG_ON(sch->q.qlen != 1);
+ q->lowest_prio = prio;
+ q->highest_prio = prio;
+ } else {
+ q->lowest_prio = calc_new_low_prio(q);
+ }
+ }
+
+ if (prio > q->highest_prio)
+ q->highest_prio = prio;
+
+ return NET_XMIT_CN;
+}
+
+static struct sk_buff *skbprio_dequeue(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ struct sk_buff_head *hpq = &q->qdiscs[q->highest_prio];
+ struct sk_buff *skb = __skb_dequeue(hpq);
+
+ if (unlikely(!skb))
+ return NULL;
+
+ sch->q.qlen--;
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+
+ q->qstats[q->highest_prio].backlog -= qdisc_pkt_len(skb);
+
+ /* Update highest priority field. */
+ if (skb_queue_empty(hpq)) {
+ if (q->lowest_prio == q->highest_prio) {
+ BUG_ON(sch->q.qlen);
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+ } else {
+ q->highest_prio = calc_new_high_prio(q);
+ }
+ }
+ return skb;
+}
+
+static int skbprio_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct tc_skbprio_qopt *ctl = nla_data(opt);
+
+ sch->limit = ctl->limit;
+ return 0;
+}
+
+static int skbprio_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ /* Initialise all queues, one for each possible priority. */
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_head_init(&q->qdiscs[prio]);
+
+ memset(&q->qstats, 0, sizeof(q->qstats));
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+ sch->limit = 64;
+ if (!opt)
+ return 0;
+
+ return skbprio_change(sch, opt, extack);
+}
+
+static int skbprio_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct tc_skbprio_qopt opt;
+
+ opt.limit = sch->limit;
+
+ if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
+ return -1;
+
+ return skb->len;
+}
+
+static void skbprio_reset(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ sch->qstats.backlog = 0;
+ sch->q.qlen = 0;
+
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_purge(&q->qdiscs[prio]);
+
+ memset(&q->qstats, 0, sizeof(q->qstats));
+ q->highest_prio = 0;
+ q->lowest_prio = SKBPRIO_MAX_PRIORITY - 1;
+}
+
+static void skbprio_destroy(struct Qdisc *sch)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ int prio;
+
+ for (prio = 0; prio < SKBPRIO_MAX_PRIORITY; prio++)
+ __skb_queue_purge(&q->qdiscs[prio]);
+}
+
+static struct Qdisc *skbprio_leaf(struct Qdisc *sch, unsigned long arg)
+{
+ return NULL;
+}
+
+static unsigned long skbprio_find(struct Qdisc *sch, u32 classid)
+{
+ return 0;
+}
+
+static int skbprio_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
+{
+ tcm->tcm_handle |= TC_H_MIN(cl);
+ return 0;
+}
+
+static int skbprio_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
+{
+ struct skbprio_sched_data *q = qdisc_priv(sch);
+ if (gnet_stats_copy_queue(d, NULL, &q->qstats[cl - 1],
+ q->qstats[cl - 1].qlen) < 0)
+ return -1;
+ return 0;
+}
+
+static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+{
+ unsigned int i;
+
+ if (arg->stop)
+ return;
+
+ for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) {
+ if (arg->count < arg->skip) {
+ arg->count++;
+ continue;
+ }
+ if (arg->fn(sch, i + 1, arg) < 0) {
+ arg->stop = 1;
+ break;
+ }
+ arg->count++;
+ }
+}
+
+static const struct Qdisc_class_ops skbprio_class_ops = {
+ .leaf = skbprio_leaf,
+ .find = skbprio_find,
+ .dump = skbprio_dump_class,
+ .dump_stats = skbprio_dump_class_stats,
+ .walk = skbprio_walk,
+};
+
+static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = {
+ .cl_ops = &skbprio_class_ops,
+ .id = "skbprio",
+ .priv_size = sizeof(struct skbprio_sched_data),
+ .enqueue = skbprio_enqueue,
+ .dequeue = skbprio_dequeue,
+ .peek = qdisc_peek_dequeued,
+ .init = skbprio_init,
+ .reset = skbprio_reset,
+ .change = skbprio_change,
+ .dump = skbprio_dump,
+ .destroy = skbprio_destroy,
+ .owner = THIS_MODULE,
+};
+
+static int __init skbprio_module_init(void)
+{
+ return register_qdisc(&skbprio_qdisc_ops);
+}
+
+static void __exit skbprio_module_exit(void)
+{
+ unregister_qdisc(&skbprio_qdisc_ops);
+}
+
+module_init(skbprio_module_init)
+module_exit(skbprio_module_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index c740b189d4ba..950ecf6e7439 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -41,8 +41,8 @@ config SCTP_DBG_OBJCNT
bool "SCTP: Debug object counts"
depends on PROC_FS
help
- If you say Y, this will enable debugging support for counting the
- type of objects that are currently allocated. This is useful for
+ If you say Y, this will enable debugging support for counting the
+ type of objects that are currently allocated. This is useful for
identifying memory leaks. This debug information can be viewed by
'cat /proc/net/sctp/sctp_dbg_objcnt'
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 298112ca8c06..85d393090238 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -1827,4 +1827,3 @@ nomem:
error = -ENOMEM;
goto out;
}
-
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 445b7ef61677..12cac85da994 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -282,7 +282,7 @@ bool sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu)
if (dst) {
/* Re-fetch, as under layers may have a higher minimum size */
- pmtu = SCTP_TRUNC4(dst_mtu(dst));
+ pmtu = sctp_dst_mtu(dst);
change = t->pathmtu != pmtu;
}
t->pathmtu = pmtu;
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f3fdf3714f8b..0fc94f296e54 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -149,7 +149,8 @@ static int smc_release(struct socket *sock)
smc->clcsock = NULL;
}
if (smc->use_fallback) {
- sock_put(sk); /* passive closing */
+ if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
+ sock_put(sk); /* passive closing */
sk->sk_state = SMC_CLOSED;
sk->sk_state_change(sk);
}
@@ -343,20 +344,17 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
rc = smc_ib_modify_qp_rts(link);
if (rc)
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_RDYLNK;
smc_wr_remember_qp_attr(link);
if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_REGRMB;
/* send CONFIRM LINK response over RoCE fabric */
- rc = smc_llc_send_confirm_link(link,
- link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_RESP);
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_CL;
/* receive ADD LINK request from server over RoCE fabric */
rest = wait_for_completion_interruptible_timeout(&link->llc_add,
@@ -372,10 +370,9 @@ static int smc_clnt_conf_first_link(struct smc_sock *smc)
/* send add link reject message, only one link supported for now */
rc = smc_llc_send_add_link(link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_RESP);
+ link->gid, SMC_LLC_RESP);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_AL;
smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
@@ -427,9 +424,10 @@ static void smc_link_save_peer_info(struct smc_link *link,
}
/* fall back during connect */
-static int smc_connect_fallback(struct smc_sock *smc)
+static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
{
smc->use_fallback = true;
+ smc->fallback_rsn = reason_code;
smc_copy_sock_settings_to_clc(smc);
if (smc->sk.sk_state == SMC_INIT)
smc->sk.sk_state = SMC_ACTIVE;
@@ -441,14 +439,20 @@ static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
{
int rc;
- if (reason_code < 0) /* error, fallback is not possible */
+ if (reason_code < 0) { /* error, fallback is not possible */
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
return reason_code;
- if (reason_code != SMC_CLC_DECL_REPLY) {
+ }
+ if (reason_code != SMC_CLC_DECL_PEERDECL) {
rc = smc_clc_send_decline(smc, reason_code);
- if (rc < 0)
+ if (rc < 0) {
+ if (smc->sk.sk_state == SMC_INIT)
+ sock_put(&smc->sk); /* passive closing */
return rc;
+ }
}
- return smc_connect_fallback(smc);
+ return smc_connect_fallback(smc, reason_code);
}
/* abort connecting */
@@ -459,15 +463,13 @@ static int smc_connect_abort(struct smc_sock *smc, int reason_code,
smc_lgr_forget(smc->conn.lgr);
mutex_unlock(&smc_create_lgr_pending);
smc_conn_free(&smc->conn);
- if (reason_code < 0 && smc->sk.sk_state == SMC_INIT)
- sock_put(&smc->sk); /* passive closing */
return reason_code;
}
/* check if there is a rdma device available for this connection. */
/* called for connect and listen */
static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
- u8 *ibport)
+ u8 *ibport, unsigned short vlan_id, u8 gid[])
{
int reason_code = 0;
@@ -475,7 +477,8 @@ static int smc_check_rdma(struct smc_sock *smc, struct smc_ib_device **ibdev,
* within same PNETID that also contains the ethernet device
* used for the internal TCP socket
*/
- smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport);
+ smc_pnet_find_roce_resource(smc->clcsock->sk, ibdev, ibport, vlan_id,
+ gid);
if (!(*ibdev))
reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
@@ -521,12 +524,12 @@ static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
static int smc_connect_clc(struct smc_sock *smc, int smc_type,
struct smc_clc_msg_accept_confirm *aclc,
struct smc_ib_device *ibdev, u8 ibport,
- struct smcd_dev *ismdev)
+ u8 gid[], struct smcd_dev *ismdev)
{
int rc = 0;
/* do inband token exchange */
- rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, ismdev);
+ rc = smc_clc_send_proposal(smc, smc_type, ibdev, ibport, gid, ismdev);
if (rc)
return rc;
/* receive SMC Accept CLC message */
@@ -566,7 +569,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
smc_link_save_peer_info(link, aclc);
if (smc_rmb_rtoken_handling(&smc->conn, aclc))
- return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
local_contact);
smc_close_init(smc);
@@ -574,12 +577,12 @@ static int smc_connect_rdma(struct smc_sock *smc,
if (local_contact == SMC_FIRST_CONTACT) {
if (smc_ib_ready_link(link))
- return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
local_contact);
} else {
if (!smc->conn.rmb_desc->reused &&
smc_reg_rmb(link, smc->conn.rmb_desc, true))
- return smc_connect_abort(smc, SMC_CLC_DECL_INTERR,
+ return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
local_contact);
}
smc_rmb_sync_sg_for_device(&smc->conn);
@@ -648,6 +651,7 @@ static int __smc_connect(struct smc_sock *smc)
struct smc_clc_msg_accept_confirm aclc;
struct smc_ib_device *ibdev;
struct smcd_dev *ismdev;
+ u8 gid[SMC_GID_SIZE];
unsigned short vlan;
int smc_type;
int rc = 0;
@@ -656,11 +660,11 @@ static int __smc_connect(struct smc_sock *smc)
sock_hold(&smc->sk); /* sock put in passive closing */
if (smc->use_fallback)
- return smc_connect_fallback(smc);
+ return smc_connect_fallback(smc, smc->fallback_rsn);
/* if peer has not signalled SMC-capability, fall back */
if (!tcp_sk(smc->clcsock->sk)->syn_smc)
- return smc_connect_fallback(smc);
+ return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
/* IPSec connections opt out of SMC-R optimizations */
if (using_ipsec(smc))
@@ -679,7 +683,7 @@ static int __smc_connect(struct smc_sock *smc)
}
/* check if there is a rdma device available */
- if (!smc_check_rdma(smc, &ibdev, &ibport)) {
+ if (!smc_check_rdma(smc, &ibdev, &ibport, vlan, gid)) {
/* RDMA is supported for this connection */
rdma_supported = true;
if (ism_supported)
@@ -690,10 +694,10 @@ static int __smc_connect(struct smc_sock *smc)
/* if neither ISM nor RDMA are supported, fallback */
if (!rdma_supported && !ism_supported)
- return smc_connect_decline_fallback(smc, SMC_CLC_DECL_CNFERR);
+ return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
/* perform CLC handshake */
- rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, ismdev);
+ rc = smc_connect_clc(smc, smc_type, &aclc, ibdev, ibport, gid, ismdev);
if (rc) {
smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return smc_connect_decline_fallback(smc, rc);
@@ -705,7 +709,7 @@ static int __smc_connect(struct smc_sock *smc)
else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
rc = smc_connect_ism(smc, &aclc, ismdev);
else
- rc = SMC_CLC_DECL_CNFERR;
+ rc = SMC_CLC_DECL_MODEUNSUPP;
if (rc) {
smc_connect_ism_vlan_cleanup(smc, ism_supported, ismdev, vlan);
return smc_connect_decline_fallback(smc, rc);
@@ -943,15 +947,12 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
link = &lgr->lnk[SMC_SINGLE_LINK];
if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_REGRMB;
/* send CONFIRM LINK request to client over the RoCE fabric */
- rc = smc_llc_send_confirm_link(link,
- link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_REQ);
+ rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_CL;
/* receive CONFIRM LINK response from client over the RoCE fabric */
rest = wait_for_completion_interruptible_timeout(
@@ -971,10 +972,9 @@ static int smc_serv_conf_first_link(struct smc_sock *smc)
/* send ADD LINK request to client over the RoCE fabric */
rc = smc_llc_send_add_link(link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_REQ);
+ link->gid, SMC_LLC_REQ);
if (rc < 0)
- return SMC_CLC_DECL_TCL;
+ return SMC_CLC_DECL_TIMEOUT_AL;
/* receive ADD LINK response from client over the RoCE fabric */
rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
@@ -1049,7 +1049,8 @@ static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
}
smc_conn_free(&new_smc->conn);
new_smc->use_fallback = true;
- if (reason_code && reason_code != SMC_CLC_DECL_REPLY) {
+ new_smc->fallback_rsn = reason_code;
+ if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
if (smc_clc_send_decline(new_smc, reason_code) < 0) {
smc_listen_out_err(new_smc);
return;
@@ -1140,7 +1141,7 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
if (local_contact != SMC_FIRST_CONTACT) {
if (!new_smc->conn.rmb_desc->reused) {
if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
- return SMC_CLC_DECL_INTERR;
+ return SMC_CLC_DECL_ERR_REGRMB;
}
}
smc_rmb_sync_sg_for_device(&new_smc->conn);
@@ -1160,13 +1161,13 @@ static void smc_listen_rdma_finish(struct smc_sock *new_smc,
smc_link_save_peer_info(link, cclc);
if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
- reason_code = SMC_CLC_DECL_INTERR;
+ reason_code = SMC_CLC_DECL_ERR_RTOK;
goto decline;
}
if (local_contact == SMC_FIRST_CONTACT) {
if (smc_ib_ready_link(link)) {
- reason_code = SMC_CLC_DECL_INTERR;
+ reason_code = SMC_CLC_DECL_ERR_RDYLNK;
goto decline;
}
/* QP confirmation over RoCE fabric */
@@ -1194,6 +1195,7 @@ static void smc_listen_work(struct work_struct *work)
struct smcd_dev *ismdev;
u8 buf[SMC_CLC_MAX_LEN];
int local_contact = 0;
+ unsigned short vlan;
int reason_code = 0;
int rc = 0;
u8 ibport;
@@ -1206,6 +1208,7 @@ static void smc_listen_work(struct work_struct *work)
/* check if peer is smc capable */
if (!tcp_sk(newclcsock->sk)->syn_smc) {
new_smc->use_fallback = true;
+ new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
smc_listen_out_connected(new_smc);
return;
}
@@ -1242,14 +1245,16 @@ static void smc_listen_work(struct work_struct *work)
/* check if RDMA is available */
if (!ism_supported &&
((pclc->hdr.path != SMC_TYPE_R && pclc->hdr.path != SMC_TYPE_B) ||
- smc_check_rdma(new_smc, &ibdev, &ibport) ||
+ smc_vlan_by_tcpsk(new_smc->clcsock, &vlan) ||
+ smc_check_rdma(new_smc, &ibdev, &ibport, vlan, NULL) ||
smc_listen_rdma_check(new_smc, pclc) ||
smc_listen_rdma_init(new_smc, pclc, ibdev, ibport,
&local_contact) ||
smc_listen_rdma_reg(new_smc, local_contact))) {
/* SMC not supported, decline */
mutex_unlock(&smc_create_lgr_pending);
- smc_listen_decline(new_smc, SMC_CLC_DECL_CNFERR, local_contact);
+ smc_listen_decline(new_smc, SMC_CLC_DECL_MODEUNSUPP,
+ local_contact);
return;
}
@@ -1296,6 +1301,7 @@ static void smc_tcp_listen_work(struct work_struct *work)
new_smc->listen_smc = lsmc;
new_smc->use_fallback = lsmc->use_fallback;
+ new_smc->fallback_rsn = lsmc->fallback_rsn;
sock_hold(lsk); /* sock_put in smc_listen_work */
INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
smc_copy_sock_settings_to_smc(new_smc);
@@ -1450,6 +1456,7 @@ static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
if (msg->msg_flags & MSG_FASTOPEN) {
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
+ smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
rc = -EINVAL;
goto out;
@@ -1527,6 +1534,8 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
if (sk->sk_err)
mask |= EPOLLERR;
} else {
+ if (sk->sk_state != SMC_CLOSED)
+ sock_poll_wait(file, wait);
if (sk->sk_err)
mask |= EPOLLERR;
if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
@@ -1552,7 +1561,6 @@ static __poll_t smc_poll(struct file *file, struct socket *sock,
}
if (smc->conn.urg_state == SMC_URG_VALID)
mask |= EPOLLPRI;
-
}
return mask;
@@ -1633,7 +1641,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
if (optlen < sizeof(int))
return -EINVAL;
- get_user(val, (int __user *)optval);
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
lock_sock(sk);
switch (optname) {
@@ -1645,6 +1654,7 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
/* option not supported by SMC */
if (sk->sk_state == SMC_INIT) {
smc->use_fallback = true;
+ smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
} else {
if (!smc->use_fallback)
rc = -EINVAL;
@@ -1701,10 +1711,13 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
return -EBADF;
return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
}
+ lock_sock(&smc->sk);
switch (cmd) {
case SIOCINQ: /* same as FIONREAD */
- if (smc->sk.sk_state == SMC_LISTEN)
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
return -EINVAL;
+ }
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED)
answ = 0;
@@ -1713,8 +1726,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
break;
case SIOCOUTQ:
/* output queue size (not send + not acked) */
- if (smc->sk.sk_state == SMC_LISTEN)
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
return -EINVAL;
+ }
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED)
answ = 0;
@@ -1724,8 +1739,10 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
break;
case SIOCOUTQNSD:
/* output queue size (not send only) */
- if (smc->sk.sk_state == SMC_LISTEN)
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
return -EINVAL;
+ }
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED)
answ = 0;
@@ -1733,25 +1750,25 @@ static int smc_ioctl(struct socket *sock, unsigned int cmd,
answ = smc_tx_prepared_sends(&smc->conn);
break;
case SIOCATMARK:
- if (smc->sk.sk_state == SMC_LISTEN)
+ if (smc->sk.sk_state == SMC_LISTEN) {
+ release_sock(&smc->sk);
return -EINVAL;
+ }
if (smc->sk.sk_state == SMC_INIT ||
smc->sk.sk_state == SMC_CLOSED) {
answ = 0;
} else {
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
- smc_curs_write(&urg,
- smc_curs_read(&conn->urg_curs, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&urg, &conn->urg_curs, conn);
answ = smc_curs_diff(conn->rmb_desc->len,
&cons, &urg) == 1;
}
break;
default:
+ release_sock(&smc->sk);
return -ENOIOCTLCMD;
}
+ release_sock(&smc->sk);
return put_user(answ, (int __user *)arg);
}
@@ -1875,6 +1892,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol,
/* create internal TCP socket for CLC handshake and fallback */
smc = smc_sk(sk);
smc->use_fallback = false; /* assume rdma capability first */
+ smc->fallback_rsn = 0;
rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
&smc->clcsock);
if (rc) {
diff --git a/net/smc/smc.h b/net/smc/smc.h
index be20acd7b5ab..08786ace6010 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -208,6 +208,8 @@ struct smc_sock { /* smc sock container */
struct list_head accept_q; /* sockets to be accepted */
spinlock_t accept_q_lock; /* protects accept_q */
bool use_fallback; /* fallback to tcp */
+ int fallback_rsn; /* reason for fallback */
+ u32 peer_diagnosis; /* decline reason from peer */
int sockopt_defer_accept;
/* sockopt TCP_DEFER_ACCEPT
* value
diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index 621d8cca570b..ed5dcf03fe0b 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -34,14 +34,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
enum ib_wc_status wc_status)
{
struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+ struct smc_connection *conn = cdcpend->conn;
struct smc_sock *smc;
int diff;
- if (!cdcpend->conn)
+ if (!conn)
/* already dismissed */
return;
- smc = container_of(cdcpend->conn, struct smc_sock, conn);
+ smc = container_of(conn, struct smc_sock, conn);
bh_lock_sock(&smc->sk);
if (!wc_status) {
diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
@@ -52,9 +53,7 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
atomic_add(diff, &cdcpend->conn->sndbuf_space);
/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
- smc_curs_write(&cdcpend->conn->tx_curs_fin,
- smc_curs_read(&cdcpend->cursor, cdcpend->conn),
- cdcpend->conn);
+ smc_curs_copy(&conn->tx_curs_fin, &cdcpend->cursor, conn);
}
smc_tx_sndbuf_nonfull(smc);
bh_unlock_sock(&smc->sk);
@@ -110,9 +109,8 @@ int smc_cdc_msg_send(struct smc_connection *conn,
&conn->local_tx_ctrl, conn);
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
if (!rc)
- smc_curs_write(&conn->rx_curs_confirmed,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&conn->rx_curs_confirmed,
+ &conn->local_tx_ctrl.cons, conn);
return rc;
}
@@ -194,8 +192,8 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
rc = smcd_tx_ism_write(conn, &cdc, sizeof(cdc), 0, 1);
if (rc)
return rc;
- smc_curs_write(&conn->rx_curs_confirmed,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn), conn);
+ smc_curs_copy(&conn->rx_curs_confirmed, &conn->local_tx_ctrl.cons,
+ conn);
/* Calculate transmitted data and increment free send buffer space */
diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
&conn->tx_curs_sent);
@@ -204,8 +202,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
atomic_add(diff, &conn->sndbuf_space);
/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
- smc_curs_write(&conn->tx_curs_fin,
- smc_curs_read(&conn->tx_curs_sent, conn), conn);
+ smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);
smc_tx_sndbuf_nonfull(smc);
return rc;
@@ -225,9 +222,7 @@ static void smc_cdc_handle_urg_data_arrival(struct smc_sock *smc,
char *base;
/* new data included urgent business */
- smc_curs_write(&conn->urg_curs,
- smc_curs_read(&conn->local_rx_ctrl.prod, conn),
- conn);
+ smc_curs_copy(&conn->urg_curs, &conn->local_rx_ctrl.prod, conn);
conn->urg_state = SMC_URG_VALID;
if (!sock_flag(&smc->sk, SOCK_URGINLINE))
/* we'll skip the urgent byte, so don't account for it */
@@ -247,12 +242,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
struct smc_connection *conn = &smc->conn;
int diff_cons, diff_prod;
- smc_curs_write(&prod_old,
- smc_curs_read(&conn->local_rx_ctrl.prod, conn),
- conn);
- smc_curs_write(&cons_old,
- smc_curs_read(&conn->local_rx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
+ smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
smc_cdc_msg_to_host(&conn->local_rx_ctrl, cdc, conn);
diff_cons = smc_curs_diff(conn->peer_rmbe_size, &cons_old,
@@ -287,7 +278,8 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
/* force immediate tx of current consumer cursor, but
* under send_lock to guarantee arrival in seqno-order
*/
- smc_tx_sndbuf_nonempty(conn);
+ if (smc->sk.sk_state != SMC_INIT)
+ smc_tx_sndbuf_nonempty(conn);
}
}
@@ -374,7 +366,7 @@ static void smc_cdc_rx_handler(struct ib_wc *wc, void *buf)
return; /* invalid message */
/* lookup connection */
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+ lgr = smc_get_lgr(link);
read_lock_bh(&lgr->conns_lock);
conn = smc_lgr_find_conn(ntohl(cdc->token), lgr);
read_unlock_bh(&lgr->conns_lock);
diff --git a/net/smc/smc_cdc.h b/net/smc/smc_cdc.h
index 8fbce4fee3e4..934df4473a7c 100644
--- a/net/smc/smc_cdc.h
+++ b/net/smc/smc_cdc.h
@@ -104,47 +104,34 @@ static inline u64 smc_curs_read(union smc_host_cursor *curs,
#endif
}
-static inline u64 smc_curs_read_net(union smc_cdc_cursor *curs,
- struct smc_connection *conn)
-{
-#ifndef KERNEL_HAS_ATOMIC64
- unsigned long flags;
- u64 ret;
-
- spin_lock_irqsave(&conn->acurs_lock, flags);
- ret = curs->acurs;
- spin_unlock_irqrestore(&conn->acurs_lock, flags);
- return ret;
-#else
- return atomic64_read(&curs->acurs);
-#endif
-}
-
-static inline void smc_curs_write(union smc_host_cursor *curs, u64 val,
- struct smc_connection *conn)
+/* Copy cursor src into tgt */
+static inline void smc_curs_copy(union smc_host_cursor *tgt,
+ union smc_host_cursor *src,
+ struct smc_connection *conn)
{
#ifndef KERNEL_HAS_ATOMIC64
unsigned long flags;
spin_lock_irqsave(&conn->acurs_lock, flags);
- curs->acurs = val;
+ tgt->acurs = src->acurs;
spin_unlock_irqrestore(&conn->acurs_lock, flags);
#else
- atomic64_set(&curs->acurs, val);
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
#endif
}
-static inline void smc_curs_write_net(union smc_cdc_cursor *curs, u64 val,
- struct smc_connection *conn)
+static inline void smc_curs_copy_net(union smc_cdc_cursor *tgt,
+ union smc_cdc_cursor *src,
+ struct smc_connection *conn)
{
#ifndef KERNEL_HAS_ATOMIC64
unsigned long flags;
spin_lock_irqsave(&conn->acurs_lock, flags);
- curs->acurs = val;
+ tgt->acurs = src->acurs;
spin_unlock_irqrestore(&conn->acurs_lock, flags);
#else
- atomic64_set(&curs->acurs, val);
+ atomic64_set(&tgt->acurs, atomic64_read(&src->acurs));
#endif
}
@@ -179,7 +166,7 @@ static inline void smc_host_cursor_to_cdc(union smc_cdc_cursor *peer,
{
union smc_host_cursor temp;
- smc_curs_write(&temp, smc_curs_read(local, conn), conn);
+ smc_curs_copy(&temp, local, conn);
peer->count = htonl(temp.count);
peer->wrap = htons(temp.wrap);
/* peer->reserved = htons(0); must be ensured by caller */
@@ -206,8 +193,8 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
union smc_host_cursor temp, old;
union smc_cdc_cursor net;
- smc_curs_write(&old, smc_curs_read(local, conn), conn);
- smc_curs_write_net(&net, smc_curs_read_net(peer, conn), conn);
+ smc_curs_copy(&old, local, conn);
+ smc_curs_copy_net(&net, peer, conn);
temp.count = ntohl(net.count);
temp.wrap = ntohs(net.wrap);
if ((old.wrap > temp.wrap) && temp.wrap)
@@ -215,7 +202,7 @@ static inline void smc_cdc_cursor_to_host(union smc_host_cursor *local,
if ((old.wrap == temp.wrap) &&
(old.count > temp.count))
return;
- smc_curs_write(local, smc_curs_read(&temp, conn), conn);
+ smc_curs_copy(local, &temp, conn);
}
static inline void smcr_cdc_msg_to_host(struct smc_host_cdc_msg *local,
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 038d70ef7892..83aba9ade060 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -267,6 +267,7 @@ out:
int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type)
{
+ long rcvtimeo = smc->clcsock->sk->sk_rcvtimeo;
struct sock *clc_sk = smc->clcsock->sk;
struct smc_clc_msg_hdr *clcm = buf;
struct msghdr msg = {NULL, 0};
@@ -326,7 +327,6 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
memset(&msg, 0, sizeof(struct msghdr));
iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen);
krflags = MSG_WAITALL;
- smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
len = sock_recvmsg(smc->clcsock, &msg, krflags);
if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
smc->sk.sk_err = EPROTO;
@@ -334,7 +334,11 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
goto out;
}
if (clcm->type == SMC_CLC_DECLINE) {
- reason_code = SMC_CLC_DECL_REPLY;
+ struct smc_clc_msg_decline *dclc;
+
+ dclc = (struct smc_clc_msg_decline *)clcm;
+ reason_code = SMC_CLC_DECL_PEERDECL;
+ smc->peer_diagnosis = ntohl(dclc->peer_diagnosis);
if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
smc->conn.lgr->sync_err = 1;
smc_lgr_terminate(smc->conn.lgr);
@@ -342,6 +346,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
}
out:
+ smc->clcsock->sk->sk_rcvtimeo = rcvtimeo;
return reason_code;
}
@@ -377,7 +382,7 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
/* send CLC PROPOSAL message across internal TCP socket */
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_ib_device *ibdev, u8 ibport,
+ struct smc_ib_device *ibdev, u8 ibport, u8 gid[],
struct smcd_dev *ismdev)
{
struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
@@ -408,7 +413,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
/* add SMC-R specifics */
memcpy(pclc.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&pclc.lcl.gid, &ibdev->gid[ibport - 1], SMC_GID_SIZE);
+ memcpy(&pclc.lcl.gid, gid, SMC_GID_SIZE);
memcpy(&pclc.lcl.mac, &ibdev->mac[ibport - 1], ETH_ALEN);
pclc.iparea_offset = htons(0);
}
@@ -491,8 +496,7 @@ int smc_clc_send_confirm(struct smc_sock *smc)
cclc.hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
memcpy(cclc.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
- SMC_GID_SIZE);
+ memcpy(&cclc.lcl.gid, link->gid, SMC_GID_SIZE);
memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1],
ETH_ALEN);
hton24(cclc.qpn, link->roce_qp->qp_num);
@@ -565,8 +569,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
link = &conn->lgr->lnk[SMC_SINGLE_LINK];
memcpy(aclc.lcl.id_for_peer, local_systemid,
sizeof(local_systemid));
- memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
- SMC_GID_SIZE);
+ memcpy(&aclc.lcl.gid, link->gid, SMC_GID_SIZE);
memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1],
ETH_ALEN);
hton24(aclc.qpn, link->roce_qp->qp_num);
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 100e988ad1a8..18da89b681c2 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -28,15 +28,21 @@
#define SMC_TYPE_B 3 /* SMC-R and SMC-D */
#define CLC_WAIT_TIME (6 * HZ) /* max. wait time on clcsock */
#define SMC_CLC_DECL_MEM 0x01010000 /* insufficient memory resources */
-#define SMC_CLC_DECL_TIMEOUT 0x02000000 /* timeout */
+#define SMC_CLC_DECL_TIMEOUT_CL 0x02010000 /* timeout w4 QP confirm link */
+#define SMC_CLC_DECL_TIMEOUT_AL 0x02020000 /* timeout w4 QP add link */
#define SMC_CLC_DECL_CNFERR 0x03000000 /* configuration error */
-#define SMC_CLC_DECL_IPSEC 0x03030000 /* IPsec usage */
+#define SMC_CLC_DECL_PEERNOSMC 0x03010000 /* peer did not indicate SMC */
+#define SMC_CLC_DECL_IPSEC 0x03020000 /* IPsec usage */
+#define SMC_CLC_DECL_NOSMCDEV 0x03030000 /* no SMC device found */
+#define SMC_CLC_DECL_MODEUNSUPP 0x03040000 /* smc modes do not match (R or D)*/
+#define SMC_CLC_DECL_RMBE_EC 0x03050000 /* peer has eyecatcher in RMBE */
+#define SMC_CLC_DECL_OPTUNSUPP 0x03060000 /* fastopen sockopt not supported */
#define SMC_CLC_DECL_SYNCERR 0x04000000 /* synchronization error */
-#define SMC_CLC_DECL_REPLY 0x06000000 /* reply to a received decline */
+#define SMC_CLC_DECL_PEERDECL 0x05000000 /* peer declined during handshake */
#define SMC_CLC_DECL_INTERR 0x99990000 /* internal error */
-#define SMC_CLC_DECL_TCL 0x02040000 /* timeout w4 QP confirm */
-#define SMC_CLC_DECL_SEND 0x07000000 /* sending problem */
-#define SMC_CLC_DECL_RMBE_EC 0x08000000 /* peer has eyecatcher in RMBE */
+#define SMC_CLC_DECL_ERR_RTOK 0x99990001 /* rtoken handling failed */
+#define SMC_CLC_DECL_ERR_RDYLNK 0x99990002 /* ib ready link failed */
+#define SMC_CLC_DECL_ERR_REGRMB 0x99990003 /* reg rmb failed */
struct smc_clc_msg_hdr { /* header1 of clc messages */
u8 eyecatcher[4]; /* eye catcher */
@@ -179,7 +185,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
u8 expected_type);
int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info);
int smc_clc_send_proposal(struct smc_sock *smc, int smc_type,
- struct smc_ib_device *smcibdev, u8 ibport,
+ struct smc_ib_device *smcibdev, u8 ibport, u8 gid[],
struct smcd_dev *ismdev);
int smc_clc_send_confirm(struct smc_sock *smc);
int smc_clc_send_accept(struct smc_sock *smc, int srv_first_contact);
diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
index fa41d9881741..ac961dfb1ea1 100644
--- a/net/smc/smc_close.c
+++ b/net/smc/smc_close.c
@@ -107,6 +107,8 @@ static void smc_close_active_abort(struct smc_sock *smc)
}
switch (sk->sk_state) {
case SMC_INIT:
+ sk->sk_state = SMC_PEERABORTWAIT;
+ break;
case SMC_ACTIVE:
sk->sk_state = SMC_PEERABORTWAIT;
release_sock(sk);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 66741e61a3b0..a46418f45ecd 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -30,6 +30,7 @@
#define SMC_LGR_NUM_INCR 256
#define SMC_LGR_FREE_DELAY_SERV (600 * HZ)
#define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ)
+#define SMC_LGR_FREE_DELAY_FAST (8 * HZ)
static struct smc_lgr_list smc_lgr_list = { /* established link groups */
.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
@@ -51,6 +52,11 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV);
}
+void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr)
+{
+ mod_delayed_work(system_wq, &lgr->free_work, SMC_LGR_FREE_DELAY_FAST);
+}
+
/* Register connection's alert token in our lookup structure.
* To use rbtrees we have to implement our own insert core.
* Requires @conns_lock
@@ -133,6 +139,20 @@ static void smc_lgr_unregister_conn(struct smc_connection *conn)
smc_lgr_schedule_free_work(lgr);
}
+/* Send delete link, either as client to request the initiation
+ * of the DELETE LINK sequence from server; or as server to
+ * initiate the delete processing. See smc_llc_rx_delete_link().
+ */
+static int smc_link_send_delete(struct smc_link *lnk)
+{
+ if (lnk->state == SMC_LNK_ACTIVE &&
+ !smc_llc_send_delete_link(lnk, SMC_LLC_REQ, true)) {
+ smc_llc_link_deleting(lnk);
+ return 0;
+ }
+ return -ENOTCONN;
+}
+
static void smc_lgr_free_work(struct work_struct *work)
{
struct smc_link_group *lgr = container_of(to_delayed_work(work),
@@ -153,10 +173,21 @@ static void smc_lgr_free_work(struct work_struct *work)
list_del_init(&lgr->list); /* remove from smc_lgr_list */
free:
spin_unlock_bh(&smc_lgr_list.lock);
+
+ if (!lgr->is_smcd && !lgr->terminating) {
+ /* try to send del link msg, on error free lgr immediately */
+ if (!smc_link_send_delete(&lgr->lnk[SMC_SINGLE_LINK])) {
+ /* reschedule in case we never receive a response */
+ smc_lgr_schedule_free_work(lgr);
+ return;
+ }
+ }
+
if (!delayed_work_pending(&lgr->free_work)) {
- if (!lgr->is_smcd &&
- lgr->lnk[SMC_SINGLE_LINK].state != SMC_LNK_INACTIVE)
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ if (!lgr->is_smcd && lnk->state != SMC_LNK_INACTIVE)
+ smc_llc_link_inactive(lnk);
smc_lgr_free(lgr);
}
}
@@ -219,6 +250,10 @@ static int smc_lgr_create(struct smc_sock *smc, bool is_smcd,
get_random_bytes(rndvec, sizeof(rndvec));
lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
(rndvec[2] << 16);
+ rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
+ vlan_id, lnk->gid, &lnk->sgid_index);
+ if (rc)
+ goto free_lgr;
rc = smc_llc_link_init(lnk);
if (rc)
goto free_lgr;
@@ -522,37 +557,6 @@ out:
return rc;
}
-/* determine the link gid matching the vlan id of the link group */
-static int smc_link_determine_gid(struct smc_link_group *lgr)
-{
- struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
- struct ib_gid_attr gattr;
- union ib_gid gid;
- int i;
-
- if (!lgr->vlan_id) {
- lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1];
- return 0;
- }
-
- for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len;
- i++) {
- if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid,
- &gattr))
- continue;
- if (gattr.ndev) {
- if (is_vlan_dev(gattr.ndev) &&
- vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) {
- lnk->gid = gid;
- dev_put(gattr.ndev);
- return 0;
- }
- dev_put(gattr.ndev);
- }
- }
- return -ENODEV;
-}
-
static bool smcr_lgr_match(struct smc_link_group *lgr,
struct smc_clc_msg_local *lcl,
enum smc_lgr_role role)
@@ -631,8 +635,6 @@ create:
if (rc)
goto out;
smc_lgr_register_conn(conn); /* add smc conn to lgr */
- if (!is_smcd)
- rc = smc_link_determine_gid(conn->lgr);
}
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
@@ -1013,8 +1015,14 @@ void smc_core_exit(void)
spin_unlock_bh(&smc_lgr_list.lock);
list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
list_del_init(&lgr->list);
- if (!lgr->is_smcd)
- smc_llc_link_inactive(&lgr->lnk[SMC_SINGLE_LINK]);
+ if (!lgr->is_smcd) {
+ struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK];
+
+ if (lnk->state == SMC_LNK_ACTIVE)
+ smc_llc_send_delete_link(lnk, SMC_LLC_REQ,
+ false);
+ smc_llc_link_inactive(lnk);
+ }
cancel_delayed_work_sync(&lgr->free_work);
smc_lgr_free(lgr); /* free link group */
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 8b47e0168fc3..c156674733c9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -34,7 +34,8 @@ enum smc_lgr_role { /* possible roles of a link group */
enum smc_link_state { /* possible states of a link */
SMC_LNK_INACTIVE, /* link is inactive */
SMC_LNK_ACTIVATING, /* link is being activated */
- SMC_LNK_ACTIVE /* link is active */
+ SMC_LNK_ACTIVE, /* link is active */
+ SMC_LNK_DELETING, /* link is being deleted */
};
#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
@@ -84,14 +85,15 @@ struct smc_link {
wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */
enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */
- union ib_gid gid; /* gid matching used vlan id */
+ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/
+ u8 sgid_index; /* gid index for vlan id */
u32 peer_qpn; /* QP number of peer */
enum ib_mtu path_mtu; /* used mtu */
enum ib_mtu peer_mtu; /* mtu size of peer */
u32 psn_initial; /* QP tx initial packet seqno */
u32 peer_psn; /* QP rx initial packet seqno */
u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */
- u8 peer_gid[sizeof(union ib_gid)]; /* gid of peer*/
+ u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/
u8 link_id; /* unique # within link group */
enum smc_link_state state; /* state of link */
@@ -192,8 +194,7 @@ struct smc_link_group {
struct smc_rtoken rtokens[SMC_RMBS_PER_LGR_MAX]
[SMC_LINKS_PER_LGR_MAX];
/* remote addr/key pairs */
- unsigned long rtokens_used_mask[BITS_TO_LONGS
- (SMC_RMBS_PER_LGR_MAX)];
+ DECLARE_BITMAP(rtokens_used_mask, SMC_RMBS_PER_LGR_MAX);
/* used rtoken elements */
};
struct { /* SMC-D */
@@ -265,5 +266,11 @@ int smc_conn_create(struct smc_sock *smc, bool is_smcd, int srv_first_contact,
struct smc_clc_msg_local *lcl, struct smcd_dev *smcd,
u64 peer_gid);
void smcd_conn_free(struct smc_connection *conn);
+void smc_lgr_schedule_free_work_fast(struct smc_link_group *lgr);
void smc_core_exit(void);
+
+static inline struct smc_link_group *smc_get_lgr(struct smc_link *link)
+{
+ return container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
+}
#endif
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 6d83eef1b743..dbf64a93d68a 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -79,6 +79,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
struct nlattr *bc)
{
struct smc_sock *smc = smc_sk(sk);
+ struct smc_diag_fallback fallback;
struct user_namespace *user_ns;
struct smc_diag_msg *r;
struct nlmsghdr *nlh;
@@ -91,11 +92,21 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
r = nlmsg_data(nlh);
smc_diag_msg_common_fill(r, sk);
r->diag_state = sk->sk_state;
- r->diag_fallback = smc->use_fallback;
+ if (smc->use_fallback)
+ r->diag_mode = SMC_DIAG_MODE_FALLBACK_TCP;
+ else if (smc->conn.lgr && smc->conn.lgr->is_smcd)
+ r->diag_mode = SMC_DIAG_MODE_SMCD;
+ else
+ r->diag_mode = SMC_DIAG_MODE_SMCR;
user_ns = sk_user_ns(NETLINK_CB(cb->skb).sk);
if (smc_diag_msg_attrs_fill(sk, skb, r, user_ns))
goto errout;
+ fallback.reason = smc->fallback_rsn;
+ fallback.peer_diagnosis = smc->peer_diagnosis;
+ if (nla_put(skb, SMC_DIAG_FALLBACK, sizeof(fallback), &fallback) < 0)
+ goto errout;
+
if ((req->diag_ext & (1 << (SMC_DIAG_CONNINFO - 1))) &&
smc->conn.alert_token_local) {
struct smc_connection *conn = &smc->conn;
@@ -149,7 +160,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
smc->conn.lgr->lnk[0].smcibdev->ibdev->name,
sizeof(smc->conn.lgr->lnk[0].smcibdev->ibdev->name));
smc_gid_be16_convert(linfo.lnk[0].gid,
- smc->conn.lgr->lnk[0].gid.raw);
+ smc->conn.lgr->lnk[0].gid);
smc_gid_be16_convert(linfo.lnk[0].peer_gid,
smc->conn.lgr->lnk[0].peer_gid);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 36de2fd76170..2cc64bc8ae20 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -68,7 +68,7 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
- rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0);
+ rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
sizeof(lnk->peer_mac));
@@ -112,8 +112,7 @@ int smc_ib_modify_qp_reset(struct smc_link *lnk)
int smc_ib_ready_link(struct smc_link *lnk)
{
- struct smc_link_group *lgr =
- container_of(lnk, struct smc_link_group, lnk[0]);
+ struct smc_link_group *lgr = smc_get_lgr(lnk);
int rc = 0;
rc = smc_ib_modify_qp_init(lnk);
@@ -143,13 +142,13 @@ out:
return rc;
}
-static int smc_ib_fill_gid_and_mac(struct smc_ib_device *smcibdev, u8 ibport)
+static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
{
struct ib_gid_attr gattr;
+ union ib_gid gid;
int rc;
- rc = ib_query_gid(smcibdev->ibdev, ibport, 0,
- &smcibdev->gid[ibport - 1], &gattr);
+ rc = ib_query_gid(smcibdev->ibdev, ibport, 0, &gid, &gattr);
if (rc || !gattr.ndev)
return -ENODEV;
@@ -176,6 +175,37 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
}
+/* determine the gid for an ib-device port and vlan id */
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+{
+ struct ib_gid_attr gattr;
+ union ib_gid _gid;
+ int i;
+
+ for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+ memset(&_gid, 0, SMC_GID_SIZE);
+ memset(&gattr, 0, sizeof(gattr));
+ if (ib_query_gid(smcibdev->ibdev, ibport, i, &_gid, &gattr))
+ continue;
+ if (!gattr.ndev)
+ continue;
+ if (((!vlan_id && !is_vlan_dev(gattr.ndev)) ||
+ (vlan_id && is_vlan_dev(gattr.ndev) &&
+ vlan_dev_vlan_id(gattr.ndev) == vlan_id)) &&
+ gattr.gid_type == IB_GID_TYPE_IB) {
+ if (gid)
+ memcpy(gid, &_gid, SMC_GID_SIZE);
+ if (sgid_index)
+ *sgid_index = i;
+ dev_put(gattr.ndev);
+ return 0;
+ }
+ dev_put(gattr.ndev);
+ }
+ return -ENODEV;
+}
+
static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
{
int rc;
@@ -187,7 +217,7 @@ static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
if (rc)
goto out;
/* the SMC protocol requires specification of the RoCE MAC address */
- rc = smc_ib_fill_gid_and_mac(smcibdev, ibport);
+ rc = smc_ib_fill_mac(smcibdev, ibport);
if (rc)
goto out;
if (!strncmp(local_systemid, SMC_LOCAL_SYSTEMID_RESET,
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 7c1223c91229..bac7fd65a4c0 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -40,7 +40,6 @@ struct smc_ib_device { /* ib-device infos for smc */
struct tasklet_struct recv_tasklet; /* called by recv cq handler */
char mac[SMC_MAX_PORTS][ETH_ALEN];
/* mac address per port*/
- union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
u8 pnetid[SMC_MAX_PORTS][SMC_MAX_PNETID_LEN];
/* pnetid per port */
u8 initialized : 1; /* ib dev CQ, evthdl done */
@@ -77,4 +76,6 @@ void smc_ib_sync_sg_for_cpu(struct smc_ib_device *smcibdev,
void smc_ib_sync_sg_for_device(struct smc_ib_device *smcibdev,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
+int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
+ unsigned short vlan_id, u8 gid[], u8 *sgid_index);
#endif
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 5800a6b43d83..9c916c709ca7 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -182,12 +182,10 @@ static int smc_llc_add_pending_send(struct smc_link *link,
}
/* high-level API to send LLC confirm link */
-int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
- union ib_gid *gid,
+int smc_llc_send_confirm_link(struct smc_link *link,
enum smc_llc_reqresp reqresp)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
struct smc_llc_msg_confirm_link *confllc;
struct smc_wr_tx_pend_priv *pend;
struct smc_wr_buf *wr_buf;
@@ -203,8 +201,9 @@ int smc_llc_send_confirm_link(struct smc_link *link, u8 mac[],
confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
if (reqresp == SMC_LLC_RESP)
confllc->hd.flags |= SMC_LLC_FLAG_RESP;
- memcpy(confllc->sender_mac, mac, ETH_ALEN);
- memcpy(confllc->sender_gid, gid, SMC_GID_SIZE);
+ memcpy(confllc->sender_mac, link->smcibdev->mac[link->ibport - 1],
+ ETH_ALEN);
+ memcpy(confllc->sender_gid, link->gid, SMC_GID_SIZE);
hton24(confllc->sender_qp_num, link->roce_qp->qp_num);
confllc->link_num = link->link_id;
memcpy(confllc->link_uid, lgr->id, SMC_LGR_ID_SIZE);
@@ -241,8 +240,7 @@ static int smc_llc_send_confirm_rkey(struct smc_link *link,
/* prepare an add link message */
static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
- struct smc_link *link, u8 mac[],
- union ib_gid *gid,
+ struct smc_link *link, u8 mac[], u8 gid[],
enum smc_llc_reqresp reqresp)
{
memset(addllc, 0, sizeof(*addllc));
@@ -259,8 +257,7 @@ static void smc_llc_prep_add_link(struct smc_llc_msg_add_link *addllc,
}
/* send ADD LINK request or response */
-int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
- union ib_gid *gid,
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
enum smc_llc_reqresp reqresp)
{
struct smc_llc_msg_add_link *addllc;
@@ -281,7 +278,7 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[],
/* prepare a delete link message */
static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
struct smc_link *link,
- enum smc_llc_reqresp reqresp)
+ enum smc_llc_reqresp reqresp, bool orderly)
{
memset(delllc, 0, sizeof(*delllc));
delllc->hd.common.type = SMC_LLC_DELETE_LINK;
@@ -290,13 +287,14 @@ static void smc_llc_prep_delete_link(struct smc_llc_msg_del_link *delllc,
delllc->hd.flags |= SMC_LLC_FLAG_RESP;
/* DEL_LINK_ALL because only 1 link supported */
delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
- delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
+ if (orderly)
+ delllc->hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
delllc->link_num = link->link_id;
}
/* send DELETE LINK request or response */
int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp)
+ enum smc_llc_reqresp reqresp, bool orderly)
{
struct smc_llc_msg_del_link *delllc;
struct smc_wr_tx_pend_priv *pend;
@@ -307,7 +305,7 @@ int smc_llc_send_delete_link(struct smc_link *link,
if (rc)
return rc;
delllc = (struct smc_llc_msg_del_link *)wr_buf;
- smc_llc_prep_delete_link(delllc, link, reqresp);
+ smc_llc_prep_delete_link(delllc, link, reqresp, orderly);
/* send llc message */
rc = smc_wr_tx_send(link, pend);
return rc;
@@ -381,11 +379,9 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf, int llclen)
static void smc_llc_rx_confirm_link(struct smc_link *link,
struct smc_llc_msg_confirm_link *llc)
{
- struct smc_link_group *lgr;
+ struct smc_link_group *lgr = smc_get_lgr(link);
int conf_rc;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
-
/* RMBE eyecatchers are not supported */
if (llc->hd.flags & SMC_LLC_FLAG_NO_RMBE_EYEC)
conf_rc = 0;
@@ -411,8 +407,7 @@ static void smc_llc_rx_confirm_link(struct smc_link *link,
static void smc_llc_rx_add_link(struct smc_link *link,
struct smc_llc_msg_add_link *llc)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
if (link->state == SMC_LNK_ACTIVATING)
@@ -426,14 +421,12 @@ static void smc_llc_rx_add_link(struct smc_link *link,
if (lgr->role == SMC_SERV) {
smc_llc_prep_add_link(llc, link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_REQ);
+ link->gid, SMC_LLC_REQ);
} else {
smc_llc_prep_add_link(llc, link,
link->smcibdev->mac[link->ibport - 1],
- &link->smcibdev->gid[link->ibport - 1],
- SMC_LLC_RESP);
+ link->gid, SMC_LLC_RESP);
}
smc_llc_send_message(link, llc, sizeof(*llc));
}
@@ -442,22 +435,23 @@ static void smc_llc_rx_add_link(struct smc_link *link,
static void smc_llc_rx_delete_link(struct smc_link *link,
struct smc_llc_msg_del_link *llc)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
if (lgr->role == SMC_SERV)
- smc_lgr_terminate(lgr);
+ smc_lgr_schedule_free_work_fast(lgr);
} else {
+ smc_lgr_forget(lgr);
+ smc_llc_link_deleting(link);
if (lgr->role == SMC_SERV) {
- smc_lgr_forget(lgr);
- smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ);
- smc_llc_send_message(link, llc, sizeof(*llc));
+ /* client asks to delete this link, send request */
+ smc_llc_prep_delete_link(llc, link, SMC_LLC_REQ, true);
} else {
- smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP);
- smc_llc_send_message(link, llc, sizeof(*llc));
- smc_lgr_terminate(lgr);
+ /* server requests to delete this link, send response */
+ smc_llc_prep_delete_link(llc, link, SMC_LLC_RESP, true);
}
+ smc_llc_send_message(link, llc, sizeof(*llc));
+ smc_lgr_schedule_free_work_fast(lgr);
}
}
@@ -476,17 +470,14 @@ static void smc_llc_rx_test_link(struct smc_link *link,
static void smc_llc_rx_confirm_rkey(struct smc_link *link,
struct smc_llc_msg_confirm_rkey *llc)
{
- struct smc_link_group *lgr;
int rc;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
-
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
link->llc_confirm_rkey_rc = llc->hd.flags &
SMC_LLC_FLAG_RKEY_NEG;
complete(&link->llc_confirm_rkey);
} else {
- rc = smc_rtoken_add(lgr,
+ rc = smc_rtoken_add(smc_get_lgr(link),
llc->rtoken[0].rmb_vaddr,
llc->rtoken[0].rmb_key);
@@ -514,18 +505,15 @@ static void smc_llc_rx_confirm_rkey_cont(struct smc_link *link,
static void smc_llc_rx_delete_rkey(struct smc_link *link,
struct smc_llc_msg_delete_rkey *llc)
{
- struct smc_link_group *lgr;
u8 err_mask = 0;
int i, max;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
-
if (llc->hd.flags & SMC_LLC_FLAG_RESP) {
/* unused as long as we don't send this type of msg */
} else {
max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
for (i = 0; i < max; i++) {
- if (smc_rtoken_delete(lgr, llc->rkey[i]))
+ if (smc_rtoken_delete(smc_get_lgr(link), llc->rkey[i]))
err_mask |= 1 << (SMC_LLC_DEL_RKEY_MAX - 1 - i);
}
@@ -583,12 +571,10 @@ static void smc_llc_testlink_work(struct work_struct *work)
struct smc_link *link = container_of(to_delayed_work(work),
struct smc_link, llc_testlink_wrk);
unsigned long next_interval;
- struct smc_link_group *lgr;
unsigned long expire_time;
u8 user_data[16] = { 0 };
int rc;
- lgr = container_of(link, struct smc_link_group, lnk[SMC_SINGLE_LINK]);
if (link->state != SMC_LNK_ACTIVE)
return; /* don't reschedule worker */
expire_time = link->wr_rx_tstamp + link->llc_testlink_time;
@@ -602,7 +588,7 @@ static void smc_llc_testlink_work(struct work_struct *work)
rc = wait_for_completion_interruptible_timeout(&link->llc_testlink_resp,
SMC_LLC_WAIT_TIME);
if (rc <= 0) {
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
return;
}
next_interval = link->llc_testlink_time;
@@ -613,8 +599,7 @@ out:
int smc_llc_link_init(struct smc_link *link)
{
- struct smc_link_group *lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
+ struct smc_link_group *lgr = smc_get_lgr(link);
link->llc_wq = alloc_ordered_workqueue("llc_wq-%x:%x)", WQ_MEM_RECLAIM,
*((u32 *)lgr->id),
link->link_id);
@@ -640,6 +625,11 @@ void smc_llc_link_active(struct smc_link *link, int testlink_time)
}
}
+void smc_llc_link_deleting(struct smc_link *link)
+{
+ link->state = SMC_LNK_DELETING;
+}
+
/* called in tasklet context */
void smc_llc_link_inactive(struct smc_link *link)
{
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index 65c8645e96a1..9e2ff088e301 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -36,14 +36,15 @@ enum smc_llc_msg_type {
};
/* transmit */
-int smc_llc_send_confirm_link(struct smc_link *lnk, u8 mac[], union ib_gid *gid,
+int smc_llc_send_confirm_link(struct smc_link *lnk,
enum smc_llc_reqresp reqresp);
-int smc_llc_send_add_link(struct smc_link *link, u8 mac[], union ib_gid *gid,
+int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
enum smc_llc_reqresp reqresp);
int smc_llc_send_delete_link(struct smc_link *link,
- enum smc_llc_reqresp reqresp);
+ enum smc_llc_reqresp reqresp, bool orderly);
int smc_llc_link_init(struct smc_link *link);
void smc_llc_link_active(struct smc_link *link, int testlink_time);
+void smc_llc_link_deleting(struct smc_link *link);
void smc_llc_link_inactive(struct smc_link *link);
void smc_llc_link_clear(struct smc_link *link);
int smc_llc_do_confirm_rkey(struct smc_link *link,
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 1b6c066d3495..01c6ce042a1c 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -535,11 +535,13 @@ static struct net_device *pnet_find_base_ndev(struct net_device *ndev)
}
/* Determine the corresponding IB device port based on the hardware PNETID.
- * Searching stops at the first matching active IB device port.
+ * Searching stops at the first matching active IB device port with vlan_id
+ * configured.
*/
static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
struct smc_ib_device **smcibdev,
- u8 *ibport)
+ u8 *ibport, unsigned short vlan_id,
+ u8 gid[])
{
u8 ndev_pnetid[SMC_MAX_PNETID_LEN];
struct smc_ib_device *ibdev;
@@ -553,15 +555,20 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev,
spin_lock(&smc_ib_devices.lock);
list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
for (i = 1; i <= SMC_MAX_PORTS; i++) {
+ if (!rdma_is_port_valid(ibdev->ibdev, i))
+ continue;
if (!memcmp(ibdev->pnetid[i - 1], ndev_pnetid,
SMC_MAX_PNETID_LEN) &&
- smc_ib_port_active(ibdev, i)) {
+ smc_ib_port_active(ibdev, i) &&
+ !smc_ib_determine_gid(ibdev, i, vlan_id, gid,
+ NULL)) {
*smcibdev = ibdev;
*ibport = i;
- break;
+ goto out;
}
}
}
+out:
spin_unlock(&smc_ib_devices.lock);
}
@@ -589,7 +596,8 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
/* Lookup of coupled ib_device via SMC pnet table */
static void smc_pnet_find_roce_by_table(struct net_device *netdev,
struct smc_ib_device **smcibdev,
- u8 *ibport)
+ u8 *ibport, unsigned short vlan_id,
+ u8 gid[])
{
struct smc_pnetentry *pnetelem;
@@ -597,7 +605,10 @@ static void smc_pnet_find_roce_by_table(struct net_device *netdev,
list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) {
if (netdev == pnetelem->ndev) {
if (smc_ib_port_active(pnetelem->smcibdev,
- pnetelem->ib_port)) {
+ pnetelem->ib_port) &&
+ !smc_ib_determine_gid(pnetelem->smcibdev,
+ pnetelem->ib_port, vlan_id,
+ gid, NULL)) {
*smcibdev = pnetelem->smcibdev;
*ibport = pnetelem->ib_port;
}
@@ -612,7 +623,8 @@ static void smc_pnet_find_roce_by_table(struct net_device *netdev,
* ethernet interface.
*/
void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport)
+ struct smc_ib_device **smcibdev, u8 *ibport,
+ unsigned short vlan_id, u8 gid[])
{
struct dst_entry *dst = sk_dst_get(sk);
@@ -625,12 +637,12 @@ void smc_pnet_find_roce_resource(struct sock *sk,
goto out_rel;
/* if possible, lookup via hardware-defined pnetid */
- smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport);
+ smc_pnet_find_roce_by_pnetid(dst->dev, smcibdev, ibport, vlan_id, gid);
if (*smcibdev)
goto out_rel;
/* lookup via SMC PNET table */
- smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport);
+ smc_pnet_find_roce_by_table(dst->dev, smcibdev, ibport, vlan_id, gid);
out_rel:
dst_release(dst);
diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h
index 1e94fd4df7bc..8ff777636e32 100644
--- a/net/smc/smc_pnet.h
+++ b/net/smc/smc_pnet.h
@@ -33,7 +33,8 @@ int smc_pnet_init(void) __init;
void smc_pnet_exit(void);
int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev);
void smc_pnet_find_roce_resource(struct sock *sk,
- struct smc_ib_device **smcibdev, u8 *ibport);
+ struct smc_ib_device **smcibdev, u8 *ibport,
+ unsigned short vlan_id, u8 gid[]);
void smc_pnet_find_ism_resource(struct sock *sk, struct smcd_dev **smcismdev);
#endif
diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c
index b329803c8339..bbcf0fe4ae10 100644
--- a/net/smc/smc_rx.c
+++ b/net/smc/smc_rx.c
@@ -82,8 +82,7 @@ static int smc_rx_update_consumer(struct smc_sock *smc,
}
}
- smc_curs_write(&conn->local_tx_ctrl.cons, smc_curs_read(&cons, conn),
- conn);
+ smc_curs_copy(&conn->local_tx_ctrl.cons, &cons, conn);
/* send consumer cursor update if required */
/* similar to advertising new TCP rcv_wnd if required */
@@ -97,8 +96,7 @@ static void smc_rx_update_cons(struct smc_sock *smc, size_t len)
struct smc_connection *conn = &smc->conn;
union smc_host_cursor cons;
- smc_curs_write(&cons, smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
smc_rx_update_consumer(smc, cons, len);
}
@@ -157,10 +155,8 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
struct splice_pipe_desc spd;
struct partial_page partial;
struct smc_spd_priv *priv;
- struct page *page;
int bytes;
- page = virt_to_page(smc->conn.rmb_desc->cpu_addr);
priv = kzalloc(sizeof(*priv), GFP_KERNEL);
if (!priv)
return -ENOMEM;
@@ -172,7 +168,7 @@ static int smc_rx_splice(struct pipe_inode_info *pipe, char *src, size_t len,
spd.nr_pages_max = 1;
spd.nr_pages = 1;
- spd.pages = &page;
+ spd.pages = &smc->conn.rmb_desc->pages;
spd.partial = &partial;
spd.ops = &smc_pipe_ops;
spd.spd_release = smc_rx_spd_release;
@@ -245,10 +241,7 @@ static int smc_rx_recv_urg(struct smc_sock *smc, struct msghdr *msg, int len,
if (!(flags & MSG_TRUNC))
rc = memcpy_to_msg(msg, &conn->urg_rx_byte, 1);
len = 1;
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons,
- conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
if (smc_curs_diff(conn->rmb_desc->len, &cons,
&conn->urg_curs) > 1)
conn->urg_rx_skip_pend = true;
@@ -370,9 +363,7 @@ copy:
continue;
}
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
/* subsequent splice() calls pick up where previous left */
if (splbytes)
smc_curs_add(conn->rmb_desc->len, &cons, splbytes);
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 142bcb134dd6..2f5e324e54b9 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -181,9 +181,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
copylen = min_t(size_t, send_remaining, writespace);
/* determine start of sndbuf */
sndbuf_base = conn->sndbuf_desc->cpu_addr;
- smc_curs_write(&prep,
- smc_curs_read(&conn->tx_curs_prep, conn),
- conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
tx_cnt_prep = prep.count;
/* determine chunks where to write into sndbuf */
/* either unwrapped case, or 1st chunk of wrapped case */
@@ -214,9 +212,7 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
smc_sndbuf_sync_sg_for_device(conn);
/* update cursors */
smc_curs_add(conn->sndbuf_desc->len, &prep, copylen);
- smc_curs_write(&conn->tx_curs_prep,
- smc_curs_read(&prep, conn),
- conn);
+ smc_curs_copy(&conn->tx_curs_prep, &prep, conn);
/* increased in send tasklet smc_cdc_tx_handler() */
smp_mb__before_atomic();
atomic_sub(copylen, &conn->sndbuf_space);
@@ -417,8 +413,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
int rc;
/* source: sndbuf */
- smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
- smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
/* cf. wmem_alloc - (snd_max - snd_una) */
to_send = smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
if (to_send <= 0)
@@ -429,12 +425,8 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
rmbespace = atomic_read(&conn->peer_rmbe_space);
if (rmbespace <= 0)
return 0;
- smc_curs_write(&prod,
- smc_curs_read(&conn->local_tx_ctrl.prod, conn),
- conn);
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_rx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&prod, &conn->local_tx_ctrl.prod, conn);
+ smc_curs_copy(&cons, &conn->local_rx_ctrl.cons, conn);
/* if usable snd_wnd closes ask peer to advertise once it opens again */
pflags = &conn->local_tx_ctrl.prod_flags;
@@ -481,14 +473,9 @@ static int smc_tx_rdma_writes(struct smc_connection *conn)
pflags->urg_data_present = 1;
smc_tx_advance_cursors(conn, &prod, &sent, len);
/* update connection's cursors with advanced local cursors */
- smc_curs_write(&conn->local_tx_ctrl.prod,
- smc_curs_read(&prod, conn),
- conn);
+ smc_curs_copy(&conn->local_tx_ctrl.prod, &prod, conn);
/* dst: peer RMBE */
- smc_curs_write(&conn->tx_curs_sent,
- smc_curs_read(&sent, conn),
- conn);
- /* src: local sndbuf */
+ smc_curs_copy(&conn->tx_curs_sent, &sent, conn);/* src: local sndbuf */
return 0;
}
@@ -606,17 +593,11 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
int sender_free = conn->rmb_desc->len;
int to_confirm;
- smc_curs_write(&cons,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
- smc_curs_write(&cfed,
- smc_curs_read(&conn->rx_curs_confirmed, conn),
- conn);
+ smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
+ smc_curs_copy(&cfed, &conn->rx_curs_confirmed, conn);
to_confirm = smc_curs_diff(conn->rmb_desc->len, &cfed, &cons);
if (to_confirm > conn->rmbe_update_limit) {
- smc_curs_write(&prod,
- smc_curs_read(&conn->local_rx_ctrl.prod, conn),
- conn);
+ smc_curs_copy(&prod, &conn->local_rx_ctrl.prod, conn);
sender_free = conn->rmb_desc->len -
smc_curs_diff(conn->rmb_desc->len, &prod, &cfed);
}
@@ -632,9 +613,8 @@ void smc_tx_consumer_update(struct smc_connection *conn, bool force)
SMC_TX_WORK_DELAY);
return;
}
- smc_curs_write(&conn->rx_curs_confirmed,
- smc_curs_read(&conn->local_tx_ctrl.cons, conn),
- conn);
+ smc_curs_copy(&conn->rx_curs_confirmed,
+ &conn->local_tx_ctrl.cons, conn);
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
}
if (conn->local_rx_ctrl.prod_flags.write_blocked &&
diff --git a/net/smc/smc_tx.h b/net/smc/smc_tx.h
index b22bdc5694c4..07e6ad76224a 100644
--- a/net/smc/smc_tx.h
+++ b/net/smc/smc_tx.h
@@ -22,8 +22,8 @@ static inline int smc_tx_prepared_sends(struct smc_connection *conn)
{
union smc_host_cursor sent, prep;
- smc_curs_write(&sent, smc_curs_read(&conn->tx_curs_sent, conn), conn);
- smc_curs_write(&prep, smc_curs_read(&conn->tx_curs_prep, conn), conn);
+ smc_curs_copy(&sent, &conn->tx_curs_sent, conn);
+ smc_curs_copy(&prep, &conn->tx_curs_prep, conn);
return smc_curs_diff(conn->sndbuf_desc->len, &sent, &prep);
}
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index dbd2605d1962..f856b8402b3f 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -92,8 +92,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
return;
if (wc->status) {
- struct smc_link_group *lgr;
-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
/* clear full struct smc_wr_tx_pend including .priv */
memset(&link->wr_tx_pends[i], 0,
@@ -103,9 +101,7 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
clear_bit(i, link->wr_tx_mask);
}
/* terminate connections of this link group abnormally */
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
}
if (pnd_snd.handler)
pnd_snd.handler(&pnd_snd.priv, link, wc->status);
@@ -186,18 +182,14 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
if (rc)
return rc;
} else {
- struct smc_link_group *lgr;
-
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
rc = wait_event_timeout(
link->wr_tx_wait,
- list_empty(&lgr->list) || /* lgr terminated */
+ link->state == SMC_LNK_INACTIVE ||
(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
SMC_WR_TX_WAIT_FREE_SLOT_TIME);
if (!rc) {
/* timeout - terminate connections */
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
return -EPIPE;
}
if (idx == link->wr_tx_cnt)
@@ -250,12 +242,8 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
rc = ib_post_send(link->roce_qp, &link->wr_tx_ibs[pend->idx],
&failed_wr);
if (rc) {
- struct smc_link_group *lgr =
- container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
-
smc_wr_tx_put_slot(link, priv);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
}
return rc;
}
@@ -283,11 +271,7 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
SMC_WR_REG_MR_WAIT_TIME);
if (!rc) {
/* timeout - terminate connections */
- struct smc_link_group *lgr;
-
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
return -EPIPE;
}
if (rc == -ERESTARTSYS)
@@ -380,8 +364,6 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
smc_wr_rx_demultiplex(&wc[i]);
smc_wr_rx_post(link); /* refill WR RX */
} else {
- struct smc_link_group *lgr;
-
/* handle status errors */
switch (wc[i].status) {
case IB_WC_RETRY_EXC_ERR:
@@ -390,9 +372,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num)
/* terminate connections of this link group
* abnormally
*/
- lgr = container_of(link, struct smc_link_group,
- lnk[SMC_SINGLE_LINK]);
- smc_lgr_terminate(lgr);
+ smc_lgr_terminate(smc_get_lgr(link));
break;
default:
smc_wr_rx_post(link); /* refill WR RX */
diff --git a/net/socket.c b/net/socket.c
index 85633622c94d..b91949168a87 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -89,6 +89,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/xattr.h>
+#include <linux/nospec.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -251,7 +252,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
wq->flags = 0;
- RCU_INIT_POINTER(ei->socket.wq, wq);
+ ei->socket.wq = wq;
ei->socket.state = SS_UNCONNECTED;
ei->socket.flags = 0;
@@ -265,11 +266,9 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
static void sock_destroy_inode(struct inode *inode)
{
struct socket_alloc *ei;
- struct socket_wq *wq;
ei = container_of(inode, struct socket_alloc, vfs_inode);
- wq = rcu_dereference_protected(ei->socket.wq, 1);
- kfree_rcu(wq, rcu);
+ kfree_rcu(ei->socket.wq, rcu);
kmem_cache_free(sock_inode_cachep, ei);
}
@@ -603,7 +602,7 @@ static void __sock_release(struct socket *sock, struct inode *inode)
module_put(owner);
}
- if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
+ if (sock->wq->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
if (!sock->file) {
@@ -1130,12 +1129,21 @@ EXPORT_SYMBOL(sock_create_lite);
static __poll_t sock_poll(struct file *file, poll_table *wait)
{
struct socket *sock = file->private_data;
- __poll_t events = poll_requested_events(wait);
+ __poll_t events = poll_requested_events(wait), flag = 0;
- sock_poll_busy_loop(sock, events);
if (!sock->ops->poll)
return 0;
- return sock->ops->poll(file, sock, wait) | sock_poll_busy_flag(sock);
+
+ if (sk_can_busy_loop(sock->sk)) {
+ /* poll once if requested by the syscall */
+ if (events & POLL_BUSY_LOOP)
+ sk_busy_loop(sock->sk, 1);
+
+ /* if this socket can poll_ll, tell the system call */
+ flag = POLL_BUSY_LOOP;
+ }
+
+ return sock->ops->poll(file, sock, wait) | flag;
}
static int sock_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1172,7 +1180,7 @@ static int sock_fasync(int fd, struct file *filp, int on)
return -EINVAL;
lock_sock(sk);
- wq = rcu_dereference_protected(sock->wq, lockdep_sock_is_held(sk));
+ wq = sock->wq;
fasync_helper(fd, filp, on, &wq->fasync_list);
if (!wq->fasync_list)
@@ -2522,6 +2530,7 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
if (call < 1 || call > SYS_SENDMMSG)
return -EINVAL;
+ call = array_index_nospec(call, SYS_SENDMMSG + 1);
len = nargs[call];
if (len > sizeof(a))
@@ -2688,7 +2697,8 @@ EXPORT_SYMBOL(sock_unregister);
bool sock_is_registered(int family)
{
- return family < NPROTO && rcu_access_pointer(net_families[family]);
+ return family < NPROTO &&
+ rcu_access_pointer(net_families[array_index_nospec(family, NPROTO)]);
}
static int __init sock_init(void)
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 3a512936eea9..da1a676860ca 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -408,8 +408,6 @@ EXPORT_SYMBOL_GPL(strp_data_ready);
static void do_strp_work(struct strparser *strp)
{
- read_descriptor_t rd_desc;
-
/* We need the read lock to synchronize with strp_data_ready. We
* need the socket lock for calling strp_read_sock.
*/
@@ -421,8 +419,6 @@ static void do_strp_work(struct strparser *strp)
if (strp->paused)
goto out;
- rd_desc.arg.data = strp;
-
if (strp_read_sock(strp) == -ENOMEM)
queue_work(strp_wq, &strp->work);
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index be8f103d22fd..0fc397fae42b 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -517,7 +517,7 @@ gss_alloc_msg(struct gss_auth *gss_auth,
err = gss_encode_v1_msg(gss_msg, service_name, gss_auth->target_name);
if (err)
goto err_put_pipe_version;
- };
+ }
kref_get(&gss_auth->kref);
return gss_msg;
err_put_pipe_version:
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index f3711176be45..9ee6cfea56dd 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -512,7 +512,7 @@ int tipc_bcast_init(struct net *net)
struct tipc_bc_base *bb = NULL;
struct tipc_link *l = NULL;
- bb = kzalloc(sizeof(*bb), GFP_ATOMIC);
+ bb = kzalloc(sizeof(*bb), GFP_KERNEL);
if (!bb)
goto enomem;
tn->bcbase = bb;
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index fd6d8f18955c..418f03d0be90 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -395,6 +395,7 @@ int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
tipc_net_init(net, node_id, 0);
}
if (!tipc_own_id(net)) {
+ dev_put(dev);
pr_warn("Failed to obtain node identity\n");
return -EINVAL;
}
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index 9f666e0650e2..2830709957bd 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -133,6 +133,8 @@ static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
}
/* tipc_disc_addr_trial(): - handle an address uniqueness trial from peer
+ * Returns true if message should be dropped by caller, i.e., if it is a
+ * trial message or we are inside trial period. Otherwise false.
*/
static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
struct tipc_media_addr *maddr,
@@ -168,8 +170,9 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d,
msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
}
+ /* Accept regular link requests/responses only after trial period */
if (mtyp != DSC_TRIAL_MSG)
- return false;
+ return trial;
sugg_addr = tipc_node_try_addr(net, peer_id, src);
if (sugg_addr)
@@ -284,7 +287,6 @@ static void tipc_disc_timeout(struct timer_list *t)
{
struct tipc_discoverer *d = from_timer(d, t, timer);
struct tipc_net *tn = tipc_net(d->net);
- u32 self = tipc_own_addr(d->net);
struct tipc_media_addr maddr;
struct sk_buff *skb = NULL;
struct net *net = d->net;
@@ -298,12 +300,14 @@ static void tipc_disc_timeout(struct timer_list *t)
goto exit;
}
- /* Did we just leave the address trial period ? */
- if (!self && !time_before(jiffies, tn->addr_trial_end)) {
- self = tn->trial_addr;
- tipc_net_finalize(net, self);
- msg_set_prevnode(buf_msg(d->skb), self);
+ /* Trial period over ? */
+ if (!time_before(jiffies, tn->addr_trial_end)) {
+ /* Did we just leave it ? */
+ if (!tipc_own_addr(net))
+ tipc_net_finalize(net, tn->trial_addr);
+
msg_set_type(buf_msg(d->skb), DSC_REQ_MSG);
+ msg_set_prevnode(buf_msg(d->skb), tipc_own_addr(net));
}
/* Adjust timeout interval according to discovery phase */
diff --git a/net/tipc/group.c b/net/tipc/group.c
index cbe39e8db39c..e82f13cb2dc5 100644
--- a/net/tipc/group.c
+++ b/net/tipc/group.c
@@ -159,11 +159,6 @@ u32 tipc_group_exclude(struct tipc_group *grp)
return 0;
}
-int tipc_group_size(struct tipc_group *grp)
-{
- return grp->member_cnt;
-}
-
struct tipc_group *tipc_group_create(struct net *net, u32 portid,
struct tipc_group_req *mreq,
bool *group_is_open)
@@ -232,8 +227,8 @@ void tipc_group_delete(struct net *net, struct tipc_group *grp)
kfree(grp);
}
-struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
- u32 node, u32 port)
+static struct tipc_member *tipc_group_find_member(struct tipc_group *grp,
+ u32 node, u32 port)
{
struct rb_node *n = grp->members.rb_node;
u64 nkey, key = (u64)node << 32 | port;
diff --git a/net/tipc/link.c b/net/tipc/link.c
index df763be38541..b1f0bee54eac 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -297,11 +297,6 @@ static bool link_is_bc_rcvlink(struct tipc_link *l)
return ((l->bc_rcvlink == l) && !link_is_bc_sndlink(l));
}
-int tipc_link_is_active(struct tipc_link *l)
-{
- return l->active;
-}
-
void tipc_link_set_active(struct tipc_link *l, bool active)
{
l->active = active;
@@ -378,7 +373,7 @@ int tipc_link_bc_peers(struct tipc_link *l)
return l->ackers;
}
-u16 link_bc_rcv_gap(struct tipc_link *l)
+static u16 link_bc_rcv_gap(struct tipc_link *l)
{
struct sk_buff *skb = skb_peek(&l->deferdq);
u16 gap = 0;
@@ -825,7 +820,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
* Wake up a number of waiting users, as permitted by available space
* in the send queue
*/
-void link_prepare_wakeup(struct tipc_link *l)
+static void link_prepare_wakeup(struct tipc_link *l)
{
struct sk_buff *skb, *tmp;
int imp, i = 0;
@@ -961,7 +956,8 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
return rc;
}
-void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
+static void tipc_link_advance_backlog(struct tipc_link *l,
+ struct sk_buff_head *xmitq)
{
struct sk_buff *skb, *_skb;
struct tipc_msg *hdr;
@@ -1011,8 +1007,8 @@ static void link_retransmit_failure(struct tipc_link *l, struct sk_buff *skb)
* @to: retransmit to (inclusive) this sequence number
* xmitq: queue for accumulating the retransmitted packets
*/
-int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
- u16 from, u16 to, struct sk_buff_head *xmitq)
+static int tipc_link_retrans(struct tipc_link *l, struct tipc_link *r,
+ u16 from, u16 to, struct sk_buff_head *xmitq)
{
struct sk_buff *_skb, *skb = skb_peek(&l->transmq);
u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c
index 5453e564da82..67f69389ec17 100644
--- a/net/tipc/monitor.c
+++ b/net/tipc/monitor.c
@@ -684,7 +684,8 @@ int tipc_nl_monitor_get_threshold(struct net *net)
return tn->mon_threshold;
}
-int __tipc_nl_add_monitor_peer(struct tipc_peer *peer, struct tipc_nl_msg *msg)
+static int __tipc_nl_add_monitor_peer(struct tipc_peer *peer,
+ struct tipc_nl_msg *msg)
{
struct tipc_mon_domain *dom = peer->domain;
struct nlattr *attrs;
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index bebe88cae07b..88f027b502f6 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -735,7 +735,7 @@ int tipc_nametbl_init(struct net *net)
struct name_table *nt;
int i;
- nt = kzalloc(sizeof(*nt), GFP_ATOMIC);
+ nt = kzalloc(sizeof(*nt), GFP_KERNEL);
if (!nt)
return -ENOMEM;
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 4fbaa0464405..a7f6964c3a4b 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -121,12 +121,17 @@ int tipc_net_init(struct net *net, u8 *node_id, u32 addr)
void tipc_net_finalize(struct net *net, u32 addr)
{
- tipc_set_node_addr(net, addr);
- smp_mb();
- tipc_named_reinit(net);
- tipc_sk_reinit(net);
- tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
- TIPC_CLUSTER_SCOPE, 0, addr);
+ struct tipc_net *tn = tipc_net(net);
+
+ spin_lock_bh(&tn->node_list_lock);
+ if (!tipc_own_addr(net)) {
+ tipc_set_node_addr(net, addr);
+ tipc_named_reinit(net);
+ tipc_sk_reinit(net);
+ tipc_nametbl_publish(net, TIPC_CFG_SRV, addr, addr,
+ TIPC_CLUSTER_SCOPE, 0, addr);
+ }
+ spin_unlock_bh(&tn->node_list_lock);
}
void tipc_net_stop(struct net *net)
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 52fd80b0e728..68014f1b6976 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -370,13 +370,17 @@ static struct tipc_node *tipc_node_create(struct net *net, u32 addr,
spin_lock_bh(&tn->node_list_lock);
n = tipc_node_find(net, addr);
if (n) {
+ if (n->capabilities == capabilities)
+ goto exit;
/* Same node may come back with new capabilities */
+ write_lock_bh(&n->lock);
n->capabilities = capabilities;
for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
l = n->links[bearer_id].link;
if (l)
tipc_link_update_caps(l, capabilities);
}
+ write_unlock_bh(&n->lock);
goto exit;
}
n = kzalloc(sizeof(*n), GFP_ATOMIC);
@@ -858,6 +862,7 @@ static u32 tipc_node_suggest_addr(struct net *net, u32 addr)
}
/* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not
+ * Returns suggested address if any, otherwise 0
*/
u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
{
@@ -880,12 +885,14 @@ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr)
if (n) {
addr = n->addr;
tipc_node_put(n);
+ return addr;
}
- /* Even this node may be in trial phase */
+
+ /* Even this node may be in conflict */
if (tn->trial_addr == addr)
return tipc_node_suggest_addr(net, addr);
- return addr;
+ return 0;
}
void tipc_node_check_dest(struct net *net, u32 addr,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 3d21414ba357..c1e93c9515bc 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -411,7 +411,6 @@ static int tipc_sk_sock_err(struct socket *sock, long *timeout)
static int tipc_sk_create(struct net *net, struct socket *sock,
int protocol, int kern)
{
- struct tipc_net *tn;
const struct proto_ops *ops;
struct sock *sk;
struct tipc_sock *tsk;
@@ -446,7 +445,6 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
INIT_LIST_HEAD(&tsk->publications);
INIT_LIST_HEAD(&tsk->cong_links);
msg = &tsk->phdr;
- tn = net_generic(sock_net(sk), tipc_net_id);
/* Finish initializing socket data structures */
sock->ops = ops;
@@ -716,7 +714,7 @@ static __poll_t tipc_poll(struct file *file, struct socket *sock,
struct tipc_sock *tsk = tipc_sk(sk);
__poll_t revents = 0;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
if (sk->sk_shutdown & RCV_SHUTDOWN)
revents |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
@@ -1117,7 +1115,7 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
u32 self = tipc_own_addr(net);
u32 type, lower, upper, scope;
struct sk_buff *skb, *_skb;
- u32 portid, oport, onode;
+ u32 portid, onode;
struct sk_buff_head tmpq;
struct list_head dports;
struct tipc_msg *hdr;
@@ -1133,7 +1131,6 @@ void tipc_sk_mcast_rcv(struct net *net, struct sk_buff_head *arrvq,
user = msg_user(hdr);
mtyp = msg_type(hdr);
hlen = skb_headroom(skb) + msg_hdr_sz(hdr);
- oport = msg_origport(hdr);
onode = msg_orignode(hdr);
type = msg_nametype(hdr);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index a7a8f8e20ff3..292742e50bfa 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -52,9 +52,12 @@ static DEFINE_SPINLOCK(tls_device_lock);
static void tls_device_free_ctx(struct tls_context *ctx)
{
- struct tls_offload_context *offload_ctx = tls_offload_ctx(ctx);
+ if (ctx->tx_conf == TLS_HW)
+ kfree(tls_offload_ctx_tx(ctx));
+
+ if (ctx->rx_conf == TLS_HW)
+ kfree(tls_offload_ctx_rx(ctx));
- kfree(offload_ctx);
kfree(ctx);
}
@@ -71,10 +74,11 @@ static void tls_device_gc_task(struct work_struct *work)
list_for_each_entry_safe(ctx, tmp, &gc_list, list) {
struct net_device *netdev = ctx->netdev;
- if (netdev) {
+ if (netdev && ctx->tx_conf == TLS_HW) {
netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
TLS_OFFLOAD_CTX_DIR_TX);
dev_put(netdev);
+ ctx->netdev = NULL;
}
list_del(&ctx->list);
@@ -82,6 +86,22 @@ static void tls_device_gc_task(struct work_struct *work)
}
}
+static void tls_device_attach(struct tls_context *ctx, struct sock *sk,
+ struct net_device *netdev)
+{
+ if (sk->sk_destruct != tls_device_sk_destruct) {
+ refcount_set(&ctx->refcount, 1);
+ dev_hold(netdev);
+ ctx->netdev = netdev;
+ spin_lock_irq(&tls_device_lock);
+ list_add_tail(&ctx->list, &tls_device_list);
+ spin_unlock_irq(&tls_device_lock);
+
+ ctx->sk_destruct = sk->sk_destruct;
+ sk->sk_destruct = tls_device_sk_destruct;
+ }
+}
+
static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
{
unsigned long flags;
@@ -125,7 +145,7 @@ static void destroy_record(struct tls_record_info *record)
kfree(record);
}
-static void delete_all_records(struct tls_offload_context *offload_ctx)
+static void delete_all_records(struct tls_offload_context_tx *offload_ctx)
{
struct tls_record_info *info, *temp;
@@ -141,14 +161,14 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_record_info *info, *temp;
- struct tls_offload_context *ctx;
+ struct tls_offload_context_tx *ctx;
u64 deleted_records = 0;
unsigned long flags;
if (!tls_ctx)
return;
- ctx = tls_offload_ctx(tls_ctx);
+ ctx = tls_offload_ctx_tx(tls_ctx);
spin_lock_irqsave(&ctx->lock, flags);
info = ctx->retransmit_hint;
@@ -179,15 +199,17 @@ static void tls_icsk_clean_acked(struct sock *sk, u32 acked_seq)
void tls_device_sk_destruct(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
- if (ctx->open_record)
- destroy_record(ctx->open_record);
+ tls_ctx->sk_destruct(sk);
- delete_all_records(ctx);
- crypto_free_aead(ctx->aead_send);
- ctx->sk_destruct(sk);
- clean_acked_data_disable(inet_csk(sk));
+ if (tls_ctx->tx_conf == TLS_HW) {
+ if (ctx->open_record)
+ destroy_record(ctx->open_record);
+ delete_all_records(ctx);
+ crypto_free_aead(ctx->aead_send);
+ clean_acked_data_disable(inet_csk(sk));
+ }
if (refcount_dec_and_test(&tls_ctx->refcount))
tls_device_queue_ctx_destruction(tls_ctx);
@@ -219,7 +241,7 @@ static void tls_append_frag(struct tls_record_info *record,
static int tls_push_record(struct sock *sk,
struct tls_context *ctx,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct tls_record_info *record,
struct page_frag *pfrag,
int flags,
@@ -264,7 +286,7 @@ static int tls_push_record(struct sock *sk,
return tls_push_sg(sk, ctx, offload_ctx->sg_tx_data, 0, flags);
}
-static int tls_create_new_record(struct tls_offload_context *offload_ctx,
+static int tls_create_new_record(struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
{
@@ -290,7 +312,7 @@ static int tls_create_new_record(struct tls_offload_context *offload_ctx,
}
static int tls_do_allocation(struct sock *sk,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct page_frag *pfrag,
size_t prepend_size)
{
@@ -324,7 +346,7 @@ static int tls_push_data(struct sock *sk,
unsigned char record_type)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST;
int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE);
struct tls_record_info *record = ctx->open_record;
@@ -477,7 +499,7 @@ out:
return rc;
}
-struct tls_record_info *tls_get_record(struct tls_offload_context *context,
+struct tls_record_info *tls_get_record(struct tls_offload_context_tx *context,
u32 seq, u64 *p_record_sn)
{
u64 record_sn = context->hint_record_sn;
@@ -520,11 +542,123 @@ static int tls_device_push_pending_record(struct sock *sk, int flags)
return tls_push_data(sk, &msg_iter, 0, flags, TLS_RECORD_TYPE_DATA);
}
+void handle_device_resync(struct sock *sk, u32 seq, u64 rcd_sn)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct net_device *netdev = tls_ctx->netdev;
+ struct tls_offload_context_rx *rx_ctx;
+ u32 is_req_pending;
+ s64 resync_req;
+ u32 req_seq;
+
+ if (tls_ctx->rx_conf != TLS_HW)
+ return;
+
+ rx_ctx = tls_offload_ctx_rx(tls_ctx);
+ resync_req = atomic64_read(&rx_ctx->resync_req);
+ req_seq = ntohl(resync_req >> 32) - ((u32)TLS_HEADER_SIZE - 1);
+ is_req_pending = resync_req;
+
+ if (unlikely(is_req_pending) && req_seq == seq &&
+ atomic64_try_cmpxchg(&rx_ctx->resync_req, &resync_req, 0))
+ netdev->tlsdev_ops->tls_dev_resync_rx(netdev, sk,
+ seq + TLS_HEADER_SIZE - 1,
+ rcd_sn);
+}
+
+static int tls_device_reencrypt(struct sock *sk, struct sk_buff *skb)
+{
+ struct strp_msg *rxm = strp_msg(skb);
+ int err = 0, offset = rxm->offset, copy, nsg;
+ struct sk_buff *skb_iter, *unused;
+ struct scatterlist sg[1];
+ char *orig_buf, *buf;
+
+ orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation);
+ if (!orig_buf)
+ return -ENOMEM;
+ buf = orig_buf;
+
+ nsg = skb_cow_data(skb, 0, &unused);
+ if (unlikely(nsg < 0)) {
+ err = nsg;
+ goto free_buf;
+ }
+
+ sg_init_table(sg, 1);
+ sg_set_buf(&sg[0], buf,
+ rxm->full_len + TLS_HEADER_SIZE +
+ TLS_CIPHER_AES_GCM_128_IV_SIZE);
+ skb_copy_bits(skb, offset, buf,
+ TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE);
+
+ /* We are interested only in the decrypted data not the auth */
+ err = decrypt_skb(sk, skb, sg);
+ if (err != -EBADMSG)
+ goto free_buf;
+ else
+ err = 0;
+
+ copy = min_t(int, skb_pagelen(skb) - offset,
+ rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+
+ if (skb->decrypted)
+ skb_store_bits(skb, offset, buf, copy);
+
+ offset += copy;
+ buf += copy;
+
+ skb_walk_frags(skb, skb_iter) {
+ copy = min_t(int, skb_iter->len,
+ rxm->full_len - offset + rxm->offset -
+ TLS_CIPHER_AES_GCM_128_TAG_SIZE);
+
+ if (skb_iter->decrypted)
+ skb_store_bits(skb_iter, offset, buf, copy);
+
+ offset += copy;
+ buf += copy;
+ }
+
+free_buf:
+ kfree(orig_buf);
+ return err;
+}
+
+int tls_device_decrypted(struct sock *sk, struct sk_buff *skb)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
+ int is_decrypted = skb->decrypted;
+ int is_encrypted = !is_decrypted;
+ struct sk_buff *skb_iter;
+
+ /* Skip if it is already decrypted */
+ if (ctx->sw.decrypted)
+ return 0;
+
+ /* Check if all the data is decrypted already */
+ skb_walk_frags(skb, skb_iter) {
+ is_decrypted &= skb_iter->decrypted;
+ is_encrypted &= !skb_iter->decrypted;
+ }
+
+ ctx->sw.decrypted |= is_decrypted;
+
+ /* Return immedeatly if the record is either entirely plaintext or
+ * entirely ciphertext. Otherwise handle reencrypt partially decrypted
+ * record.
+ */
+ return (is_encrypted || is_decrypted) ? 0 :
+ tls_device_reencrypt(sk, skb);
+}
+
int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
{
u16 nonce_size, tag_size, iv_size, rec_seq_size;
struct tls_record_info *start_marker_record;
- struct tls_offload_context *offload_ctx;
+ struct tls_offload_context_tx *offload_ctx;
struct tls_crypto_info *crypto_info;
struct net_device *netdev;
char *iv, *rec_seq;
@@ -546,7 +680,7 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
goto out;
}
- offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE, GFP_KERNEL);
+ offload_ctx = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_TX, GFP_KERNEL);
if (!offload_ctx) {
rc = -ENOMEM;
goto free_marker_record;
@@ -582,12 +716,11 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
ctx->tx.rec_seq_size = rec_seq_size;
- ctx->tx.rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+ ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
if (!ctx->tx.rec_seq) {
rc = -ENOMEM;
goto free_iv;
}
- memcpy(ctx->tx.rec_seq, rec_seq, rec_seq_size);
rc = tls_sw_fallback_init(sk, offload_ctx, crypto_info);
if (rc)
@@ -609,7 +742,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
clean_acked_data_enable(inet_csk(sk), &tls_icsk_clean_acked);
ctx->push_pending_record = tls_device_push_pending_record;
- offload_ctx->sk_destruct = sk->sk_destruct;
/* TLS offload is greatly simplified if we don't send
* SKBs where only part of the payload needs to be encrypted.
@@ -619,8 +751,6 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (skb)
TCP_SKB_CB(skb)->eor = 1;
- refcount_set(&ctx->refcount, 1);
-
/* We support starting offload on multiple sockets
* concurrently, so we only need a read lock here.
* This lock must precede get_netdev_for_sock to prevent races between
@@ -655,19 +785,14 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx)
if (rc)
goto release_netdev;
- ctx->netdev = netdev;
-
- spin_lock_irq(&tls_device_lock);
- list_add_tail(&ctx->list, &tls_device_list);
- spin_unlock_irq(&tls_device_lock);
+ tls_device_attach(ctx, sk, netdev);
- sk->sk_validate_xmit_skb = tls_validate_xmit_skb;
/* following this assignment tls_is_sk_tx_device_offloaded
* will return true and the context might be accessed
* by the netdev's xmit function.
*/
- smp_store_release(&sk->sk_destruct,
- &tls_device_sk_destruct);
+ smp_store_release(&sk->sk_validate_xmit_skb, tls_validate_xmit_skb);
+ dev_put(netdev);
up_read(&device_offload_lock);
goto out;
@@ -690,6 +815,105 @@ out:
return rc;
}
+int tls_set_device_offload_rx(struct sock *sk, struct tls_context *ctx)
+{
+ struct tls_offload_context_rx *context;
+ struct net_device *netdev;
+ int rc = 0;
+
+ /* We support starting offload on multiple sockets
+ * concurrently, so we only need a read lock here.
+ * This lock must precede get_netdev_for_sock to prevent races between
+ * NETDEV_DOWN and setsockopt.
+ */
+ down_read(&device_offload_lock);
+ netdev = get_netdev_for_sock(sk);
+ if (!netdev) {
+ pr_err_ratelimited("%s: netdev not found\n", __func__);
+ rc = -EINVAL;
+ goto release_lock;
+ }
+
+ if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
+ pr_err_ratelimited("%s: netdev %s with no TLS offload\n",
+ __func__, netdev->name);
+ rc = -ENOTSUPP;
+ goto release_netdev;
+ }
+
+ /* Avoid offloading if the device is down
+ * We don't want to offload new flows after
+ * the NETDEV_DOWN event
+ */
+ if (!(netdev->flags & IFF_UP)) {
+ rc = -EINVAL;
+ goto release_netdev;
+ }
+
+ context = kzalloc(TLS_OFFLOAD_CONTEXT_SIZE_RX, GFP_KERNEL);
+ if (!context) {
+ rc = -ENOMEM;
+ goto release_netdev;
+ }
+
+ ctx->priv_ctx_rx = context;
+ rc = tls_set_sw_offload(sk, ctx, 0);
+ if (rc)
+ goto release_ctx;
+
+ rc = netdev->tlsdev_ops->tls_dev_add(netdev, sk, TLS_OFFLOAD_CTX_DIR_RX,
+ &ctx->crypto_recv,
+ tcp_sk(sk)->copied_seq);
+ if (rc) {
+ pr_err_ratelimited("%s: The netdev has refused to offload this socket\n",
+ __func__);
+ goto free_sw_resources;
+ }
+
+ tls_device_attach(ctx, sk, netdev);
+ goto release_netdev;
+
+free_sw_resources:
+ tls_sw_free_resources_rx(sk);
+release_ctx:
+ ctx->priv_ctx_rx = NULL;
+release_netdev:
+ dev_put(netdev);
+release_lock:
+ up_read(&device_offload_lock);
+ return rc;
+}
+
+void tls_device_offload_cleanup_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct net_device *netdev;
+
+ down_read(&device_offload_lock);
+ netdev = tls_ctx->netdev;
+ if (!netdev)
+ goto out;
+
+ if (!(netdev->features & NETIF_F_HW_TLS_RX)) {
+ pr_err_ratelimited("%s: device is missing NETIF_F_HW_TLS_RX cap\n",
+ __func__);
+ goto out;
+ }
+
+ netdev->tlsdev_ops->tls_dev_del(netdev, tls_ctx,
+ TLS_OFFLOAD_CTX_DIR_RX);
+
+ if (tls_ctx->tx_conf != TLS_HW) {
+ dev_put(netdev);
+ tls_ctx->netdev = NULL;
+ }
+out:
+ up_read(&device_offload_lock);
+ kfree(tls_ctx->rx.rec_seq);
+ kfree(tls_ctx->rx.iv);
+ tls_sw_release_resources_rx(sk);
+}
+
static int tls_device_down(struct net_device *netdev)
{
struct tls_context *ctx, *tmp;
@@ -710,8 +934,12 @@ static int tls_device_down(struct net_device *netdev)
spin_unlock_irqrestore(&tls_device_lock, flags);
list_for_each_entry_safe(ctx, tmp, &list, list) {
- netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
- TLS_OFFLOAD_CTX_DIR_TX);
+ if (ctx->tx_conf == TLS_HW)
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_TX);
+ if (ctx->rx_conf == TLS_HW)
+ netdev->tlsdev_ops->tls_dev_del(netdev, ctx,
+ TLS_OFFLOAD_CTX_DIR_RX);
ctx->netdev = NULL;
dev_put(netdev);
list_del_init(&ctx->list);
@@ -732,12 +960,16 @@ static int tls_dev_event(struct notifier_block *this, unsigned long event,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
- if (!(dev->features & NETIF_F_HW_TLS_TX))
+ if (!(dev->features & (NETIF_F_HW_TLS_RX | NETIF_F_HW_TLS_TX)))
return NOTIFY_DONE;
switch (event) {
case NETDEV_REGISTER:
case NETDEV_FEAT_CHANGE:
+ if ((dev->features & NETIF_F_HW_TLS_RX) &&
+ !dev->tlsdev_ops->tls_dev_resync_rx)
+ return NOTIFY_BAD;
+
if (dev->tlsdev_ops &&
dev->tlsdev_ops->tls_dev_add &&
dev->tlsdev_ops->tls_dev_del)
diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c
index 748914abdb60..e3313c45663f 100644
--- a/net/tls/tls_device_fallback.c
+++ b/net/tls/tls_device_fallback.c
@@ -214,7 +214,7 @@ static void complete_skb(struct sk_buff *nskb, struct sk_buff *skb, int headln)
static int fill_sg_in(struct scatterlist *sg_in,
struct sk_buff *skb,
- struct tls_offload_context *ctx,
+ struct tls_offload_context_tx *ctx,
u64 *rcd_sn,
s32 *sync_size,
int *resync_sgs)
@@ -299,7 +299,7 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx,
s32 sync_size, u64 rcd_sn)
{
int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
void *buf, *iv, *aad, *dummy_buf;
struct aead_request *aead_req;
@@ -361,7 +361,7 @@ static struct sk_buff *tls_sw_fallback(struct sock *sk, struct sk_buff *skb)
{
int tcp_payload_offset = skb_transport_offset(skb) + tcp_hdrlen(skb);
struct tls_context *tls_ctx = tls_get_ctx(sk);
- struct tls_offload_context *ctx = tls_offload_ctx(tls_ctx);
+ struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx);
int payload_len = skb->len - tcp_payload_offset;
struct scatterlist *sg_in, sg_out[3];
struct sk_buff *nskb = NULL;
@@ -413,9 +413,10 @@ struct sk_buff *tls_validate_xmit_skb(struct sock *sk,
return tls_sw_fallback(sk, skb);
}
+EXPORT_SYMBOL_GPL(tls_validate_xmit_skb);
int tls_sw_fallback_init(struct sock *sk,
- struct tls_offload_context *offload_ctx,
+ struct tls_offload_context_tx *offload_ctx,
struct tls_crypto_info *crypto_info)
{
const u8 *key;
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 301f22430469..b09867c8b817 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -51,15 +51,6 @@ enum {
TLSV6,
TLS_NUM_PROTS,
};
-enum {
- TLS_BASE,
- TLS_SW,
-#ifdef CONFIG_TLS_DEVICE
- TLS_HW,
-#endif
- TLS_HW_RECORD,
- TLS_NUM_CONFIG,
-};
static struct proto *saved_tcpv6_prot;
static DEFINE_MUTEX(tcpv6_prot_mutex);
@@ -290,7 +281,10 @@ static void tls_sk_proto_close(struct sock *sk, long timeout)
}
#ifdef CONFIG_TLS_DEVICE
- if (ctx->tx_conf != TLS_HW) {
+ if (ctx->rx_conf == TLS_HW)
+ tls_device_offload_cleanup_rx(sk);
+
+ if (ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW) {
#else
{
#endif
@@ -470,8 +464,16 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval,
conf = TLS_SW;
}
} else {
- rc = tls_set_sw_offload(sk, ctx, 0);
- conf = TLS_SW;
+#ifdef CONFIG_TLS_DEVICE
+ rc = tls_set_device_offload_rx(sk, ctx);
+ conf = TLS_HW;
+ if (rc) {
+#else
+ {
+#endif
+ rc = tls_set_sw_offload(sk, ctx, 0);
+ conf = TLS_SW;
+ }
}
if (rc)
@@ -629,6 +631,12 @@ static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG],
prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW];
prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg;
prot[TLS_HW][TLS_SW].sendpage = tls_device_sendpage;
+
+ prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW];
+
+ prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW];
+
+ prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW];
#endif
prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base;
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 7453f5ae0819..83d67df33f0c 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -53,7 +53,6 @@ static int tls_do_decryption(struct sock *sk,
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
- struct strp_msg *rxm = strp_msg(skb);
struct aead_request *aead_req;
int ret;
@@ -71,18 +70,6 @@ static int tls_do_decryption(struct sock *sk,
ret = crypto_wait_req(crypto_aead_decrypt(aead_req), &ctx->async_wait);
- if (ret < 0)
- goto out;
-
- rxm->offset += tls_ctx->rx.prepend_size;
- rxm->full_len -= tls_ctx->rx.overhead_size;
- tls_advance_record_sn(sk, &tls_ctx->rx);
-
- ctx->decrypted = true;
-
- ctx->saved_data_ready(sk);
-
-out:
aead_request_free(aead_req);
return ret;
}
@@ -324,7 +311,12 @@ static int zerocopy_from_iter(struct sock *sk, struct iov_iter *from,
}
}
+ /* Mark the end in the last sg entry if newly added */
+ if (num_elem > *pages_used)
+ sg_mark_end(&to[num_elem - 1]);
out:
+ if (rc)
+ iov_iter_revert(from, size - *size_used);
*size_used = size;
*pages_used = num_elem;
@@ -373,6 +365,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
int record_room;
bool full_record;
int orig_size;
+ bool is_kvec = msg->msg_iter.type & ITER_KVEC;
if (msg->msg_flags & ~(MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL))
return -ENOTSUPP;
@@ -421,8 +414,7 @@ alloc_encrypted:
try_to_copy -= required_size - ctx->sg_encrypted_size;
full_record = true;
}
-
- if (full_record || eor) {
+ if (!is_kvec && (full_record || eor)) {
ret = zerocopy_from_iter(sk, &msg->msg_iter,
try_to_copy, &ctx->sg_plaintext_num_elem,
&ctx->sg_plaintext_size,
@@ -434,15 +426,11 @@ alloc_encrypted:
copied += try_to_copy;
ret = tls_push_record(sk, msg->msg_flags, record_type);
- if (!ret)
- continue;
- if (ret == -EAGAIN)
+ if (ret)
goto send_end;
+ continue;
- copied -= try_to_copy;
fallback_to_reg_send:
- iov_iter_revert(&msg->msg_iter,
- ctx->sg_plaintext_size - orig_size);
trim_sg(sk, ctx->sg_plaintext_data,
&ctx->sg_plaintext_num_elem,
&ctx->sg_plaintext_size,
@@ -642,6 +630,9 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
return NULL;
}
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ return NULL;
+
if (sock_flag(sk, SOCK_DONE))
return NULL;
@@ -666,8 +657,38 @@ static struct sk_buff *tls_wait_data(struct sock *sk, int flags,
return skb;
}
-static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
- struct scatterlist *sgout)
+static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout, bool *zc)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct strp_msg *rxm = strp_msg(skb);
+ int err = 0;
+
+#ifdef CONFIG_TLS_DEVICE
+ err = tls_device_decrypted(sk, skb);
+ if (err < 0)
+ return err;
+#endif
+ if (!ctx->decrypted) {
+ err = decrypt_skb(sk, skb, sgout);
+ if (err < 0)
+ return err;
+ } else {
+ *zc = false;
+ }
+
+ rxm->offset += tls_ctx->rx.prepend_size;
+ rxm->full_len -= tls_ctx->rx.overhead_size;
+ tls_advance_record_sn(sk, &tls_ctx->rx);
+ ctx->decrypted = true;
+ ctx->saved_data_ready(sk);
+
+ return err;
+}
+
+int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
@@ -697,6 +718,10 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
nsg = skb_to_sgvec(skb, &sgin[1],
rxm->offset + tls_ctx->rx.prepend_size,
rxm->full_len - tls_ctx->rx.prepend_size);
+ if (nsg < 0) {
+ ret = nsg;
+ goto out;
+ }
tls_make_aad(ctx->rx_aad_ciphertext,
rxm->full_len - tls_ctx->rx.overhead_size,
@@ -708,6 +733,7 @@ static int decrypt_skb(struct sock *sk, struct sk_buff *skb,
rxm->full_len - tls_ctx->rx.overhead_size,
skb, sk->sk_allocation);
+out:
if (sgin != &sgin_arr[0])
kfree(sgin);
@@ -752,6 +778,7 @@ int tls_sw_recvmsg(struct sock *sk,
bool cmsg = false;
int target, err = 0;
long timeo;
+ bool is_kvec = msg->msg_iter.type & ITER_KVEC;
flags |= nonblock;
@@ -795,7 +822,7 @@ int tls_sw_recvmsg(struct sock *sk,
page_count = iov_iter_npages(&msg->msg_iter,
MAX_SKB_FRAGS);
to_copy = rxm->full_len - tls_ctx->rx.overhead_size;
- if (to_copy <= len && page_count < MAX_SKB_FRAGS &&
+ if (!is_kvec && to_copy <= len && page_count < MAX_SKB_FRAGS &&
likely(!(flags & MSG_PEEK))) {
struct scatterlist sgin[MAX_SKB_FRAGS + 1];
int pages = 0;
@@ -812,7 +839,7 @@ int tls_sw_recvmsg(struct sock *sk,
if (err < 0)
goto fallback_to_reg_recv;
- err = decrypt_skb(sk, skb, sgin);
+ err = decrypt_skb_update(sk, skb, sgin, &zc);
for (; pages > 0; pages--)
put_page(sg_page(&sgin[pages]));
if (err < 0) {
@@ -821,7 +848,7 @@ int tls_sw_recvmsg(struct sock *sk,
}
} else {
fallback_to_reg_recv:
- err = decrypt_skb(sk, skb, NULL);
+ err = decrypt_skb_update(sk, skb, NULL, &zc);
if (err < 0) {
tls_err_abort(sk, EBADMSG);
goto recv_end;
@@ -876,6 +903,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
int err = 0;
long timeo;
int chunk;
+ bool zc;
lock_sock(sk);
@@ -892,7 +920,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
}
if (!ctx->decrypted) {
- err = decrypt_skb(sk, skb, NULL);
+ err = decrypt_skb_update(sk, skb, NULL, &zc);
if (err < 0) {
tls_err_abort(sk, EBADMSG);
@@ -981,6 +1009,10 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
goto read_failure;
}
+#ifdef CONFIG_TLS_DEVICE
+ handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset,
+ *(u64*)tls_ctx->rx.rec_seq);
+#endif
return data_len + TLS_HEADER_SIZE;
read_failure:
@@ -999,7 +1031,7 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
ctx->recv_pkt = skb;
strp_pause(strp);
- strp->sk->sk_state_change(strp->sk);
+ ctx->saved_data_ready(strp->sk);
}
static void tls_data_ready(struct sock *sk)
@@ -1015,23 +1047,20 @@ void tls_sw_free_resources_tx(struct sock *sk)
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
- if (ctx->aead_send)
- crypto_free_aead(ctx->aead_send);
+ crypto_free_aead(ctx->aead_send);
tls_free_both_sg(sk);
kfree(ctx);
}
-void tls_sw_free_resources_rx(struct sock *sk)
+void tls_sw_release_resources_rx(struct sock *sk)
{
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
if (ctx->aead_recv) {
- if (ctx->recv_pkt) {
- kfree_skb(ctx->recv_pkt);
- ctx->recv_pkt = NULL;
- }
+ kfree_skb(ctx->recv_pkt);
+ ctx->recv_pkt = NULL;
crypto_free_aead(ctx->aead_recv);
strp_stop(&ctx->strp);
write_lock_bh(&sk->sk_callback_lock);
@@ -1041,6 +1070,14 @@ void tls_sw_free_resources_rx(struct sock *sk)
strp_done(&ctx->strp);
lock_sock(sk);
}
+}
+
+void tls_sw_free_resources_rx(struct sock *sk)
+{
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+
+ tls_sw_release_resources_rx(sk);
kfree(ctx);
}
@@ -1065,28 +1102,38 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
}
if (tx) {
- sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
- if (!sw_ctx_tx) {
- rc = -ENOMEM;
- goto out;
+ if (!ctx->priv_ctx_tx) {
+ sw_ctx_tx = kzalloc(sizeof(*sw_ctx_tx), GFP_KERNEL);
+ if (!sw_ctx_tx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ctx->priv_ctx_tx = sw_ctx_tx;
+ } else {
+ sw_ctx_tx =
+ (struct tls_sw_context_tx *)ctx->priv_ctx_tx;
}
- crypto_init_wait(&sw_ctx_tx->async_wait);
- ctx->priv_ctx_tx = sw_ctx_tx;
} else {
- sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
- if (!sw_ctx_rx) {
- rc = -ENOMEM;
- goto out;
+ if (!ctx->priv_ctx_rx) {
+ sw_ctx_rx = kzalloc(sizeof(*sw_ctx_rx), GFP_KERNEL);
+ if (!sw_ctx_rx) {
+ rc = -ENOMEM;
+ goto out;
+ }
+ ctx->priv_ctx_rx = sw_ctx_rx;
+ } else {
+ sw_ctx_rx =
+ (struct tls_sw_context_rx *)ctx->priv_ctx_rx;
}
- crypto_init_wait(&sw_ctx_rx->async_wait);
- ctx->priv_ctx_rx = sw_ctx_rx;
}
if (tx) {
+ crypto_init_wait(&sw_ctx_tx->async_wait);
crypto_info = &ctx->crypto_send;
cctx = &ctx->tx;
aead = &sw_ctx_tx->aead_send;
} else {
+ crypto_init_wait(&sw_ctx_rx->async_wait);
crypto_info = &ctx->crypto_recv;
cctx = &ctx->rx;
aead = &sw_ctx_rx->aead_recv;
@@ -1129,12 +1176,11 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx)
memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE);
memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size);
cctx->rec_seq_size = rec_seq_size;
- cctx->rec_seq = kmalloc(rec_seq_size, GFP_KERNEL);
+ cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL);
if (!cctx->rec_seq) {
rc = -ENOMEM;
goto free_iv;
}
- memcpy(cctx->rec_seq, rec_seq, rec_seq_size);
if (sw_ctx_tx) {
sg_init_table(sw_ctx_tx->sg_encrypted_data,
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e5473c03d667..d1edfa3cad61 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -430,7 +430,12 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
connected = unix_dgram_peer_wake_connect(sk, other);
- if (unix_recvq_full(other))
+ /* If other is SOCK_DEAD, we want to make sure we signal
+ * POLLOUT, such that a subsequent write() can get a
+ * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
+ * to other and its full, we will hang waiting for POLLOUT.
+ */
+ if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
return 1;
if (connected)
@@ -2635,7 +2640,7 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa
struct sock *sk = sock->sk;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
mask = 0;
/* exceptional events? */
@@ -2672,7 +2677,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
unsigned int writable;
__poll_t mask;
- sock_poll_wait(file, sk_sleep(sk), wait);
+ sock_poll_wait(file, wait);
mask = 0;
/* exceptional events? */
diff --git a/net/wimax/Makefile b/net/wimax/Makefile
index eb2db0d3b880..c2a71ae487ac 100644
--- a/net/wimax/Makefile
+++ b/net/wimax/Makefile
@@ -11,5 +11,3 @@ wimax-y := \
stack.o
wimax-$(CONFIG_DEBUG_FS) += debugfs.o
-
-
diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c
index 6c9bedb7431e..24514840746e 100644
--- a/net/wimax/debugfs.c
+++ b/net/wimax/debugfs.c
@@ -76,5 +76,3 @@ void wimax_debugfs_rm(struct wimax_dev *wimax_dev)
{
debugfs_remove_recursive(wimax_dev->debugfs_dentry);
}
-
-
diff --git a/net/wimax/op-msg.c b/net/wimax/op-msg.c
index 54aa146930bd..101b2fa3f32e 100644
--- a/net/wimax/op-msg.c
+++ b/net/wimax/op-msg.c
@@ -404,4 +404,3 @@ error_no_wimax_dev:
d_fnend(3, NULL, "(skb %p info %p) = %d\n", skb, info, result);
return result;
}
-
diff --git a/net/wimax/stack.c b/net/wimax/stack.c
index 5db731512014..a6307813b6d5 100644
--- a/net/wimax/stack.c
+++ b/net/wimax/stack.c
@@ -486,7 +486,8 @@ int wimax_dev_add(struct wimax_dev *wimax_dev, struct net_device *net_dev)
d_fnstart(3, dev, "(wimax_dev %p net_dev %p)\n", wimax_dev, net_dev);
/* Do the RFKILL setup before locking, as RFKILL will call
- * into our functions. */
+ * into our functions.
+ */
wimax_dev->net_dev = net_dev;
result = wimax_rfkill_add(wimax_dev);
if (result < 0)
@@ -629,4 +630,3 @@ module_exit(wimax_subsys_exit);
MODULE_AUTHOR("Intel Corporation <linux-wimax@intel.com>");
MODULE_DESCRIPTION("Linux WiMAX stack");
MODULE_LICENSE("GPL");
-
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index ba0a1f398ce5..e6bce1f130c9 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -65,9 +65,9 @@ struct lib80211_tkip_data {
int key_idx;
struct crypto_skcipher *rx_tfm_arc4;
- struct crypto_ahash *rx_tfm_michael;
+ struct crypto_shash *rx_tfm_michael;
struct crypto_skcipher *tx_tfm_arc4;
- struct crypto_ahash *tx_tfm_michael;
+ struct crypto_shash *tx_tfm_michael;
/* scratch buffers for virt_to_page() (crypto API) */
u8 rx_hdr[16], tx_hdr[16];
@@ -106,8 +106,7 @@ static void *lib80211_tkip_init(int key_idx)
goto fail;
}
- priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
+ priv->tx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0);
if (IS_ERR(priv->tx_tfm_michael)) {
priv->tx_tfm_michael = NULL;
goto fail;
@@ -120,8 +119,7 @@ static void *lib80211_tkip_init(int key_idx)
goto fail;
}
- priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
+ priv->rx_tfm_michael = crypto_alloc_shash("michael_mic", 0, 0);
if (IS_ERR(priv->rx_tfm_michael)) {
priv->rx_tfm_michael = NULL;
goto fail;
@@ -131,9 +129,9 @@ static void *lib80211_tkip_init(int key_idx)
fail:
if (priv) {
- crypto_free_ahash(priv->tx_tfm_michael);
+ crypto_free_shash(priv->tx_tfm_michael);
crypto_free_skcipher(priv->tx_tfm_arc4);
- crypto_free_ahash(priv->rx_tfm_michael);
+ crypto_free_shash(priv->rx_tfm_michael);
crypto_free_skcipher(priv->rx_tfm_arc4);
kfree(priv);
}
@@ -145,9 +143,9 @@ static void lib80211_tkip_deinit(void *priv)
{
struct lib80211_tkip_data *_priv = priv;
if (_priv) {
- crypto_free_ahash(_priv->tx_tfm_michael);
+ crypto_free_shash(_priv->tx_tfm_michael);
crypto_free_skcipher(_priv->tx_tfm_arc4);
- crypto_free_ahash(_priv->rx_tfm_michael);
+ crypto_free_shash(_priv->rx_tfm_michael);
crypto_free_skcipher(_priv->rx_tfm_arc4);
}
kfree(priv);
@@ -510,29 +508,36 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
return keyidx;
}
-static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr,
- u8 * data, size_t data_len, u8 * mic)
+static int michael_mic(struct crypto_shash *tfm_michael, u8 *key, u8 *hdr,
+ u8 *data, size_t data_len, u8 *mic)
{
- AHASH_REQUEST_ON_STACK(req, tfm_michael);
- struct scatterlist sg[2];
+ SHASH_DESC_ON_STACK(desc, tfm_michael);
int err;
if (tfm_michael == NULL) {
pr_warn("%s(): tfm_michael == NULL\n", __func__);
return -1;
}
- sg_init_table(sg, 2);
- sg_set_buf(&sg[0], hdr, 16);
- sg_set_buf(&sg[1], data, data_len);
- if (crypto_ahash_setkey(tfm_michael, key, 8))
+ desc->tfm = tfm_michael;
+ desc->flags = 0;
+
+ if (crypto_shash_setkey(tfm_michael, key, 8))
return -1;
- ahash_request_set_tfm(req, tfm_michael);
- ahash_request_set_callback(req, 0, NULL, NULL);
- ahash_request_set_crypt(req, sg, mic, data_len + 16);
- err = crypto_ahash_digest(req);
- ahash_request_zero(req);
+ err = crypto_shash_init(desc);
+ if (err)
+ goto out;
+ err = crypto_shash_update(desc, hdr, 16);
+ if (err)
+ goto out;
+ err = crypto_shash_update(desc, data, data_len);
+ if (err)
+ goto out;
+ err = crypto_shash_final(desc, mic);
+
+out:
+ shash_desc_zero(desc);
return err;
}
@@ -654,9 +659,9 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
int keyidx;
- struct crypto_ahash *tfm = tkey->tx_tfm_michael;
+ struct crypto_shash *tfm = tkey->tx_tfm_michael;
struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4;
- struct crypto_ahash *tfm3 = tkey->rx_tfm_michael;
+ struct crypto_shash *tfm3 = tkey->rx_tfm_michael;
struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4;
keyidx = tkey->key_idx;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index e4e5f025d16b..5fb9b7dd9831 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -4466,6 +4466,7 @@ static int parse_station_flags(struct genl_info *info,
params->sta_flags_mask = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
BIT(NL80211_STA_FLAG_MFP) |
BIT(NL80211_STA_FLAG_AUTHORIZED);
+ break;
default:
return -EINVAL;
}
@@ -15029,20 +15030,24 @@ void cfg80211_mgmt_tx_status(struct wireless_dev *wdev, u64 cookie,
EXPORT_SYMBOL(cfg80211_mgmt_tx_status);
static int __nl80211_rx_control_port(struct net_device *dev,
- const u8 *buf, size_t len,
- const u8 *addr, u16 proto,
+ struct sk_buff *skb,
bool unencrypted, gfp_t gfp)
{
struct wireless_dev *wdev = dev->ieee80211_ptr;
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
+ struct ethhdr *ehdr = eth_hdr(skb);
+ const u8 *addr = ehdr->h_source;
+ u16 proto = be16_to_cpu(skb->protocol);
struct sk_buff *msg;
void *hdr;
+ struct nlattr *frame;
+
u32 nlportid = READ_ONCE(wdev->conn_owner_nlportid);
if (!nlportid)
return -ENOENT;
- msg = nlmsg_new(100 + len, gfp);
+ msg = nlmsg_new(100 + skb->len, gfp);
if (!msg)
return -ENOMEM;
@@ -15056,13 +15061,17 @@ static int __nl80211_rx_control_port(struct net_device *dev,
nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex) ||
nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
NL80211_ATTR_PAD) ||
- nla_put(msg, NL80211_ATTR_FRAME, len, buf) ||
nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, addr) ||
nla_put_u16(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE, proto) ||
(unencrypted && nla_put_flag(msg,
NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT)))
goto nla_put_failure;
+ frame = nla_reserve(msg, NL80211_ATTR_FRAME, skb->len);
+ if (!frame)
+ goto nla_put_failure;
+
+ skb_copy_bits(skb, 0, nla_data(frame), skb->len);
genlmsg_end(msg, hdr);
return genlmsg_unicast(wiphy_net(&rdev->wiphy), msg, nlportid);
@@ -15073,14 +15082,12 @@ static int __nl80211_rx_control_port(struct net_device *dev,
}
bool cfg80211_rx_control_port(struct net_device *dev,
- const u8 *buf, size_t len,
- const u8 *addr, u16 proto, bool unencrypted)
+ struct sk_buff *skb, bool unencrypted)
{
int ret;
- trace_cfg80211_rx_control_port(dev, buf, len, addr, proto, unencrypted);
- ret = __nl80211_rx_control_port(dev, buf, len, addr, proto,
- unencrypted, GFP_ATOMIC);
+ trace_cfg80211_rx_control_port(dev, skb, unencrypted);
+ ret = __nl80211_rx_control_port(dev, skb, unencrypted, GFP_ATOMIC);
trace_cfg80211_return_bool(ret == 0);
return ret == 0;
}
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index bbe6298e4bb9..4fc66a117b7d 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -2240,7 +2240,9 @@ static void wiphy_update_regulatory(struct wiphy *wiphy,
* as some drivers used this to restore its orig_* reg domain.
*/
if (initiator == NL80211_REGDOM_SET_BY_CORE &&
- wiphy->regulatory_flags & REGULATORY_CUSTOM_REG)
+ wiphy->regulatory_flags & REGULATORY_CUSTOM_REG &&
+ !(wiphy->regulatory_flags &
+ REGULATORY_WIPHY_SELF_MANAGED))
reg_call_notifier(wiphy, lr);
return;
}
@@ -2787,26 +2789,6 @@ static void notify_self_managed_wiphys(struct regulatory_request *request)
}
}
-static bool reg_only_self_managed_wiphys(void)
-{
- struct cfg80211_registered_device *rdev;
- struct wiphy *wiphy;
- bool self_managed_found = false;
-
- ASSERT_RTNL();
-
- list_for_each_entry(rdev, &cfg80211_rdev_list, list) {
- wiphy = &rdev->wiphy;
- if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED)
- self_managed_found = true;
- else
- return false;
- }
-
- /* make sure at least one self-managed wiphy exists */
- return self_managed_found;
-}
-
/*
* Processes regulatory hints, this is all the NL80211_REGDOM_SET_BY_*
* Regulatory hints come on a first come first serve basis and we
@@ -2839,10 +2821,6 @@ static void reg_process_pending_hints(void)
spin_unlock(&reg_requests_lock);
notify_self_managed_wiphys(reg_request);
- if (reg_only_self_managed_wiphys()) {
- reg_free_request(reg_request);
- return;
- }
reg_process_hint(reg_request);
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2b417a2fe63f..7c73510b161f 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -2627,23 +2627,25 @@ TRACE_EVENT(cfg80211_mgmt_tx_status,
);
TRACE_EVENT(cfg80211_rx_control_port,
- TP_PROTO(struct net_device *netdev, const u8 *buf, size_t len,
- const u8 *addr, u16 proto, bool unencrypted),
- TP_ARGS(netdev, buf, len, addr, proto, unencrypted),
+ TP_PROTO(struct net_device *netdev, struct sk_buff *skb,
+ bool unencrypted),
+ TP_ARGS(netdev, skb, unencrypted),
TP_STRUCT__entry(
NETDEV_ENTRY
- MAC_ENTRY(addr)
+ __field(int, len)
+ MAC_ENTRY(from)
__field(u16, proto)
__field(bool, unencrypted)
),
TP_fast_assign(
NETDEV_ASSIGN;
- MAC_ASSIGN(addr, addr);
- __entry->proto = proto;
+ __entry->len = skb->len;
+ MAC_ASSIGN(from, eth_hdr(skb)->h_source);
+ __entry->proto = be16_to_cpu(skb->protocol);
__entry->unencrypted = unencrypted;
),
- TP_printk(NETDEV_PR_FMT ", " MAC_PR_FMT " proto: 0x%x, unencrypted: %s",
- NETDEV_PR_ARG, MAC_PR_ARG(addr),
+ TP_printk(NETDEV_PR_FMT ", len=%d, " MAC_PR_FMT ", proto: 0x%x, unencrypted: %s",
+ NETDEV_PR_ARG, __entry->len, MAC_PR_ARG(from),
__entry->proto, BOOL_TO_STR(__entry->unencrypted))
);
diff --git a/net/x25/Kconfig b/net/x25/Kconfig
index e2fa133f9fba..59fcb41fc5e6 100644
--- a/net/x25/Kconfig
+++ b/net/x25/Kconfig
@@ -31,5 +31,3 @@ config X25
To compile this driver as a module, choose M here: the module
will be called x25. If unsure, say N.
-
-
diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c
index 9c214ec681ac..743103786652 100644
--- a/net/x25/x25_subr.c
+++ b/net/x25/x25_subr.c
@@ -381,4 +381,3 @@ void x25_check_rbuf(struct sock *sk)
x25_stop_timer(sk);
}
}
-
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index f47abb46c587..911ca6d3cb5a 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -11,6 +11,8 @@
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/mm.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
#include "xdp_umem.h"
#include "xsk_queue.h"
@@ -40,6 +42,21 @@ void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
}
}
+int xdp_umem_query(struct net_device *dev, u16 queue_id)
+{
+ struct netdev_bpf bpf;
+
+ ASSERT_RTNL();
+
+ memset(&bpf, 0, sizeof(bpf));
+ bpf.command = XDP_QUERY_XSK_UMEM;
+ bpf.xsk.queue_id = queue_id;
+
+ if (!dev->netdev_ops->ndo_bpf)
+ return 0;
+ return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
+}
+
int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
u32 queue_id, u16 flags)
{
@@ -56,41 +73,36 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
if (force_copy)
return 0;
- dev_hold(dev);
-
- if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) {
- bpf.command = XDP_QUERY_XSK_UMEM;
+ if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
+ return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
- rtnl_lock();
- err = dev->netdev_ops->ndo_bpf(dev, &bpf);
- rtnl_unlock();
+ bpf.command = XDP_QUERY_XSK_UMEM;
- if (err) {
- dev_put(dev);
- return force_zc ? -ENOTSUPP : 0;
- }
-
- bpf.command = XDP_SETUP_XSK_UMEM;
- bpf.xsk.umem = umem;
- bpf.xsk.queue_id = queue_id;
+ rtnl_lock();
+ err = xdp_umem_query(dev, queue_id);
+ if (err) {
+ err = err < 0 ? -ENOTSUPP : -EBUSY;
+ goto err_rtnl_unlock;
+ }
- rtnl_lock();
- err = dev->netdev_ops->ndo_bpf(dev, &bpf);
- rtnl_unlock();
+ bpf.command = XDP_SETUP_XSK_UMEM;
+ bpf.xsk.umem = umem;
+ bpf.xsk.queue_id = queue_id;
- if (err) {
- dev_put(dev);
- return force_zc ? err : 0; /* fail or fallback */
- }
+ err = dev->netdev_ops->ndo_bpf(dev, &bpf);
+ if (err)
+ goto err_rtnl_unlock;
+ rtnl_unlock();
- umem->dev = dev;
- umem->queue_id = queue_id;
- umem->zc = true;
- return 0;
- }
+ dev_hold(dev);
+ umem->dev = dev;
+ umem->queue_id = queue_id;
+ umem->zc = true;
+ return 0;
- dev_put(dev);
- return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
+err_rtnl_unlock:
+ rtnl_unlock();
+ return force_zc ? err : 0; /* fail or fallback */
}
static void xdp_umem_clear_dev(struct xdp_umem *umem)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 59fb7d3c36a3..4e937cd7c17d 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -84,10 +84,8 @@ static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
{
int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len);
- if (err) {
- xdp_return_buff(xdp);
+ if (err)
xs->rx_dropped++;
- }
return err;
}
@@ -199,8 +197,11 @@ static void xsk_destruct_skb(struct sk_buff *skb)
{
u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg;
struct xdp_sock *xs = xdp_sk(skb->sk);
+ unsigned long flags;
+ spin_lock_irqsave(&xs->tx_completion_lock, flags);
WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr));
+ spin_unlock_irqrestore(&xs->tx_completion_lock, flags);
sock_wfree(skb);
}
@@ -215,9 +216,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
struct sk_buff *skb;
int err = 0;
- if (unlikely(!xs->tx))
- return -ENOBUFS;
-
mutex_lock(&xs->mutex);
while (xskq_peek_desc(xs->tx, &desc)) {
@@ -230,22 +228,13 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
goto out;
}
- if (xskq_reserve_addr(xs->umem->cq)) {
- err = -EAGAIN;
+ if (xskq_reserve_addr(xs->umem->cq))
goto out;
- }
- len = desc.len;
- if (unlikely(len > xs->dev->mtu)) {
- err = -EMSGSIZE;
+ if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- }
-
- if (xs->queue_id >= xs->dev->real_num_tx_queues) {
- err = -ENXIO;
- goto out;
- }
+ len = desc.len;
skb = sock_alloc_send_skb(sk, len, 1, &err);
if (unlikely(!skb)) {
err = -EAGAIN;
@@ -268,15 +257,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m,
skb->destructor = xsk_destruct_skb;
err = dev_direct_xmit(skb, xs->queue_id);
+ xskq_discard_desc(xs->tx);
/* Ignore NET_XMIT_CN as packet might have been sent */
if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) {
- err = -EAGAIN;
- /* SKB consumed by dev_direct_xmit() */
+ /* SKB completed but not sent */
+ err = -EBUSY;
goto out;
}
sent_frame = true;
- xskq_discard_desc(xs->tx);
}
out:
@@ -297,6 +286,8 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len)
return -ENXIO;
if (unlikely(!(xs->dev->flags & IFF_UP)))
return -ENETDOWN;
+ if (unlikely(!xs->tx))
+ return -ENOBUFS;
if (need_wait)
return -EOPNOTSUPP;
@@ -755,6 +746,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol,
xs = xdp_sk(sk);
mutex_init(&xs->mutex);
+ spin_lock_init(&xs->tx_completion_lock);
local_bh_disable();
sock_prot_inuse_add(net, &xsk_proto, 1);
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index ef6a6f0ec949..8a64b150be54 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt)
return (entries > dcnt) ? dcnt : entries;
}
-static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer)
-{
- return q->nentries - (producer - q->cons_tail);
-}
-
static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt)
{
- u32 free_entries = xskq_nb_free_lazy(q, producer);
+ u32 free_entries = q->nentries - (producer - q->cons_tail);
if (free_entries >= dcnt)
return free_entries;
@@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr)
{
struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring;
- if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0)
+ if (xskq_nb_free(q, q->prod_tail, 1) == 0)
return -ENOSPC;
ring->desc[q->prod_tail++ & q->ring_mask] = addr;
@@ -255,7 +250,7 @@ static inline bool xskq_full_desc(struct xsk_queue *q)
static inline bool xskq_empty_desc(struct xsk_queue *q)
{
- return xskq_nb_free(q, q->prod_tail, 1) == q->nentries;
+ return xskq_nb_free(q, q->prod_tail, q->nentries) == q->nentries;
}
void xskq_set_umem(struct xsk_queue *q, struct xdp_umem_props *umem_props);
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index 286ed25c1a69..4a9ee2d83158 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -25,6 +25,14 @@ config XFRM_USER
If unsure, say Y.
+config XFRM_INTERFACE
+ tristate "Transformation virtual interface"
+ depends on XFRM && IPV6
+ ---help---
+ This provides a virtual interface to route IPsec traffic.
+
+ If unsure, say N.
+
config XFRM_SUB_POLICY
bool "Transformation sub policy support"
depends on XFRM
@@ -87,4 +95,3 @@ config NET_KEY_MIGRATE
<draft-sugimoto-mip6-pfkey-migrate>.
If unsure, say N.
-
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 0bd2465a8c5a..fbc4552d17b8 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
obj-$(CONFIG_XFRM_USER) += xfrm_user.o
obj-$(CONFIG_XFRM_IPCOMP) += xfrm_ipcomp.o
+obj-$(CONFIG_XFRM_INTERFACE) += xfrm_interface.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 175941e15a6e..5611b7521020 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -56,7 +56,7 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
if (skb_is_gso(skb)) {
struct net_device *dev = skb->dev;
- if (unlikely(!x->xso.offload_handle || (x->xso.dev != dev))) {
+ if (unlikely(x->xso.dev != dev)) {
struct sk_buff *segs;
/* Packet got rerouted, fixup features and segment it. */
@@ -162,7 +162,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
}
dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr,
- x->props.family, x->props.output_mark);
+ x->props.family,
+ xfrm_smark_get(0, x));
if (IS_ERR(dst))
return 0;
@@ -210,8 +211,8 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
if (!x->type_offload || x->encap)
return false;
- if ((!dev || (x->xso.offload_handle && (dev == xfrm_dst_path(dst)->dev))) &&
- (!xdst->child->xfrm && x->type->get_mtu)) {
+ if ((!dev || (dev == xfrm_dst_path(dst)->dev)) &&
+ (!xdst->child->xfrm && x->type->get_mtu)) {
mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
if (skb->len <= mtu)
@@ -306,12 +307,6 @@ static int xfrm_dev_register(struct net_device *dev)
return xfrm_api_check(dev);
}
-static int xfrm_dev_unregister(struct net_device *dev)
-{
- xfrm_policy_cache_flush();
- return NOTIFY_DONE;
-}
-
static int xfrm_dev_feat_change(struct net_device *dev)
{
return xfrm_api_check(dev);
@@ -322,7 +317,6 @@ static int xfrm_dev_down(struct net_device *dev)
if (dev->features & NETIF_F_HW_ESP)
xfrm_dev_state_flush(dev_net(dev), dev, true);
- xfrm_policy_cache_flush();
return NOTIFY_DONE;
}
@@ -334,9 +328,6 @@ static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void
case NETDEV_REGISTER:
return xfrm_dev_register(dev);
- case NETDEV_UNREGISTER:
- return xfrm_dev_unregister(dev);
-
case NETDEV_FEAT_CHANGE:
return xfrm_dev_feat_change(dev);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 352abca2605f..b89c9c7f8c5c 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -320,6 +320,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
seq = 0;
if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
goto drop;
}
@@ -328,17 +329,21 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
XFRM_SPI_SKB_CB(skb)->daddroff);
do {
if (skb->sp->len == XFRM_MAX_DEPTH) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
goto drop;
}
x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
if (x == NULL) {
+ secpath_reset(skb);
XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
xfrm_audit_state_notfound(skb, family, spi, seq);
goto drop;
}
+ skb->mark = xfrm_smark_get(skb->mark, x);
+
skb->sp->xvec[skb->sp->len++] = x;
lock:
diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c
new file mode 100644
index 000000000000..31acc6f33d98
--- /dev/null
+++ b/net/xfrm/xfrm_interface.c
@@ -0,0 +1,975 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * XFRM virtual interface
+ *
+ * Copyright (C) 2018 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ */
+
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/sockios.h>
+#include <linux/icmp.h>
+#include <linux/if.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/in6.h>
+#include <linux/netdevice.h>
+#include <linux/if_link.h>
+#include <linux/if_arp.h>
+#include <linux/icmpv6.h>
+#include <linux/init.h>
+#include <linux/route.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+#include <linux/uaccess.h>
+#include <linux/atomic.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+#include <net/xfrm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <linux/etherdevice.h>
+
+static int xfrmi_dev_init(struct net_device *dev);
+static void xfrmi_dev_setup(struct net_device *dev);
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly;
+static unsigned int xfrmi_net_id __read_mostly;
+
+struct xfrmi_net {
+ /* lists for storing interfaces in use */
+ struct xfrm_if __rcu *xfrmi[1];
+};
+
+#define for_each_xfrmi_rcu(start, xi) \
+ for (xi = rcu_dereference(start); xi; xi = rcu_dereference(xi->next))
+
+static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x)
+{
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ struct xfrm_if *xi;
+
+ for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+ if (x->if_id == xi->p.if_id &&
+ (xi->dev->flags & IFF_UP))
+ return xi;
+ }
+
+ return NULL;
+}
+
+static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb)
+{
+ struct xfrmi_net *xfrmn;
+ int ifindex;
+ struct xfrm_if *xi;
+
+ if (!skb->dev)
+ return NULL;
+
+ xfrmn = net_generic(dev_net(skb->dev), xfrmi_net_id);
+ ifindex = skb->dev->ifindex;
+
+ for_each_xfrmi_rcu(xfrmn->xfrmi[0], xi) {
+ if (ifindex == xi->dev->ifindex &&
+ (xi->dev->flags & IFF_UP))
+ return xi;
+ }
+
+ return NULL;
+}
+
+static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+ struct xfrm_if __rcu **xip = &xfrmn->xfrmi[0];
+
+ rcu_assign_pointer(xi->next , rtnl_dereference(*xip));
+ rcu_assign_pointer(*xip, xi);
+}
+
+static void xfrmi_unlink(struct xfrmi_net *xfrmn, struct xfrm_if *xi)
+{
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *iter;
+
+ for (xip = &xfrmn->xfrmi[0];
+ (iter = rtnl_dereference(*xip)) != NULL;
+ xip = &iter->next) {
+ if (xi == iter) {
+ rcu_assign_pointer(*xip, xi->next);
+ break;
+ }
+ }
+}
+
+static void xfrmi_dev_free(struct net_device *dev)
+{
+ free_percpu(dev->tstats);
+}
+
+static int xfrmi_create2(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ int err;
+
+ dev->rtnl_link_ops = &xfrmi_link_ops;
+ err = register_netdevice(dev);
+ if (err < 0)
+ goto out;
+
+ strcpy(xi->p.name, dev->name);
+
+ dev_hold(dev);
+ xfrmi_link(xfrmn, xi);
+
+ return 0;
+
+out:
+ return err;
+}
+
+static struct xfrm_if *xfrmi_create(struct net *net, struct xfrm_if_parms *p)
+{
+ struct net_device *dev;
+ struct xfrm_if *xi;
+ char name[IFNAMSIZ];
+ int err;
+
+ if (p->name[0]) {
+ strlcpy(name, p->name, IFNAMSIZ);
+ } else {
+ err = -EINVAL;
+ goto failed;
+ }
+
+ dev = alloc_netdev(sizeof(*xi), name, NET_NAME_UNKNOWN, xfrmi_dev_setup);
+ if (!dev) {
+ err = -EAGAIN;
+ goto failed;
+ }
+
+ dev_net_set(dev, net);
+
+ xi = netdev_priv(dev);
+ xi->p = *p;
+ xi->net = net;
+ xi->dev = dev;
+ xi->phydev = dev_get_by_index(net, p->link);
+ if (!xi->phydev) {
+ err = -ENODEV;
+ goto failed_free;
+ }
+
+ err = xfrmi_create2(dev);
+ if (err < 0)
+ goto failed_dev_put;
+
+ return xi;
+
+failed_dev_put:
+ dev_put(xi->phydev);
+failed_free:
+ free_netdev(dev);
+failed:
+ return ERR_PTR(err);
+}
+
+static struct xfrm_if *xfrmi_locate(struct net *net, struct xfrm_if_parms *p,
+ int create)
+{
+ struct xfrm_if __rcu **xip;
+ struct xfrm_if *xi;
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+ for (xip = &xfrmn->xfrmi[0];
+ (xi = rtnl_dereference(*xip)) != NULL;
+ xip = &xi->next) {
+ if (xi->p.if_id == p->if_id) {
+ if (create)
+ return ERR_PTR(-EEXIST);
+
+ return xi;
+ }
+ }
+ if (!create)
+ return ERR_PTR(-ENODEV);
+ return xfrmi_create(net, p);
+}
+
+static void xfrmi_dev_uninit(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id);
+
+ xfrmi_unlink(xfrmn, xi);
+ dev_put(xi->phydev);
+ dev_put(dev);
+}
+
+static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet)
+{
+ skb->tstamp = 0;
+ skb->pkt_type = PACKET_HOST;
+ skb->skb_iif = 0;
+ skb->ignore_df = 0;
+ skb_dst_drop(skb);
+ nf_reset(skb);
+ nf_reset_trace(skb);
+
+ if (!xnet)
+ return;
+
+ ipvs_reset(skb);
+ secpath_reset(skb);
+ skb_orphan(skb);
+ skb->mark = 0;
+}
+
+static int xfrmi_rcv_cb(struct sk_buff *skb, int err)
+{
+ struct pcpu_sw_netstats *tstats;
+ struct xfrm_mode *inner_mode;
+ struct net_device *dev;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ bool xnet;
+
+ if (err && !skb->sp)
+ return 0;
+
+ x = xfrm_input_state(skb);
+
+ xi = xfrmi_lookup(xs_net(x), x);
+ if (!xi)
+ return 1;
+
+ dev = xi->dev;
+ skb->dev = dev;
+
+ if (err) {
+ dev->stats.rx_errors++;
+ dev->stats.rx_dropped++;
+
+ return 0;
+ }
+
+ xnet = !net_eq(xi->net, dev_net(skb->dev));
+
+ if (xnet) {
+ inner_mode = x->inner_mode;
+
+ if (x->sel.family == AF_UNSPEC) {
+ inner_mode = xfrm_ip2inner_mode(x, XFRM_MODE_SKB_CB(skb)->protocol);
+ if (inner_mode == NULL) {
+ XFRM_INC_STATS(dev_net(skb->dev),
+ LINUX_MIB_XFRMINSTATEMODEERROR);
+ return -EINVAL;
+ }
+ }
+
+ if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb,
+ inner_mode->afinfo->family))
+ return -EPERM;
+ }
+
+ xfrmi_scrub_packet(skb, xnet);
+
+ tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->rx_packets++;
+ tstats->rx_bytes += skb->len;
+ u64_stats_update_end(&tstats->syncp);
+
+ return 0;
+}
+
+static int
+xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device_stats *stats = &xi->dev->stats;
+ struct dst_entry *dst = skb_dst(skb);
+ unsigned int length = skb->len;
+ struct net_device *tdev;
+ struct xfrm_state *x;
+ int err = -1;
+ int mtu;
+
+ if (!dst)
+ goto tx_err_link_failure;
+
+ dst_hold(dst);
+ dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id);
+ if (IS_ERR(dst)) {
+ err = PTR_ERR(dst);
+ dst = NULL;
+ goto tx_err_link_failure;
+ }
+
+ x = dst->xfrm;
+ if (!x)
+ goto tx_err_link_failure;
+
+ if (x->if_id != xi->p.if_id)
+ goto tx_err_link_failure;
+
+ tdev = dst->dev;
+
+ if (tdev == dev) {
+ stats->collisions++;
+ net_warn_ratelimited("%s: Local routing loop detected!\n",
+ xi->p.name);
+ goto tx_err_dst_release;
+ }
+
+ mtu = dst_mtu(dst);
+ if (!skb->ignore_df && skb->len > mtu) {
+ skb_dst_update_pmtu(skb, mtu);
+
+ if (skb->protocol == htons(ETH_P_IPV6)) {
+ if (mtu < IPV6_MIN_MTU)
+ mtu = IPV6_MIN_MTU;
+
+ icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+ } else {
+ icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
+ htonl(mtu));
+ }
+
+ dst_release(dst);
+ return -EMSGSIZE;
+ }
+
+ xfrmi_scrub_packet(skb, !net_eq(xi->net, dev_net(dev)));
+ skb_dst_set(skb, dst);
+ skb->dev = tdev;
+
+ err = dst_output(xi->net, skb->sk, skb);
+ if (net_xmit_eval(err) == 0) {
+ struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
+
+ u64_stats_update_begin(&tstats->syncp);
+ tstats->tx_bytes += length;
+ tstats->tx_packets++;
+ u64_stats_update_end(&tstats->syncp);
+ } else {
+ stats->tx_errors++;
+ stats->tx_aborted_errors++;
+ }
+
+ return 0;
+tx_err_link_failure:
+ stats->tx_carrier_errors++;
+ dst_link_failure(skb);
+tx_err_dst_release:
+ dst_release(dst);
+ return err;
+}
+
+static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device_stats *stats = &xi->dev->stats;
+ struct flowi fl;
+ int ret;
+
+ memset(&fl, 0, sizeof(fl));
+
+ switch (skb->protocol) {
+ case htons(ETH_P_IPV6):
+ xfrm_decode_session(skb, &fl, AF_INET6);
+ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb)));
+ break;
+ case htons(ETH_P_IP):
+ xfrm_decode_session(skb, &fl, AF_INET);
+ memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
+ break;
+ default:
+ goto tx_err;
+ }
+
+ fl.flowi_oif = xi->phydev->ifindex;
+
+ ret = xfrmi_xmit2(skb, dev, &fl);
+ if (ret < 0)
+ goto tx_err;
+
+ return NETDEV_TX_OK;
+
+tx_err:
+ stats->tx_errors++;
+ stats->tx_dropped++;
+ kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static int xfrmi4_err(struct sk_buff *skb, u32 info)
+{
+ const struct iphdr *iph = (const struct iphdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ int protocol = iph->protocol;
+ struct ip_comp_hdr *ipch;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah ;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ __be32 spi;
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2));
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2));
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ switch (icmp_hdr(skb)->type) {
+ case ICMP_DEST_UNREACH:
+ if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED)
+ return 0;
+ case ICMP_REDIRECT:
+ break;
+ default:
+ return 0;
+ }
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET);
+ if (!x)
+ return 0;
+
+ xi = xfrmi_lookup(net, x);
+ if (!xi) {
+ xfrm_state_put(x);
+ return -1;
+ }
+
+ if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH)
+ ipv4_update_pmtu(skb, net, info, 0, 0, protocol, 0);
+ else
+ ipv4_redirect(skb, net, 0, 0, protocol, 0);
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int xfrmi6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
+ u8 type, u8 code, int offset, __be32 info)
+{
+ const struct ipv6hdr *iph = (const struct ipv6hdr *)skb->data;
+ struct net *net = dev_net(skb->dev);
+ int protocol = iph->nexthdr;
+ struct ip_comp_hdr *ipch;
+ struct ip_esp_hdr *esph;
+ struct ip_auth_hdr *ah;
+ struct xfrm_state *x;
+ struct xfrm_if *xi;
+ __be32 spi;
+
+ switch (protocol) {
+ case IPPROTO_ESP:
+ esph = (struct ip_esp_hdr *)(skb->data + offset);
+ spi = esph->spi;
+ break;
+ case IPPROTO_AH:
+ ah = (struct ip_auth_hdr *)(skb->data + offset);
+ spi = ah->spi;
+ break;
+ case IPPROTO_COMP:
+ ipch = (struct ip_comp_hdr *)(skb->data + offset);
+ spi = htonl(ntohs(ipch->cpi));
+ break;
+ default:
+ return 0;
+ }
+
+ if (type != ICMPV6_PKT_TOOBIG &&
+ type != NDISC_REDIRECT)
+ return 0;
+
+ x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr,
+ spi, protocol, AF_INET6);
+ if (!x)
+ return 0;
+
+ xi = xfrmi_lookup(net, x);
+ if (!xi) {
+ xfrm_state_put(x);
+ return -1;
+ }
+
+ if (type == NDISC_REDIRECT)
+ ip6_redirect(skb, net, skb->dev->ifindex, 0,
+ sock_net_uid(net, NULL));
+ else
+ ip6_update_pmtu(skb, net, info, 0, 0, sock_net_uid(net, NULL));
+ xfrm_state_put(x);
+
+ return 0;
+}
+
+static int xfrmi_change(struct xfrm_if *xi, const struct xfrm_if_parms *p)
+{
+ if (xi->p.link != p->link)
+ return -EINVAL;
+
+ xi->p.if_id = p->if_id;
+
+ return 0;
+}
+
+static int xfrmi_update(struct xfrm_if *xi, struct xfrm_if_parms *p)
+{
+ struct net *net = dev_net(xi->dev);
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+ int err;
+
+ xfrmi_unlink(xfrmn, xi);
+ synchronize_net();
+ err = xfrmi_change(xi, p);
+ xfrmi_link(xfrmn, xi);
+ netdev_state_change(xi->dev);
+ return err;
+}
+
+static void xfrmi_get_stats64(struct net_device *dev,
+ struct rtnl_link_stats64 *s)
+{
+ int cpu;
+
+ if (!dev->tstats)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct pcpu_sw_netstats *stats;
+ struct pcpu_sw_netstats tmp;
+ int start;
+
+ stats = per_cpu_ptr(dev->tstats, cpu);
+ do {
+ start = u64_stats_fetch_begin_irq(&stats->syncp);
+ tmp.rx_packets = stats->rx_packets;
+ tmp.rx_bytes = stats->rx_bytes;
+ tmp.tx_packets = stats->tx_packets;
+ tmp.tx_bytes = stats->tx_bytes;
+ } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+
+ s->rx_packets += tmp.rx_packets;
+ s->rx_bytes += tmp.rx_bytes;
+ s->tx_packets += tmp.tx_packets;
+ s->tx_bytes += tmp.tx_bytes;
+ }
+
+ s->rx_dropped = dev->stats.rx_dropped;
+ s->tx_dropped = dev->stats.tx_dropped;
+}
+
+static int xfrmi_get_iflink(const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ return xi->phydev->ifindex;
+}
+
+
+static const struct net_device_ops xfrmi_netdev_ops = {
+ .ndo_init = xfrmi_dev_init,
+ .ndo_uninit = xfrmi_dev_uninit,
+ .ndo_start_xmit = xfrmi_xmit,
+ .ndo_get_stats64 = xfrmi_get_stats64,
+ .ndo_get_iflink = xfrmi_get_iflink,
+};
+
+static void xfrmi_dev_setup(struct net_device *dev)
+{
+ dev->netdev_ops = &xfrmi_netdev_ops;
+ dev->type = ARPHRD_NONE;
+ dev->hard_header_len = ETH_HLEN;
+ dev->min_header_len = ETH_HLEN;
+ dev->mtu = ETH_DATA_LEN;
+ dev->min_mtu = ETH_MIN_MTU;
+ dev->max_mtu = ETH_DATA_LEN;
+ dev->addr_len = ETH_ALEN;
+ dev->flags = IFF_NOARP;
+ dev->needs_free_netdev = true;
+ dev->priv_destructor = xfrmi_dev_free;
+ netif_keep_dst(dev);
+}
+
+static int xfrmi_dev_init(struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net_device *phydev = xi->phydev;
+ int err;
+
+ dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
+ if (!dev->tstats)
+ return -ENOMEM;
+
+ err = gro_cells_init(&xi->gro_cells, dev);
+ if (err) {
+ free_percpu(dev->tstats);
+ return err;
+ }
+
+ dev->features |= NETIF_F_LLTX;
+
+ dev->needed_headroom = phydev->needed_headroom;
+ dev->needed_tailroom = phydev->needed_tailroom;
+
+ if (is_zero_ether_addr(dev->dev_addr))
+ eth_hw_addr_inherit(dev, phydev);
+ if (is_zero_ether_addr(dev->broadcast))
+ memcpy(dev->broadcast, phydev->broadcast, dev->addr_len);
+
+ return 0;
+}
+
+static int xfrmi_validate(struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static void xfrmi_netlink_parms(struct nlattr *data[],
+ struct xfrm_if_parms *parms)
+{
+ memset(parms, 0, sizeof(*parms));
+
+ if (!data)
+ return;
+
+ if (data[IFLA_XFRM_LINK])
+ parms->link = nla_get_u32(data[IFLA_XFRM_LINK]);
+
+ if (data[IFLA_XFRM_IF_ID])
+ parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]);
+}
+
+static int xfrmi_newlink(struct net *src_net, struct net_device *dev,
+ struct nlattr *tb[], struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = dev_net(dev);
+ struct xfrm_if_parms *p;
+ struct xfrm_if *xi;
+
+ xi = netdev_priv(dev);
+ p = &xi->p;
+
+ xfrmi_netlink_parms(data, p);
+
+ if (!tb[IFLA_IFNAME])
+ return -EINVAL;
+
+ nla_strlcpy(p->name, tb[IFLA_IFNAME], IFNAMSIZ);
+
+ xi = xfrmi_locate(net, p, 1);
+ return PTR_ERR_OR_ZERO(xi);
+}
+
+static void xfrmi_dellink(struct net_device *dev, struct list_head *head)
+{
+ unregister_netdevice_queue(dev, head);
+}
+
+static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[],
+ struct nlattr *data[],
+ struct netlink_ext_ack *extack)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct net *net = dev_net(dev);
+
+ xfrmi_netlink_parms(data, &xi->p);
+
+ xi = xfrmi_locate(net, &xi->p, 0);
+
+ if (IS_ERR_OR_NULL(xi)) {
+ xi = netdev_priv(dev);
+ } else {
+ if (xi->dev != dev)
+ return -EEXIST;
+ }
+
+ return xfrmi_update(xi, &xi->p);
+}
+
+static size_t xfrmi_get_size(const struct net_device *dev)
+{
+ return
+ /* IFLA_XFRM_LINK */
+ nla_total_size(4) +
+ /* IFLA_XFRM_IF_ID */
+ nla_total_size(4) +
+ 0;
+}
+
+static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+ struct xfrm_if_parms *parm = &xi->p;
+
+ if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) ||
+ nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+struct net *xfrmi_get_link_net(const struct net_device *dev)
+{
+ struct xfrm_if *xi = netdev_priv(dev);
+
+ return dev_net(xi->phydev);
+}
+
+static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = {
+ [IFLA_XFRM_LINK] = { .type = NLA_U32 },
+ [IFLA_XFRM_IF_ID] = { .type = NLA_U32 },
+};
+
+static struct rtnl_link_ops xfrmi_link_ops __read_mostly = {
+ .kind = "xfrm",
+ .maxtype = IFLA_XFRM_MAX,
+ .policy = xfrmi_policy,
+ .priv_size = sizeof(struct xfrm_if),
+ .setup = xfrmi_dev_setup,
+ .validate = xfrmi_validate,
+ .newlink = xfrmi_newlink,
+ .dellink = xfrmi_dellink,
+ .changelink = xfrmi_changelink,
+ .get_size = xfrmi_get_size,
+ .fill_info = xfrmi_fill_info,
+ .get_link_net = xfrmi_get_link_net,
+};
+
+static void __net_exit xfrmi_destroy_interfaces(struct xfrmi_net *xfrmn)
+{
+ struct xfrm_if *xi;
+ LIST_HEAD(list);
+
+ xi = rtnl_dereference(xfrmn->xfrmi[0]);
+ if (!xi)
+ return;
+
+ unregister_netdevice_queue(xi->dev, &list);
+ unregister_netdevice_many(&list);
+}
+
+static int __net_init xfrmi_init_net(struct net *net)
+{
+ return 0;
+}
+
+static void __net_exit xfrmi_exit_net(struct net *net)
+{
+ struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id);
+
+ rtnl_lock();
+ xfrmi_destroy_interfaces(xfrmn);
+ rtnl_unlock();
+}
+
+static struct pernet_operations xfrmi_net_ops = {
+ .init = xfrmi_init_net,
+ .exit = xfrmi_exit_net,
+ .id = &xfrmi_net_id,
+ .size = sizeof(struct xfrmi_net),
+};
+
+static struct xfrm6_protocol xfrmi_esp6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm6_protocol xfrmi_ah6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm6_protocol xfrmi_ipcomp6_protocol __read_mostly = {
+ .handler = xfrm6_rcv,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi6_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_esp4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_ah4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static struct xfrm4_protocol xfrmi_ipcomp4_protocol __read_mostly = {
+ .handler = xfrm4_rcv,
+ .input_handler = xfrm_input,
+ .cb_handler = xfrmi_rcv_cb,
+ .err_handler = xfrmi4_err,
+ .priority = 10,
+};
+
+static int __init xfrmi4_init(void)
+{
+ int err;
+
+ err = xfrm4_protocol_register(&xfrmi_esp4_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm4_protocol_register(&xfrmi_ah4_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm4_protocol_register(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ return 0;
+
+xfrm_proto_comp_failed:
+ xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ return err;
+}
+
+static void xfrmi4_fini(void)
+{
+ xfrm4_protocol_deregister(&xfrmi_ipcomp4_protocol, IPPROTO_COMP);
+ xfrm4_protocol_deregister(&xfrmi_ah4_protocol, IPPROTO_AH);
+ xfrm4_protocol_deregister(&xfrmi_esp4_protocol, IPPROTO_ESP);
+}
+
+static int __init xfrmi6_init(void)
+{
+ int err;
+
+ err = xfrm6_protocol_register(&xfrmi_esp6_protocol, IPPROTO_ESP);
+ if (err < 0)
+ goto xfrm_proto_esp_failed;
+ err = xfrm6_protocol_register(&xfrmi_ah6_protocol, IPPROTO_AH);
+ if (err < 0)
+ goto xfrm_proto_ah_failed;
+ err = xfrm6_protocol_register(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+ if (err < 0)
+ goto xfrm_proto_comp_failed;
+
+ return 0;
+
+xfrm_proto_comp_failed:
+ xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+xfrm_proto_ah_failed:
+ xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+xfrm_proto_esp_failed:
+ return err;
+}
+
+static void xfrmi6_fini(void)
+{
+ xfrm6_protocol_deregister(&xfrmi_ipcomp6_protocol, IPPROTO_COMP);
+ xfrm6_protocol_deregister(&xfrmi_ah6_protocol, IPPROTO_AH);
+ xfrm6_protocol_deregister(&xfrmi_esp6_protocol, IPPROTO_ESP);
+}
+
+static const struct xfrm_if_cb xfrm_if_cb = {
+ .decode_session = xfrmi_decode_session,
+};
+
+static int __init xfrmi_init(void)
+{
+ const char *msg;
+ int err;
+
+ pr_info("IPsec XFRM device driver\n");
+
+ msg = "tunnel device";
+ err = register_pernet_device(&xfrmi_net_ops);
+ if (err < 0)
+ goto pernet_dev_failed;
+
+ msg = "xfrm4 protocols";
+ err = xfrmi4_init();
+ if (err < 0)
+ goto xfrmi4_failed;
+
+ msg = "xfrm6 protocols";
+ err = xfrmi6_init();
+ if (err < 0)
+ goto xfrmi6_failed;
+
+
+ msg = "netlink interface";
+ err = rtnl_link_register(&xfrmi_link_ops);
+ if (err < 0)
+ goto rtnl_link_failed;
+
+ xfrm_if_register_cb(&xfrm_if_cb);
+
+ return err;
+
+rtnl_link_failed:
+ xfrmi6_fini();
+xfrmi6_failed:
+ xfrmi4_fini();
+xfrmi4_failed:
+ unregister_pernet_device(&xfrmi_net_ops);
+pernet_dev_failed:
+ pr_err("xfrmi init: failed to register %s\n", msg);
+ return err;
+}
+
+static void __exit xfrmi_fini(void)
+{
+ xfrm_if_unregister_cb();
+ rtnl_link_unregister(&xfrmi_link_ops);
+ xfrmi4_fini();
+ xfrmi6_fini();
+ unregister_pernet_device(&xfrmi_net_ops);
+}
+
+module_init(xfrmi_init);
+module_exit(xfrmi_fini);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_RTNL_LINK("xfrm");
+MODULE_ALIAS_NETDEV("xfrm0");
+MODULE_AUTHOR("Steffen Klassert");
+MODULE_DESCRIPTION("XFRM virtual interface");
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 89b178a78dc7..45ba07ab3e4f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -66,8 +66,7 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
goto error_nolock;
}
- if (x->props.output_mark)
- skb->mark = x->props.output_mark;
+ skb->mark = xfrm_smark_get(skb->mark, x);
err = x->outer_mode->output(x, skb);
if (err) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 5f48251c1319..3110c3fbee20 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -45,8 +45,9 @@ struct xfrm_flo {
u8 flags;
};
-static DEFINE_PER_CPU(struct xfrm_dst *, xfrm_last_dst);
-static struct work_struct *xfrm_pcpu_work __read_mostly;
+static DEFINE_SPINLOCK(xfrm_if_cb_lock);
+static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly;
+
static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock);
static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1]
__read_mostly;
@@ -119,6 +120,12 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa
return afinfo;
}
+/* Called with rcu_read_lock(). */
+static const struct xfrm_if_cb *xfrm_if_get_cb(void)
+{
+ return rcu_dereference(xfrm_if_cb);
+}
+
struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
const xfrm_address_t *saddr,
const xfrm_address_t *daddr,
@@ -182,8 +189,8 @@ static inline unsigned long make_jiffies(long secs)
static void xfrm_policy_timer(struct timer_list *t)
{
struct xfrm_policy *xp = from_timer(xp, t, timer);
- unsigned long now = get_seconds();
- long next = LONG_MAX;
+ time64_t now = ktime_get_real_seconds();
+ time64_t next = TIME64_MAX;
int warn = 0;
int dir;
@@ -195,7 +202,7 @@ static void xfrm_policy_timer(struct timer_list *t)
dir = xfrm_policy_id2dir(xp->index);
if (xp->lft.hard_add_expires_seconds) {
- long tmo = xp->lft.hard_add_expires_seconds +
+ time64_t tmo = xp->lft.hard_add_expires_seconds +
xp->curlft.add_time - now;
if (tmo <= 0)
goto expired;
@@ -203,7 +210,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.hard_use_expires_seconds) {
- long tmo = xp->lft.hard_use_expires_seconds +
+ time64_t tmo = xp->lft.hard_use_expires_seconds +
(xp->curlft.use_time ? : xp->curlft.add_time) - now;
if (tmo <= 0)
goto expired;
@@ -211,7 +218,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.soft_add_expires_seconds) {
- long tmo = xp->lft.soft_add_expires_seconds +
+ time64_t tmo = xp->lft.soft_add_expires_seconds +
xp->curlft.add_time - now;
if (tmo <= 0) {
warn = 1;
@@ -221,7 +228,7 @@ static void xfrm_policy_timer(struct timer_list *t)
next = tmo;
}
if (xp->lft.soft_use_expires_seconds) {
- long tmo = xp->lft.soft_use_expires_seconds +
+ time64_t tmo = xp->lft.soft_use_expires_seconds +
(xp->curlft.use_time ? : xp->curlft.add_time) - now;
if (tmo <= 0) {
warn = 1;
@@ -233,7 +240,7 @@ static void xfrm_policy_timer(struct timer_list *t)
if (warn)
km_policy_expired(xp, dir, 0, 0);
- if (next != LONG_MAX &&
+ if (next != TIME64_MAX &&
!mod_timer(&xp->timer, jiffies + make_jiffies(next)))
xfrm_pol_hold(xp);
@@ -747,6 +754,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
newpos = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == policy->type &&
+ pol->if_id == policy->if_id &&
!selector_cmp(&pol->selector, &policy->selector) &&
xfrm_policy_mark_match(policy, pol) &&
xfrm_sec_ctx_match(pol->security, policy->security) &&
@@ -783,7 +791,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index);
hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index));
- policy->curlft.add_time = get_seconds();
+ policy->curlft.add_time = ktime_get_real_seconds();
policy->curlft.use_time = 0;
if (!mod_timer(&policy->timer, jiffies + HZ))
xfrm_pol_hold(policy);
@@ -798,8 +806,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl)
}
EXPORT_SYMBOL(xfrm_policy_insert);
-struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
- int dir, struct xfrm_selector *sel,
+struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u32 if_id,
+ u8 type, int dir,
+ struct xfrm_selector *sel,
struct xfrm_sec_ctx *ctx, int delete,
int *err)
{
@@ -812,6 +821,7 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
ret = NULL;
hlist_for_each_entry(pol, chain, bydst) {
if (pol->type == type &&
+ pol->if_id == if_id &&
(mark & pol->mark.m) == pol->mark.v &&
!selector_cmp(sel, &pol->selector) &&
xfrm_sec_ctx_match(ctx, pol->security)) {
@@ -837,8 +847,9 @@ struct xfrm_policy *xfrm_policy_bysel_ctx(struct net *net, u32 mark, u8 type,
}
EXPORT_SYMBOL(xfrm_policy_bysel_ctx);
-struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
- int dir, u32 id, int delete, int *err)
+struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u32 if_id,
+ u8 type, int dir, u32 id, int delete,
+ int *err)
{
struct xfrm_policy *pol, *ret;
struct hlist_head *chain;
@@ -853,6 +864,7 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net, u32 mark, u8 type,
ret = NULL;
hlist_for_each_entry(pol, chain, byidx) {
if (pol->type == type && pol->index == id &&
+ pol->if_id == if_id &&
(mark & pol->mark.m) == pol->mark.v) {
xfrm_pol_hold(pol);
if (delete) {
@@ -1056,13 +1068,14 @@ EXPORT_SYMBOL(xfrm_policy_walk_done);
*/
static int xfrm_policy_match(const struct xfrm_policy *pol,
const struct flowi *fl,
- u8 type, u16 family, int dir)
+ u8 type, u16 family, int dir, u32 if_id)
{
const struct xfrm_selector *sel = &pol->selector;
int ret = -ESRCH;
bool match;
if (pol->family != family ||
+ pol->if_id != if_id ||
(fl->flowi_mark & pol->mark.m) != pol->mark.v ||
pol->type != type)
return ret;
@@ -1077,7 +1090,8 @@ static int xfrm_policy_match(const struct xfrm_policy *pol,
static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
const struct flowi *fl,
- u16 family, u8 dir)
+ u16 family, u8 dir,
+ u32 if_id)
{
int err;
struct xfrm_policy *pol, *ret;
@@ -1101,7 +1115,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
priority = ~0U;
ret = NULL;
hlist_for_each_entry_rcu(pol, chain, bydst) {
- err = xfrm_policy_match(pol, fl, type, family, dir);
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
if (err) {
if (err == -ESRCH)
continue;
@@ -1120,7 +1134,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type,
if ((pol->priority >= priority) && ret)
break;
- err = xfrm_policy_match(pol, fl, type, family, dir);
+ err = xfrm_policy_match(pol, fl, type, family, dir, if_id);
if (err) {
if (err == -ESRCH)
continue;
@@ -1145,21 +1159,25 @@ fail:
return ret;
}
-static struct xfrm_policy *
-xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir)
+static struct xfrm_policy *xfrm_policy_lookup(struct net *net,
+ const struct flowi *fl,
+ u16 family, u8 dir, u32 if_id)
{
#ifdef CONFIG_XFRM_SUB_POLICY
struct xfrm_policy *pol;
- pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir);
+ pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family,
+ dir, if_id);
if (pol != NULL)
return pol;
#endif
- return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir);
+ return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family,
+ dir, if_id);
}
static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
- const struct flowi *fl, u16 family)
+ const struct flowi *fl,
+ u16 family, u32 if_id)
{
struct xfrm_policy *pol;
@@ -1177,7 +1195,8 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
match = xfrm_selector_match(&pol->selector, fl, family);
if (match) {
- if ((sk->sk_mark & pol->mark.m) != pol->mark.v) {
+ if ((sk->sk_mark & pol->mark.m) != pol->mark.v ||
+ pol->if_id != if_id) {
pol = NULL;
goto out;
}
@@ -1268,7 +1287,7 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
old_pol = rcu_dereference_protected(sk->sk_policy[dir],
lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
- pol->curlft.add_time = get_seconds();
+ pol->curlft.add_time = ktime_get_real_seconds();
pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
xfrm_sk_policy_link(pol, dir);
}
@@ -1305,6 +1324,7 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
newp->lft = old->lft;
newp->curlft = old->curlft;
newp->mark = old->mark;
+ newp->if_id = old->if_id;
newp->action = old->action;
newp->flags = old->flags;
newp->xfrm_nr = old->xfrm_nr;
@@ -1390,7 +1410,8 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl,
}
}
- x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family);
+ x = xfrm_state_find(remote, local, fl, tmpl, policy, &error,
+ family, policy->if_id);
if (x && x->km.state == XFRM_STATE_VALID) {
xfrm[nx++] = x;
@@ -1607,10 +1628,11 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy,
dst_copy_metrics(dst1, dst);
if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+ __u32 mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]);
+
family = xfrm[i]->props.family;
dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif,
- &saddr, &daddr, family,
- xfrm[i]->props.output_mark);
+ &saddr, &daddr, family, mark);
err = PTR_ERR(dst);
if (IS_ERR(dst))
goto put_states;
@@ -1692,7 +1714,8 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]),
XFRM_POLICY_TYPE_MAIN,
fl, family,
- XFRM_POLICY_OUT);
+ XFRM_POLICY_OUT,
+ pols[0]->if_id);
if (pols[1]) {
if (IS_ERR(pols[1])) {
xfrm_pols_put(pols, *num_pols);
@@ -1714,108 +1737,6 @@ static int xfrm_expand_policies(const struct flowi *fl, u16 family,
}
-static void xfrm_last_dst_update(struct xfrm_dst *xdst, struct xfrm_dst *old)
-{
- this_cpu_write(xfrm_last_dst, xdst);
- if (old)
- dst_release(&old->u.dst);
-}
-
-static void __xfrm_pcpu_work_fn(void)
-{
- struct xfrm_dst *old;
-
- old = this_cpu_read(xfrm_last_dst);
- if (old && !xfrm_bundle_ok(old))
- xfrm_last_dst_update(NULL, old);
-}
-
-static void xfrm_pcpu_work_fn(struct work_struct *work)
-{
- local_bh_disable();
- rcu_read_lock();
- __xfrm_pcpu_work_fn();
- rcu_read_unlock();
- local_bh_enable();
-}
-
-void xfrm_policy_cache_flush(void)
-{
- struct xfrm_dst *old;
- bool found = false;
- int cpu;
-
- might_sleep();
-
- local_bh_disable();
- rcu_read_lock();
- for_each_possible_cpu(cpu) {
- old = per_cpu(xfrm_last_dst, cpu);
- if (old && !xfrm_bundle_ok(old)) {
- if (smp_processor_id() == cpu) {
- __xfrm_pcpu_work_fn();
- continue;
- }
- found = true;
- break;
- }
- }
-
- rcu_read_unlock();
- local_bh_enable();
-
- if (!found)
- return;
-
- get_online_cpus();
-
- for_each_possible_cpu(cpu) {
- bool bundle_release;
-
- rcu_read_lock();
- old = per_cpu(xfrm_last_dst, cpu);
- bundle_release = old && !xfrm_bundle_ok(old);
- rcu_read_unlock();
-
- if (!bundle_release)
- continue;
-
- if (cpu_online(cpu)) {
- schedule_work_on(cpu, &xfrm_pcpu_work[cpu]);
- continue;
- }
-
- rcu_read_lock();
- old = per_cpu(xfrm_last_dst, cpu);
- if (old && !xfrm_bundle_ok(old)) {
- per_cpu(xfrm_last_dst, cpu) = NULL;
- dst_release(&old->u.dst);
- }
- rcu_read_unlock();
- }
-
- put_online_cpus();
-}
-
-static bool xfrm_xdst_can_reuse(struct xfrm_dst *xdst,
- struct xfrm_state * const xfrm[],
- int num)
-{
- const struct dst_entry *dst = &xdst->u.dst;
- int i;
-
- if (xdst->num_xfrms != num)
- return false;
-
- for (i = 0; i < num; i++) {
- if (!dst || dst->xfrm != xfrm[i])
- return false;
- dst = xfrm_dst_child(dst);
- }
-
- return xfrm_bundle_ok(xdst);
-}
-
static struct xfrm_dst *
xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
const struct flowi *fl, u16 family,
@@ -1824,34 +1745,21 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
struct net *net = xp_net(pols[0]);
struct xfrm_state *xfrm[XFRM_MAX_DEPTH];
struct xfrm_dst *bundle[XFRM_MAX_DEPTH];
- struct xfrm_dst *xdst, *old;
+ struct xfrm_dst *xdst;
struct dst_entry *dst;
int err;
/* Try to instantiate a bundle */
err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family);
if (err <= 0) {
- if (err != 0 && err != -EAGAIN)
+ if (err == 0)
+ return NULL;
+
+ if (err != -EAGAIN)
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR);
return ERR_PTR(err);
}
- xdst = this_cpu_read(xfrm_last_dst);
- if (xdst &&
- xdst->u.dst.dev == dst_orig->dev &&
- xdst->num_pols == num_pols &&
- memcmp(xdst->pols, pols,
- sizeof(struct xfrm_policy *) * num_pols) == 0 &&
- xfrm_xdst_can_reuse(xdst, xfrm, err)) {
- dst_hold(&xdst->u.dst);
- xfrm_pols_put(pols, num_pols);
- while (err > 0)
- xfrm_state_put(xfrm[--err]);
- return xdst;
- }
-
- old = xdst;
-
dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig);
if (IS_ERR(dst)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR);
@@ -1864,9 +1772,6 @@ xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols,
memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols);
xdst->policy_genid = atomic_read(&pols[0]->genid);
- atomic_set(&xdst->u.dst.__refcnt, 2);
- xfrm_last_dst_update(xdst, old);
-
return xdst;
}
@@ -2047,8 +1952,10 @@ free_dst:
goto out;
}
-static struct xfrm_dst *
-xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo)
+static struct xfrm_dst *xfrm_bundle_lookup(struct net *net,
+ const struct flowi *fl,
+ u16 family, u8 dir,
+ struct xfrm_flo *xflo, u32 if_id)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
int num_pols = 0, num_xfrms = 0, err;
@@ -2057,7 +1964,7 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
/* Resolve policies to use if we couldn't get them from
* previous cache entry */
num_pols = 1;
- pols[0] = xfrm_policy_lookup(net, fl, family, dir);
+ pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id);
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
@@ -2067,13 +1974,15 @@ xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir,
if (num_xfrms <= 0)
goto make_dummy_bundle;
- local_bh_disable();
xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family,
xflo->dst_orig);
- local_bh_enable();
-
if (IS_ERR(xdst)) {
err = PTR_ERR(xdst);
+ if (err == -EREMOTE) {
+ xfrm_pols_put(pols, num_pols);
+ return NULL;
+ }
+
if (err != -EAGAIN)
goto error;
goto make_dummy_bundle;
@@ -2123,14 +2032,19 @@ static struct dst_entry *make_blackhole(struct net *net, u16 family,
return ret;
}
-/* Main function: finds/creates a bundle for given flow.
+/* Finds/creates a bundle for given flow and if_id
*
* At the moment we eat a raw IP route. Mostly to speed up lookups
* on interfaces with disabled IPsec.
+ *
+ * xfrm_lookup uses an if_id of 0 by default, and is provided for
+ * compatibility
*/
-struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
- const struct flowi *fl,
- const struct sock *sk, int flags)
+struct dst_entry *xfrm_lookup_with_ifid(struct net *net,
+ struct dst_entry *dst_orig,
+ const struct flowi *fl,
+ const struct sock *sk,
+ int flags, u32 if_id)
{
struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX];
struct xfrm_dst *xdst;
@@ -2146,7 +2060,8 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
sk = sk_const_to_full_sk(sk);
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
- pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family);
+ pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family,
+ if_id);
err = xfrm_expand_policies(fl, family, pols,
&num_pols, &num_xfrms);
if (err < 0)
@@ -2158,15 +2073,16 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
goto no_transform;
}
- local_bh_disable();
xdst = xfrm_resolve_and_create_bundle(
pols, num_pols, fl,
family, dst_orig);
- local_bh_enable();
if (IS_ERR(xdst)) {
xfrm_pols_put(pols, num_pols);
err = PTR_ERR(xdst);
+ if (err == -EREMOTE)
+ goto nopol;
+
goto dropdst;
} else if (xdst == NULL) {
num_xfrms = 0;
@@ -2189,7 +2105,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
!net->xfrm.policy_count[XFRM_POLICY_OUT])
goto nopol;
- xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo);
+ xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id);
if (xdst == NULL)
goto nopol;
if (IS_ERR(xdst)) {
@@ -2234,7 +2150,7 @@ no_transform:
}
for (i = 0; i < num_pols; i++)
- pols[i]->curlft.use_time = get_seconds();
+ pols[i]->curlft.use_time = ktime_get_real_seconds();
if (num_xfrms < 0) {
/* Prohibit the flow */
@@ -2270,6 +2186,19 @@ dropdst:
xfrm_pols_put(pols, drop_pols);
return ERR_PTR(err);
}
+EXPORT_SYMBOL(xfrm_lookup_with_ifid);
+
+/* Main function: finds/creates a bundle for given flow.
+ *
+ * At the moment we eat a raw IP route. Mostly to speed up lookups
+ * on interfaces with disabled IPsec.
+ */
+struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
+ const struct flowi *fl, const struct sock *sk,
+ int flags)
+{
+ return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0);
+}
EXPORT_SYMBOL(xfrm_lookup);
/* Callers of xfrm_lookup_route() must ensure a call to dst_output().
@@ -2286,6 +2215,9 @@ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig,
if (IS_ERR(dst) && PTR_ERR(dst) == -EREMOTE)
return make_blackhole(net, dst_orig->ops->family, dst_orig);
+ if (IS_ERR(dst))
+ dst_release(dst_orig);
+
return dst;
}
EXPORT_SYMBOL(xfrm_lookup_route);
@@ -2365,6 +2297,7 @@ int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl,
return -EAFNOSUPPORT;
afinfo->decode_session(skb, fl, reverse);
+
err = security_xfrm_decode_session(skb, &fl->flowi_secid);
rcu_read_unlock();
return err;
@@ -2395,6 +2328,19 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
int reverse;
struct flowi fl;
int xerr_idx = -1;
+ const struct xfrm_if_cb *ifcb;
+ struct xfrm_if *xi;
+ u32 if_id = 0;
+
+ rcu_read_lock();
+ ifcb = xfrm_if_get_cb();
+
+ if (ifcb) {
+ xi = ifcb->decode_session(skb);
+ if (xi)
+ if_id = xi->p.if_id;
+ }
+ rcu_read_unlock();
reverse = dir & ~XFRM_POLICY_MASK;
dir &= XFRM_POLICY_MASK;
@@ -2422,7 +2368,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
pol = NULL;
sk = sk_to_full_sk(sk);
if (sk && sk->sk_policy[dir]) {
- pol = xfrm_sk_policy_lookup(sk, dir, &fl, family);
+ pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id);
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
return 0;
@@ -2430,7 +2376,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
}
if (!pol)
- pol = xfrm_policy_lookup(net, &fl, family, dir);
+ pol = xfrm_policy_lookup(net, &fl, family, dir, if_id);
if (IS_ERR(pol)) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
@@ -2446,7 +2392,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
return 1;
}
- pol->curlft.use_time = get_seconds();
+ pol->curlft.use_time = ktime_get_real_seconds();
pols[0] = pol;
npols++;
@@ -2454,13 +2400,13 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) {
pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN,
&fl, family,
- XFRM_POLICY_IN);
+ XFRM_POLICY_IN, if_id);
if (pols[1]) {
if (IS_ERR(pols[1])) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR);
return 0;
}
- pols[1]->curlft.use_time = get_seconds();
+ pols[1]->curlft.use_time = ktime_get_real_seconds();
npols++;
}
}
@@ -2819,6 +2765,21 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
+void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb)
+{
+ spin_lock(&xfrm_if_cb_lock);
+ rcu_assign_pointer(xfrm_if_cb, ifcb);
+ spin_unlock(&xfrm_if_cb_lock);
+}
+EXPORT_SYMBOL(xfrm_if_register_cb);
+
+void xfrm_if_unregister_cb(void)
+{
+ RCU_INIT_POINTER(xfrm_if_cb, NULL);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL(xfrm_if_unregister_cb);
+
#ifdef CONFIG_XFRM_STATISTICS
static int __net_init xfrm_statistics_init(struct net *net)
{
@@ -2986,19 +2947,13 @@ static struct pernet_operations __net_initdata xfrm_net_ops = {
void __init xfrm_init(void)
{
- int i;
-
- xfrm_pcpu_work = kmalloc_array(NR_CPUS, sizeof(*xfrm_pcpu_work),
- GFP_KERNEL);
- BUG_ON(!xfrm_pcpu_work);
-
- for (i = 0; i < NR_CPUS; i++)
- INIT_WORK(&xfrm_pcpu_work[i], xfrm_pcpu_work_fn);
-
register_pernet_subsys(&xfrm_net_ops);
xfrm_dev_init();
seqcount_init(&xfrm_policy_hash_generation);
xfrm_input_init();
+
+ RCU_INIT_POINTER(xfrm_if_cb, NULL);
+ synchronize_rcu();
}
#ifdef CONFIG_AUDITSYSCALL
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 8308281f3253..b669262682c9 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -475,8 +475,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
{
struct tasklet_hrtimer *thr = container_of(me, struct tasklet_hrtimer, timer);
struct xfrm_state *x = container_of(thr, struct xfrm_state, mtimer);
- unsigned long now = get_seconds();
- long next = LONG_MAX;
+ time64_t now = ktime_get_real_seconds();
+ time64_t next = TIME64_MAX;
int warn = 0;
int err = 0;
@@ -537,7 +537,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
if (warn)
km_state_expired(x, 0, 0);
resched:
- if (next != LONG_MAX) {
+ if (next != TIME64_MAX) {
tasklet_hrtimer_start(&x->mtimer, ktime_set(next, 0), HRTIMER_MODE_REL);
}
@@ -577,7 +577,7 @@ struct xfrm_state *xfrm_state_alloc(struct net *net)
tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler,
CLOCK_BOOTTIME, HRTIMER_MODE_ABS);
timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0);
- x->curlft.add_time = get_seconds();
+ x->curlft.add_time = ktime_get_real_seconds();
x->lft.soft_byte_limit = XFRM_INF;
x->lft.soft_packet_limit = XFRM_INF;
x->lft.hard_byte_limit = XFRM_INF;
@@ -735,10 +735,9 @@ restart:
}
out:
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
- if (cnt) {
+ if (cnt)
err = 0;
- xfrm_policy_cache_flush();
- }
+
return err;
}
EXPORT_SYMBOL(xfrm_state_flush);
@@ -931,7 +930,7 @@ struct xfrm_state *
xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
const struct flowi *fl, struct xfrm_tmpl *tmpl,
struct xfrm_policy *pol, int *err,
- unsigned short family)
+ unsigned short family, u32 if_id)
{
static xfrm_address_t saddr_wildcard = { };
struct net *net = xp_net(pol);
@@ -955,6 +954,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_state_addr_check(x, daddr, saddr, encap_family) &&
tmpl->mode == x->props.mode &&
@@ -971,6 +971,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr,
if (x->props.family == encap_family &&
x->props.reqid == tmpl->reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_addr_equal(&x->id.daddr, daddr, encap_family) &&
tmpl->mode == x->props.mode &&
@@ -1010,6 +1011,7 @@ found:
* to current session. */
xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family);
memcpy(&x->mark, &pol->mark, sizeof(x->mark));
+ x->if_id = if_id;
error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid);
if (error) {
@@ -1067,7 +1069,7 @@ out:
}
struct xfrm_state *
-xfrm_stateonly_find(struct net *net, u32 mark,
+xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
xfrm_address_t *daddr, xfrm_address_t *saddr,
unsigned short family, u8 mode, u8 proto, u32 reqid)
{
@@ -1080,6 +1082,7 @@ xfrm_stateonly_find(struct net *net, u32 mark,
if (x->props.family == family &&
x->props.reqid == reqid &&
(mark & x->mark.m) == x->mark.v &&
+ x->if_id == if_id &&
!(x->props.flags & XFRM_STATE_WILDRECV) &&
xfrm_state_addr_check(x, daddr, saddr, family) &&
mode == x->props.mode &&
@@ -1160,11 +1163,13 @@ static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
struct xfrm_state *x;
unsigned int h;
u32 mark = xnew->mark.v & xnew->mark.m;
+ u32 if_id = xnew->if_id;
h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family);
hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) {
if (x->props.family == family &&
x->props.reqid == reqid &&
+ x->if_id == if_id &&
(mark & x->mark.m) == x->mark.v &&
xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) &&
xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family))
@@ -1187,7 +1192,7 @@ EXPORT_SYMBOL(xfrm_state_insert);
static struct xfrm_state *__find_acq_core(struct net *net,
const struct xfrm_mark *m,
unsigned short family, u8 mode,
- u32 reqid, u8 proto,
+ u32 reqid, u32 if_id, u8 proto,
const xfrm_address_t *daddr,
const xfrm_address_t *saddr,
int create)
@@ -1242,6 +1247,7 @@ static struct xfrm_state *__find_acq_core(struct net *net,
x->props.family = family;
x->props.mode = mode;
x->props.reqid = reqid;
+ x->if_id = if_id;
x->mark.v = m->v;
x->mark.m = m->m;
x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires;
@@ -1296,7 +1302,7 @@ int xfrm_state_add(struct xfrm_state *x)
if (use_spi && !x1)
x1 = __find_acq_core(net, &x->mark, family, x->props.mode,
- x->props.reqid, x->id.proto,
+ x->props.reqid, x->if_id, x->id.proto,
&x->id.daddr, &x->props.saddr, 0);
__xfrm_state_bump_genids(x);
@@ -1395,6 +1401,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig,
x->props.flags = orig->props.flags;
x->props.extra_flags = orig->props.extra_flags;
+ x->if_id = orig->if_id;
x->tfcpad = orig->tfcpad;
x->replay_maxdiff = orig->replay_maxdiff;
x->replay_maxage = orig->replay_maxage;
@@ -1554,6 +1561,19 @@ out:
if (x1->curlft.use_time)
xfrm_state_check_expire(x1);
+ if (x->props.smark.m || x->props.smark.v || x->if_id) {
+ spin_lock_bh(&net->xfrm.xfrm_state_lock);
+
+ if (x->props.smark.m || x->props.smark.v)
+ x1->props.smark = x->props.smark;
+
+ if (x->if_id)
+ x1->if_id = x->if_id;
+
+ __xfrm_state_bump_genids(x1);
+ spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+ }
+
err = 0;
x->km.state = XFRM_STATE_DEAD;
__xfrm_state_put(x);
@@ -1571,7 +1591,7 @@ EXPORT_SYMBOL(xfrm_state_update);
int xfrm_state_check_expire(struct xfrm_state *x)
{
if (!x->curlft.use_time)
- x->curlft.use_time = get_seconds();
+ x->curlft.use_time = ktime_get_real_seconds();
if (x->curlft.bytes >= x->lft.hard_byte_limit ||
x->curlft.packets >= x->lft.hard_packet_limit) {
@@ -1619,13 +1639,13 @@ EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
struct xfrm_state *
xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid,
- u8 proto, const xfrm_address_t *daddr,
+ u32 if_id, u8 proto, const xfrm_address_t *daddr,
const xfrm_address_t *saddr, int create, unsigned short family)
{
struct xfrm_state *x;
spin_lock_bh(&net->xfrm.xfrm_state_lock);
- x = __find_acq_core(net, mark, family, mode, reqid, proto, daddr, saddr, create);
+ x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create);
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
return x;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 080035f056d9..4791aa8b8185 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -527,6 +527,19 @@ static void xfrm_update_ae_params(struct xfrm_state *x, struct nlattr **attrs,
x->replay_maxdiff = nla_get_u32(rt);
}
+static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m)
+{
+ if (attrs[XFRMA_SET_MARK]) {
+ m->v = nla_get_u32(attrs[XFRMA_SET_MARK]);
+ if (attrs[XFRMA_SET_MARK_MASK])
+ m->m = nla_get_u32(attrs[XFRMA_SET_MARK_MASK]);
+ else
+ m->m = 0xffffffff;
+ } else {
+ m->v = m->m = 0;
+ }
+}
+
static struct xfrm_state *xfrm_state_construct(struct net *net,
struct xfrm_usersa_info *p,
struct nlattr **attrs,
@@ -579,8 +592,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
xfrm_mark_get(attrs, &x->mark);
- if (attrs[XFRMA_OUTPUT_MARK])
- x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]);
+ xfrm_smark_init(attrs, &x->props.smark);
+
+ if (attrs[XFRMA_IF_ID])
+ x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]);
if (err)
@@ -824,6 +839,18 @@ static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
return 0;
}
+static int xfrm_smark_put(struct sk_buff *skb, struct xfrm_mark *m)
+{
+ int ret = 0;
+
+ if (m->v | m->m) {
+ ret = nla_put_u32(skb, XFRMA_SET_MARK, m->v);
+ if (!ret)
+ ret = nla_put_u32(skb, XFRMA_SET_MARK_MASK, m->m);
+ }
+ return ret;
+}
+
/* Don't change this without updating xfrm_sa_len! */
static int copy_to_user_state_extra(struct xfrm_state *x,
struct xfrm_usersa_info *p,
@@ -887,6 +914,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
ret = xfrm_mark_put(skb, &x->mark);
if (ret)
goto out;
+
+ ret = xfrm_smark_put(skb, &x->props.smark);
+ if (ret)
+ goto out;
+
if (x->replay_esn)
ret = nla_put(skb, XFRMA_REPLAY_ESN_VAL,
xfrm_replay_state_esn_len(x->replay_esn),
@@ -900,8 +932,8 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
ret = copy_user_offload(&x->xso, skb);
if (ret)
goto out;
- if (x->props.output_mark) {
- ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark);
+ if (x->if_id) {
+ ret = nla_put_u32(skb, XFRMA_IF_ID, x->if_id);
if (ret)
goto out;
}
@@ -1025,10 +1057,12 @@ static inline int xfrm_nlmsg_multicast(struct net *net, struct sk_buff *skb,
{
struct sock *nlsk = rcu_dereference(net->xfrm.nlsk);
- if (nlsk)
- return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
- else
- return -1;
+ if (!nlsk) {
+ kfree_skb(skb);
+ return -EPIPE;
+ }
+
+ return nlmsg_multicast(nlsk, skb, pid, group, GFP_ATOMIC);
}
static inline unsigned int xfrm_spdinfo_msgsize(void)
@@ -1253,6 +1287,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
int err;
u32 mark;
struct xfrm_mark m;
+ u32 if_id = 0;
p = nlmsg_data(nlh);
err = verify_spi_info(p->info.id.proto, p->min, p->max);
@@ -1265,6 +1300,10 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
x = NULL;
mark = xfrm_mark_get(attrs, &m);
+
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->info.seq) {
x = xfrm_find_acq_byseq(net, mark, p->info.seq);
if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) {
@@ -1275,7 +1314,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
if (!x)
x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid,
- p->info.id.proto, daddr,
+ if_id, p->info.id.proto, daddr,
&p->info.saddr, 1,
family);
err = -ENOENT;
@@ -1563,6 +1602,9 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us
xfrm_mark_get(attrs, &xp->mark);
+ if (attrs[XFRMA_IF_ID])
+ xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
return xp;
error:
*errp = err;
@@ -1671,9 +1713,11 @@ static inline unsigned int userpolicy_type_attrsize(void)
#ifdef CONFIG_XFRM_SUB_POLICY
static int copy_to_user_policy_type(u8 type, struct sk_buff *skb)
{
- struct xfrm_userpolicy_type upt = {
- .type = type,
- };
+ struct xfrm_userpolicy_type upt;
+
+ /* Sadly there are two holes in struct xfrm_userpolicy_type */
+ memset(&upt, 0, sizeof(upt));
+ upt.type = type;
return nla_put(skb, XFRMA_POLICY_TYPE, sizeof(upt), &upt);
}
@@ -1708,6 +1752,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -1789,6 +1835,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
int delete;
struct xfrm_mark m;
u32 mark = xfrm_mark_get(attrs, &m);
+ u32 if_id = 0;
p = nlmsg_data(nlh);
delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY;
@@ -1801,8 +1848,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->index)
- xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, delete, &err);
+ xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, delete, &err);
else {
struct nlattr *rt = attrs[XFRMA_SEC_CTX];
struct xfrm_sec_ctx *ctx;
@@ -1819,7 +1869,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
}
- xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir, &p->sel,
+ xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir, &p->sel,
ctx, delete, &err);
security_xfrm_policy_free(ctx);
}
@@ -1942,6 +1992,10 @@ static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct
if (err)
goto out_cancel;
+ err = xfrm_if_id_put(skb, x->if_id);
+ if (err)
+ goto out_cancel;
+
nlmsg_end(skb, nlh);
return 0;
@@ -2084,6 +2138,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
int err = -ENOENT;
struct xfrm_mark m;
u32 mark = xfrm_mark_get(attrs, &m);
+ u32 if_id = 0;
err = copy_from_user_policy_type(&type, attrs);
if (err)
@@ -2093,8 +2148,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
+ if (attrs[XFRMA_IF_ID])
+ if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
+
if (p->index)
- xp = xfrm_policy_byid(net, mark, type, p->dir, p->index, 0, &err);
+ xp = xfrm_policy_byid(net, mark, if_id, type, p->dir, p->index, 0, &err);
else {
struct nlattr *rt = attrs[XFRMA_SEC_CTX];
struct xfrm_sec_ctx *ctx;
@@ -2111,7 +2169,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err)
return err;
}
- xp = xfrm_policy_bysel_ctx(net, mark, type, p->dir,
+ xp = xfrm_policy_bysel_ctx(net, mark, if_id, type, p->dir,
&p->sel, ctx, 0, &err);
security_xfrm_policy_free(ctx);
}
@@ -2493,7 +2551,9 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
[XFRMA_PROTO] = { .type = NLA_U8 },
[XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) },
[XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) },
- [XFRMA_OUTPUT_MARK] = { .type = NLA_U32 },
+ [XFRMA_SET_MARK] = { .type = NLA_U32 },
+ [XFRMA_SET_MARK_MASK] = { .type = NLA_U32 },
+ [XFRMA_IF_ID] = { .type = NLA_U32 },
};
static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2625,6 +2685,10 @@ static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct
if (err)
return err;
+ err = xfrm_if_id_put(skb, x->if_id);
+ if (err)
+ return err;
+
nlmsg_end(skb, nlh);
return 0;
}
@@ -2719,8 +2783,12 @@ static inline unsigned int xfrm_sa_len(struct xfrm_state *x)
l += nla_total_size(sizeof(x->props.extra_flags));
if (x->xso.dev)
l += nla_total_size(sizeof(x->xso));
- if (x->props.output_mark)
- l += nla_total_size(sizeof(x->props.output_mark));
+ if (x->props.smark.v | x->props.smark.m) {
+ l += nla_total_size(sizeof(x->props.smark.v));
+ l += nla_total_size(sizeof(x->props.smark.m));
+ }
+ if (x->if_id)
+ l += nla_total_size(sizeof(x->if_id));
/* Must count x->lastused as it may become non-zero behind our back. */
l += nla_total_size_64bit(sizeof(u64));
@@ -2850,6 +2918,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -2966,6 +3036,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err) {
nlmsg_cancel(skb, nlh);
return err;
@@ -3047,6 +3119,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
err = copy_to_user_policy_type(xp->type, skb);
if (!err)
err = xfrm_mark_put(skb, &xp->mark);
+ if (!err)
+ err = xfrm_if_id_put(skb, xp->if_id);
if (err)
goto out_free_skb;
@@ -3280,4 +3354,3 @@ module_init(xfrm_user_init);
module_exit(xfrm_user_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_XFRM);
-